diff --git "a/checkpoint-5000/trainer_state.json" "b/checkpoint-5000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-5000/trainer_state.json" @@ -0,0 +1,95034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 5000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.96875, + "epoch": 0.0002, + "grad_norm": 1.330741047859192, + "kl": 0.0, + "learning_rate": 0.0, + "loss": -0.0091, + "num_tokens": 7118.0, + "reward": 0.78277587890625, + "reward_std": 0.019820334389805794, + "rewards//mean": 0.78277587890625, + "rewards//std": 0.025473881512880325, + "step": 1 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.125, + "epoch": 0.0004, + "grad_norm": 1.3642058372497559, + "kl": 8.770232670940459e-05, + "learning_rate": 2e-08, + "loss": -0.0214, + "num_tokens": 14310.0, + "reward": 0.7691650390625, + "reward_std": 0.020717589184641838, + "rewards//mean": 0.7691650390625, + "rewards//std": 0.021729234606027603, + "step": 2 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.390625, + "epoch": 0.0006, + "grad_norm": 1.5292798280715942, + "kl": 0.0004639196158677805, + "learning_rate": 4e-08, + "loss": 0.0266, + "num_tokens": 21599.0, + "reward": 0.776611328125, + "reward_std": 0.019365321844816208, + "rewards//mean": 0.776611328125, + "rewards//std": 0.02419889159500599, + "step": 3 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.609375, + "epoch": 0.0008, + "grad_norm": 1.3712643384933472, + "kl": 0.0005618960894935299, + "learning_rate": 6e-08, + "loss": 0.0088, + "num_tokens": 28878.0, + "reward": 0.75933837890625, + "reward_std": 0.02479785680770874, + "rewards//mean": 0.75933837890625, + "rewards//std": 0.030126258730888367, + "step": 4 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.001, + "grad_norm": 1.3365423679351807, + "kl": 0.0006071559328120202, + "learning_rate": 8e-08, + "loss": 0.0, + "num_tokens": 36222.0, + "reward": 0.69512939453125, + "reward_std": 0.02599060907959938, + "rewards//mean": 0.69512939453125, + "rewards//std": 0.027458040043711662, + "step": 5 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.421875, + "epoch": 0.0012, + "grad_norm": 1.3917468786239624, + "kl": 0.0006030387594364583, + "learning_rate": 1e-07, + "loss": 0.0102, + "num_tokens": 43513.0, + "reward": 0.7738037109375, + "reward_std": 0.0216459259390831, + "rewards//mean": 0.7738037109375, + "rewards//std": 0.0289713554084301, + "step": 6 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.75, + "epoch": 0.0014, + "grad_norm": 1.3228261470794678, + "kl": 0.0006229907703527715, + "learning_rate": 1.2e-07, + "loss": 0.0195, + "num_tokens": 50793.0, + "reward": 0.7625732421875, + "reward_std": 0.02057132124900818, + "rewards//mean": 0.7625732421875, + "rewards//std": 0.02158103883266449, + "step": 7 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.421875, + "epoch": 0.0016, + "grad_norm": 1.3880749940872192, + "kl": 0.000710429361788556, + "learning_rate": 1.4e-07, + "loss": 0.0046, + "num_tokens": 58044.0, + "reward": 0.76055908203125, + "reward_std": 0.016441747546195984, + "rewards//mean": 0.76055908203125, + "rewards//std": 0.017185240983963013, + "step": 8 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.546875, + "epoch": 0.0018, + "grad_norm": 1.2208527326583862, + "kl": 0.00050989046212635, + "learning_rate": 1.6e-07, + "loss": 0.0133, + "num_tokens": 65271.0, + "reward": 0.7808837890625, + "reward_std": 0.019830580800771713, + "rewards//mean": 0.7808837890625, + "rewards//std": 0.021762648597359657, + "step": 9 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.671875, + "epoch": 0.002, + "grad_norm": 1.3162200450897217, + "kl": 0.0005903646742808633, + "learning_rate": 1.8e-07, + "loss": 0.005, + "num_tokens": 72474.0, + "reward": 0.7845458984375, + "reward_std": 0.020996030420064926, + "rewards//mean": 0.7845458984375, + "rewards//std": 0.022371798753738403, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.4375, + "epoch": 0.0022, + "grad_norm": 1.4269388914108276, + "kl": 0.0006086856737965718, + "learning_rate": 2e-07, + "loss": 0.0064, + "num_tokens": 79902.0, + "reward": 0.765625, + "reward_std": 0.01927652768790722, + "rewards//mean": 0.765625, + "rewards//std": 0.0238167904317379, + "step": 11 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.34375, + "epoch": 0.0024, + "grad_norm": 1.2612816095352173, + "kl": 0.0005421320165623911, + "learning_rate": 2.1999999999999998e-07, + "loss": 0.0213, + "num_tokens": 87092.0, + "reward": 0.77197265625, + "reward_std": 0.021374139934778214, + "rewards//mean": 0.77197265625, + "rewards//std": 0.023287175223231316, + "step": 12 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.75, + "epoch": 0.0026, + "grad_norm": 1.2021592855453491, + "kl": 0.0006030967088008765, + "learning_rate": 2.4e-07, + "loss": 0.0251, + "num_tokens": 94260.0, + "reward": 0.750732421875, + "reward_std": 0.02469783090054989, + "rewards//mean": 0.750732421875, + "rewards//std": 0.02573174051940441, + "step": 13 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.296875, + "epoch": 0.0028, + "grad_norm": 1.3700822591781616, + "kl": 0.0006516266366816126, + "learning_rate": 2.6e-07, + "loss": -0.0234, + "num_tokens": 101487.0, + "reward": 0.7679443359375, + "reward_std": 0.02249380201101303, + "rewards//mean": 0.7679443359375, + "rewards//std": 0.024819916114211082, + "step": 14 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.109375, + "epoch": 0.003, + "grad_norm": 1.255008578300476, + "kl": 0.0006202829972608015, + "learning_rate": 2.8e-07, + "loss": 0.0087, + "num_tokens": 108790.0, + "reward": 0.78741455078125, + "reward_std": 0.024374691769480705, + "rewards//mean": 0.78741455078125, + "rewards//std": 0.03397418186068535, + "step": 15 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.3125, + "epoch": 0.0032, + "grad_norm": 1.4460698366165161, + "kl": 0.0005603266690741293, + "learning_rate": 3e-07, + "loss": 0.0009, + "num_tokens": 115938.0, + "reward": 0.7344970703125, + "reward_std": 0.0203128419816494, + "rewards//mean": 0.7344970703125, + "rewards//std": 0.025732623413205147, + "step": 16 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.0625, + "epoch": 0.0034, + "grad_norm": 1.6037236452102661, + "kl": 0.0005951909479335882, + "learning_rate": 3.2e-07, + "loss": -0.0142, + "num_tokens": 123174.0, + "reward": 0.78466796875, + "reward_std": 0.023104771971702576, + "rewards//mean": 0.78466796875, + "rewards//std": 0.027370311319828033, + "step": 17 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.375, + "epoch": 0.0036, + "grad_norm": 1.6774663925170898, + "kl": 0.0006116131335147657, + "learning_rate": 3.4000000000000003e-07, + "loss": 0.0181, + "num_tokens": 130430.0, + "reward": 0.78021240234375, + "reward_std": 0.019533738493919373, + "rewards//mean": 0.78021240234375, + "rewards//std": 0.023242847993969917, + "step": 18 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.3125, + "epoch": 0.0038, + "grad_norm": 1.415427327156067, + "kl": 0.0005582675548794214, + "learning_rate": 3.6e-07, + "loss": -0.0006, + "num_tokens": 137666.0, + "reward": 0.78228759765625, + "reward_std": 0.025690637528896332, + "rewards//mean": 0.78228759765625, + "rewards//std": 0.029472993686795235, + "step": 19 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.734375, + "epoch": 0.004, + "grad_norm": 1.195767879486084, + "kl": 0.000582047134230379, + "learning_rate": 3.7999999999999996e-07, + "loss": 0.0003, + "num_tokens": 144801.0, + "reward": 0.78521728515625, + "reward_std": 0.019779382273554802, + "rewards//mean": 0.78521728515625, + "rewards//std": 0.0258033387362957, + "step": 20 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.640625, + "epoch": 0.0042, + "grad_norm": 1.3072420358657837, + "kl": 0.000576665155676892, + "learning_rate": 4e-07, + "loss": -0.0325, + "num_tokens": 151978.0, + "reward": 0.75439453125, + "reward_std": 0.021154703572392464, + "rewards//mean": 0.75439453125, + "rewards//std": 0.030973635613918304, + "step": 21 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.15625, + "epoch": 0.0044, + "grad_norm": 1.4500961303710938, + "kl": 0.0006909124494995922, + "learning_rate": 4.1999999999999995e-07, + "loss": 0.0228, + "num_tokens": 159172.0, + "reward": 0.77825927734375, + "reward_std": 0.020673532038927078, + "rewards//mean": 0.77825927734375, + "rewards//std": 0.025861937552690506, + "step": 22 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.640625, + "epoch": 0.0046, + "grad_norm": 1.5054388046264648, + "kl": 0.0005879784148419276, + "learning_rate": 4.3999999999999997e-07, + "loss": 0.0156, + "num_tokens": 166493.0, + "reward": 0.77227783203125, + "reward_std": 0.018953248858451843, + "rewards//mean": 0.77227783203125, + "rewards//std": 0.021717648953199387, + "step": 23 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.359375, + "epoch": 0.0048, + "grad_norm": 1.4031546115875244, + "kl": 0.0005716818413930014, + "learning_rate": 4.6e-07, + "loss": 0.0093, + "num_tokens": 173668.0, + "reward": 0.78045654296875, + "reward_std": 0.02023874595761299, + "rewards//mean": 0.78045654296875, + "rewards//std": 0.024529967457056046, + "step": 24 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.640625, + "epoch": 0.005, + "grad_norm": 1.3101736307144165, + "kl": 0.0006665766013611574, + "learning_rate": 4.8e-07, + "loss": 0.0105, + "num_tokens": 180877.0, + "reward": 0.75958251953125, + "reward_std": 0.024649150669574738, + "rewards//mean": 0.75958251953125, + "rewards//std": 0.029214540496468544, + "step": 25 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.15625, + "epoch": 0.0052, + "grad_norm": 1.2071986198425293, + "kl": 0.000541045083082281, + "learning_rate": 5e-07, + "loss": 0.0072, + "num_tokens": 188143.0, + "reward": 0.7794189453125, + "reward_std": 0.01972513645887375, + "rewards//mean": 0.7794189453125, + "rewards//std": 0.02137240581214428, + "step": 26 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.96875, + "epoch": 0.0054, + "grad_norm": 1.38238525390625, + "kl": 0.0006756391485396307, + "learning_rate": 5.2e-07, + "loss": -0.0255, + "num_tokens": 195205.0, + "reward": 0.78662109375, + "reward_std": 0.026358000934123993, + "rewards//mean": 0.78662109375, + "rewards//std": 0.03140076994895935, + "step": 27 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.609375, + "epoch": 0.0056, + "grad_norm": 1.4248371124267578, + "kl": 0.0006071090174373239, + "learning_rate": 5.4e-07, + "loss": 0.0076, + "num_tokens": 202444.0, + "reward": 0.7886962890625, + "reward_std": 0.017779318615794182, + "rewards//mean": 0.7886962890625, + "rewards//std": 0.028937894850969315, + "step": 28 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.203125, + "epoch": 0.0058, + "grad_norm": 1.2917896509170532, + "kl": 0.0005683702693204395, + "learning_rate": 5.6e-07, + "loss": -0.0167, + "num_tokens": 209737.0, + "reward": 0.78009033203125, + "reward_std": 0.018400968983769417, + "rewards//mean": 0.78009033203125, + "rewards//std": 0.01979215256869793, + "step": 29 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.140625, + "epoch": 0.006, + "grad_norm": 1.7145577669143677, + "kl": 0.0005475290890899487, + "learning_rate": 5.8e-07, + "loss": 0.0202, + "num_tokens": 216922.0, + "reward": 0.771728515625, + "reward_std": 0.022254066541790962, + "rewards//mean": 0.771728515625, + "rewards//std": 0.028695465996861458, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.34375, + "epoch": 0.0062, + "grad_norm": 1.3456392288208008, + "kl": 0.0006371552917698864, + "learning_rate": 6e-07, + "loss": -0.0209, + "num_tokens": 224016.0, + "reward": 0.77581787109375, + "reward_std": 0.018139831721782684, + "rewards//mean": 0.77581787109375, + "rewards//std": 0.022741157561540604, + "step": 31 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.390625, + "epoch": 0.0064, + "grad_norm": 1.3876322507858276, + "kl": 0.0006511218271043617, + "learning_rate": 6.2e-07, + "loss": -0.0293, + "num_tokens": 231225.0, + "reward": 0.80853271484375, + "reward_std": 0.019414987415075302, + "rewards//mean": 0.80853271484375, + "rewards//std": 0.02734368108212948, + "step": 32 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.15625, + "epoch": 0.0066, + "grad_norm": 1.3585008382797241, + "kl": 0.0005951397397438996, + "learning_rate": 6.4e-07, + "loss": -0.0266, + "num_tokens": 238523.0, + "reward": 0.7796630859375, + "reward_std": 0.014973786659538746, + "rewards//mean": 0.7796630859375, + "rewards//std": 0.021084317937493324, + "step": 33 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.078125, + "epoch": 0.0068, + "grad_norm": 1.3869155645370483, + "kl": 0.0005850151464983355, + "learning_rate": 6.6e-07, + "loss": -0.0237, + "num_tokens": 245744.0, + "reward": 0.77142333984375, + "reward_std": 0.02189992368221283, + "rewards//mean": 0.77142333984375, + "rewards//std": 0.025770464912056923, + "step": 34 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.515625, + "epoch": 0.007, + "grad_norm": 1.2167606353759766, + "kl": 0.0005352018706616946, + "learning_rate": 6.800000000000001e-07, + "loss": 0.0015, + "num_tokens": 253097.0, + "reward": 0.78759765625, + "reward_std": 0.01706944778561592, + "rewards//mean": 0.78759765625, + "rewards//std": 0.0200815349817276, + "step": 35 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.6875, + "epoch": 0.0072, + "grad_norm": 1.6206326484680176, + "kl": 0.0005966624157736078, + "learning_rate": 7e-07, + "loss": 0.0215, + "num_tokens": 260269.0, + "reward": 0.78643798828125, + "reward_std": 0.030387528240680695, + "rewards//mean": 0.78643798828125, + "rewards//std": 0.032999563962221146, + "step": 36 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.28125, + "epoch": 0.0074, + "grad_norm": 1.5001860857009888, + "kl": 0.000601245163124986, + "learning_rate": 7.2e-07, + "loss": 0.0216, + "num_tokens": 267575.0, + "reward": 0.78106689453125, + "reward_std": 0.01981106773018837, + "rewards//mean": 0.78106689453125, + "rewards//std": 0.02713863179087639, + "step": 37 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.59375, + "epoch": 0.0076, + "grad_norm": 1.373761534690857, + "kl": 0.0006264550174819306, + "learning_rate": 7.4e-07, + "loss": -0.0125, + "num_tokens": 274877.0, + "reward": 0.7655029296875, + "reward_std": 0.021658534184098244, + "rewards//mean": 0.7655029296875, + "rewards//std": 0.024832110852003098, + "step": 38 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.015625, + "epoch": 0.0078, + "grad_norm": 1.3701711893081665, + "kl": 0.000635052150755655, + "learning_rate": 7.599999999999999e-07, + "loss": 0.0123, + "num_tokens": 282182.0, + "reward": 0.81976318359375, + "reward_std": 0.01955665647983551, + "rewards//mean": 0.81976318359375, + "rewards//std": 0.02329099550843239, + "step": 39 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.046875, + "epoch": 0.008, + "grad_norm": 1.3612878322601318, + "kl": 0.0006571996127604507, + "learning_rate": 7.799999999999999e-07, + "loss": -0.0062, + "num_tokens": 289409.0, + "reward": 0.768798828125, + "reward_std": 0.020438021048903465, + "rewards//mean": 0.768798828125, + "rewards//std": 0.02359071746468544, + "step": 40 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.28125, + "epoch": 0.0082, + "grad_norm": 1.430421233177185, + "kl": 0.0006672534655081108, + "learning_rate": 8e-07, + "loss": -0.0177, + "num_tokens": 296667.0, + "reward": 0.74969482421875, + "reward_std": 0.028876055032014847, + "rewards//mean": 0.74969482421875, + "rewards//std": 0.035559069365262985, + "step": 41 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.375, + "epoch": 0.0084, + "grad_norm": 1.7381017208099365, + "kl": 0.0005666987308359239, + "learning_rate": 8.199999999999999e-07, + "loss": 0.0104, + "num_tokens": 303899.0, + "reward": 0.77166748046875, + "reward_std": 0.01432991586625576, + "rewards//mean": 0.77166748046875, + "rewards//std": 0.01917368359863758, + "step": 42 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.4375, + "epoch": 0.0086, + "grad_norm": 1.449127197265625, + "kl": 0.0006344818830257282, + "learning_rate": 8.399999999999999e-07, + "loss": 0.0052, + "num_tokens": 311055.0, + "reward": 0.76495361328125, + "reward_std": 0.020354636013507843, + "rewards//mean": 0.76495361328125, + "rewards//std": 0.026259826496243477, + "step": 43 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.453125, + "epoch": 0.0088, + "grad_norm": 1.7169703245162964, + "kl": 0.0006883836977067403, + "learning_rate": 8.599999999999999e-07, + "loss": 0.0007, + "num_tokens": 318380.0, + "reward": 0.7899169921875, + "reward_std": 0.025784596800804138, + "rewards//mean": 0.7899169921875, + "rewards//std": 0.02802586741745472, + "step": 44 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.953125, + "epoch": 0.009, + "grad_norm": 1.984134316444397, + "kl": 0.0006288373369898181, + "learning_rate": 8.799999999999999e-07, + "loss": 0.0075, + "num_tokens": 325617.0, + "reward": 0.7906494140625, + "reward_std": 0.023792359977960587, + "rewards//mean": 0.7906494140625, + "rewards//std": 0.025645412504673004, + "step": 45 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.203125, + "epoch": 0.0092, + "grad_norm": 1.6195120811462402, + "kl": 0.0007290957728400826, + "learning_rate": 9e-07, + "loss": 0.0618, + "num_tokens": 332830.0, + "reward": 0.78948974609375, + "reward_std": 0.022750750184059143, + "rewards//mean": 0.78948974609375, + "rewards//std": 0.02881542406976223, + "step": 46 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.25, + "epoch": 0.0094, + "grad_norm": 1.561281681060791, + "kl": 0.0007416203734464943, + "learning_rate": 9.2e-07, + "loss": -0.0551, + "num_tokens": 340006.0, + "reward": 0.77471923828125, + "reward_std": 0.02468026988208294, + "rewards//mean": 0.77471923828125, + "rewards//std": 0.026181310415267944, + "step": 47 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.40625, + "epoch": 0.0096, + "grad_norm": 1.7003390789031982, + "kl": 0.0007731970545137301, + "learning_rate": 9.399999999999999e-07, + "loss": 0.0112, + "num_tokens": 347160.0, + "reward": 0.77001953125, + "reward_std": 0.01727737858891487, + "rewards//mean": 0.77001953125, + "rewards//std": 0.02261173538863659, + "step": 48 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.984375, + "epoch": 0.0098, + "grad_norm": 1.3565831184387207, + "kl": 0.0006613830701098777, + "learning_rate": 9.6e-07, + "loss": 0.0123, + "num_tokens": 354311.0, + "reward": 0.77838134765625, + "reward_std": 0.02317187562584877, + "rewards//mean": 0.77838134765625, + "rewards//std": 0.029751086607575417, + "step": 49 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.828125, + "epoch": 0.01, + "grad_norm": 1.6535234451293945, + "kl": 0.0006520196839119308, + "learning_rate": 9.8e-07, + "loss": 0.0201, + "num_tokens": 361492.0, + "reward": 0.76300048828125, + "reward_std": 0.01997113972902298, + "rewards//mean": 0.76300048828125, + "rewards//std": 0.027814079076051712, + "step": 50 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.234375, + "epoch": 0.0102, + "grad_norm": 1.2742284536361694, + "kl": 0.0005557646909437608, + "learning_rate": 1e-06, + "loss": 0.0124, + "num_tokens": 368691.0, + "reward": 0.80963134765625, + "reward_std": 0.021810419857501984, + "rewards//mean": 0.80963134765625, + "rewards//std": 0.02619229443371296, + "step": 51 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.203125, + "epoch": 0.0104, + "grad_norm": 1.3828446865081787, + "kl": 0.0007919049021438695, + "learning_rate": 9.999998993000298e-07, + "loss": -0.0056, + "num_tokens": 375848.0, + "reward": 0.7681884765625, + "reward_std": 0.021554943174123764, + "rewards//mean": 0.7681884765625, + "rewards//std": 0.021967574954032898, + "step": 52 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.40625, + "epoch": 0.0106, + "grad_norm": 1.5640984773635864, + "kl": 0.0007231131385196932, + "learning_rate": 9.999995972001601e-07, + "loss": -0.0255, + "num_tokens": 382954.0, + "reward": 0.81829833984375, + "reward_std": 0.019432753324508667, + "rewards//mean": 0.81829833984375, + "rewards//std": 0.027830945327878, + "step": 53 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.125, + "epoch": 0.0108, + "grad_norm": 1.1990188360214233, + "kl": 0.0006687208733637817, + "learning_rate": 9.999990937005123e-07, + "loss": 0.0193, + "num_tokens": 390074.0, + "reward": 0.76806640625, + "reward_std": 0.01621391624212265, + "rewards//mean": 0.76806640625, + "rewards//std": 0.02153645269572735, + "step": 54 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.75, + "epoch": 0.011, + "grad_norm": 1.6972732543945312, + "kl": 0.0007027534302324057, + "learning_rate": 9.999983888012896e-07, + "loss": -0.0748, + "num_tokens": 397186.0, + "reward": 0.76104736328125, + "reward_std": 0.024184027686715126, + "rewards//mean": 0.76104736328125, + "rewards//std": 0.033339109271764755, + "step": 55 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.875, + "epoch": 0.0112, + "grad_norm": 1.3704986572265625, + "kl": 0.0006907701172167435, + "learning_rate": 9.999974825027754e-07, + "loss": -0.0115, + "num_tokens": 404402.0, + "reward": 0.77001953125, + "reward_std": 0.02191847376525402, + "rewards//mean": 0.77001953125, + "rewards//std": 0.024523256346583366, + "step": 56 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.640625, + "epoch": 0.0114, + "grad_norm": 1.2531187534332275, + "kl": 0.0006101112630858552, + "learning_rate": 9.999963748053354e-07, + "loss": -0.0107, + "num_tokens": 411603.0, + "reward": 0.78790283203125, + "reward_std": 0.02264866977930069, + "rewards//mean": 0.78790283203125, + "rewards//std": 0.025880077853798866, + "step": 57 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.796875, + "epoch": 0.0116, + "grad_norm": 1.4559237957000732, + "kl": 0.0007085944671416655, + "learning_rate": 9.99995065709415e-07, + "loss": -0.0002, + "num_tokens": 418806.0, + "reward": 0.774658203125, + "reward_std": 0.014307827688753605, + "rewards//mean": 0.774658203125, + "rewards//std": 0.019855627790093422, + "step": 58 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.234375, + "epoch": 0.0118, + "grad_norm": 1.4714115858078003, + "kl": 0.0007497215774492361, + "learning_rate": 9.999935552155421e-07, + "loss": -0.023, + "num_tokens": 425997.0, + "reward": 0.78729248046875, + "reward_std": 0.024219848215579987, + "rewards//mean": 0.78729248046875, + "rewards//std": 0.02876337058842182, + "step": 59 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.421875, + "epoch": 0.012, + "grad_norm": 1.4400527477264404, + "kl": 0.000683333542838227, + "learning_rate": 9.99991843324325e-07, + "loss": -0.0033, + "num_tokens": 433208.0, + "reward": 0.7880859375, + "reward_std": 0.019430354237556458, + "rewards//mean": 0.7880859375, + "rewards//std": 0.020351096987724304, + "step": 60 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.5, + "epoch": 0.0122, + "grad_norm": 1.3697925806045532, + "kl": 0.0007930825813673437, + "learning_rate": 9.999899300364532e-07, + "loss": -0.0164, + "num_tokens": 440336.0, + "reward": 0.72802734375, + "reward_std": 0.0178694948554039, + "rewards//mean": 0.72802734375, + "rewards//std": 0.02182689495384693, + "step": 61 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.796875, + "epoch": 0.0124, + "grad_norm": 1.672784447669983, + "kl": 0.0007618076779181138, + "learning_rate": 9.999878153526972e-07, + "loss": -0.0245, + "num_tokens": 447499.0, + "reward": 0.76947021484375, + "reward_std": 0.02417871728539467, + "rewards//mean": 0.76947021484375, + "rewards//std": 0.028181197121739388, + "step": 62 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.453125, + "epoch": 0.0126, + "grad_norm": 1.3753050565719604, + "kl": 0.0007644057623110712, + "learning_rate": 9.999854992739093e-07, + "loss": 0.0211, + "num_tokens": 454680.0, + "reward": 0.7822265625, + "reward_std": 0.026227232068777084, + "rewards//mean": 0.7822265625, + "rewards//std": 0.028194153681397438, + "step": 63 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.703125, + "epoch": 0.0128, + "grad_norm": 1.6537467241287231, + "kl": 0.0009215796599164605, + "learning_rate": 9.999829818010219e-07, + "loss": -0.0168, + "num_tokens": 461853.0, + "reward": 0.7657470703125, + "reward_std": 0.017894916236400604, + "rewards//mean": 0.7657470703125, + "rewards//std": 0.020786413922905922, + "step": 64 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.328125, + "epoch": 0.013, + "grad_norm": 1.65345299243927, + "kl": 0.0010513342640479095, + "learning_rate": 9.999802629350491e-07, + "loss": -0.01, + "num_tokens": 469034.0, + "reward": 0.76171875, + "reward_std": 0.022089308127760887, + "rewards//mean": 0.76171875, + "rewards//std": 0.025296252220869064, + "step": 65 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.40625, + "epoch": 0.0132, + "grad_norm": 1.5205495357513428, + "kl": 0.0009956455323845148, + "learning_rate": 9.999773426770863e-07, + "loss": -0.0109, + "num_tokens": 476180.0, + "reward": 0.78424072265625, + "reward_std": 0.024543695151805878, + "rewards//mean": 0.78424072265625, + "rewards//std": 0.029353594407439232, + "step": 66 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.546875, + "epoch": 0.0134, + "grad_norm": 1.4638257026672363, + "kl": 0.001019660776364617, + "learning_rate": 9.999742210283097e-07, + "loss": -0.0003, + "num_tokens": 483503.0, + "reward": 0.7811279296875, + "reward_std": 0.025582896545529366, + "rewards//mean": 0.7811279296875, + "rewards//std": 0.027664894238114357, + "step": 67 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.65625, + "epoch": 0.0136, + "grad_norm": 1.6010429859161377, + "kl": 0.0008906045331968926, + "learning_rate": 9.999708979899767e-07, + "loss": 0.0039, + "num_tokens": 490705.0, + "reward": 0.75555419921875, + "reward_std": 0.019358504563570023, + "rewards//mean": 0.75555419921875, + "rewards//std": 0.022732501849532127, + "step": 68 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.140625, + "epoch": 0.0138, + "grad_norm": 1.4374825954437256, + "kl": 0.001124387577874586, + "learning_rate": 9.999673735634259e-07, + "loss": 0.0062, + "num_tokens": 497850.0, + "reward": 0.7647705078125, + "reward_std": 0.021568913012742996, + "rewards//mean": 0.7647705078125, + "rewards//std": 0.023203495889902115, + "step": 69 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.8125, + "epoch": 0.014, + "grad_norm": 1.4206154346466064, + "kl": 0.0009053266476257704, + "learning_rate": 9.999636477500764e-07, + "loss": -0.0188, + "num_tokens": 505070.0, + "reward": 0.7135009765625, + "reward_std": 0.024471838027238846, + "rewards//mean": 0.7135009765625, + "rewards//std": 0.026939677074551582, + "step": 70 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.5, + "epoch": 0.0142, + "grad_norm": 1.4083281755447388, + "kl": 0.0011126465251436457, + "learning_rate": 9.999597205514296e-07, + "loss": 0.0126, + "num_tokens": 512190.0, + "reward": 0.76934814453125, + "reward_std": 0.020879924297332764, + "rewards//mean": 0.76934814453125, + "rewards//std": 0.028008239343762398, + "step": 71 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.796875, + "epoch": 0.0144, + "grad_norm": 1.4578380584716797, + "kl": 0.0009499725492787547, + "learning_rate": 9.999555919690672e-07, + "loss": 0.0088, + "num_tokens": 519337.0, + "reward": 0.77325439453125, + "reward_std": 0.024022122845053673, + "rewards//mean": 0.77325439453125, + "rewards//std": 0.030140826478600502, + "step": 72 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.328125, + "epoch": 0.0146, + "grad_norm": 1.2108057737350464, + "kl": 0.0008634059413452633, + "learning_rate": 9.99951262004652e-07, + "loss": -0.0089, + "num_tokens": 526590.0, + "reward": 0.76165771484375, + "reward_std": 0.018969528377056122, + "rewards//mean": 0.76165771484375, + "rewards//std": 0.02251504175364971, + "step": 73 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.40625, + "epoch": 0.0148, + "grad_norm": 1.3392640352249146, + "kl": 0.0009661525109549984, + "learning_rate": 9.999467306599285e-07, + "loss": -0.0422, + "num_tokens": 533800.0, + "reward": 0.78167724609375, + "reward_std": 0.020327871665358543, + "rewards//mean": 0.78167724609375, + "rewards//std": 0.0216219462454319, + "step": 74 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.171875, + "epoch": 0.015, + "grad_norm": 1.586457371711731, + "kl": 0.0010271763749187812, + "learning_rate": 9.999419979367214e-07, + "loss": 0.0093, + "num_tokens": 541099.0, + "reward": 0.76690673828125, + "reward_std": 0.024835947901010513, + "rewards//mean": 0.76690673828125, + "rewards//std": 0.0314127616584301, + "step": 75 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.546875, + "epoch": 0.0152, + "grad_norm": 1.5043230056762695, + "kl": 0.0011063858037232421, + "learning_rate": 9.999370638369376e-07, + "loss": -0.0053, + "num_tokens": 548478.0, + "reward": 0.78643798828125, + "reward_std": 0.021762218326330185, + "rewards//mean": 0.78643798828125, + "rewards//std": 0.028848500922322273, + "step": 76 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.984375, + "epoch": 0.0154, + "grad_norm": 1.4076570272445679, + "kl": 0.0011196136911166832, + "learning_rate": 9.99931928362564e-07, + "loss": -0.0076, + "num_tokens": 555701.0, + "reward": 0.73419189453125, + "reward_std": 0.018441949039697647, + "rewards//mean": 0.73419189453125, + "rewards//std": 0.02551010437309742, + "step": 77 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.453125, + "epoch": 0.0156, + "grad_norm": 1.3934000730514526, + "kl": 0.0011642847530310974, + "learning_rate": 9.999265915156696e-07, + "loss": -0.0043, + "num_tokens": 562866.0, + "reward": 0.76373291015625, + "reward_std": 0.02121182158589363, + "rewards//mean": 0.76373291015625, + "rewards//std": 0.02198711968958378, + "step": 78 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.09375, + "epoch": 0.0158, + "grad_norm": 1.4244978427886963, + "kl": 0.0011330637280480005, + "learning_rate": 9.999210532984038e-07, + "loss": 0.0034, + "num_tokens": 570096.0, + "reward": 0.77587890625, + "reward_std": 0.029779508709907532, + "rewards//mean": 0.77587890625, + "rewards//std": 0.0324925072491169, + "step": 79 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.15625, + "epoch": 0.016, + "grad_norm": 1.356224536895752, + "kl": 0.0011438406654633582, + "learning_rate": 9.999153137129977e-07, + "loss": 0.0013, + "num_tokens": 577354.0, + "reward": 0.79473876953125, + "reward_std": 0.020995885133743286, + "rewards//mean": 0.79473876953125, + "rewards//std": 0.025779275223612785, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.65625, + "epoch": 0.0162, + "grad_norm": 1.492012858390808, + "kl": 0.0011650765591184609, + "learning_rate": 9.999093727617628e-07, + "loss": -0.0014, + "num_tokens": 584540.0, + "reward": 0.7685546875, + "reward_std": 0.020711876451969147, + "rewards//mean": 0.7685546875, + "rewards//std": 0.022681251168251038, + "step": 81 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.4375, + "epoch": 0.0164, + "grad_norm": 1.5772998332977295, + "kl": 0.0014679357263958082, + "learning_rate": 9.999032304470924e-07, + "loss": 0.0055, + "num_tokens": 591856.0, + "reward": 0.78533935546875, + "reward_std": 0.021423671394586563, + "rewards//mean": 0.78533935546875, + "rewards//std": 0.024591602385044098, + "step": 82 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.4375, + "epoch": 0.0166, + "grad_norm": 1.3216415643692017, + "kl": 0.0012835140369134024, + "learning_rate": 9.998968867714608e-07, + "loss": 0.0274, + "num_tokens": 598916.0, + "reward": 0.76458740234375, + "reward_std": 0.021460019052028656, + "rewards//mean": 0.76458740234375, + "rewards//std": 0.02895064279437065, + "step": 83 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.796875, + "epoch": 0.0168, + "grad_norm": 1.3197364807128906, + "kl": 0.0013353173126233742, + "learning_rate": 9.998903417374226e-07, + "loss": -0.0198, + "num_tokens": 606231.0, + "reward": 0.77667236328125, + "reward_std": 0.017462503165006638, + "rewards//mean": 0.77667236328125, + "rewards//std": 0.020464492961764336, + "step": 84 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.1875, + "epoch": 0.017, + "grad_norm": 1.5649313926696777, + "kl": 0.0013015906297368929, + "learning_rate": 9.998835953476147e-07, + "loss": -0.001, + "num_tokens": 613515.0, + "reward": 0.7845458984375, + "reward_std": 0.019052714109420776, + "rewards//mean": 0.7845458984375, + "rewards//std": 0.025384364649653435, + "step": 85 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.65625, + "epoch": 0.0172, + "grad_norm": 1.4362589120864868, + "kl": 0.0012978223530808464, + "learning_rate": 9.998766476047545e-07, + "loss": 0.0137, + "num_tokens": 620725.0, + "reward": 0.77783203125, + "reward_std": 0.02120601385831833, + "rewards//mean": 0.77783203125, + "rewards//std": 0.029011299833655357, + "step": 86 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.0625, + "epoch": 0.0174, + "grad_norm": 1.5230056047439575, + "kl": 0.0013773388782283291, + "learning_rate": 9.998694985116404e-07, + "loss": -0.0237, + "num_tokens": 628017.0, + "reward": 0.79998779296875, + "reward_std": 0.025107817724347115, + "rewards//mean": 0.79998779296875, + "rewards//std": 0.03313232585787773, + "step": 87 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.984375, + "epoch": 0.0176, + "grad_norm": 1.4740111827850342, + "kl": 0.0013570786832133308, + "learning_rate": 9.99862148071152e-07, + "loss": -0.0411, + "num_tokens": 635232.0, + "reward": 0.78900146484375, + "reward_std": 0.02586870640516281, + "rewards//mean": 0.78900146484375, + "rewards//std": 0.03901592269539833, + "step": 88 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.421875, + "epoch": 0.0178, + "grad_norm": 1.4893147945404053, + "kl": 0.0014846853155177087, + "learning_rate": 9.998545962862501e-07, + "loss": 0.0281, + "num_tokens": 642387.0, + "reward": 0.77752685546875, + "reward_std": 0.021953551098704338, + "rewards//mean": 0.77752685546875, + "rewards//std": 0.030816294252872467, + "step": 89 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.53125, + "epoch": 0.018, + "grad_norm": 1.5940765142440796, + "kl": 0.0012421840074239299, + "learning_rate": 9.998468431599767e-07, + "loss": -0.0432, + "num_tokens": 649517.0, + "reward": 0.75830078125, + "reward_std": 0.024636201560497284, + "rewards//mean": 0.75830078125, + "rewards//std": 0.02638813480734825, + "step": 90 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.484375, + "epoch": 0.0182, + "grad_norm": 1.4425054788589478, + "kl": 0.0017800273490138352, + "learning_rate": 9.998388886954545e-07, + "loss": -0.0329, + "num_tokens": 656684.0, + "reward": 0.752197265625, + "reward_std": 0.021423064172267914, + "rewards//mean": 0.752197265625, + "rewards//std": 0.026999453082680702, + "step": 91 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.0184, + "grad_norm": 1.3329654932022095, + "kl": 0.0014921495458111167, + "learning_rate": 9.998307328958877e-07, + "loss": 0.0009, + "num_tokens": 664026.0, + "reward": 0.79754638671875, + "reward_std": 0.020358750596642494, + "rewards//mean": 0.79754638671875, + "rewards//std": 0.02264443039894104, + "step": 92 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.34375, + "epoch": 0.0186, + "grad_norm": 1.391867995262146, + "kl": 0.001664792449446395, + "learning_rate": 9.998223757645617e-07, + "loss": -0.0091, + "num_tokens": 671192.0, + "reward": 0.7958984375, + "reward_std": 0.02496184967458248, + "rewards//mean": 0.7958984375, + "rewards//std": 0.029487434774637222, + "step": 93 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.015625, + "epoch": 0.0188, + "grad_norm": 1.37995445728302, + "kl": 0.0015712101012468338, + "learning_rate": 9.998138173048423e-07, + "loss": -0.0167, + "num_tokens": 678425.0, + "reward": 0.78204345703125, + "reward_std": 0.021268179640173912, + "rewards//mean": 0.78204345703125, + "rewards//std": 0.024101657792925835, + "step": 94 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.671875, + "epoch": 0.019, + "grad_norm": 1.6874161958694458, + "kl": 0.0021500648581422865, + "learning_rate": 9.99805057520177e-07, + "loss": -0.0337, + "num_tokens": 685564.0, + "reward": 0.81964111328125, + "reward_std": 0.021292973309755325, + "rewards//mean": 0.81964111328125, + "rewards//std": 0.022375434637069702, + "step": 95 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.21875, + "epoch": 0.0192, + "grad_norm": 1.6771905422210693, + "kl": 0.0020039084483869374, + "learning_rate": 9.997960964140945e-07, + "loss": -0.0439, + "num_tokens": 692658.0, + "reward": 0.7938232421875, + "reward_std": 0.023873912170529366, + "rewards//mean": 0.7938232421875, + "rewards//std": 0.028969265520572662, + "step": 96 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.53125, + "epoch": 0.0194, + "grad_norm": 1.4371724128723145, + "kl": 0.0016009308164939284, + "learning_rate": 9.99786933990204e-07, + "loss": -0.0503, + "num_tokens": 699708.0, + "reward": 0.77197265625, + "reward_std": 0.023824818432331085, + "rewards//mean": 0.77197265625, + "rewards//std": 0.02614840492606163, + "step": 97 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.796875, + "epoch": 0.0196, + "grad_norm": 1.3930429220199585, + "kl": 0.0020625491451937705, + "learning_rate": 9.997775702521965e-07, + "loss": -0.0372, + "num_tokens": 706823.0, + "reward": 0.78076171875, + "reward_std": 0.024393687024712563, + "rewards//mean": 0.78076171875, + "rewards//std": 0.03429839760065079, + "step": 98 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.6875, + "epoch": 0.0198, + "grad_norm": 1.5262292623519897, + "kl": 0.0019297650724183768, + "learning_rate": 9.997680052038434e-07, + "loss": -0.0142, + "num_tokens": 714131.0, + "reward": 0.79205322265625, + "reward_std": 0.02311617322266102, + "rewards//mean": 0.79205322265625, + "rewards//std": 0.030135301873087883, + "step": 99 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.1875, + "epoch": 0.02, + "grad_norm": 1.7183003425598145, + "kl": 0.002687897824216634, + "learning_rate": 9.997582388489973e-07, + "loss": -0.011, + "num_tokens": 721415.0, + "reward": 0.80328369140625, + "reward_std": 0.027876444160938263, + "rewards//mean": 0.80328369140625, + "rewards//std": 0.03732733055949211, + "step": 100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.1875, + "epoch": 0.0202, + "grad_norm": 1.4759055376052856, + "kl": 0.0021202935895416886, + "learning_rate": 9.997482711915925e-07, + "loss": -0.0065, + "num_tokens": 728651.0, + "reward": 0.799072265625, + "reward_std": 0.02536786161363125, + "rewards//mean": 0.799072265625, + "rewards//std": 0.030764734372496605, + "step": 101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.90625, + "epoch": 0.0204, + "grad_norm": 1.491334319114685, + "kl": 0.0023831506696296856, + "learning_rate": 9.99738102235644e-07, + "loss": -0.0304, + "num_tokens": 735893.0, + "reward": 0.806640625, + "reward_std": 0.02095945179462433, + "rewards//mean": 0.806640625, + "rewards//std": 0.028133956715464592, + "step": 102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.609375, + "epoch": 0.0206, + "grad_norm": 1.7631739377975464, + "kl": 0.0028153733583167195, + "learning_rate": 9.997277319852474e-07, + "loss": -0.0478, + "num_tokens": 742996.0, + "reward": 0.76043701171875, + "reward_std": 0.021906524896621704, + "rewards//mean": 0.76043701171875, + "rewards//std": 0.026748724281787872, + "step": 103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.890625, + "epoch": 0.0208, + "grad_norm": 1.4913345575332642, + "kl": 0.0028796039696317166, + "learning_rate": 9.997171604445802e-07, + "loss": 0.0228, + "num_tokens": 750341.0, + "reward": 0.76678466796875, + "reward_std": 0.022760892286896706, + "rewards//mean": 0.76678466796875, + "rewards//std": 0.02512022852897644, + "step": 104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.140625, + "epoch": 0.021, + "grad_norm": 1.3980119228363037, + "kl": 0.002873321092920378, + "learning_rate": 9.997063876179007e-07, + "loss": -0.0295, + "num_tokens": 757614.0, + "reward": 0.78289794921875, + "reward_std": 0.021695509552955627, + "rewards//mean": 0.78289794921875, + "rewards//std": 0.023631028831005096, + "step": 105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.359375, + "epoch": 0.0212, + "grad_norm": 1.3898428678512573, + "kl": 0.0026267130306223407, + "learning_rate": 9.996954135095478e-07, + "loss": -0.0193, + "num_tokens": 764741.0, + "reward": 0.76678466796875, + "reward_std": 0.020583078265190125, + "rewards//mean": 0.76678466796875, + "rewards//std": 0.02214832231402397, + "step": 106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.6875, + "epoch": 0.0214, + "grad_norm": 1.5119179487228394, + "kl": 0.003281464974861592, + "learning_rate": 9.996842381239422e-07, + "loss": -0.0035, + "num_tokens": 771905.0, + "reward": 0.78253173828125, + "reward_std": 0.019131232053041458, + "rewards//mean": 0.78253173828125, + "rewards//std": 0.02106519415974617, + "step": 107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.796875, + "epoch": 0.0216, + "grad_norm": 1.7645680904388428, + "kl": 0.002965506530017592, + "learning_rate": 9.996728614655853e-07, + "loss": -0.0244, + "num_tokens": 779084.0, + "reward": 0.76141357421875, + "reward_std": 0.01892857998609543, + "rewards//mean": 0.76141357421875, + "rewards//std": 0.02308666706085205, + "step": 108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.453125, + "epoch": 0.0218, + "grad_norm": 1.4755489826202393, + "kl": 0.002978647855343297, + "learning_rate": 9.996612835390594e-07, + "loss": 0.0073, + "num_tokens": 786305.0, + "reward": 0.783203125, + "reward_std": 0.014705033972859383, + "rewards//mean": 0.783203125, + "rewards//std": 0.02221732959151268, + "step": 109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.28125, + "epoch": 0.022, + "grad_norm": 1.631646990776062, + "kl": 0.0029858853085897863, + "learning_rate": 9.996495043490283e-07, + "loss": -0.084, + "num_tokens": 793427.0, + "reward": 0.76708984375, + "reward_std": 0.022133205085992813, + "rewards//mean": 0.76708984375, + "rewards//std": 0.024876268580555916, + "step": 110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.234375, + "epoch": 0.0222, + "grad_norm": 1.4675276279449463, + "kl": 0.0034049677196890116, + "learning_rate": 9.996375239002368e-07, + "loss": -0.0104, + "num_tokens": 800698.0, + "reward": 0.77325439453125, + "reward_std": 0.020833559334278107, + "rewards//mean": 0.77325439453125, + "rewards//std": 0.03465617820620537, + "step": 111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.296875, + "epoch": 0.0224, + "grad_norm": 1.4260845184326172, + "kl": 0.002962343249237165, + "learning_rate": 9.996253421975102e-07, + "loss": -0.0159, + "num_tokens": 807973.0, + "reward": 0.80657958984375, + "reward_std": 0.027838246896862984, + "rewards//mean": 0.80657958984375, + "rewards//std": 0.03495929762721062, + "step": 112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.5, + "epoch": 0.0226, + "grad_norm": 1.6338378190994263, + "kl": 0.004356921213911846, + "learning_rate": 9.996129592457556e-07, + "loss": 0.0104, + "num_tokens": 815045.0, + "reward": 0.80450439453125, + "reward_std": 0.024008864536881447, + "rewards//mean": 0.80450439453125, + "rewards//std": 0.02869170717895031, + "step": 113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.015625, + "epoch": 0.0228, + "grad_norm": 1.459762692451477, + "kl": 0.003733841178473085, + "learning_rate": 9.996003750499607e-07, + "loss": -0.0148, + "num_tokens": 822238.0, + "reward": 0.7955322265625, + "reward_std": 0.028848551213741302, + "rewards//mean": 0.7955322265625, + "rewards//std": 0.03032340109348297, + "step": 114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.140625, + "epoch": 0.023, + "grad_norm": 1.6857537031173706, + "kl": 0.0038129173626657575, + "learning_rate": 9.995875896151944e-07, + "loss": -0.0048, + "num_tokens": 829479.0, + "reward": 0.778564453125, + "reward_std": 0.02239256165921688, + "rewards//mean": 0.778564453125, + "rewards//std": 0.030320655554533005, + "step": 115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.703125, + "epoch": 0.0232, + "grad_norm": 1.624248743057251, + "kl": 0.004633707023458555, + "learning_rate": 9.99574602946607e-07, + "loss": 0.0231, + "num_tokens": 836556.0, + "reward": 0.80828857421875, + "reward_std": 0.022478384897112846, + "rewards//mean": 0.80828857421875, + "rewards//std": 0.031995274126529694, + "step": 116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.75, + "epoch": 0.0234, + "grad_norm": 1.5513633489608765, + "kl": 0.004590423370245844, + "learning_rate": 9.99561415049429e-07, + "loss": -0.0105, + "num_tokens": 843812.0, + "reward": 0.7720947265625, + "reward_std": 0.025803670287132263, + "rewards//mean": 0.7720947265625, + "rewards//std": 0.03089313581585884, + "step": 117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.09375, + "epoch": 0.0236, + "grad_norm": 1.6353378295898438, + "kl": 0.003957089560572058, + "learning_rate": 9.99548025928973e-07, + "loss": 0.0014, + "num_tokens": 851058.0, + "reward": 0.78582763671875, + "reward_std": 0.029691316187381744, + "rewards//mean": 0.78582763671875, + "rewards//std": 0.031769268214702606, + "step": 118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.328125, + "epoch": 0.0238, + "grad_norm": 1.6492676734924316, + "kl": 0.003939666727092117, + "learning_rate": 9.995344355906318e-07, + "loss": -0.0233, + "num_tokens": 858199.0, + "reward": 0.77392578125, + "reward_std": 0.029359523206949234, + "rewards//mean": 0.77392578125, + "rewards//std": 0.032641246914863586, + "step": 119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.8125, + "epoch": 0.024, + "grad_norm": 1.7664073705673218, + "kl": 0.004003126407042146, + "learning_rate": 9.995206440398796e-07, + "loss": -0.0066, + "num_tokens": 865459.0, + "reward": 0.783203125, + "reward_std": 0.025128034874796867, + "rewards//mean": 0.783203125, + "rewards//std": 0.026721050962805748, + "step": 120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.828125, + "epoch": 0.0242, + "grad_norm": 1.9681915044784546, + "kl": 0.004875384824117646, + "learning_rate": 9.995066512822718e-07, + "loss": 0.0452, + "num_tokens": 872736.0, + "reward": 0.79119873046875, + "reward_std": 0.024004701524972916, + "rewards//mean": 0.79119873046875, + "rewards//std": 0.026428760960698128, + "step": 121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.28125, + "epoch": 0.0244, + "grad_norm": 1.558786153793335, + "kl": 0.004551505378913134, + "learning_rate": 9.994924573234446e-07, + "loss": 0.0026, + "num_tokens": 879986.0, + "reward": 0.79949951171875, + "reward_std": 0.026334920898079872, + "rewards//mean": 0.79949951171875, + "rewards//std": 0.03499435633420944, + "step": 122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.671875, + "epoch": 0.0246, + "grad_norm": 1.6079860925674438, + "kl": 0.0049999390612356365, + "learning_rate": 9.994780621691154e-07, + "loss": -0.0039, + "num_tokens": 887229.0, + "reward": 0.816650390625, + "reward_std": 0.01860572211444378, + "rewards//mean": 0.816650390625, + "rewards//std": 0.022281285375356674, + "step": 123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.640625, + "epoch": 0.0248, + "grad_norm": 1.5004476308822632, + "kl": 0.004701720463344827, + "learning_rate": 9.994634658250824e-07, + "loss": -0.0056, + "num_tokens": 894470.0, + "reward": 0.79791259765625, + "reward_std": 0.03110302798449993, + "rewards//mean": 0.79791259765625, + "rewards//std": 0.035209547728300095, + "step": 124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.4375, + "epoch": 0.025, + "grad_norm": 1.5459445714950562, + "kl": 0.005158779589692131, + "learning_rate": 9.994486682972252e-07, + "loss": 0.0037, + "num_tokens": 901730.0, + "reward": 0.80926513671875, + "reward_std": 0.02434464544057846, + "rewards//mean": 0.80926513671875, + "rewards//std": 0.02862515114247799, + "step": 125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.0, + "epoch": 0.0252, + "grad_norm": 1.634482502937317, + "kl": 0.005206895875744522, + "learning_rate": 9.99433669591504e-07, + "loss": 0.001, + "num_tokens": 908866.0, + "reward": 0.80218505859375, + "reward_std": 0.022825004532933235, + "rewards//mean": 0.80218505859375, + "rewards//std": 0.029768381267786026, + "step": 126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.3125, + "epoch": 0.0254, + "grad_norm": 1.5500982999801636, + "kl": 0.007514075085055083, + "learning_rate": 9.994184697139604e-07, + "loss": -0.0026, + "num_tokens": 916094.0, + "reward": 0.82110595703125, + "reward_std": 0.019578732550144196, + "rewards//mean": 0.82110595703125, + "rewards//std": 0.022885147482156754, + "step": 127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.421875, + "epoch": 0.0256, + "grad_norm": 1.6925216913223267, + "kl": 0.007458690612111241, + "learning_rate": 9.99403068670717e-07, + "loss": -0.053, + "num_tokens": 923233.0, + "reward": 0.799072265625, + "reward_std": 0.025864217430353165, + "rewards//mean": 0.799072265625, + "rewards//std": 0.030015574768185616, + "step": 128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.71875, + "epoch": 0.0258, + "grad_norm": 1.4078813791275024, + "kl": 0.005578697135206312, + "learning_rate": 9.993874664679772e-07, + "loss": -0.0381, + "num_tokens": 930319.0, + "reward": 0.79705810546875, + "reward_std": 0.02614477649331093, + "rewards//mean": 0.79705810546875, + "rewards//std": 0.029168905690312386, + "step": 129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.026, + "grad_norm": 1.331628680229187, + "kl": 0.005371542036300525, + "learning_rate": 9.993716631120258e-07, + "loss": 0.0023, + "num_tokens": 937579.0, + "reward": 0.77587890625, + "reward_std": 0.022140957415103912, + "rewards//mean": 0.77587890625, + "rewards//std": 0.026825085282325745, + "step": 130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.640625, + "epoch": 0.0262, + "grad_norm": 1.4089117050170898, + "kl": 0.006456497067119926, + "learning_rate": 9.99355658609228e-07, + "loss": 0.0027, + "num_tokens": 944860.0, + "reward": 0.78436279296875, + "reward_std": 0.019052622839808464, + "rewards//mean": 0.78436279296875, + "rewards//std": 0.026061931625008583, + "step": 131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.28125, + "epoch": 0.0264, + "grad_norm": 1.531468391418457, + "kl": 0.0071123561065178365, + "learning_rate": 9.993394529660306e-07, + "loss": -0.0321, + "num_tokens": 952038.0, + "reward": 0.78839111328125, + "reward_std": 0.021916473284363747, + "rewards//mean": 0.78839111328125, + "rewards//std": 0.029475560411810875, + "step": 132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.78125, + "epoch": 0.0266, + "grad_norm": 1.4498622417449951, + "kl": 0.006457396491896361, + "learning_rate": 9.993230461889615e-07, + "loss": -0.0054, + "num_tokens": 959256.0, + "reward": 0.74493408203125, + "reward_std": 0.028803564608097076, + "rewards//mean": 0.74493408203125, + "rewards//std": 0.032531093806028366, + "step": 133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.09375, + "epoch": 0.0268, + "grad_norm": 1.6773560047149658, + "kl": 0.007732725876849145, + "learning_rate": 9.993064382846289e-07, + "loss": 0.0029, + "num_tokens": 966454.0, + "reward": 0.7852783203125, + "reward_std": 0.025966495275497437, + "rewards//mean": 0.7852783203125, + "rewards//std": 0.03186377137899399, + "step": 134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.90625, + "epoch": 0.027, + "grad_norm": 1.5514166355133057, + "kl": 0.007403019466437399, + "learning_rate": 9.992896292597228e-07, + "loss": 0.0096, + "num_tokens": 973616.0, + "reward": 0.7889404296875, + "reward_std": 0.02649814635515213, + "rewards//mean": 0.7889404296875, + "rewards//std": 0.03094601072371006, + "step": 135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.640625, + "epoch": 0.0272, + "grad_norm": 1.679001808166504, + "kl": 0.009151361125987023, + "learning_rate": 9.992726191210137e-07, + "loss": 0.0071, + "num_tokens": 980857.0, + "reward": 0.75848388671875, + "reward_std": 0.01735023595392704, + "rewards//mean": 0.75848388671875, + "rewards//std": 0.021424314007163048, + "step": 136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.96875, + "epoch": 0.0274, + "grad_norm": 1.5797340869903564, + "kl": 0.00746469123987481, + "learning_rate": 9.992554078753533e-07, + "loss": -0.0386, + "num_tokens": 988047.0, + "reward": 0.81475830078125, + "reward_std": 0.019897248595952988, + "rewards//mean": 0.81475830078125, + "rewards//std": 0.02171904407441616, + "step": 137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.1875, + "epoch": 0.0276, + "grad_norm": 1.4600346088409424, + "kl": 0.007911627268185839, + "learning_rate": 9.992379955296745e-07, + "loss": 0.0017, + "num_tokens": 995331.0, + "reward": 0.77813720703125, + "reward_std": 0.02584485150873661, + "rewards//mean": 0.77813720703125, + "rewards//std": 0.032385583966970444, + "step": 138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.828125, + "epoch": 0.0278, + "grad_norm": 1.5442570447921753, + "kl": 0.01014052820391953, + "learning_rate": 9.992203820909905e-07, + "loss": -0.0155, + "num_tokens": 1002552.0, + "reward": 0.8055419921875, + "reward_std": 0.02394559234380722, + "rewards//mean": 0.8055419921875, + "rewards//std": 0.02583126351237297, + "step": 139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.234375, + "epoch": 0.028, + "grad_norm": 1.5903511047363281, + "kl": 0.007759003434330225, + "learning_rate": 9.992025675663965e-07, + "loss": -0.0182, + "num_tokens": 1009855.0, + "reward": 0.809814453125, + "reward_std": 0.02532828599214554, + "rewards//mean": 0.809814453125, + "rewards//std": 0.027089012786746025, + "step": 140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.03125, + "epoch": 0.0282, + "grad_norm": 2.042998790740967, + "kl": 0.00833955011330545, + "learning_rate": 9.991845519630676e-07, + "loss": -0.0741, + "num_tokens": 1017081.0, + "reward": 0.77001953125, + "reward_std": 0.024747565388679504, + "rewards//mean": 0.77001953125, + "rewards//std": 0.026369770988821983, + "step": 141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.546875, + "epoch": 0.0284, + "grad_norm": 1.5249228477478027, + "kl": 0.009552208299282938, + "learning_rate": 9.991663352882613e-07, + "loss": -0.0364, + "num_tokens": 1024396.0, + "reward": 0.80828857421875, + "reward_std": 0.023025671020150185, + "rewards//mean": 0.80828857421875, + "rewards//std": 0.02896789275109768, + "step": 142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.734375, + "epoch": 0.0286, + "grad_norm": 1.489943265914917, + "kl": 0.008483934332616627, + "learning_rate": 9.991479175493148e-07, + "loss": -0.0135, + "num_tokens": 1031603.0, + "reward": 0.7950439453125, + "reward_std": 0.02456018328666687, + "rewards//mean": 0.7950439453125, + "rewards//std": 0.027795907109975815, + "step": 143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.140625, + "epoch": 0.0288, + "grad_norm": 1.456337571144104, + "kl": 0.008895340200979263, + "learning_rate": 9.991292987536468e-07, + "loss": -0.0152, + "num_tokens": 1038908.0, + "reward": 0.8245849609375, + "reward_std": 0.02788938209414482, + "rewards//mean": 0.8245849609375, + "rewards//std": 0.032716985791921616, + "step": 144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.029, + "grad_norm": 1.4399168491363525, + "kl": 0.010077533253934234, + "learning_rate": 9.991104789087569e-07, + "loss": 0.0004, + "num_tokens": 1046116.0, + "reward": 0.79156494140625, + "reward_std": 0.026688657701015472, + "rewards//mean": 0.79156494140625, + "rewards//std": 0.030610768124461174, + "step": 145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.65625, + "epoch": 0.0292, + "grad_norm": 1.493910312652588, + "kl": 0.010284366551786661, + "learning_rate": 9.990914580222255e-07, + "loss": 0.008, + "num_tokens": 1053470.0, + "reward": 0.78533935546875, + "reward_std": 0.030629053711891174, + "rewards//mean": 0.78533935546875, + "rewards//std": 0.031074577942490578, + "step": 146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.203125, + "epoch": 0.0294, + "grad_norm": 1.5384007692337036, + "kl": 0.009934234491083771, + "learning_rate": 9.990722361017149e-07, + "loss": -0.0032, + "num_tokens": 1060763.0, + "reward": 0.77471923828125, + "reward_std": 0.019815947860479355, + "rewards//mean": 0.77471923828125, + "rewards//std": 0.02362333983182907, + "step": 147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.46875, + "epoch": 0.0296, + "grad_norm": 1.610908031463623, + "kl": 0.010053021425846964, + "learning_rate": 9.990528131549671e-07, + "loss": 0.0073, + "num_tokens": 1068057.0, + "reward": 0.76678466796875, + "reward_std": 0.02389853447675705, + "rewards//mean": 0.76678466796875, + "rewards//std": 0.02919588051736355, + "step": 148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.609375, + "epoch": 0.0298, + "grad_norm": 1.5590932369232178, + "kl": 0.011891809524968266, + "learning_rate": 9.990331891898058e-07, + "loss": -0.0211, + "num_tokens": 1075312.0, + "reward": 0.7979736328125, + "reward_std": 0.021271442994475365, + "rewards//mean": 0.7979736328125, + "rewards//std": 0.027156822383403778, + "step": 149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.71875, + "epoch": 0.03, + "grad_norm": 1.3702912330627441, + "kl": 0.010590280639007688, + "learning_rate": 9.990133642141357e-07, + "loss": 0.0069, + "num_tokens": 1082614.0, + "reward": 0.76043701171875, + "reward_std": 0.022752897813916206, + "rewards//mean": 0.76043701171875, + "rewards//std": 0.02701900154352188, + "step": 150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.3125, + "epoch": 0.0302, + "grad_norm": 1.3211010694503784, + "kl": 0.010557684814557433, + "learning_rate": 9.989933382359422e-07, + "loss": 0.0124, + "num_tokens": 1089826.0, + "reward": 0.77606201171875, + "reward_std": 0.0217263326048851, + "rewards//mean": 0.77606201171875, + "rewards//std": 0.02516237646341324, + "step": 151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.0304, + "grad_norm": 1.4291832447052002, + "kl": 0.011703187832608819, + "learning_rate": 9.989731112632916e-07, + "loss": 0.0001, + "num_tokens": 1097121.0, + "reward": 0.7823486328125, + "reward_std": 0.031302228569984436, + "rewards//mean": 0.7823486328125, + "rewards//std": 0.03432641178369522, + "step": 152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.359375, + "epoch": 0.0306, + "grad_norm": 1.6218584775924683, + "kl": 0.01123869139701128, + "learning_rate": 9.989526833043316e-07, + "loss": -0.0195, + "num_tokens": 1104424.0, + "reward": 0.80999755859375, + "reward_std": 0.02326120436191559, + "rewards//mean": 0.80999755859375, + "rewards//std": 0.02903522551059723, + "step": 153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.28125, + "epoch": 0.0308, + "grad_norm": 1.4991286993026733, + "kl": 0.01516896951943636, + "learning_rate": 9.989320543672903e-07, + "loss": 0.0355, + "num_tokens": 1111602.0, + "reward": 0.74688720703125, + "reward_std": 0.021862009540200233, + "rewards//mean": 0.74688720703125, + "rewards//std": 0.026704544201493263, + "step": 154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.703125, + "epoch": 0.031, + "grad_norm": 1.6447926759719849, + "kl": 0.012095957063138485, + "learning_rate": 9.989112244604771e-07, + "loss": -0.0086, + "num_tokens": 1118983.0, + "reward": 0.785400390625, + "reward_std": 0.02170182764530182, + "rewards//mean": 0.785400390625, + "rewards//std": 0.026318056508898735, + "step": 155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.796875, + "epoch": 0.0312, + "grad_norm": 1.4232521057128906, + "kl": 0.011578940029721707, + "learning_rate": 9.988901935922825e-07, + "loss": -0.0113, + "num_tokens": 1126226.0, + "reward": 0.78375244140625, + "reward_std": 0.021450551226735115, + "rewards//mean": 0.78375244140625, + "rewards//std": 0.023055829107761383, + "step": 156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.28125, + "epoch": 0.0314, + "grad_norm": 1.486119031906128, + "kl": 0.011666014383081347, + "learning_rate": 9.988689617711776e-07, + "loss": -0.0157, + "num_tokens": 1133356.0, + "reward": 0.7926025390625, + "reward_std": 0.023090731352567673, + "rewards//mean": 0.7926025390625, + "rewards//std": 0.025303132832050323, + "step": 157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.0316, + "grad_norm": 1.7912400960922241, + "kl": 0.012429868802428246, + "learning_rate": 9.988475290057143e-07, + "loss": 0.0053, + "num_tokens": 1140662.0, + "reward": 0.81085205078125, + "reward_std": 0.02675771713256836, + "rewards//mean": 0.81085205078125, + "rewards//std": 0.030732672661542892, + "step": 158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.109375, + "epoch": 0.0318, + "grad_norm": 1.8786247968673706, + "kl": 0.014571455540135503, + "learning_rate": 9.988258953045262e-07, + "loss": -0.0013, + "num_tokens": 1147869.0, + "reward": 0.7962646484375, + "reward_std": 0.02158692479133606, + "rewards//mean": 0.7962646484375, + "rewards//std": 0.023890087381005287, + "step": 159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.171875, + "epoch": 0.032, + "grad_norm": 1.594660758972168, + "kl": 0.014851250336505473, + "learning_rate": 9.988040606763272e-07, + "loss": -0.0474, + "num_tokens": 1155008.0, + "reward": 0.79156494140625, + "reward_std": 0.032177601009607315, + "rewards//mean": 0.79156494140625, + "rewards//std": 0.037515029311180115, + "step": 160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.0, + "epoch": 0.0322, + "grad_norm": 1.416437029838562, + "kl": 0.013576600817032158, + "learning_rate": 9.98782025129912e-07, + "loss": -0.0191, + "num_tokens": 1162240.0, + "reward": 0.80682373046875, + "reward_std": 0.029878515750169754, + "rewards//mean": 0.80682373046875, + "rewards//std": 0.03749928995966911, + "step": 161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.703125, + "epoch": 0.0324, + "grad_norm": 1.617987871170044, + "kl": 0.01538213121239096, + "learning_rate": 9.987597886741568e-07, + "loss": -0.0062, + "num_tokens": 1169477.0, + "reward": 0.77435302734375, + "reward_std": 0.03178906440734863, + "rewards//mean": 0.77435302734375, + "rewards//std": 0.0345393568277359, + "step": 162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.09375, + "epoch": 0.0326, + "grad_norm": 1.6794630289077759, + "kl": 0.01766700088046491, + "learning_rate": 9.987373513180184e-07, + "loss": -0.0089, + "num_tokens": 1176627.0, + "reward": 0.789306640625, + "reward_std": 0.019825855270028114, + "rewards//mean": 0.789306640625, + "rewards//std": 0.021484375, + "step": 163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.53125, + "epoch": 0.0328, + "grad_norm": 1.6535617113113403, + "kl": 0.01556278788484633, + "learning_rate": 9.987147130705347e-07, + "loss": -0.017, + "num_tokens": 1183813.0, + "reward": 0.7869873046875, + "reward_std": 0.02414393052458763, + "rewards//mean": 0.7869873046875, + "rewards//std": 0.027612315490841866, + "step": 164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.390625, + "epoch": 0.033, + "grad_norm": 1.5248324871063232, + "kl": 0.014940478256903589, + "learning_rate": 9.98691873940824e-07, + "loss": -0.0339, + "num_tokens": 1191054.0, + "reward": 0.7880859375, + "reward_std": 0.027189800515770912, + "rewards//mean": 0.7880859375, + "rewards//std": 0.030449189245700836, + "step": 165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.09375, + "epoch": 0.0332, + "grad_norm": 1.5281543731689453, + "kl": 0.016499170335009694, + "learning_rate": 9.98668833938086e-07, + "loss": -0.0193, + "num_tokens": 1198260.0, + "reward": 0.78692626953125, + "reward_std": 0.027888746932148933, + "rewards//mean": 0.78692626953125, + "rewards//std": 0.027312107384204865, + "step": 166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.328125, + "epoch": 0.0334, + "grad_norm": 1.630226731300354, + "kl": 0.01790173863992095, + "learning_rate": 9.986455930716016e-07, + "loss": -0.0215, + "num_tokens": 1205481.0, + "reward": 0.80889892578125, + "reward_std": 0.027613570913672447, + "rewards//mean": 0.80889892578125, + "rewards//std": 0.030744491145014763, + "step": 167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.296875, + "epoch": 0.0336, + "grad_norm": 1.4432041645050049, + "kl": 0.016991431126371026, + "learning_rate": 9.986221513507318e-07, + "loss": -0.009, + "num_tokens": 1212748.0, + "reward": 0.7861328125, + "reward_std": 0.02175457403063774, + "rewards//mean": 0.7861328125, + "rewards//std": 0.02309396117925644, + "step": 168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.75, + "epoch": 0.0338, + "grad_norm": 1.5508756637573242, + "kl": 0.01776412082836032, + "learning_rate": 9.985985087849191e-07, + "loss": 0.0142, + "num_tokens": 1220012.0, + "reward": 0.78497314453125, + "reward_std": 0.029599878937005997, + "rewards//mean": 0.78497314453125, + "rewards//std": 0.03179118037223816, + "step": 169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.109375, + "epoch": 0.034, + "grad_norm": 1.436679720878601, + "kl": 0.01695575809571892, + "learning_rate": 9.985746653836866e-07, + "loss": -0.0135, + "num_tokens": 1227331.0, + "reward": 0.8001708984375, + "reward_std": 0.028251083567738533, + "rewards//mean": 0.8001708984375, + "rewards//std": 0.03772652894258499, + "step": 170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0342, + "grad_norm": 1.4549329280853271, + "kl": 0.0220373006304726, + "learning_rate": 9.985506211566386e-07, + "loss": 0.0009, + "num_tokens": 1234619.0, + "reward": 0.818603515625, + "reward_std": 0.025214456021785736, + "rewards//mean": 0.818603515625, + "rewards//std": 0.028132878243923187, + "step": 171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.296875, + "epoch": 0.0344, + "grad_norm": 1.4122838973999023, + "kl": 0.019923817832022905, + "learning_rate": 9.9852637611346e-07, + "loss": -0.0044, + "num_tokens": 1241886.0, + "reward": 0.80072021484375, + "reward_std": 0.03035944513976574, + "rewards//mean": 0.80072021484375, + "rewards//std": 0.036348503082990646, + "step": 172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.5625, + "epoch": 0.0346, + "grad_norm": 1.4607577323913574, + "kl": 0.018885106197558343, + "learning_rate": 9.98501930263917e-07, + "loss": -0.0033, + "num_tokens": 1249130.0, + "reward": 0.776123046875, + "reward_std": 0.026938727125525475, + "rewards//mean": 0.776123046875, + "rewards//std": 0.030717460438609123, + "step": 173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.40625, + "epoch": 0.0348, + "grad_norm": 1.6858209371566772, + "kl": 0.020018818438984454, + "learning_rate": 9.984772836178556e-07, + "loss": -0.0072, + "num_tokens": 1256476.0, + "reward": 0.80841064453125, + "reward_std": 0.025222107768058777, + "rewards//mean": 0.80841064453125, + "rewards//std": 0.027493299916386604, + "step": 174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.8125, + "epoch": 0.035, + "grad_norm": 1.5073444843292236, + "kl": 0.01865451887715608, + "learning_rate": 9.984524361852043e-07, + "loss": 0.001, + "num_tokens": 1263784.0, + "reward": 0.791259765625, + "reward_std": 0.02491554245352745, + "rewards//mean": 0.791259765625, + "rewards//std": 0.03858760744333267, + "step": 175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.125, + "epoch": 0.0352, + "grad_norm": 1.8319169282913208, + "kl": 0.019719559466466308, + "learning_rate": 9.984273879759712e-07, + "loss": -0.0004, + "num_tokens": 1271064.0, + "reward": 0.79791259765625, + "reward_std": 0.030297227203845978, + "rewards//mean": 0.79791259765625, + "rewards//std": 0.03371970355510712, + "step": 176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.875, + "epoch": 0.0354, + "grad_norm": 1.526073694229126, + "kl": 0.022273722803220153, + "learning_rate": 9.984021390002457e-07, + "loss": 0.0064, + "num_tokens": 1278448.0, + "reward": 0.8076171875, + "reward_std": 0.01769275963306427, + "rewards//mean": 0.8076171875, + "rewards//std": 0.023928390815854073, + "step": 177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.765625, + "epoch": 0.0356, + "grad_norm": 1.9743379354476929, + "kl": 0.024272599956020713, + "learning_rate": 9.983766892681985e-07, + "loss": 0.0136, + "num_tokens": 1285761.0, + "reward": 0.80108642578125, + "reward_std": 0.02730768546462059, + "rewards//mean": 0.80108642578125, + "rewards//std": 0.030035173520445824, + "step": 178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.703125, + "epoch": 0.0358, + "grad_norm": 1.857028841972351, + "kl": 0.023356027202680707, + "learning_rate": 9.983510387900802e-07, + "loss": 0.0152, + "num_tokens": 1293046.0, + "reward": 0.78533935546875, + "reward_std": 0.026074916124343872, + "rewards//mean": 0.78533935546875, + "rewards//std": 0.03679841011762619, + "step": 179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.71875, + "epoch": 0.036, + "grad_norm": 1.5747302770614624, + "kl": 0.024878773256205022, + "learning_rate": 9.983251875762232e-07, + "loss": -0.0009, + "num_tokens": 1300284.0, + "reward": 0.8048095703125, + "reward_std": 0.02907049097120762, + "rewards//mean": 0.8048095703125, + "rewards//std": 0.03675585985183716, + "step": 180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.640625, + "epoch": 0.0362, + "grad_norm": 1.5011318922042847, + "kl": 0.022421044763177633, + "learning_rate": 9.982991356370403e-07, + "loss": -0.0129, + "num_tokens": 1307549.0, + "reward": 0.79376220703125, + "reward_std": 0.02785402536392212, + "rewards//mean": 0.79376220703125, + "rewards//std": 0.02862197905778885, + "step": 181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.734375, + "epoch": 0.0364, + "grad_norm": 1.4354181289672852, + "kl": 0.022614846471697092, + "learning_rate": 9.98272882983025e-07, + "loss": -0.0002, + "num_tokens": 1314876.0, + "reward": 0.7977294921875, + "reward_std": 0.02392902411520481, + "rewards//mean": 0.7977294921875, + "rewards//std": 0.03299526497721672, + "step": 182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.171875, + "epoch": 0.0366, + "grad_norm": 1.5373140573501587, + "kl": 0.027508829720318317, + "learning_rate": 9.982464296247522e-07, + "loss": -0.0428, + "num_tokens": 1322183.0, + "reward": 0.77850341796875, + "reward_std": 0.022220859304070473, + "rewards//mean": 0.77850341796875, + "rewards//std": 0.027380747720599174, + "step": 183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.59375, + "epoch": 0.0368, + "grad_norm": 1.6750292778015137, + "kl": 0.027591626159846783, + "learning_rate": 9.98219775572877e-07, + "loss": -0.0058, + "num_tokens": 1329325.0, + "reward": 0.7962646484375, + "reward_std": 0.02700239047408104, + "rewards//mean": 0.7962646484375, + "rewards//std": 0.029125606641173363, + "step": 184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.328125, + "epoch": 0.037, + "grad_norm": 1.9082082509994507, + "kl": 0.025278224260546267, + "learning_rate": 9.981929208381357e-07, + "loss": -0.0064, + "num_tokens": 1336602.0, + "reward": 0.79962158203125, + "reward_std": 0.030599147081375122, + "rewards//mean": 0.79962158203125, + "rewards//std": 0.03483914956450462, + "step": 185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.625, + "epoch": 0.0372, + "grad_norm": 1.8531265258789062, + "kl": 0.026406396413221955, + "learning_rate": 9.981658654313456e-07, + "loss": -0.0369, + "num_tokens": 1343794.0, + "reward": 0.7950439453125, + "reward_std": 0.01601320505142212, + "rewards//mean": 0.7950439453125, + "rewards//std": 0.01822875253856182, + "step": 186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.9375, + "epoch": 0.0374, + "grad_norm": 1.6979440450668335, + "kl": 0.026424716925248504, + "learning_rate": 9.981386093634045e-07, + "loss": -0.0219, + "num_tokens": 1351062.0, + "reward": 0.791015625, + "reward_std": 0.019099269062280655, + "rewards//mean": 0.791015625, + "rewards//std": 0.019325554370880127, + "step": 187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.78125, + "epoch": 0.0376, + "grad_norm": 1.5942009687423706, + "kl": 0.02507302979938686, + "learning_rate": 9.98111152645291e-07, + "loss": -0.0017, + "num_tokens": 1358296.0, + "reward": 0.8072509765625, + "reward_std": 0.023885823786258698, + "rewards//mean": 0.8072509765625, + "rewards//std": 0.02815305069088936, + "step": 188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0378, + "grad_norm": 1.4508997201919556, + "kl": 0.029859249712899327, + "learning_rate": 9.98083495288065e-07, + "loss": 0.0012, + "num_tokens": 1365504.0, + "reward": 0.81005859375, + "reward_std": 0.019233308732509613, + "rewards//mean": 0.81005859375, + "rewards//std": 0.030950168147683144, + "step": 189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.609375, + "epoch": 0.038, + "grad_norm": 1.639853835105896, + "kl": 0.029251613188534975, + "learning_rate": 9.980556373028665e-07, + "loss": 0.0028, + "num_tokens": 1372759.0, + "reward": 0.78326416015625, + "reward_std": 0.023131363093852997, + "rewards//mean": 0.78326416015625, + "rewards//std": 0.030544430017471313, + "step": 190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.6875, + "epoch": 0.0382, + "grad_norm": 1.7138605117797852, + "kl": 0.025414355099201202, + "learning_rate": 9.98027578700917e-07, + "loss": -0.0167, + "num_tokens": 1380035.0, + "reward": 0.80084228515625, + "reward_std": 0.02559913694858551, + "rewards//mean": 0.80084228515625, + "rewards//std": 0.0346636027097702, + "step": 191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.78125, + "epoch": 0.0384, + "grad_norm": 1.4497588872909546, + "kl": 0.030320858815684915, + "learning_rate": 9.979993194935182e-07, + "loss": -0.0105, + "num_tokens": 1387285.0, + "reward": 0.81646728515625, + "reward_std": 0.02013261616230011, + "rewards//mean": 0.81646728515625, + "rewards//std": 0.025012129917740822, + "step": 192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.578125, + "epoch": 0.0386, + "grad_norm": 1.669048547744751, + "kl": 0.03284411155618727, + "learning_rate": 9.979708596920529e-07, + "loss": -0.0047, + "num_tokens": 1394562.0, + "reward": 0.78936767578125, + "reward_std": 0.02085341513156891, + "rewards//mean": 0.78936767578125, + "rewards//std": 0.026769088581204414, + "step": 193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.390625, + "epoch": 0.0388, + "grad_norm": 1.4441988468170166, + "kl": 0.026247155386954546, + "learning_rate": 9.97942199307985e-07, + "loss": -0.012, + "num_tokens": 1401835.0, + "reward": 0.7960205078125, + "reward_std": 0.025780964642763138, + "rewards//mean": 0.7960205078125, + "rewards//std": 0.03149295225739479, + "step": 194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.875, + "epoch": 0.039, + "grad_norm": 1.6695730686187744, + "kl": 0.029477658914402127, + "learning_rate": 9.97913338352859e-07, + "loss": 0.003, + "num_tokens": 1409147.0, + "reward": 0.7889404296875, + "reward_std": 0.029297390952706337, + "rewards//mean": 0.7889404296875, + "rewards//std": 0.029919344931840897, + "step": 195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.671875, + "epoch": 0.0392, + "grad_norm": 1.6277669668197632, + "kl": 0.03377866349183023, + "learning_rate": 9.978842768382998e-07, + "loss": 0.0093, + "num_tokens": 1416398.0, + "reward": 0.7686767578125, + "reward_std": 0.019148029386997223, + "rewards//mean": 0.7686767578125, + "rewards//std": 0.02185981161892414, + "step": 196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.734375, + "epoch": 0.0394, + "grad_norm": 1.6557395458221436, + "kl": 0.03164317994378507, + "learning_rate": 9.978550147760131e-07, + "loss": -0.0067, + "num_tokens": 1423645.0, + "reward": 0.79510498046875, + "reward_std": 0.026034388691186905, + "rewards//mean": 0.79510498046875, + "rewards//std": 0.03262959420681, + "step": 197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0396, + "grad_norm": 1.5768955945968628, + "kl": 0.03248094697482884, + "learning_rate": 9.978255521777862e-07, + "loss": 0.0013, + "num_tokens": 1430917.0, + "reward": 0.80047607421875, + "reward_std": 0.036130838096141815, + "rewards//mean": 0.80047607421875, + "rewards//std": 0.04237736016511917, + "step": 198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0398, + "grad_norm": 1.3822791576385498, + "kl": 0.03165759355761111, + "learning_rate": 9.977958890554866e-07, + "loss": 0.0013, + "num_tokens": 1438173.0, + "reward": 0.76861572265625, + "reward_std": 0.021230751648545265, + "rewards//mean": 0.76861572265625, + "rewards//std": 0.0238548144698143, + "step": 199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.296875, + "epoch": 0.04, + "grad_norm": 1.5288805961608887, + "kl": 0.03266419074498117, + "learning_rate": 9.97766025421062e-07, + "loss": -0.0205, + "num_tokens": 1445400.0, + "reward": 0.79791259765625, + "reward_std": 0.03015642613172531, + "rewards//mean": 0.79791259765625, + "rewards//std": 0.030893320217728615, + "step": 200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.984375, + "epoch": 0.0402, + "grad_norm": 1.5779434442520142, + "kl": 0.03641755995340645, + "learning_rate": 9.977359612865422e-07, + "loss": -0.0587, + "num_tokens": 1452551.0, + "reward": 0.7823486328125, + "reward_std": 0.02429197169840336, + "rewards//mean": 0.7823486328125, + "rewards//std": 0.03314175084233284, + "step": 201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.6875, + "epoch": 0.0404, + "grad_norm": 1.5730459690093994, + "kl": 0.03551433398388326, + "learning_rate": 9.977056966640367e-07, + "loss": -0.0202, + "num_tokens": 1459795.0, + "reward": 0.80438232421875, + "reward_std": 0.029905686154961586, + "rewards//mean": 0.80438232421875, + "rewards//std": 0.03807217255234718, + "step": 202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.203125, + "epoch": 0.0406, + "grad_norm": 1.6034892797470093, + "kl": 0.04400835046544671, + "learning_rate": 9.976752315657359e-07, + "loss": -0.0253, + "num_tokens": 1467000.0, + "reward": 0.7659912109375, + "reward_std": 0.031159641221165657, + "rewards//mean": 0.7659912109375, + "rewards//std": 0.03381636366248131, + "step": 203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.75, + "epoch": 0.0408, + "grad_norm": 1.4959354400634766, + "kl": 0.03899890510365367, + "learning_rate": 9.976445660039117e-07, + "loss": -0.0099, + "num_tokens": 1474232.0, + "reward": 0.810546875, + "reward_std": 0.024132370948791504, + "rewards//mean": 0.810546875, + "rewards//std": 0.029123786836862564, + "step": 204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.21875, + "epoch": 0.041, + "grad_norm": 1.551506757736206, + "kl": 0.03464899770915508, + "learning_rate": 9.976136999909155e-07, + "loss": -0.0012, + "num_tokens": 1481518.0, + "reward": 0.809814453125, + "reward_std": 0.025436121970415115, + "rewards//mean": 0.809814453125, + "rewards//std": 0.032814234495162964, + "step": 205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.140625, + "epoch": 0.0412, + "grad_norm": 1.570482611656189, + "kl": 0.03764410084113479, + "learning_rate": 9.975826335391805e-07, + "loss": 0.0073, + "num_tokens": 1488679.0, + "reward": 0.794677734375, + "reward_std": 0.029199369251728058, + "rewards//mean": 0.794677734375, + "rewards//std": 0.029902391135692596, + "step": 206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.015625, + "epoch": 0.0414, + "grad_norm": 1.4733949899673462, + "kl": 0.03423602762632072, + "learning_rate": 9.975513666612203e-07, + "loss": -0.0252, + "num_tokens": 1495984.0, + "reward": 0.8035888671875, + "reward_std": 0.020572178065776825, + "rewards//mean": 0.8035888671875, + "rewards//std": 0.029376082122325897, + "step": 207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.984375, + "epoch": 0.0416, + "grad_norm": 1.5248899459838867, + "kl": 0.04065468325279653, + "learning_rate": 9.975198993696291e-07, + "loss": -0.0326, + "num_tokens": 1503263.0, + "reward": 0.80517578125, + "reward_std": 0.028173374012112617, + "rewards//mean": 0.80517578125, + "rewards//std": 0.031277116388082504, + "step": 208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.65625, + "epoch": 0.0418, + "grad_norm": 1.6459959745407104, + "kl": 0.03681662306189537, + "learning_rate": 9.97488231677082e-07, + "loss": -0.005, + "num_tokens": 1510641.0, + "reward": 0.81072998046875, + "reward_std": 0.032815009355545044, + "rewards//mean": 0.81072998046875, + "rewards//std": 0.03888183459639549, + "step": 209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.078125, + "epoch": 0.042, + "grad_norm": 1.4369343519210815, + "kl": 0.04421597719192505, + "learning_rate": 9.974563635963347e-07, + "loss": -0.0198, + "num_tokens": 1517902.0, + "reward": 0.80340576171875, + "reward_std": 0.031175196170806885, + "rewards//mean": 0.80340576171875, + "rewards//std": 0.03857618197798729, + "step": 210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.515625, + "epoch": 0.0422, + "grad_norm": 1.7451955080032349, + "kl": 0.042173873633146286, + "learning_rate": 9.974242951402235e-07, + "loss": -0.0109, + "num_tokens": 1525103.0, + "reward": 0.803466796875, + "reward_std": 0.02319139800965786, + "rewards//mean": 0.803466796875, + "rewards//std": 0.024957675486803055, + "step": 211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.375, + "epoch": 0.0424, + "grad_norm": 1.976418375968933, + "kl": 0.04329403955489397, + "learning_rate": 9.973920263216657e-07, + "loss": 0.0159, + "num_tokens": 1532287.0, + "reward": 0.8135986328125, + "reward_std": 0.03098849207162857, + "rewards//mean": 0.8135986328125, + "rewards//std": 0.03476808965206146, + "step": 212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0426, + "grad_norm": 1.4984416961669922, + "kl": 0.03874999680556357, + "learning_rate": 9.97359557153659e-07, + "loss": 0.0015, + "num_tokens": 1539495.0, + "reward": 0.78582763671875, + "reward_std": 0.027528319507837296, + "rewards//mean": 0.78582763671875, + "rewards//std": 0.0313548818230629, + "step": 213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.3125, + "epoch": 0.0428, + "grad_norm": 1.6637729406356812, + "kl": 0.05149036552757025, + "learning_rate": 9.973268876492825e-07, + "loss": 0.0204, + "num_tokens": 1546691.0, + "reward": 0.78167724609375, + "reward_std": 0.025446917861700058, + "rewards//mean": 0.78167724609375, + "rewards//std": 0.0314132422208786, + "step": 214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.25, + "epoch": 0.043, + "grad_norm": 1.4455718994140625, + "kl": 0.04723980464041233, + "learning_rate": 9.972940178216952e-07, + "loss": -0.0136, + "num_tokens": 1553915.0, + "reward": 0.78741455078125, + "reward_std": 0.018912669271230698, + "rewards//mean": 0.78741455078125, + "rewards//std": 0.019609278067946434, + "step": 215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.0432, + "grad_norm": 1.4507062435150146, + "kl": 0.04037927486933768, + "learning_rate": 9.972609476841365e-07, + "loss": 0.001, + "num_tokens": 1561154.0, + "reward": 0.82684326171875, + "reward_std": 0.01909043826162815, + "rewards//mean": 0.82684326171875, + "rewards//std": 0.020100276917219162, + "step": 216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.515625, + "epoch": 0.0434, + "grad_norm": 1.6010550260543823, + "kl": 0.04940338432788849, + "learning_rate": 9.97227677249928e-07, + "loss": -0.0053, + "num_tokens": 1568395.0, + "reward": 0.81158447265625, + "reward_std": 0.023518329486250877, + "rewards//mean": 0.81158447265625, + "rewards//std": 0.030897239223122597, + "step": 217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.484375, + "epoch": 0.0436, + "grad_norm": 1.6384398937225342, + "kl": 0.05265101324766874, + "learning_rate": 9.971942065324702e-07, + "loss": -0.0142, + "num_tokens": 1575610.0, + "reward": 0.7926025390625, + "reward_std": 0.028944380581378937, + "rewards//mean": 0.7926025390625, + "rewards//std": 0.03034336306154728, + "step": 218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.484375, + "epoch": 0.0438, + "grad_norm": 1.540368676185608, + "kl": 0.046287836972624063, + "learning_rate": 9.971605355452457e-07, + "loss": -0.0273, + "num_tokens": 1582937.0, + "reward": 0.82208251953125, + "reward_std": 0.03292100876569748, + "rewards//mean": 0.82208251953125, + "rewards//std": 0.037948716431856155, + "step": 219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.044, + "grad_norm": 1.6212857961654663, + "kl": 0.046670813113451004, + "learning_rate": 9.97126664301817e-07, + "loss": 0.0034, + "num_tokens": 1590127.0, + "reward": 0.79840087890625, + "reward_std": 0.028314683586359024, + "rewards//mean": 0.79840087890625, + "rewards//std": 0.0348387137055397, + "step": 220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.21875, + "epoch": 0.0442, + "grad_norm": 1.5348947048187256, + "kl": 0.05362212611362338, + "learning_rate": 9.970925928158272e-07, + "loss": -0.0214, + "num_tokens": 1597445.0, + "reward": 0.7938232421875, + "reward_std": 0.027900366112589836, + "rewards//mean": 0.7938232421875, + "rewards//std": 0.03640826791524887, + "step": 221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0444, + "grad_norm": 1.4764069318771362, + "kl": 0.04575439915060997, + "learning_rate": 9.970583211010007e-07, + "loss": 0.0018, + "num_tokens": 1604861.0, + "reward": 0.7884521484375, + "reward_std": 0.030294381082057953, + "rewards//mean": 0.7884521484375, + "rewards//std": 0.036526158452034, + "step": 222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.8125, + "epoch": 0.0446, + "grad_norm": 1.5283159017562866, + "kl": 0.047826265916228294, + "learning_rate": 9.970238491711415e-07, + "loss": -0.0116, + "num_tokens": 1612081.0, + "reward": 0.7852783203125, + "reward_std": 0.024566255509853363, + "rewards//mean": 0.7852783203125, + "rewards//std": 0.03083624318242073, + "step": 223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.21875, + "epoch": 0.0448, + "grad_norm": 1.691720724105835, + "kl": 0.05117085622623563, + "learning_rate": 9.969891770401356e-07, + "loss": -0.0325, + "num_tokens": 1619455.0, + "reward": 0.81048583984375, + "reward_std": 0.024159129709005356, + "rewards//mean": 0.81048583984375, + "rewards//std": 0.025985730811953545, + "step": 224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.953125, + "epoch": 0.045, + "grad_norm": 1.5798157453536987, + "kl": 0.04884958220645785, + "learning_rate": 9.969543047219486e-07, + "loss": -0.0192, + "num_tokens": 1626636.0, + "reward": 0.7972412109375, + "reward_std": 0.027723845094442368, + "rewards//mean": 0.7972412109375, + "rewards//std": 0.034644220024347305, + "step": 225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.71875, + "epoch": 0.0452, + "grad_norm": 1.6214699745178223, + "kl": 0.04841682966798544, + "learning_rate": 9.96919232230627e-07, + "loss": 0.0042, + "num_tokens": 1633906.0, + "reward": 0.79058837890625, + "reward_std": 0.03260193020105362, + "rewards//mean": 0.79058837890625, + "rewards//std": 0.03864244371652603, + "step": 226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.25, + "epoch": 0.0454, + "grad_norm": 1.5493749380111694, + "kl": 0.04976038774475455, + "learning_rate": 9.968839595802981e-07, + "loss": -0.0141, + "num_tokens": 1641122.0, + "reward": 0.76904296875, + "reward_std": 0.017932549118995667, + "rewards//mean": 0.76904296875, + "rewards//std": 0.019369369372725487, + "step": 227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.34375, + "epoch": 0.0456, + "grad_norm": 1.7390168905258179, + "kl": 0.053597273072227836, + "learning_rate": 9.968484867851697e-07, + "loss": 0.0136, + "num_tokens": 1648408.0, + "reward": 0.815673828125, + "reward_std": 0.03216677904129028, + "rewards//mean": 0.815673828125, + "rewards//std": 0.038712941110134125, + "step": 228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.9375, + "epoch": 0.0458, + "grad_norm": 1.6401088237762451, + "kl": 0.04806722281500697, + "learning_rate": 9.968128138595302e-07, + "loss": -0.0013, + "num_tokens": 1655644.0, + "reward": 0.77716064453125, + "reward_std": 0.023288216441869736, + "rewards//mean": 0.77716064453125, + "rewards//std": 0.029284924268722534, + "step": 229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.703125, + "epoch": 0.046, + "grad_norm": 2.165658950805664, + "kl": 0.05592726566828787, + "learning_rate": 9.967769408177488e-07, + "loss": 0.0002, + "num_tokens": 1662881.0, + "reward": 0.80609130859375, + "reward_std": 0.024635300040245056, + "rewards//mean": 0.80609130859375, + "rewards//std": 0.03338130936026573, + "step": 230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.46875, + "epoch": 0.0462, + "grad_norm": 1.4398484230041504, + "kl": 0.059767965227365494, + "learning_rate": 9.967408676742751e-07, + "loss": 0.0067, + "num_tokens": 1670151.0, + "reward": 0.806396484375, + "reward_std": 0.023778801783919334, + "rewards//mean": 0.806396484375, + "rewards//std": 0.026235099881887436, + "step": 231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.703125, + "epoch": 0.0464, + "grad_norm": 1.3392302989959717, + "kl": 0.05588983464986086, + "learning_rate": 9.967045944436393e-07, + "loss": 0.0059, + "num_tokens": 1677532.0, + "reward": 0.81427001953125, + "reward_std": 0.025270797312259674, + "rewards//mean": 0.81427001953125, + "rewards//std": 0.030400864779949188, + "step": 232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.0466, + "grad_norm": 1.4796013832092285, + "kl": 0.05407287226989865, + "learning_rate": 9.96668121140452e-07, + "loss": 0.0031, + "num_tokens": 1684943.0, + "reward": 0.77972412109375, + "reward_std": 0.01995760016143322, + "rewards//mean": 0.77972412109375, + "rewards//std": 0.02175038494169712, + "step": 233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.4375, + "epoch": 0.0468, + "grad_norm": 1.6216585636138916, + "kl": 0.06411712523549795, + "learning_rate": 9.966314477794052e-07, + "loss": -0.0239, + "num_tokens": 1692147.0, + "reward": 0.8013916015625, + "reward_std": 0.02029525302350521, + "rewards//mean": 0.8013916015625, + "rewards//std": 0.024931885302066803, + "step": 234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.78125, + "epoch": 0.047, + "grad_norm": 1.58781099319458, + "kl": 0.060873673763126135, + "learning_rate": 9.965945743752705e-07, + "loss": -0.0082, + "num_tokens": 1699349.0, + "reward": 0.8201904296875, + "reward_std": 0.029398076236248016, + "rewards//mean": 0.8201904296875, + "rewards//std": 0.03649132698774338, + "step": 235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.671875, + "epoch": 0.0472, + "grad_norm": 1.5597407817840576, + "kl": 0.06513929460197687, + "learning_rate": 9.965575009429005e-07, + "loss": -0.0152, + "num_tokens": 1706624.0, + "reward": 0.81854248046875, + "reward_std": 0.027636567130684853, + "rewards//mean": 0.81854248046875, + "rewards//std": 0.03155028447508812, + "step": 236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.65625, + "epoch": 0.0474, + "grad_norm": 1.3907430171966553, + "kl": 0.05831611808389425, + "learning_rate": 9.965202274972286e-07, + "loss": -0.007, + "num_tokens": 1713794.0, + "reward": 0.8011474609375, + "reward_std": 0.0269140787422657, + "rewards//mean": 0.8011474609375, + "rewards//std": 0.03983438014984131, + "step": 237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0476, + "grad_norm": 1.3295928239822388, + "kl": 0.06068591773509979, + "learning_rate": 9.964827540532684e-07, + "loss": 0.0024, + "num_tokens": 1721066.0, + "reward": 0.8126220703125, + "reward_std": 0.025826364755630493, + "rewards//mean": 0.8126220703125, + "rewards//std": 0.03293281048536301, + "step": 238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.828125, + "epoch": 0.0478, + "grad_norm": 1.6099345684051514, + "kl": 0.07326604798436165, + "learning_rate": 9.964450806261144e-07, + "loss": 0.0089, + "num_tokens": 1728327.0, + "reward": 0.799560546875, + "reward_std": 0.026219502091407776, + "rewards//mean": 0.799560546875, + "rewards//std": 0.031031252816319466, + "step": 239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.048, + "grad_norm": 1.635419487953186, + "kl": 0.047440858324989676, + "learning_rate": 9.96407207230941e-07, + "loss": 0.0019, + "num_tokens": 1735671.0, + "reward": 0.7947998046875, + "reward_std": 0.029523316770792007, + "rewards//mean": 0.7947998046875, + "rewards//std": 0.03482029587030411, + "step": 240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.609375, + "epoch": 0.0482, + "grad_norm": 1.7200520038604736, + "kl": 0.05986652011051774, + "learning_rate": 9.963691338830042e-07, + "loss": -0.0112, + "num_tokens": 1742910.0, + "reward": 0.77484130859375, + "reward_std": 0.024928193539381027, + "rewards//mean": 0.77484130859375, + "rewards//std": 0.029637914150953293, + "step": 241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.0484, + "grad_norm": 1.694566249847412, + "kl": 0.05253846012055874, + "learning_rate": 9.963308605976396e-07, + "loss": 0.0066, + "num_tokens": 1750232.0, + "reward": 0.77337646484375, + "reward_std": 0.02274433709681034, + "rewards//mean": 0.77337646484375, + "rewards//std": 0.023862428963184357, + "step": 242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.59375, + "epoch": 0.0486, + "grad_norm": 1.6018421649932861, + "kl": 0.059752690605819225, + "learning_rate": 9.962923873902636e-07, + "loss": -0.0093, + "num_tokens": 1757398.0, + "reward": 0.8031005859375, + "reward_std": 0.03048805519938469, + "rewards//mean": 0.8031005859375, + "rewards//std": 0.03364402800798416, + "step": 243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.859375, + "epoch": 0.0488, + "grad_norm": 1.616485595703125, + "kl": 0.06559192668646574, + "learning_rate": 9.962537142763732e-07, + "loss": -0.0019, + "num_tokens": 1764661.0, + "reward": 0.80621337890625, + "reward_std": 0.025913584977388382, + "rewards//mean": 0.80621337890625, + "rewards//std": 0.029533537104725838, + "step": 244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.703125, + "epoch": 0.049, + "grad_norm": 1.4728548526763916, + "kl": 0.0603984734043479, + "learning_rate": 9.962148412715463e-07, + "loss": -0.0211, + "num_tokens": 1771970.0, + "reward": 0.77435302734375, + "reward_std": 0.026433032006025314, + "rewards//mean": 0.77435302734375, + "rewards//std": 0.03897399827837944, + "step": 245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.625, + "epoch": 0.0492, + "grad_norm": 1.5393234491348267, + "kl": 0.05948180053383112, + "learning_rate": 9.961757683914405e-07, + "loss": 0.006, + "num_tokens": 1779290.0, + "reward": 0.8189697265625, + "reward_std": 0.0313270166516304, + "rewards//mean": 0.8189697265625, + "rewards//std": 0.0348706915974617, + "step": 246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.3125, + "epoch": 0.0494, + "grad_norm": 1.8312506675720215, + "kl": 0.06438866350799799, + "learning_rate": 9.961364956517946e-07, + "loss": -0.0019, + "num_tokens": 1786510.0, + "reward": 0.8043212890625, + "reward_std": 0.027360614389181137, + "rewards//mean": 0.8043212890625, + "rewards//std": 0.03266512230038643, + "step": 247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.0496, + "grad_norm": 1.3763837814331055, + "kl": 0.07036868762224913, + "learning_rate": 9.960970230684275e-07, + "loss": 0.0031, + "num_tokens": 1793797.0, + "reward": 0.8291015625, + "reward_std": 0.029286861419677734, + "rewards//mean": 0.8291015625, + "rewards//std": 0.029512066394090652, + "step": 248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.8125, + "epoch": 0.0498, + "grad_norm": 1.426759123802185, + "kl": 0.0625599306076765, + "learning_rate": 9.960573506572389e-07, + "loss": -0.0051, + "num_tokens": 1801009.0, + "reward": 0.7716064453125, + "reward_std": 0.01923120953142643, + "rewards//mean": 0.7716064453125, + "rewards//std": 0.02165386639535427, + "step": 249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.65625, + "epoch": 0.05, + "grad_norm": 1.6642627716064453, + "kl": 0.06993492972105742, + "learning_rate": 9.960174784342087e-07, + "loss": 0.0162, + "num_tokens": 1808275.0, + "reward": 0.79901123046875, + "reward_std": 0.03090490773320198, + "rewards//mean": 0.79901123046875, + "rewards//std": 0.03377487510442734, + "step": 250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0502, + "grad_norm": 1.5413845777511597, + "kl": 0.062095348723232746, + "learning_rate": 9.959774064153975e-07, + "loss": 0.0025, + "num_tokens": 1815523.0, + "reward": 0.80596923828125, + "reward_std": 0.03636975586414337, + "rewards//mean": 0.80596923828125, + "rewards//std": 0.03950674086809158, + "step": 251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0504, + "grad_norm": 1.4786999225616455, + "kl": 0.06322549981996417, + "learning_rate": 9.959371346169465e-07, + "loss": 0.0025, + "num_tokens": 1822875.0, + "reward": 0.7974853515625, + "reward_std": 0.02372065559029579, + "rewards//mean": 0.7974853515625, + "rewards//std": 0.02650461718440056, + "step": 252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.578125, + "epoch": 0.0506, + "grad_norm": 1.4886751174926758, + "kl": 0.05433172173798084, + "learning_rate": 9.95896663055077e-07, + "loss": 0.0065, + "num_tokens": 1830072.0, + "reward": 0.82373046875, + "reward_std": 0.02426808699965477, + "rewards//mean": 0.82373046875, + "rewards//std": 0.02630539983510971, + "step": 253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.46875, + "epoch": 0.0508, + "grad_norm": 1.6090656518936157, + "kl": 0.05550272250548005, + "learning_rate": 9.958559917460907e-07, + "loss": -0.0016, + "num_tokens": 1837390.0, + "reward": 0.7802734375, + "reward_std": 0.02034963294863701, + "rewards//mean": 0.7802734375, + "rewards//std": 0.02700057625770569, + "step": 254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.640625, + "epoch": 0.051, + "grad_norm": 1.789724588394165, + "kl": 0.07605843339115381, + "learning_rate": 9.958151207063703e-07, + "loss": -0.0073, + "num_tokens": 1844703.0, + "reward": 0.78173828125, + "reward_std": 0.02103423699736595, + "rewards//mean": 0.78173828125, + "rewards//std": 0.02643398754298687, + "step": 255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.609375, + "epoch": 0.0512, + "grad_norm": 1.4726845026016235, + "kl": 0.07236371748149395, + "learning_rate": 9.957740499523785e-07, + "loss": -0.0006, + "num_tokens": 1851886.0, + "reward": 0.83453369140625, + "reward_std": 0.02898683026432991, + "rewards//mean": 0.83453369140625, + "rewards//std": 0.03314831480383873, + "step": 256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.40625, + "epoch": 0.0514, + "grad_norm": 1.557300329208374, + "kl": 0.06535837752744555, + "learning_rate": 9.957327795006588e-07, + "loss": 0.0078, + "num_tokens": 1859152.0, + "reward": 0.82232666015625, + "reward_std": 0.03251180052757263, + "rewards//mean": 0.82232666015625, + "rewards//std": 0.03681527450680733, + "step": 257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.484375, + "epoch": 0.0516, + "grad_norm": 1.7280945777893066, + "kl": 0.0672774245031178, + "learning_rate": 9.956913093678348e-07, + "loss": -0.0076, + "num_tokens": 1866375.0, + "reward": 0.79913330078125, + "reward_std": 0.031145265325903893, + "rewards//mean": 0.79913330078125, + "rewards//std": 0.034833066165447235, + "step": 258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0518, + "grad_norm": 1.7853983640670776, + "kl": 0.06827154662460089, + "learning_rate": 9.956496395706105e-07, + "loss": 0.0027, + "num_tokens": 1873743.0, + "reward": 0.829345703125, + "reward_std": 0.02869301475584507, + "rewards//mean": 0.829345703125, + "rewards//std": 0.04283537715673447, + "step": 259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.052, + "grad_norm": 1.522091031074524, + "kl": 0.0627308962866664, + "learning_rate": 9.956077701257707e-07, + "loss": 0.0014, + "num_tokens": 1881082.0, + "reward": 0.79583740234375, + "reward_std": 0.030068516731262207, + "rewards//mean": 0.79583740234375, + "rewards//std": 0.03479436784982681, + "step": 260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.78125, + "epoch": 0.0522, + "grad_norm": 1.8448858261108398, + "kl": 0.06319458270445466, + "learning_rate": 9.955657010501806e-07, + "loss": 0.0084, + "num_tokens": 1888388.0, + "reward": 0.7896728515625, + "reward_std": 0.020673716440796852, + "rewards//mean": 0.7896728515625, + "rewards//std": 0.023315435275435448, + "step": 261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.796875, + "epoch": 0.0524, + "grad_norm": 1.7232383489608765, + "kl": 0.07260536728426814, + "learning_rate": 9.955234323607851e-07, + "loss": 0.0061, + "num_tokens": 1895719.0, + "reward": 0.82928466796875, + "reward_std": 0.020161326974630356, + "rewards//mean": 0.82928466796875, + "rewards//std": 0.024485494941473007, + "step": 262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.34375, + "epoch": 0.0526, + "grad_norm": 1.3847827911376953, + "kl": 0.07560721784830093, + "learning_rate": 9.954809640746105e-07, + "loss": -0.0242, + "num_tokens": 1902989.0, + "reward": 0.8016357421875, + "reward_std": 0.027517035603523254, + "rewards//mean": 0.8016357421875, + "rewards//std": 0.028877150267362595, + "step": 263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.5, + "epoch": 0.0528, + "grad_norm": 1.6710200309753418, + "kl": 0.06560091534629464, + "learning_rate": 9.954382962087627e-07, + "loss": -0.0046, + "num_tokens": 1910253.0, + "reward": 0.82000732421875, + "reward_std": 0.03169260919094086, + "rewards//mean": 0.82000732421875, + "rewards//std": 0.04342183843255043, + "step": 264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.053, + "grad_norm": 1.4919801950454712, + "kl": 0.07869026716798544, + "learning_rate": 9.953954287804284e-07, + "loss": 0.0031, + "num_tokens": 1917437.0, + "reward": 0.80426025390625, + "reward_std": 0.0264921635389328, + "rewards//mean": 0.80426025390625, + "rewards//std": 0.03172778710722923, + "step": 265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.8125, + "epoch": 0.0532, + "grad_norm": 2.5178911685943604, + "kl": 0.08063567942008376, + "learning_rate": 9.953523618068748e-07, + "loss": -0.0075, + "num_tokens": 1924633.0, + "reward": 0.77545166015625, + "reward_std": 0.02118399553000927, + "rewards//mean": 0.77545166015625, + "rewards//std": 0.024250054731965065, + "step": 266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0534, + "grad_norm": 1.945655107498169, + "kl": 0.07817292865365744, + "learning_rate": 9.95309095305449e-07, + "loss": 0.0031, + "num_tokens": 1931865.0, + "reward": 0.82403564453125, + "reward_std": 0.031137293204665184, + "rewards//mean": 0.82403564453125, + "rewards//std": 0.03373540937900543, + "step": 267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.0536, + "grad_norm": 1.7948945760726929, + "kl": 0.07094687037169933, + "learning_rate": 9.952656292935788e-07, + "loss": -0.0047, + "num_tokens": 1939210.0, + "reward": 0.8138427734375, + "reward_std": 0.019506406038999557, + "rewards//mean": 0.8138427734375, + "rewards//std": 0.029677536338567734, + "step": 268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.734375, + "epoch": 0.0538, + "grad_norm": 1.7484893798828125, + "kl": 0.08202310046181083, + "learning_rate": 9.952219637887725e-07, + "loss": -0.0017, + "num_tokens": 1946425.0, + "reward": 0.8052978515625, + "reward_std": 0.021149149164557457, + "rewards//mean": 0.8052978515625, + "rewards//std": 0.026559388265013695, + "step": 269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.054, + "grad_norm": 1.4721801280975342, + "kl": 0.07391235465183854, + "learning_rate": 9.951780988086183e-07, + "loss": 0.0017, + "num_tokens": 1953676.0, + "reward": 0.80426025390625, + "reward_std": 0.01838582754135132, + "rewards//mean": 0.80426025390625, + "rewards//std": 0.019687091931700706, + "step": 270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.6875, + "epoch": 0.0542, + "grad_norm": 1.5606902837753296, + "kl": 0.08020484913140535, + "learning_rate": 9.95134034370785e-07, + "loss": 0.0074, + "num_tokens": 1960944.0, + "reward": 0.81658935546875, + "reward_std": 0.029150189831852913, + "rewards//mean": 0.81658935546875, + "rewards//std": 0.029193807393312454, + "step": 271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.671875, + "epoch": 0.0544, + "grad_norm": 1.8031209707260132, + "kl": 0.06984481122344732, + "learning_rate": 9.95089770493022e-07, + "loss": -0.0026, + "num_tokens": 1968251.0, + "reward": 0.807373046875, + "reward_std": 0.026371419429779053, + "rewards//mean": 0.807373046875, + "rewards//std": 0.031802188605070114, + "step": 272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.53125, + "epoch": 0.0546, + "grad_norm": 1.3756400346755981, + "kl": 0.07916021719574928, + "learning_rate": 9.950453071931588e-07, + "loss": -0.0167, + "num_tokens": 1975493.0, + "reward": 0.80126953125, + "reward_std": 0.028513655066490173, + "rewards//mean": 0.80126953125, + "rewards//std": 0.03184499964118004, + "step": 273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.390625, + "epoch": 0.0548, + "grad_norm": 1.450444221496582, + "kl": 0.0728410822339356, + "learning_rate": 9.950006444891048e-07, + "loss": 0.013, + "num_tokens": 1982702.0, + "reward": 0.7489013671875, + "reward_std": 0.024310847744345665, + "rewards//mean": 0.7489013671875, + "rewards//std": 0.026941923424601555, + "step": 274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.8125, + "epoch": 0.055, + "grad_norm": 1.384976863861084, + "kl": 0.0620033647865057, + "learning_rate": 9.949557823988506e-07, + "loss": 0.01, + "num_tokens": 1990010.0, + "reward": 0.8223876953125, + "reward_std": 0.02718181721866131, + "rewards//mean": 0.8223876953125, + "rewards//std": 0.03133489936590195, + "step": 275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.8125, + "epoch": 0.0552, + "grad_norm": 1.6794301271438599, + "kl": 0.06980185396969318, + "learning_rate": 9.949107209404663e-07, + "loss": -0.003, + "num_tokens": 1997294.0, + "reward": 0.79449462890625, + "reward_std": 0.02323540672659874, + "rewards//mean": 0.79449462890625, + "rewards//std": 0.026581251993775368, + "step": 276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.78125, + "epoch": 0.0554, + "grad_norm": 1.804860234260559, + "kl": 0.06426722509786487, + "learning_rate": 9.94865460132103e-07, + "loss": -0.0012, + "num_tokens": 2004560.0, + "reward": 0.75762939453125, + "reward_std": 0.02401154302060604, + "rewards//mean": 0.75762939453125, + "rewards//std": 0.02639666572213173, + "step": 277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.0556, + "grad_norm": 1.746940016746521, + "kl": 0.0659959577023983, + "learning_rate": 9.948199999919912e-07, + "loss": 0.0038, + "num_tokens": 2011893.0, + "reward": 0.8238525390625, + "reward_std": 0.0246270839124918, + "rewards//mean": 0.8238525390625, + "rewards//std": 0.029583534225821495, + "step": 278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0558, + "grad_norm": 1.5128523111343384, + "kl": 0.08404629258438945, + "learning_rate": 9.947743405384428e-07, + "loss": 0.0034, + "num_tokens": 2019149.0, + "reward": 0.804443359375, + "reward_std": 0.02776503376662731, + "rewards//mean": 0.804443359375, + "rewards//std": 0.03473607450723648, + "step": 279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.056, + "grad_norm": 1.8058712482452393, + "kl": 0.07961097359657288, + "learning_rate": 9.947284817898492e-07, + "loss": 0.0056, + "num_tokens": 2026415.0, + "reward": 0.79730224609375, + "reward_std": 0.02299703285098076, + "rewards//mean": 0.79730224609375, + "rewards//std": 0.029798876494169235, + "step": 280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.25, + "epoch": 0.0562, + "grad_norm": 1.4775314331054688, + "kl": 0.07526570744812489, + "learning_rate": 9.946824237646824e-07, + "loss": -0.0131, + "num_tokens": 2033679.0, + "reward": 0.80029296875, + "reward_std": 0.03442797437310219, + "rewards//mean": 0.80029296875, + "rewards//std": 0.03983912989497185, + "step": 281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0564, + "grad_norm": 1.643812894821167, + "kl": 0.08360268408432603, + "learning_rate": 9.946361664814943e-07, + "loss": 0.0033, + "num_tokens": 2040903.0, + "reward": 0.80126953125, + "reward_std": 0.01987861841917038, + "rewards//mean": 0.80126953125, + "rewards//std": 0.024014273658394814, + "step": 282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.5625, + "epoch": 0.0566, + "grad_norm": 1.5289838314056396, + "kl": 0.08868004847317934, + "learning_rate": 9.945897099589173e-07, + "loss": -0.0349, + "num_tokens": 2048067.0, + "reward": 0.80120849609375, + "reward_std": 0.025324847549200058, + "rewards//mean": 0.80120849609375, + "rewards//std": 0.025818588212132454, + "step": 283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0568, + "grad_norm": 1.6417237520217896, + "kl": 0.07433904567733407, + "learning_rate": 9.945430542156646e-07, + "loss": 0.003, + "num_tokens": 2055307.0, + "reward": 0.81390380859375, + "reward_std": 0.03387096896767616, + "rewards//mean": 0.81390380859375, + "rewards//std": 0.0400988943874836, + "step": 284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.828125, + "epoch": 0.057, + "grad_norm": 1.5993088483810425, + "kl": 0.08496549911797047, + "learning_rate": 9.944961992705286e-07, + "loss": -0.0006, + "num_tokens": 2062488.0, + "reward": 0.80126953125, + "reward_std": 0.0265166275203228, + "rewards//mean": 0.80126953125, + "rewards//std": 0.03157768025994301, + "step": 285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.140625, + "epoch": 0.0572, + "grad_norm": 1.3719289302825928, + "kl": 0.08071692194789648, + "learning_rate": 9.944491451423827e-07, + "loss": -0.0298, + "num_tokens": 2069753.0, + "reward": 0.79144287109375, + "reward_std": 0.018055181950330734, + "rewards//mean": 0.79144287109375, + "rewards//std": 0.021254774183034897, + "step": 286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.0574, + "grad_norm": 1.5391428470611572, + "kl": 0.06619712244719267, + "learning_rate": 9.944018918501805e-07, + "loss": 0.0063, + "num_tokens": 2077058.0, + "reward": 0.80419921875, + "reward_std": 0.027588527649641037, + "rewards//mean": 0.80419921875, + "rewards//std": 0.04236132651567459, + "step": 287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.8125, + "epoch": 0.0576, + "grad_norm": 1.583073616027832, + "kl": 0.07178678875789046, + "learning_rate": 9.94354439412955e-07, + "loss": -0.0061, + "num_tokens": 2084382.0, + "reward": 0.80010986328125, + "reward_std": 0.029259001836180687, + "rewards//mean": 0.80010986328125, + "rewards//std": 0.03131623566150665, + "step": 288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0578, + "grad_norm": 1.8364008665084839, + "kl": 0.07049432396888733, + "learning_rate": 9.943067878498209e-07, + "loss": 0.0028, + "num_tokens": 2091750.0, + "reward": 0.82452392578125, + "reward_std": 0.022959835827350616, + "rewards//mean": 0.82452392578125, + "rewards//std": 0.023595130071043968, + "step": 289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.058, + "grad_norm": 1.6003330945968628, + "kl": 0.06961082573980093, + "learning_rate": 9.942589371799714e-07, + "loss": 0.0028, + "num_tokens": 2099118.0, + "reward": 0.82110595703125, + "reward_std": 0.030794944614171982, + "rewards//mean": 0.82110595703125, + "rewards//std": 0.036206211894750595, + "step": 290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0582, + "grad_norm": 1.6063001155853271, + "kl": 0.07512943167239428, + "learning_rate": 9.94210887422681e-07, + "loss": 0.003, + "num_tokens": 2106430.0, + "reward": 0.815185546875, + "reward_std": 0.029485758394002914, + "rewards//mean": 0.815185546875, + "rewards//std": 0.03745369613170624, + "step": 291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0584, + "grad_norm": 1.6206570863723755, + "kl": 0.08722271770238876, + "learning_rate": 9.941626385973047e-07, + "loss": 0.0035, + "num_tokens": 2113638.0, + "reward": 0.83807373046875, + "reward_std": 0.03251531720161438, + "rewards//mean": 0.83807373046875, + "rewards//std": 0.0357060581445694, + "step": 292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.65625, + "epoch": 0.0586, + "grad_norm": 1.6328608989715576, + "kl": 0.07656494015827775, + "learning_rate": 9.941141907232763e-07, + "loss": -0.0187, + "num_tokens": 2120936.0, + "reward": 0.8416748046875, + "reward_std": 0.02875984087586403, + "rewards//mean": 0.8416748046875, + "rewards//std": 0.030072759836912155, + "step": 293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0588, + "grad_norm": 1.7066808938980103, + "kl": 0.08184460503980517, + "learning_rate": 9.94065543820111e-07, + "loss": 0.0033, + "num_tokens": 2128272.0, + "reward": 0.8231201171875, + "reward_std": 0.024384278804063797, + "rewards//mean": 0.8231201171875, + "rewards//std": 0.030317408964037895, + "step": 294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.059, + "grad_norm": 1.4360301494598389, + "kl": 0.08665785938501358, + "learning_rate": 9.94016697907404e-07, + "loss": 0.0035, + "num_tokens": 2135464.0, + "reward": 0.837890625, + "reward_std": 0.033705923706293106, + "rewards//mean": 0.837890625, + "rewards//std": 0.033954180777072906, + "step": 295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.765625, + "epoch": 0.0592, + "grad_norm": 1.6588941812515259, + "kl": 0.09392558131366968, + "learning_rate": 9.9396765300483e-07, + "loss": 0.0, + "num_tokens": 2142825.0, + "reward": 0.8082275390625, + "reward_std": 0.032454267144203186, + "rewards//mean": 0.8082275390625, + "rewards//std": 0.03880569711327553, + "step": 296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.625, + "epoch": 0.0594, + "grad_norm": 1.6167796850204468, + "kl": 0.08913951832801104, + "learning_rate": 9.939184091321444e-07, + "loss": 0.0055, + "num_tokens": 2150073.0, + "reward": 0.827880859375, + "reward_std": 0.02551415003836155, + "rewards//mean": 0.827880859375, + "rewards//std": 0.031764086335897446, + "step": 297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.78125, + "epoch": 0.0596, + "grad_norm": 1.6783093214035034, + "kl": 0.07079231878742576, + "learning_rate": 9.938689663091827e-07, + "loss": 0.011, + "num_tokens": 2157427.0, + "reward": 0.80712890625, + "reward_std": 0.028216544538736343, + "rewards//mean": 0.80712890625, + "rewards//std": 0.03543758764863014, + "step": 298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.6875, + "epoch": 0.0598, + "grad_norm": 1.3216148614883423, + "kl": 0.09520759619772434, + "learning_rate": 9.938193245558604e-07, + "loss": -0.002, + "num_tokens": 2164679.0, + "reward": 0.8228759765625, + "reward_std": 0.029928969219326973, + "rewards//mean": 0.8228759765625, + "rewards//std": 0.03966071456670761, + "step": 299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.546875, + "epoch": 0.06, + "grad_norm": 1.471260666847229, + "kl": 0.08411201182752848, + "learning_rate": 9.937694838921733e-07, + "loss": -0.0171, + "num_tokens": 2171930.0, + "reward": 0.76727294921875, + "reward_std": 0.02324344404041767, + "rewards//mean": 0.76727294921875, + "rewards//std": 0.03150322660803795, + "step": 300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.0602, + "grad_norm": 1.4392378330230713, + "kl": 0.08016178570687771, + "learning_rate": 9.93719444338197e-07, + "loss": 0.0033, + "num_tokens": 2179271.0, + "reward": 0.79888916015625, + "reward_std": 0.028746172785758972, + "rewards//mean": 0.79888916015625, + "rewards//std": 0.04044546186923981, + "step": 301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.0604, + "grad_norm": 1.4478363990783691, + "kl": 0.1025152811780572, + "learning_rate": 9.936692059140878e-07, + "loss": 0.0042, + "num_tokens": 2186552.0, + "reward": 0.83416748046875, + "reward_std": 0.03392185643315315, + "rewards//mean": 0.83416748046875, + "rewards//std": 0.03493114188313484, + "step": 302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.515625, + "epoch": 0.0606, + "grad_norm": 1.512493371963501, + "kl": 0.07052132859826088, + "learning_rate": 9.936187686400814e-07, + "loss": -0.0047, + "num_tokens": 2193945.0, + "reward": 0.8206787109375, + "reward_std": 0.022190537303686142, + "rewards//mean": 0.8206787109375, + "rewards//std": 0.031232314184308052, + "step": 303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.515625, + "epoch": 0.0608, + "grad_norm": 1.8274962902069092, + "kl": 0.08366881450638175, + "learning_rate": 9.93568132536494e-07, + "loss": 0.0058, + "num_tokens": 2201210.0, + "reward": 0.7923583984375, + "reward_std": 0.025398828089237213, + "rewards//mean": 0.7923583984375, + "rewards//std": 0.028307482600212097, + "step": 304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.34375, + "epoch": 0.061, + "grad_norm": 1.810131311416626, + "kl": 0.08943480672314763, + "learning_rate": 9.935172976237217e-07, + "loss": -0.0402, + "num_tokens": 2208424.0, + "reward": 0.81231689453125, + "reward_std": 0.02159598469734192, + "rewards//mean": 0.81231689453125, + "rewards//std": 0.023024950176477432, + "step": 305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.0612, + "grad_norm": 1.5364056825637817, + "kl": 0.10298978537321091, + "learning_rate": 9.93466263922241e-07, + "loss": 0.0111, + "num_tokens": 2215681.0, + "reward": 0.80181884765625, + "reward_std": 0.02418678253889084, + "rewards//mean": 0.80181884765625, + "rewards//std": 0.03300094231963158, + "step": 306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0614, + "grad_norm": 1.6495087146759033, + "kl": 0.09287515468895435, + "learning_rate": 9.934150314526083e-07, + "loss": 0.0037, + "num_tokens": 2222937.0, + "reward": 0.8123779296875, + "reward_std": 0.025728987529873848, + "rewards//mean": 0.8123779296875, + "rewards//std": 0.029830165207386017, + "step": 307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0616, + "grad_norm": 1.589019536972046, + "kl": 0.0924870390444994, + "learning_rate": 9.933636002354599e-07, + "loss": 0.0037, + "num_tokens": 2230217.0, + "reward": 0.77069091796875, + "reward_std": 0.021310115233063698, + "rewards//mean": 0.77069091796875, + "rewards//std": 0.025645043700933456, + "step": 308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0618, + "grad_norm": 1.4731931686401367, + "kl": 0.10337489284574986, + "learning_rate": 9.933119702915124e-07, + "loss": 0.0041, + "num_tokens": 2237433.0, + "reward": 0.731689453125, + "reward_std": 0.021616047248244286, + "rewards//mean": 0.731689453125, + "rewards//std": 0.02629043348133564, + "step": 309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.062, + "grad_norm": 1.7002865076065063, + "kl": 0.09452310390770435, + "learning_rate": 9.93260141641562e-07, + "loss": 0.0048, + "num_tokens": 2244730.0, + "reward": 0.8046875, + "reward_std": 0.02696414291858673, + "rewards//mean": 0.8046875, + "rewards//std": 0.029273098334670067, + "step": 310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.40625, + "epoch": 0.0622, + "grad_norm": 1.6786231994628906, + "kl": 0.10590477380901575, + "learning_rate": 9.932081143064858e-07, + "loss": -0.0045, + "num_tokens": 2251900.0, + "reward": 0.8172607421875, + "reward_std": 0.03245352953672409, + "rewards//mean": 0.8172607421875, + "rewards//std": 0.03982069715857506, + "step": 311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0624, + "grad_norm": 1.4219293594360352, + "kl": 0.08819158561527729, + "learning_rate": 9.931558883072402e-07, + "loss": 0.0035, + "num_tokens": 2259244.0, + "reward": 0.8126220703125, + "reward_std": 0.021063588559627533, + "rewards//mean": 0.8126220703125, + "rewards//std": 0.021776555106043816, + "step": 312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.796875, + "epoch": 0.0626, + "grad_norm": 1.4592877626419067, + "kl": 0.09165953937917948, + "learning_rate": 9.931034636648616e-07, + "loss": 0.003, + "num_tokens": 2266487.0, + "reward": 0.68511962890625, + "reward_std": 0.01953062415122986, + "rewards//mean": 0.68511962890625, + "rewards//std": 0.025879492983222008, + "step": 313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.5, + "epoch": 0.0628, + "grad_norm": 1.6392055749893188, + "kl": 0.09163308702409267, + "learning_rate": 9.930508404004666e-07, + "loss": -0.0133, + "num_tokens": 2273687.0, + "reward": 0.83319091796875, + "reward_std": 0.030586615204811096, + "rewards//mean": 0.83319091796875, + "rewards//std": 0.041931357234716415, + "step": 314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.109375, + "epoch": 0.063, + "grad_norm": 1.7404165267944336, + "kl": 0.10751303471624851, + "learning_rate": 9.929980185352525e-07, + "loss": 0.0204, + "num_tokens": 2280862.0, + "reward": 0.79833984375, + "reward_std": 0.03080086037516594, + "rewards//mean": 0.79833984375, + "rewards//std": 0.03476830944418907, + "step": 315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.75, + "epoch": 0.0632, + "grad_norm": 1.6569958925247192, + "kl": 0.08730357699096203, + "learning_rate": 9.929449980904951e-07, + "loss": -0.0082, + "num_tokens": 2288126.0, + "reward": 0.81494140625, + "reward_std": 0.025709809735417366, + "rewards//mean": 0.81494140625, + "rewards//std": 0.027748214080929756, + "step": 316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.0634, + "grad_norm": 1.4599896669387817, + "kl": 0.10084377322345972, + "learning_rate": 9.928917790875516e-07, + "loss": 0.0027, + "num_tokens": 2295465.0, + "reward": 0.7864990234375, + "reward_std": 0.024457935243844986, + "rewards//mean": 0.7864990234375, + "rewards//std": 0.030002206563949585, + "step": 317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.0636, + "grad_norm": 1.3552584648132324, + "kl": 0.09777896851301193, + "learning_rate": 9.928383615478586e-07, + "loss": 0.0032, + "num_tokens": 2302686.0, + "reward": 0.8309326171875, + "reward_std": 0.019726349040865898, + "rewards//mean": 0.8309326171875, + "rewards//std": 0.022909360006451607, + "step": 318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0638, + "grad_norm": 1.7792775630950928, + "kl": 0.09933721367269754, + "learning_rate": 9.927847454929322e-07, + "loss": 0.004, + "num_tokens": 2309966.0, + "reward": 0.819580078125, + "reward_std": 0.0268249474465847, + "rewards//mean": 0.819580078125, + "rewards//std": 0.033623550087213516, + "step": 319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.064, + "grad_norm": 1.505348563194275, + "kl": 0.10555843729525805, + "learning_rate": 9.927309309443695e-07, + "loss": 0.0042, + "num_tokens": 2317206.0, + "reward": 0.81256103515625, + "reward_std": 0.02722073160111904, + "rewards//mean": 0.81256103515625, + "rewards//std": 0.03628180921077728, + "step": 320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0642, + "grad_norm": 1.4116710424423218, + "kl": 0.10727399215102196, + "learning_rate": 9.926769179238464e-07, + "loss": 0.0043, + "num_tokens": 2324446.0, + "reward": 0.81182861328125, + "reward_std": 0.03335077315568924, + "rewards//mean": 0.81182861328125, + "rewards//std": 0.03583217412233353, + "step": 321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0644, + "grad_norm": 1.4713000059127808, + "kl": 0.10790205933153629, + "learning_rate": 9.926227064531199e-07, + "loss": 0.0043, + "num_tokens": 2331638.0, + "reward": 0.77813720703125, + "reward_std": 0.02476838231086731, + "rewards//mean": 0.77813720703125, + "rewards//std": 0.03471029922366142, + "step": 322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.375, + "epoch": 0.0646, + "grad_norm": 1.50894033908844, + "kl": 0.09074968658387661, + "learning_rate": 9.925682965540263e-07, + "loss": -0.0257, + "num_tokens": 2338846.0, + "reward": 0.8179931640625, + "reward_std": 0.0222804956138134, + "rewards//mean": 0.8179931640625, + "rewards//std": 0.03238961845636368, + "step": 323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0648, + "grad_norm": 1.4154258966445923, + "kl": 0.10706997383385897, + "learning_rate": 9.925136882484815e-07, + "loss": 0.0043, + "num_tokens": 2346102.0, + "reward": 0.8109130859375, + "reward_std": 0.03054163046181202, + "rewards//mean": 0.8109130859375, + "rewards//std": 0.03191503882408142, + "step": 324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.84375, + "epoch": 0.065, + "grad_norm": 1.491241693496704, + "kl": 0.10446293279528618, + "learning_rate": 9.92458881558482e-07, + "loss": 0.0012, + "num_tokens": 2353420.0, + "reward": 0.784912109375, + "reward_std": 0.027655068784952164, + "rewards//mean": 0.784912109375, + "rewards//std": 0.03102344647049904, + "step": 325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.6875, + "epoch": 0.0652, + "grad_norm": 1.7677568197250366, + "kl": 0.11717665521427989, + "learning_rate": 9.92403876506104e-07, + "loss": 0.0026, + "num_tokens": 2360608.0, + "reward": 0.83837890625, + "reward_std": 0.029619915410876274, + "rewards//mean": 0.83837890625, + "rewards//std": 0.0316389799118042, + "step": 326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.0654, + "grad_norm": 1.4867119789123535, + "kl": 0.0966028617694974, + "learning_rate": 9.923486731135033e-07, + "loss": 0.0032, + "num_tokens": 2367913.0, + "reward": 0.8125, + "reward_std": 0.024412449449300766, + "rewards//mean": 0.8125, + "rewards//std": 0.03506312146782875, + "step": 327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.796875, + "epoch": 0.0656, + "grad_norm": 1.5532718896865845, + "kl": 0.09632278513163328, + "learning_rate": 9.922932714029163e-07, + "loss": -0.0064, + "num_tokens": 2375092.0, + "reward": 0.81109619140625, + "reward_std": 0.023545071482658386, + "rewards//mean": 0.81109619140625, + "rewards//std": 0.029654255136847496, + "step": 328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.0658, + "grad_norm": 1.6438449621200562, + "kl": 0.08380890637636185, + "learning_rate": 9.92237671396658e-07, + "loss": 0.0052, + "num_tokens": 2382359.0, + "reward": 0.8331298828125, + "reward_std": 0.028803512454032898, + "rewards//mean": 0.8331298828125, + "rewards//std": 0.03002036362886429, + "step": 329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.78125, + "epoch": 0.066, + "grad_norm": 1.52018141746521, + "kl": 0.1005646362900734, + "learning_rate": 9.921818731171248e-07, + "loss": 0.0033, + "num_tokens": 2389609.0, + "reward": 0.80096435546875, + "reward_std": 0.019802164286375046, + "rewards//mean": 0.80096435546875, + "rewards//std": 0.024164384230971336, + "step": 330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.6875, + "epoch": 0.0662, + "grad_norm": 1.7980751991271973, + "kl": 0.09556509740650654, + "learning_rate": 9.921258765867919e-07, + "loss": -0.0017, + "num_tokens": 2396981.0, + "reward": 0.791259765625, + "reward_std": 0.02803277224302292, + "rewards//mean": 0.791259765625, + "rewards//std": 0.03380315378308296, + "step": 331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.796875, + "epoch": 0.0664, + "grad_norm": 1.4869153499603271, + "kl": 0.09881448466330767, + "learning_rate": 9.920696818282147e-07, + "loss": -0.002, + "num_tokens": 2404200.0, + "reward": 0.78851318359375, + "reward_std": 0.022936001420021057, + "rewards//mean": 0.78851318359375, + "rewards//std": 0.026198072358965874, + "step": 332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.734375, + "epoch": 0.0666, + "grad_norm": 1.9274632930755615, + "kl": 0.1076492234133184, + "learning_rate": 9.920132888640284e-07, + "loss": 0.0097, + "num_tokens": 2411415.0, + "reward": 0.81561279296875, + "reward_std": 0.03066486306488514, + "rewards//mean": 0.81561279296875, + "rewards//std": 0.03724166378378868, + "step": 333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.421875, + "epoch": 0.0668, + "grad_norm": 1.5303622484207153, + "kl": 0.11499751172959805, + "learning_rate": 9.919566977169485e-07, + "loss": -0.0246, + "num_tokens": 2418754.0, + "reward": 0.796875, + "reward_std": 0.01947757601737976, + "rewards//mean": 0.796875, + "rewards//std": 0.03083648905158043, + "step": 334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.84375, + "epoch": 0.067, + "grad_norm": 1.4922113418579102, + "kl": 0.10089922323822975, + "learning_rate": 9.918999084097694e-07, + "loss": 0.0058, + "num_tokens": 2426072.0, + "reward": 0.832275390625, + "reward_std": 0.018433287739753723, + "rewards//mean": 0.832275390625, + "rewards//std": 0.020924687385559082, + "step": 335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.0672, + "grad_norm": 1.5714677572250366, + "kl": 0.10787924844771624, + "learning_rate": 9.91842920965366e-07, + "loss": 0.0063, + "num_tokens": 2433348.0, + "reward": 0.7786865234375, + "reward_std": 0.021855615079402924, + "rewards//mean": 0.7786865234375, + "rewards//std": 0.02383934147655964, + "step": 336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.65625, + "epoch": 0.0674, + "grad_norm": 1.6750205755233765, + "kl": 0.10377390682697296, + "learning_rate": 9.91785735406693e-07, + "loss": -0.0149, + "num_tokens": 2440702.0, + "reward": 0.8236083984375, + "reward_std": 0.03331586718559265, + "rewards//mean": 0.8236083984375, + "rewards//std": 0.04105306789278984, + "step": 337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.359375, + "epoch": 0.0676, + "grad_norm": 1.6918433904647827, + "kl": 0.09170649386942387, + "learning_rate": 9.917283517567843e-07, + "loss": -0.0212, + "num_tokens": 2447957.0, + "reward": 0.77142333984375, + "reward_std": 0.028777562081813812, + "rewards//mean": 0.77142333984375, + "rewards//std": 0.032166559249162674, + "step": 338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.59375, + "epoch": 0.0678, + "grad_norm": 1.8690557479858398, + "kl": 0.1041225865483284, + "learning_rate": 9.916707700387545e-07, + "loss": -0.0166, + "num_tokens": 2455275.0, + "reward": 0.83172607421875, + "reward_std": 0.027708154171705246, + "rewards//mean": 0.83172607421875, + "rewards//std": 0.031698670238256454, + "step": 339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.546875, + "epoch": 0.068, + "grad_norm": 1.6053203344345093, + "kl": 0.10950558818876743, + "learning_rate": 9.916129902757974e-07, + "loss": -0.0192, + "num_tokens": 2462582.0, + "reward": 0.8289794921875, + "reward_std": 0.021280745044350624, + "rewards//mean": 0.8289794921875, + "rewards//std": 0.02313031256198883, + "step": 340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.59375, + "epoch": 0.0682, + "grad_norm": 1.4908751249313354, + "kl": 0.10723921749740839, + "learning_rate": 9.915550124911866e-07, + "loss": -0.0235, + "num_tokens": 2470028.0, + "reward": 0.8028564453125, + "reward_std": 0.024182353168725967, + "rewards//mean": 0.8028564453125, + "rewards//std": 0.027804618701338768, + "step": 341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0684, + "grad_norm": 1.4062821865081787, + "kl": 0.11000932473689318, + "learning_rate": 9.914968367082755e-07, + "loss": 0.0044, + "num_tokens": 2477276.0, + "reward": 0.7999267578125, + "reward_std": 0.018666604533791542, + "rewards//mean": 0.7999267578125, + "rewards//std": 0.0222768671810627, + "step": 342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.640625, + "epoch": 0.0686, + "grad_norm": 1.4653537273406982, + "kl": 0.1032357681542635, + "learning_rate": 9.914384629504973e-07, + "loss": -0.0006, + "num_tokens": 2484549.0, + "reward": 0.83160400390625, + "reward_std": 0.02952004224061966, + "rewards//mean": 0.83160400390625, + "rewards//std": 0.03254179283976555, + "step": 343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0688, + "grad_norm": 1.942368507385254, + "kl": 0.12489230558276176, + "learning_rate": 9.913798912413652e-07, + "loss": 0.005, + "num_tokens": 2491829.0, + "reward": 0.815185546875, + "reward_std": 0.020390164107084274, + "rewards//mean": 0.815185546875, + "rewards//std": 0.03426572307944298, + "step": 344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.069, + "grad_norm": 1.5378228425979614, + "kl": 0.11281054466962814, + "learning_rate": 9.913211216044713e-07, + "loss": 0.0045, + "num_tokens": 2499085.0, + "reward": 0.80816650390625, + "reward_std": 0.027130484580993652, + "rewards//mean": 0.80816650390625, + "rewards//std": 0.034323930740356445, + "step": 345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0692, + "grad_norm": 1.5276501178741455, + "kl": 0.10910513158887625, + "learning_rate": 9.912621540634886e-07, + "loss": 0.0044, + "num_tokens": 2506461.0, + "reward": 0.847900390625, + "reward_std": 0.026252053678035736, + "rewards//mean": 0.847900390625, + "rewards//std": 0.03099220246076584, + "step": 346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0694, + "grad_norm": 1.558321475982666, + "kl": 0.10140563361346722, + "learning_rate": 9.91202988642169e-07, + "loss": 0.0041, + "num_tokens": 2513677.0, + "reward": 0.77545166015625, + "reward_std": 0.020047467201948166, + "rewards//mean": 0.77545166015625, + "rewards//std": 0.023292293772101402, + "step": 347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.6875, + "epoch": 0.0696, + "grad_norm": 1.3729301691055298, + "kl": 0.11031910311430693, + "learning_rate": 9.911436253643443e-07, + "loss": 0.0251, + "num_tokens": 2520961.0, + "reward": 0.799072265625, + "reward_std": 0.026244722306728363, + "rewards//mean": 0.799072265625, + "rewards//std": 0.030780475586652756, + "step": 348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0698, + "grad_norm": 1.446530818939209, + "kl": 0.11420840676873922, + "learning_rate": 9.91084064253926e-07, + "loss": 0.0046, + "num_tokens": 2528273.0, + "reward": 0.81829833984375, + "reward_std": 0.0247903224080801, + "rewards//mean": 0.81829833984375, + "rewards//std": 0.029757192358374596, + "step": 349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.07, + "grad_norm": 1.5888477563858032, + "kl": 0.10328076919540763, + "learning_rate": 9.910243053349055e-07, + "loss": 0.0041, + "num_tokens": 2535537.0, + "reward": 0.82122802734375, + "reward_std": 0.03141201660037041, + "rewards//mean": 0.82122802734375, + "rewards//std": 0.03762060031294823, + "step": 350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0702, + "grad_norm": 1.632536768913269, + "kl": 0.12693768553435802, + "learning_rate": 9.909643486313533e-07, + "loss": 0.0051, + "num_tokens": 2542825.0, + "reward": 0.7967529296875, + "reward_std": 0.024580225348472595, + "rewards//mean": 0.7967529296875, + "rewards//std": 0.02913184091448784, + "step": 351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0704, + "grad_norm": 1.7252479791641235, + "kl": 0.11625802051275969, + "learning_rate": 9.909041941674204e-07, + "loss": 0.0047, + "num_tokens": 2550113.0, + "reward": 0.8323974609375, + "reward_std": 0.023920735344290733, + "rewards//mean": 0.8323974609375, + "rewards//std": 0.029196204617619514, + "step": 352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.0706, + "grad_norm": 1.6687239408493042, + "kl": 0.11438181716948748, + "learning_rate": 9.908438419673366e-07, + "loss": 0.0049, + "num_tokens": 2557341.0, + "reward": 0.81072998046875, + "reward_std": 0.02121851034462452, + "rewards//mean": 0.81072998046875, + "rewards//std": 0.025257250294089317, + "step": 353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.0708, + "grad_norm": 1.6752699613571167, + "kl": 0.12126925121992826, + "learning_rate": 9.90783292055412e-07, + "loss": -0.0013, + "num_tokens": 2564646.0, + "reward": 0.8099365234375, + "reward_std": 0.021628160029649734, + "rewards//mean": 0.8099365234375, + "rewards//std": 0.025772595778107643, + "step": 354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.071, + "grad_norm": 1.6468839645385742, + "kl": 0.12016522977501154, + "learning_rate": 9.907225444560361e-07, + "loss": 0.0048, + "num_tokens": 2571926.0, + "reward": 0.80303955078125, + "reward_std": 0.01961602084338665, + "rewards//mean": 0.80303955078125, + "rewards//std": 0.023835133761167526, + "step": 355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0712, + "grad_norm": 1.6054528951644897, + "kl": 0.10932362172752619, + "learning_rate": 9.90661599193678e-07, + "loss": 0.0044, + "num_tokens": 2579302.0, + "reward": 0.8026123046875, + "reward_std": 0.01971992291510105, + "rewards//mean": 0.8026123046875, + "rewards//std": 0.025103727355599403, + "step": 356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.8125, + "epoch": 0.0714, + "grad_norm": 1.6159863471984863, + "kl": 0.11917565017938614, + "learning_rate": 9.906004562928863e-07, + "loss": -0.0056, + "num_tokens": 2586562.0, + "reward": 0.8031005859375, + "reward_std": 0.016146648675203323, + "rewards//mean": 0.8031005859375, + "rewards//std": 0.018515486270189285, + "step": 357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.578125, + "epoch": 0.0716, + "grad_norm": 1.750967025756836, + "kl": 0.10188998281955719, + "learning_rate": 9.905391157782897e-07, + "loss": -0.0089, + "num_tokens": 2593719.0, + "reward": 0.85308837890625, + "reward_std": 0.027826478704810143, + "rewards//mean": 0.85308837890625, + "rewards//std": 0.035812314599752426, + "step": 358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0718, + "grad_norm": 1.8491488695144653, + "kl": 0.109468680806458, + "learning_rate": 9.904775776745956e-07, + "loss": 0.0044, + "num_tokens": 2600959.0, + "reward": 0.82391357421875, + "reward_std": 0.035552043467760086, + "rewards//mean": 0.82391357421875, + "rewards//std": 0.04450711980462074, + "step": 359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.072, + "grad_norm": 1.5489528179168701, + "kl": 0.13034156896173954, + "learning_rate": 9.904158420065922e-07, + "loss": 0.0007, + "num_tokens": 2608306.0, + "reward": 0.8170166015625, + "reward_std": 0.025740882381796837, + "rewards//mean": 0.8170166015625, + "rewards//std": 0.029250076040625572, + "step": 360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0722, + "grad_norm": 1.931219458580017, + "kl": 0.11153018847107887, + "learning_rate": 9.903539087991461e-07, + "loss": 0.0045, + "num_tokens": 2615474.0, + "reward": 0.8043212890625, + "reward_std": 0.028132067993283272, + "rewards//mean": 0.8043212890625, + "rewards//std": 0.03476286679506302, + "step": 361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0724, + "grad_norm": 1.9316530227661133, + "kl": 0.10586426313966513, + "learning_rate": 9.902917780772042e-07, + "loss": 0.0042, + "num_tokens": 2622730.0, + "reward": 0.82818603515625, + "reward_std": 0.01929824985563755, + "rewards//mean": 0.82818603515625, + "rewards//std": 0.024592217057943344, + "step": 362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0726, + "grad_norm": 1.6335597038269043, + "kl": 0.10098724346607924, + "learning_rate": 9.902294498657929e-07, + "loss": 0.004, + "num_tokens": 2630138.0, + "reward": 0.83599853515625, + "reward_std": 0.02150493860244751, + "rewards//mean": 0.83599853515625, + "rewards//std": 0.030076975002884865, + "step": 363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0728, + "grad_norm": 1.6724114418029785, + "kl": 0.12477605044841766, + "learning_rate": 9.901669241900176e-07, + "loss": 0.005, + "num_tokens": 2637346.0, + "reward": 0.8204345703125, + "reward_std": 0.02794221043586731, + "rewards//mean": 0.8204345703125, + "rewards//std": 0.03317461907863617, + "step": 364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.073, + "grad_norm": 1.4103883504867554, + "kl": 0.12068549729883671, + "learning_rate": 9.90104201075064e-07, + "loss": 0.0038, + "num_tokens": 2644643.0, + "reward": 0.82623291015625, + "reward_std": 0.028366103768348694, + "rewards//mean": 0.82623291015625, + "rewards//std": 0.03499392420053482, + "step": 365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0732, + "grad_norm": 1.270281195640564, + "kl": 0.1026424691081047, + "learning_rate": 9.900412805461966e-07, + "loss": 0.0041, + "num_tokens": 2652051.0, + "reward": 0.80810546875, + "reward_std": 0.028189770877361298, + "rewards//mean": 0.80810546875, + "rewards//std": 0.0413193553686142, + "step": 366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.0734, + "grad_norm": 1.559218406677246, + "kl": 0.13275473564863205, + "learning_rate": 9.899781626287602e-07, + "loss": 0.0057, + "num_tokens": 2659489.0, + "reward": 0.83856201171875, + "reward_std": 0.027918478474020958, + "rewards//mean": 0.83856201171875, + "rewards//std": 0.03803876042366028, + "step": 367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0736, + "grad_norm": 1.5817978382110596, + "kl": 0.12298816721886396, + "learning_rate": 9.899148473481784e-07, + "loss": 0.0049, + "num_tokens": 2666761.0, + "reward": 0.84918212890625, + "reward_std": 0.025500498712062836, + "rewards//mean": 0.84918212890625, + "rewards//std": 0.02743321843445301, + "step": 368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0738, + "grad_norm": 1.3800699710845947, + "kl": 0.12957536429166794, + "learning_rate": 9.898513347299547e-07, + "loss": 0.0052, + "num_tokens": 2674025.0, + "reward": 0.8475341796875, + "reward_std": 0.024528346955776215, + "rewards//mean": 0.8475341796875, + "rewards//std": 0.03245871141552925, + "step": 369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.074, + "grad_norm": 1.7134215831756592, + "kl": 0.13566438760608435, + "learning_rate": 9.89787624799672e-07, + "loss": 0.0054, + "num_tokens": 2681305.0, + "reward": 0.85260009765625, + "reward_std": 0.03374122455716133, + "rewards//mean": 0.85260009765625, + "rewards//std": 0.04085805267095566, + "step": 370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.875, + "epoch": 0.0742, + "grad_norm": 1.4326939582824707, + "kl": 0.12100122775882483, + "learning_rate": 9.897237175829926e-07, + "loss": -0.0013, + "num_tokens": 2688513.0, + "reward": 0.8360595703125, + "reward_std": 0.018374403938651085, + "rewards//mean": 0.8360595703125, + "rewards//std": 0.019337691366672516, + "step": 371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0744, + "grad_norm": 1.6045618057250977, + "kl": 0.14925550762563944, + "learning_rate": 9.896596131056582e-07, + "loss": 0.006, + "num_tokens": 2695793.0, + "reward": 0.82598876953125, + "reward_std": 0.023545075207948685, + "rewards//mean": 0.82598876953125, + "rewards//std": 0.02716093324124813, + "step": 372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.375, + "epoch": 0.0746, + "grad_norm": 1.413653016090393, + "kl": 0.11870748270303011, + "learning_rate": 9.895953113934903e-07, + "loss": 0.0043, + "num_tokens": 2703025.0, + "reward": 0.8291015625, + "reward_std": 0.022527407854795456, + "rewards//mean": 0.8291015625, + "rewards//std": 0.033407654613256454, + "step": 373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0748, + "grad_norm": 1.6010768413543701, + "kl": 0.1172671364620328, + "learning_rate": 9.895308124723896e-07, + "loss": 0.0047, + "num_tokens": 2710425.0, + "reward": 0.8167724609375, + "reward_std": 0.020026205107569695, + "rewards//mean": 0.8167724609375, + "rewards//std": 0.025293558835983276, + "step": 374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.075, + "grad_norm": 2.000542402267456, + "kl": 0.11080891638994217, + "learning_rate": 9.89466116368336e-07, + "loss": 0.0044, + "num_tokens": 2717681.0, + "reward": 0.8143310546875, + "reward_std": 0.022553879767656326, + "rewards//mean": 0.8143310546875, + "rewards//std": 0.027427494525909424, + "step": 375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0752, + "grad_norm": 1.41792893409729, + "kl": 0.12116466090083122, + "learning_rate": 9.894012231073895e-07, + "loss": 0.0048, + "num_tokens": 2724977.0, + "reward": 0.81695556640625, + "reward_std": 0.020680002868175507, + "rewards//mean": 0.81695556640625, + "rewards//std": 0.02927199937403202, + "step": 376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0754, + "grad_norm": 1.5321438312530518, + "kl": 0.13543279841542244, + "learning_rate": 9.893361327156884e-07, + "loss": 0.0054, + "num_tokens": 2732337.0, + "reward": 0.84210205078125, + "reward_std": 0.0322381854057312, + "rewards//mean": 0.84210205078125, + "rewards//std": 0.03323177993297577, + "step": 377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0756, + "grad_norm": 1.3832063674926758, + "kl": 0.13975889794528484, + "learning_rate": 9.89270845219452e-07, + "loss": 0.0056, + "num_tokens": 2739609.0, + "reward": 0.83843994140625, + "reward_std": 0.02708376571536064, + "rewards//mean": 0.83843994140625, + "rewards//std": 0.0317830815911293, + "step": 378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.625, + "epoch": 0.0758, + "grad_norm": 1.4303878545761108, + "kl": 0.14658327773213387, + "learning_rate": 9.892053606449774e-07, + "loss": -0.0047, + "num_tokens": 2746857.0, + "reward": 0.860107421875, + "reward_std": 0.03132881969213486, + "rewards//mean": 0.860107421875, + "rewards//std": 0.03769220411777496, + "step": 379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.65625, + "epoch": 0.076, + "grad_norm": 1.39239501953125, + "kl": 0.1400539679452777, + "learning_rate": 9.891396790186422e-07, + "loss": -0.005, + "num_tokens": 2754091.0, + "reward": 0.84320068359375, + "reward_std": 0.030959809198975563, + "rewards//mean": 0.84320068359375, + "rewards//std": 0.036840759217739105, + "step": 380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0762, + "grad_norm": 1.3299005031585693, + "kl": 0.1411418654024601, + "learning_rate": 9.890738003669027e-07, + "loss": 0.0056, + "num_tokens": 2761355.0, + "reward": 0.83489990234375, + "reward_std": 0.030630648136138916, + "rewards//mean": 0.83489990234375, + "rewards//std": 0.033884502947330475, + "step": 381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0764, + "grad_norm": 1.7539607286453247, + "kl": 0.11807714402675629, + "learning_rate": 9.89007724716295e-07, + "loss": 0.0047, + "num_tokens": 2768771.0, + "reward": 0.82379150390625, + "reward_std": 0.028771081939339638, + "rewards//mean": 0.82379150390625, + "rewards//std": 0.041064269840717316, + "step": 382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.671875, + "epoch": 0.0766, + "grad_norm": 1.4371676445007324, + "kl": 0.12034825794398785, + "learning_rate": 9.889414520934343e-07, + "loss": 0.003, + "num_tokens": 2776038.0, + "reward": 0.824951171875, + "reward_std": 0.033065736293792725, + "rewards//mean": 0.824951171875, + "rewards//std": 0.03928431123495102, + "step": 383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0768, + "grad_norm": 1.4283788204193115, + "kl": 0.12488754093647003, + "learning_rate": 9.88874982525015e-07, + "loss": 0.005, + "num_tokens": 2783318.0, + "reward": 0.83544921875, + "reward_std": 0.02027050405740738, + "rewards//mean": 0.83544921875, + "rewards//std": 0.025765838101506233, + "step": 384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.077, + "grad_norm": 1.5762522220611572, + "kl": 0.13217007834464312, + "learning_rate": 9.888083160378112e-07, + "loss": 0.0053, + "num_tokens": 2790574.0, + "reward": 0.810302734375, + "reward_std": 0.015394829213619232, + "rewards//mean": 0.810302734375, + "rewards//std": 0.020098110660910606, + "step": 385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0772, + "grad_norm": 1.5726687908172607, + "kl": 0.13252857513725758, + "learning_rate": 9.887414526586763e-07, + "loss": 0.0053, + "num_tokens": 2797838.0, + "reward": 0.80389404296875, + "reward_std": 0.02258962020277977, + "rewards//mean": 0.80389404296875, + "rewards//std": 0.025894111022353172, + "step": 386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.0774, + "grad_norm": 1.4931820631027222, + "kl": 0.14040267188102007, + "learning_rate": 9.886743924145426e-07, + "loss": 0.0047, + "num_tokens": 2805093.0, + "reward": 0.83941650390625, + "reward_std": 0.023262690752744675, + "rewards//mean": 0.83941650390625, + "rewards//std": 0.03910660371184349, + "step": 387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.828125, + "epoch": 0.0776, + "grad_norm": 1.5451669692993164, + "kl": 0.14815572183579206, + "learning_rate": 9.886071353324222e-07, + "loss": -0.0027, + "num_tokens": 2812322.0, + "reward": 0.844482421875, + "reward_std": 0.025567756965756416, + "rewards//mean": 0.844482421875, + "rewards//std": 0.030748983845114708, + "step": 388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0778, + "grad_norm": 1.9579883813858032, + "kl": 0.16076608281582594, + "learning_rate": 9.88539681439406e-07, + "loss": 0.0064, + "num_tokens": 2819602.0, + "reward": 0.814697265625, + "reward_std": 0.024172095581889153, + "rewards//mean": 0.814697265625, + "rewards//std": 0.04030077904462814, + "step": 389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.078, + "grad_norm": 1.7950809001922607, + "kl": 0.11780598666518927, + "learning_rate": 9.884720307626646e-07, + "loss": 0.0053, + "num_tokens": 2826888.0, + "reward": 0.80804443359375, + "reward_std": 0.029416294768452644, + "rewards//mean": 0.80804443359375, + "rewards//std": 0.037063609808683395, + "step": 390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.0782, + "grad_norm": 1.553209900856018, + "kl": 0.15158303827047348, + "learning_rate": 9.884041833294475e-07, + "loss": 0.0033, + "num_tokens": 2834145.0, + "reward": 0.8553466796875, + "reward_std": 0.03083866275846958, + "rewards//mean": 0.8553466796875, + "rewards//std": 0.03866344317793846, + "step": 391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0784, + "grad_norm": 1.464160442352295, + "kl": 0.12380940560251474, + "learning_rate": 9.883361391670839e-07, + "loss": 0.005, + "num_tokens": 2841385.0, + "reward": 0.74993896484375, + "reward_std": 0.02266588807106018, + "rewards//mean": 0.74993896484375, + "rewards//std": 0.02488592639565468, + "step": 392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0786, + "grad_norm": 1.2679316997528076, + "kl": 0.12635519076138735, + "learning_rate": 9.882678983029817e-07, + "loss": 0.0051, + "num_tokens": 2848737.0, + "reward": 0.81170654296875, + "reward_std": 0.02779785543680191, + "rewards//mean": 0.81170654296875, + "rewards//std": 0.032069940119981766, + "step": 393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.796875, + "epoch": 0.0788, + "grad_norm": 1.4692590236663818, + "kl": 0.14963931869715452, + "learning_rate": 9.881994607646286e-07, + "loss": -0.0058, + "num_tokens": 2855972.0, + "reward": 0.77032470703125, + "reward_std": 0.017697036266326904, + "rewards//mean": 0.77032470703125, + "rewards//std": 0.027596617117524147, + "step": 394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.84375, + "epoch": 0.079, + "grad_norm": 1.7903512716293335, + "kl": 0.15648710262030363, + "learning_rate": 9.881308265795911e-07, + "loss": 0.002, + "num_tokens": 2863226.0, + "reward": 0.84954833984375, + "reward_std": 0.027770301327109337, + "rewards//mean": 0.84954833984375, + "rewards//std": 0.031110117211937904, + "step": 395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.0792, + "grad_norm": 1.5731680393218994, + "kl": 0.1341902781277895, + "learning_rate": 9.88061995775515e-07, + "loss": 0.0062, + "num_tokens": 2870601.0, + "reward": 0.8194580078125, + "reward_std": 0.020800434052944183, + "rewards//mean": 0.8194580078125, + "rewards//std": 0.026661783456802368, + "step": 396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0794, + "grad_norm": 1.5587303638458252, + "kl": 0.1492895595729351, + "learning_rate": 9.879929683801253e-07, + "loss": 0.006, + "num_tokens": 2877809.0, + "reward": 0.86578369140625, + "reward_std": 0.022067338228225708, + "rewards//mean": 0.86578369140625, + "rewards//std": 0.031105251982808113, + "step": 397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.703125, + "epoch": 0.0796, + "grad_norm": 1.7429847717285156, + "kl": 0.16400672402232885, + "learning_rate": 9.879237444212264e-07, + "loss": -0.003, + "num_tokens": 2885118.0, + "reward": 0.80010986328125, + "reward_std": 0.021756384521722794, + "rewards//mean": 0.80010986328125, + "rewards//std": 0.026680726557970047, + "step": 398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0798, + "grad_norm": 1.6535727977752686, + "kl": 0.15143904089927673, + "learning_rate": 9.878543239267014e-07, + "loss": 0.0061, + "num_tokens": 2892398.0, + "reward": 0.82879638671875, + "reward_std": 0.026628583669662476, + "rewards//mean": 0.82879638671875, + "rewards//std": 0.031231041997671127, + "step": 399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.08, + "grad_norm": 1.6625901460647583, + "kl": 0.16487631760537624, + "learning_rate": 9.877847069245133e-07, + "loss": 0.0066, + "num_tokens": 2899758.0, + "reward": 0.827880859375, + "reward_std": 0.023994656279683113, + "rewards//mean": 0.827880859375, + "rewards//std": 0.03258461877703667, + "step": 400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.734375, + "epoch": 0.0802, + "grad_norm": 1.3307304382324219, + "kl": 0.1426503686234355, + "learning_rate": 9.877148934427035e-07, + "loss": -0.0031, + "num_tokens": 2907013.0, + "reward": 0.810791015625, + "reward_std": 0.0229007750749588, + "rewards//mean": 0.810791015625, + "rewards//std": 0.027267245575785637, + "step": 401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0804, + "grad_norm": 1.5634301900863647, + "kl": 0.12798271048814058, + "learning_rate": 9.876448835093929e-07, + "loss": 0.0051, + "num_tokens": 2914349.0, + "reward": 0.81829833984375, + "reward_std": 0.017851322889328003, + "rewards//mean": 0.81829833984375, + "rewards//std": 0.020728345960378647, + "step": 402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0806, + "grad_norm": 1.8128896951675415, + "kl": 0.1618978213518858, + "learning_rate": 9.875746771527815e-07, + "loss": 0.0065, + "num_tokens": 2921605.0, + "reward": 0.82177734375, + "reward_std": 0.023464296013116837, + "rewards//mean": 0.82177734375, + "rewards//std": 0.028801828622817993, + "step": 403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0808, + "grad_norm": 1.4858336448669434, + "kl": 0.15265102963894606, + "learning_rate": 9.875042744011486e-07, + "loss": 0.0061, + "num_tokens": 2928821.0, + "reward": 0.8165283203125, + "reward_std": 0.01914503611624241, + "rewards//mean": 0.8165283203125, + "rewards//std": 0.020111288875341415, + "step": 404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.71875, + "epoch": 0.081, + "grad_norm": 1.896262526512146, + "kl": 0.15816643089056015, + "learning_rate": 9.874336752828522e-07, + "loss": 0.0044, + "num_tokens": 2936035.0, + "reward": 0.836669921875, + "reward_std": 0.01797148585319519, + "rewards//mean": 0.836669921875, + "rewards//std": 0.021551910787820816, + "step": 405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0812, + "grad_norm": 1.444067120552063, + "kl": 0.15769504848867655, + "learning_rate": 9.873628798263295e-07, + "loss": 0.0063, + "num_tokens": 2943347.0, + "reward": 0.85601806640625, + "reward_std": 0.027190091088414192, + "rewards//mean": 0.85601806640625, + "rewards//std": 0.036063358187675476, + "step": 406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0814, + "grad_norm": 1.9418094158172607, + "kl": 0.18293398898094893, + "learning_rate": 9.872918880600973e-07, + "loss": 0.0073, + "num_tokens": 2950739.0, + "reward": 0.84246826171875, + "reward_std": 0.01840265840291977, + "rewards//mean": 0.84246826171875, + "rewards//std": 0.02303152345120907, + "step": 407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0816, + "grad_norm": 1.525444507598877, + "kl": 0.18828759528696537, + "learning_rate": 9.87220700012751e-07, + "loss": 0.0075, + "num_tokens": 2958043.0, + "reward": 0.84906005859375, + "reward_std": 0.032291099429130554, + "rewards//mean": 0.84906005859375, + "rewards//std": 0.039953671395778656, + "step": 408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.75, + "epoch": 0.0818, + "grad_norm": 1.316410779953003, + "kl": 0.1625326070934534, + "learning_rate": 9.871493157129647e-07, + "loss": 0.009, + "num_tokens": 2965347.0, + "reward": 0.8155517578125, + "reward_std": 0.021146081387996674, + "rewards//mean": 0.8155517578125, + "rewards//std": 0.026579899713397026, + "step": 409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.082, + "grad_norm": 1.4193910360336304, + "kl": 0.1670632939785719, + "learning_rate": 9.870777351894926e-07, + "loss": 0.0069, + "num_tokens": 2972664.0, + "reward": 0.8433837890625, + "reward_std": 0.028243370354175568, + "rewards//mean": 0.8433837890625, + "rewards//std": 0.036276642233133316, + "step": 410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.765625, + "epoch": 0.0822, + "grad_norm": 1.8903616666793823, + "kl": 0.16123722307384014, + "learning_rate": 9.870059584711668e-07, + "loss": 0.0121, + "num_tokens": 2979961.0, + "reward": 0.806640625, + "reward_std": 0.01915012113749981, + "rewards//mean": 0.806640625, + "rewards//std": 0.02463657222688198, + "step": 411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0824, + "grad_norm": 1.5448776483535767, + "kl": 0.15667866822332144, + "learning_rate": 9.869339855868991e-07, + "loss": 0.0063, + "num_tokens": 2987209.0, + "reward": 0.8502197265625, + "reward_std": 0.03483223170042038, + "rewards//mean": 0.8502197265625, + "rewards//std": 0.040416866540908813, + "step": 412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0826, + "grad_norm": 1.904563546180725, + "kl": 0.13585046213120222, + "learning_rate": 9.868618165656804e-07, + "loss": 0.0054, + "num_tokens": 2994513.0, + "reward": 0.8214111328125, + "reward_std": 0.015888184309005737, + "rewards//mean": 0.8214111328125, + "rewards//std": 0.025859376415610313, + "step": 413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.0828, + "grad_norm": 2.2323663234710693, + "kl": 0.15275530610233545, + "learning_rate": 9.8678945143658e-07, + "loss": 0.0046, + "num_tokens": 3001708.0, + "reward": 0.79852294921875, + "reward_std": 0.01690751314163208, + "rewards//mean": 0.79852294921875, + "rewards//std": 0.01928546652197838, + "step": 414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.083, + "grad_norm": 1.521886944770813, + "kl": 0.15038963593542576, + "learning_rate": 9.86716890228747e-07, + "loss": 0.006, + "num_tokens": 3009020.0, + "reward": 0.8148193359375, + "reward_std": 0.025958247482776642, + "rewards//mean": 0.8148193359375, + "rewards//std": 0.029763104394078255, + "step": 415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0832, + "grad_norm": 1.9094656705856323, + "kl": 0.14829275477677584, + "learning_rate": 9.866441329714087e-07, + "loss": 0.0059, + "num_tokens": 3016308.0, + "reward": 0.8538818359375, + "reward_std": 0.028644222766160965, + "rewards//mean": 0.8538818359375, + "rewards//std": 0.031709473580121994, + "step": 416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0834, + "grad_norm": 1.334229588508606, + "kl": 0.1554647758603096, + "learning_rate": 9.86571179693872e-07, + "loss": 0.0062, + "num_tokens": 3023644.0, + "reward": 0.8338623046875, + "reward_std": 0.01815095916390419, + "rewards//mean": 0.8338623046875, + "rewards//std": 0.023601945489645004, + "step": 417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.859375, + "epoch": 0.0836, + "grad_norm": 1.4114062786102295, + "kl": 0.19823346845805645, + "learning_rate": 9.86498030425522e-07, + "loss": 0.001, + "num_tokens": 3030875.0, + "reward": 0.83258056640625, + "reward_std": 0.017367083579301834, + "rewards//mean": 0.83258056640625, + "rewards//std": 0.02575989067554474, + "step": 418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0838, + "grad_norm": 1.7300703525543213, + "kl": 0.19561454467475414, + "learning_rate": 9.864246851958237e-07, + "loss": 0.0078, + "num_tokens": 3038131.0, + "reward": 0.8267822265625, + "reward_std": 0.016315139830112457, + "rewards//mean": 0.8267822265625, + "rewards//std": 0.02747381664812565, + "step": 419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.084, + "grad_norm": 2.6342854499816895, + "kl": 0.2521906867623329, + "learning_rate": 9.863511440343205e-07, + "loss": 0.0101, + "num_tokens": 3045347.0, + "reward": 0.858154296875, + "reward_std": 0.02520241215825081, + "rewards//mean": 0.858154296875, + "rewards//std": 0.039204079657793045, + "step": 420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.828125, + "epoch": 0.0842, + "grad_norm": 1.9959427118301392, + "kl": 0.17915515787899494, + "learning_rate": 9.862774069706345e-07, + "loss": -0.0025, + "num_tokens": 3052608.0, + "reward": 0.84759521484375, + "reward_std": 0.026869237422943115, + "rewards//mean": 0.84759521484375, + "rewards//std": 0.0367395393550396, + "step": 421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0844, + "grad_norm": 1.5117335319519043, + "kl": 0.17964705545455217, + "learning_rate": 9.862034740344671e-07, + "loss": 0.0072, + "num_tokens": 3059912.0, + "reward": 0.834716796875, + "reward_std": 0.023416858166456223, + "rewards//mean": 0.834716796875, + "rewards//std": 0.03128775954246521, + "step": 422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0846, + "grad_norm": 1.4761617183685303, + "kl": 0.14798900298774242, + "learning_rate": 9.861293452555986e-07, + "loss": 0.0059, + "num_tokens": 3067104.0, + "reward": 0.84942626953125, + "reward_std": 0.024361981078982353, + "rewards//mean": 0.84942626953125, + "rewards//std": 0.030969176441431046, + "step": 423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0848, + "grad_norm": 1.442857265472412, + "kl": 0.1658561434596777, + "learning_rate": 9.86055020663888e-07, + "loss": 0.0066, + "num_tokens": 3074360.0, + "reward": 0.83251953125, + "reward_std": 0.019875170662999153, + "rewards//mean": 0.83251953125, + "rewards//std": 0.024827538058161736, + "step": 424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.59375, + "epoch": 0.085, + "grad_norm": 1.4769834280014038, + "kl": 0.17314176633954048, + "learning_rate": 9.859805002892731e-07, + "loss": -0.0254, + "num_tokens": 3081550.0, + "reward": 0.8370361328125, + "reward_std": 0.031008243560791016, + "rewards//mean": 0.8370361328125, + "rewards//std": 0.04023217037320137, + "step": 425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0852, + "grad_norm": 1.7869426012039185, + "kl": 0.2089535454288125, + "learning_rate": 9.859057841617708e-07, + "loss": 0.0084, + "num_tokens": 3088758.0, + "reward": 0.79345703125, + "reward_std": 0.019341979175806046, + "rewards//mean": 0.79345703125, + "rewards//std": 0.03354151174426079, + "step": 426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0854, + "grad_norm": 1.633345365524292, + "kl": 0.14360905438661575, + "learning_rate": 9.858308723114768e-07, + "loss": 0.0057, + "num_tokens": 3096070.0, + "reward": 0.837158203125, + "reward_std": 0.03350232541561127, + "rewards//mean": 0.837158203125, + "rewards//std": 0.04615054652094841, + "step": 427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0856, + "grad_norm": 1.7984955310821533, + "kl": 0.14450997952371836, + "learning_rate": 9.857557647685655e-07, + "loss": 0.0058, + "num_tokens": 3103430.0, + "reward": 0.82598876953125, + "reward_std": 0.028221094980835915, + "rewards//mean": 0.82598876953125, + "rewards//std": 0.030088549479842186, + "step": 428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0858, + "grad_norm": 1.6806559562683105, + "kl": 0.1661527007818222, + "learning_rate": 9.856804615632901e-07, + "loss": 0.0066, + "num_tokens": 3110838.0, + "reward": 0.838134765625, + "reward_std": 0.020333271473646164, + "rewards//mean": 0.838134765625, + "rewards//std": 0.025561751797795296, + "step": 429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.086, + "grad_norm": 1.7124407291412354, + "kl": 0.17584913596510887, + "learning_rate": 9.856049627259832e-07, + "loss": 0.007, + "num_tokens": 3118094.0, + "reward": 0.8203125, + "reward_std": 0.03421751409769058, + "rewards//mean": 0.8203125, + "rewards//std": 0.03580813854932785, + "step": 430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.0862, + "grad_norm": 1.4816746711730957, + "kl": 0.17367327213287354, + "learning_rate": 9.85529268287055e-07, + "loss": 0.0106, + "num_tokens": 3125335.0, + "reward": 0.779296875, + "reward_std": 0.01608245074748993, + "rewards//mean": 0.779296875, + "rewards//std": 0.03030567243695259, + "step": 431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.609375, + "epoch": 0.0864, + "grad_norm": 1.6110529899597168, + "kl": 0.17724544554948807, + "learning_rate": 9.854533782769959e-07, + "loss": 0.0095, + "num_tokens": 3132534.0, + "reward": 0.8416748046875, + "reward_std": 0.018835583701729774, + "rewards//mean": 0.8416748046875, + "rewards//std": 0.029675496742129326, + "step": 432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.765625, + "epoch": 0.0866, + "grad_norm": 1.9000838994979858, + "kl": 0.15654443204402924, + "learning_rate": 9.853772927263739e-07, + "loss": 0.0025, + "num_tokens": 3139887.0, + "reward": 0.8204345703125, + "reward_std": 0.02088984288275242, + "rewards//mean": 0.8204345703125, + "rewards//std": 0.03573344647884369, + "step": 433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0868, + "grad_norm": 1.5283061265945435, + "kl": 0.1548478864133358, + "learning_rate": 9.853010116658366e-07, + "loss": 0.0062, + "num_tokens": 3147247.0, + "reward": 0.8291015625, + "reward_std": 0.021576430648565292, + "rewards//mean": 0.8291015625, + "rewards//std": 0.032128944993019104, + "step": 434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.087, + "grad_norm": 1.8227264881134033, + "kl": 0.16510541178286076, + "learning_rate": 9.852245351261097e-07, + "loss": 0.0066, + "num_tokens": 3154471.0, + "reward": 0.8187255859375, + "reward_std": 0.028329282999038696, + "rewards//mean": 0.8187255859375, + "rewards//std": 0.035343270748853683, + "step": 435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.84375, + "epoch": 0.0872, + "grad_norm": 1.5320924520492554, + "kl": 0.1771046295762062, + "learning_rate": 9.851478631379982e-07, + "loss": 0.002, + "num_tokens": 3161677.0, + "reward": 0.85235595703125, + "reward_std": 0.01989646814763546, + "rewards//mean": 0.85235595703125, + "rewards//std": 0.02190089039504528, + "step": 436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.640625, + "epoch": 0.0874, + "grad_norm": 1.5170859098434448, + "kl": 0.17804256081581116, + "learning_rate": 9.850709957323854e-07, + "loss": -0.0026, + "num_tokens": 3168934.0, + "reward": 0.82708740234375, + "reward_std": 0.023379258811473846, + "rewards//mean": 0.82708740234375, + "rewards//std": 0.025144923478364944, + "step": 437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.78125, + "epoch": 0.0876, + "grad_norm": 1.6697574853897095, + "kl": 0.17269239481538534, + "learning_rate": 9.849939329402336e-07, + "loss": 0.0102, + "num_tokens": 3176152.0, + "reward": 0.8074951171875, + "reward_std": 0.020199580118060112, + "rewards//mean": 0.8074951171875, + "rewards//std": 0.026779619976878166, + "step": 438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0878, + "grad_norm": 1.269140362739563, + "kl": 0.17004015669226646, + "learning_rate": 9.849166747925834e-07, + "loss": 0.0068, + "num_tokens": 3183464.0, + "reward": 0.82940673828125, + "reward_std": 0.01639626733958721, + "rewards//mean": 0.82940673828125, + "rewards//std": 0.018335873261094093, + "step": 439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.53125, + "epoch": 0.088, + "grad_norm": 1.4404964447021484, + "kl": 0.14840035885572433, + "learning_rate": 9.848392213205547e-07, + "loss": 0.0036, + "num_tokens": 3190658.0, + "reward": 0.86041259765625, + "reward_std": 0.020950615406036377, + "rewards//mean": 0.86041259765625, + "rewards//std": 0.02862250804901123, + "step": 440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.515625, + "epoch": 0.0882, + "grad_norm": 1.5204646587371826, + "kl": 0.17300505004823208, + "learning_rate": 9.847615725553455e-07, + "loss": -0.0021, + "num_tokens": 3197923.0, + "reward": 0.8392333984375, + "reward_std": 0.016896039247512817, + "rewards//mean": 0.8392333984375, + "rewards//std": 0.02293841540813446, + "step": 441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.0884, + "grad_norm": 1.524003267288208, + "kl": 0.1748319212347269, + "learning_rate": 9.84683728528233e-07, + "loss": 0.0063, + "num_tokens": 3205229.0, + "reward": 0.86236572265625, + "reward_std": 0.013934498652815819, + "rewards//mean": 0.86236572265625, + "rewards//std": 0.029138272628188133, + "step": 442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.84375, + "epoch": 0.0886, + "grad_norm": 1.6142150163650513, + "kl": 0.1701322691515088, + "learning_rate": 9.846056892705727e-07, + "loss": 0.0073, + "num_tokens": 3212627.0, + "reward": 0.822509765625, + "reward_std": 0.023265544325113297, + "rewards//mean": 0.822509765625, + "rewards//std": 0.033078886568546295, + "step": 443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.75, + "epoch": 0.0888, + "grad_norm": 1.8413188457489014, + "kl": 0.18479791469871998, + "learning_rate": 9.845274548137985e-07, + "loss": 0.0086, + "num_tokens": 3219843.0, + "reward": 0.83636474609375, + "reward_std": 0.02935871109366417, + "rewards//mean": 0.83636474609375, + "rewards//std": 0.030186494812369347, + "step": 444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.089, + "grad_norm": 1.5492944717407227, + "kl": 0.1669162530452013, + "learning_rate": 9.844490251894236e-07, + "loss": 0.0067, + "num_tokens": 3227147.0, + "reward": 0.8551025390625, + "reward_std": 0.02708328142762184, + "rewards//mean": 0.8551025390625, + "rewards//std": 0.031107990071177483, + "step": 445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0892, + "grad_norm": 1.8131632804870605, + "kl": 0.15791757684201002, + "learning_rate": 9.843704004290392e-07, + "loss": 0.0063, + "num_tokens": 3234531.0, + "reward": 0.83380126953125, + "reward_std": 0.025960084050893784, + "rewards//mean": 0.83380126953125, + "rewards//std": 0.03683459386229515, + "step": 446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0894, + "grad_norm": 1.4264763593673706, + "kl": 0.18809033371508121, + "learning_rate": 9.842915805643156e-07, + "loss": 0.0075, + "num_tokens": 3241811.0, + "reward": 0.8023681640625, + "reward_std": 0.016801923513412476, + "rewards//mean": 0.8023681640625, + "rewards//std": 0.023268641903996468, + "step": 447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0896, + "grad_norm": 2.0073764324188232, + "kl": 0.18780111242085695, + "learning_rate": 9.84212565627001e-07, + "loss": 0.0075, + "num_tokens": 3249139.0, + "reward": 0.83599853515625, + "reward_std": 0.022723905742168427, + "rewards//mean": 0.83599853515625, + "rewards//std": 0.029655275866389275, + "step": 448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0898, + "grad_norm": 1.3451988697052002, + "kl": 0.18107646703720093, + "learning_rate": 9.841333556489232e-07, + "loss": 0.0072, + "num_tokens": 3256515.0, + "reward": 0.783203125, + "reward_std": 0.015557118691504002, + "rewards//mean": 0.783203125, + "rewards//std": 0.017758049070835114, + "step": 449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.09, + "grad_norm": 1.8686095476150513, + "kl": 0.18474428541958332, + "learning_rate": 9.840539506619872e-07, + "loss": 0.0106, + "num_tokens": 3263749.0, + "reward": 0.8028564453125, + "reward_std": 0.02446833997964859, + "rewards//mean": 0.8028564453125, + "rewards//std": 0.032518353313207626, + "step": 450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0902, + "grad_norm": 1.3886102437973022, + "kl": 0.18988430220633745, + "learning_rate": 9.83974350698178e-07, + "loss": 0.0076, + "num_tokens": 3271101.0, + "reward": 0.84210205078125, + "reward_std": 0.01767934113740921, + "rewards//mean": 0.84210205078125, + "rewards//std": 0.03706973418593407, + "step": 451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.46875, + "epoch": 0.0904, + "grad_norm": 1.42201566696167, + "kl": 0.18129398673772812, + "learning_rate": 9.838945557895584e-07, + "loss": -0.0067, + "num_tokens": 3278435.0, + "reward": 0.83447265625, + "reward_std": 0.021803248673677444, + "rewards//mean": 0.83447265625, + "rewards//std": 0.0284974854439497, + "step": 452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.78125, + "epoch": 0.0906, + "grad_norm": 1.1378215551376343, + "kl": 0.1409740997478366, + "learning_rate": 9.838145659682692e-07, + "loss": 0.0058, + "num_tokens": 3285805.0, + "reward": 0.81988525390625, + "reward_std": 0.012863392010331154, + "rewards//mean": 0.81988525390625, + "rewards//std": 0.016294516623020172, + "step": 453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0908, + "grad_norm": 1.6490195989608765, + "kl": 0.17553207464516163, + "learning_rate": 9.83734381266531e-07, + "loss": 0.007, + "num_tokens": 3292965.0, + "reward": 0.832763671875, + "reward_std": 0.023432031273841858, + "rewards//mean": 0.832763671875, + "rewards//std": 0.029666567221283913, + "step": 454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.091, + "grad_norm": 1.7721961736679077, + "kl": 0.19630361907184124, + "learning_rate": 9.836540017166419e-07, + "loss": 0.0079, + "num_tokens": 3300293.0, + "reward": 0.822021484375, + "reward_std": 0.01977444998919964, + "rewards//mean": 0.822021484375, + "rewards//std": 0.022161388769745827, + "step": 455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.765625, + "epoch": 0.0912, + "grad_norm": 1.5251160860061646, + "kl": 0.21747586503624916, + "learning_rate": 9.835734273509785e-07, + "loss": 0.0126, + "num_tokens": 3307534.0, + "reward": 0.82989501953125, + "reward_std": 0.02460447885096073, + "rewards//mean": 0.82989501953125, + "rewards//std": 0.03824751079082489, + "step": 456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0914, + "grad_norm": 1.6759788990020752, + "kl": 0.18464125599712133, + "learning_rate": 9.834926582019966e-07, + "loss": 0.0074, + "num_tokens": 3314854.0, + "reward": 0.8233642578125, + "reward_std": 0.02581986039876938, + "rewards//mean": 0.8233642578125, + "rewards//std": 0.02910272777080536, + "step": 457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0916, + "grad_norm": 1.7316563129425049, + "kl": 0.18390465341508389, + "learning_rate": 9.834116943022297e-07, + "loss": 0.0074, + "num_tokens": 3322182.0, + "reward": 0.8262939453125, + "reward_std": 0.02447674795985222, + "rewards//mean": 0.8262939453125, + "rewards//std": 0.03180290386080742, + "step": 458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0918, + "grad_norm": 1.612594485282898, + "kl": 0.2041511945426464, + "learning_rate": 9.8333053568429e-07, + "loss": 0.0082, + "num_tokens": 3329446.0, + "reward": 0.87054443359375, + "reward_std": 0.023778308182954788, + "rewards//mean": 0.87054443359375, + "rewards//std": 0.029764313250780106, + "step": 459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.515625, + "epoch": 0.092, + "grad_norm": 1.6016572713851929, + "kl": 0.1724289208650589, + "learning_rate": 9.832491823808686e-07, + "loss": 0.002, + "num_tokens": 3336815.0, + "reward": 0.8592529296875, + "reward_std": 0.02584010735154152, + "rewards//mean": 0.8592529296875, + "rewards//std": 0.030591784045100212, + "step": 460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.0922, + "grad_norm": 1.3772622346878052, + "kl": 0.1911499137058854, + "learning_rate": 9.831676344247342e-07, + "loss": 0.007, + "num_tokens": 3344099.0, + "reward": 0.824462890625, + "reward_std": 0.013022979721426964, + "rewards//mean": 0.824462890625, + "rewards//std": 0.01894424296915531, + "step": 461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.859375, + "epoch": 0.0924, + "grad_norm": 1.3538057804107666, + "kl": 0.16963309608399868, + "learning_rate": 9.830858918487346e-07, + "loss": 0.0092, + "num_tokens": 3351546.0, + "reward": 0.81390380859375, + "reward_std": 0.020388346165418625, + "rewards//mean": 0.81390380859375, + "rewards//std": 0.02539532072842121, + "step": 462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0926, + "grad_norm": 1.7561582326889038, + "kl": 0.1836804710328579, + "learning_rate": 9.830039546857952e-07, + "loss": 0.0073, + "num_tokens": 3358834.0, + "reward": 0.84869384765625, + "reward_std": 0.02540658973157406, + "rewards//mean": 0.84869384765625, + "rewards//std": 0.03624674305319786, + "step": 463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0928, + "grad_norm": 1.4902132749557495, + "kl": 0.18900141678750515, + "learning_rate": 9.829218229689209e-07, + "loss": 0.0076, + "num_tokens": 3366058.0, + "reward": 0.85137939453125, + "reward_std": 0.018896307796239853, + "rewards//mean": 0.85137939453125, + "rewards//std": 0.02656131237745285, + "step": 464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.093, + "grad_norm": 1.3676836490631104, + "kl": 0.18453174456954002, + "learning_rate": 9.828394967311938e-07, + "loss": 0.0074, + "num_tokens": 3373378.0, + "reward": 0.834716796875, + "reward_std": 0.02113497629761696, + "rewards//mean": 0.834716796875, + "rewards//std": 0.040138185024261475, + "step": 465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.0932, + "grad_norm": 1.5636066198349, + "kl": 0.24821490980684757, + "learning_rate": 9.827569760057754e-07, + "loss": 0.0041, + "num_tokens": 3380651.0, + "reward": 0.81805419921875, + "reward_std": 0.017224829643964767, + "rewards//mean": 0.81805419921875, + "rewards//std": 0.023049920797348022, + "step": 466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0934, + "grad_norm": 1.952860713005066, + "kl": 0.23367472551763058, + "learning_rate": 9.826742608259047e-07, + "loss": 0.0093, + "num_tokens": 3387979.0, + "reward": 0.84783935546875, + "reward_std": 0.018350636586546898, + "rewards//mean": 0.84783935546875, + "rewards//std": 0.020415615290403366, + "step": 467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.859375, + "epoch": 0.0936, + "grad_norm": 1.6941291093826294, + "kl": 0.24167053773999214, + "learning_rate": 9.825913512248995e-07, + "loss": 0.0021, + "num_tokens": 3395242.0, + "reward": 0.78765869140625, + "reward_std": 0.020818307995796204, + "rewards//mean": 0.78765869140625, + "rewards//std": 0.031614989042282104, + "step": 468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.0938, + "grad_norm": 1.4979290962219238, + "kl": 0.20042267069220543, + "learning_rate": 9.825082472361556e-07, + "loss": 0.0075, + "num_tokens": 3402572.0, + "reward": 0.84454345703125, + "reward_std": 0.020520757883787155, + "rewards//mean": 0.84454345703125, + "rewards//std": 0.029390186071395874, + "step": 469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.094, + "grad_norm": 1.3087490797042847, + "kl": 0.18437449354678392, + "learning_rate": 9.824249488931475e-07, + "loss": 0.0074, + "num_tokens": 3409852.0, + "reward": 0.883056640625, + "reward_std": 0.015540940687060356, + "rewards//mean": 0.883056640625, + "rewards//std": 0.022432951256632805, + "step": 470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0942, + "grad_norm": 1.3886988162994385, + "kl": 0.17169822938740253, + "learning_rate": 9.82341456229428e-07, + "loss": 0.0069, + "num_tokens": 3417132.0, + "reward": 0.8389892578125, + "reward_std": 0.022249974310398102, + "rewards//mean": 0.8389892578125, + "rewards//std": 0.028446180745959282, + "step": 471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.84375, + "epoch": 0.0944, + "grad_norm": 1.4124736785888672, + "kl": 0.20674669742584229, + "learning_rate": 9.822577692786272e-07, + "loss": 0.0106, + "num_tokens": 3424442.0, + "reward": 0.8475341796875, + "reward_std": 0.019612163305282593, + "rewards//mean": 0.8475341796875, + "rewards//std": 0.024675559252500534, + "step": 472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0946, + "grad_norm": 1.7601836919784546, + "kl": 0.20727793872356415, + "learning_rate": 9.821738880744547e-07, + "loss": 0.0083, + "num_tokens": 3431666.0, + "reward": 0.84716796875, + "reward_std": 0.02429209090769291, + "rewards//mean": 0.84716796875, + "rewards//std": 0.03289992734789848, + "step": 473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.734375, + "epoch": 0.0948, + "grad_norm": 1.4998639822006226, + "kl": 0.21500564366579056, + "learning_rate": 9.820898126506979e-07, + "loss": 0.0183, + "num_tokens": 3438873.0, + "reward": 0.7669677734375, + "reward_std": 0.0176587775349617, + "rewards//mean": 0.7669677734375, + "rewards//std": 0.02440165914595127, + "step": 474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.359375, + "epoch": 0.095, + "grad_norm": 1.4507863521575928, + "kl": 0.21139503829181194, + "learning_rate": 9.820055430412219e-07, + "loss": 0.0076, + "num_tokens": 3446096.0, + "reward": 0.85845947265625, + "reward_std": 0.02054494433104992, + "rewards//mean": 0.85845947265625, + "rewards//std": 0.023722456768155098, + "step": 475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.140625, + "epoch": 0.0952, + "grad_norm": 1.463124394416809, + "kl": 0.21864628233015537, + "learning_rate": 9.81921079279971e-07, + "loss": -0.0362, + "num_tokens": 3453401.0, + "reward": 0.80291748046875, + "reward_std": 0.023809876292943954, + "rewards//mean": 0.80291748046875, + "rewards//std": 0.030225085094571114, + "step": 476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0954, + "grad_norm": 1.437126874923706, + "kl": 0.19849344715476036, + "learning_rate": 9.81836421400967e-07, + "loss": 0.0079, + "num_tokens": 3460721.0, + "reward": 0.8096923828125, + "reward_std": 0.013462377712130547, + "rewards//mean": 0.8096923828125, + "rewards//std": 0.018746238201856613, + "step": 477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.71875, + "epoch": 0.0956, + "grad_norm": 1.7841358184814453, + "kl": 0.20555554516613483, + "learning_rate": 9.817515694383102e-07, + "loss": 0.0075, + "num_tokens": 3467983.0, + "reward": 0.84613037109375, + "reward_std": 0.02328665368258953, + "rewards//mean": 0.84613037109375, + "rewards//std": 0.03183828666806221, + "step": 478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.78125, + "epoch": 0.0958, + "grad_norm": 1.5066807270050049, + "kl": 0.19328912161290646, + "learning_rate": 9.816665234261786e-07, + "loss": 0.0067, + "num_tokens": 3475217.0, + "reward": 0.78515625, + "reward_std": 0.016146095469594002, + "rewards//mean": 0.78515625, + "rewards//std": 0.023385772481560707, + "step": 479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.096, + "grad_norm": 1.857603907585144, + "kl": 0.2151609156280756, + "learning_rate": 9.81581283398829e-07, + "loss": 0.0086, + "num_tokens": 3482385.0, + "reward": 0.85943603515625, + "reward_std": 0.02774648368358612, + "rewards//mean": 0.85943603515625, + "rewards//std": 0.039435792714357376, + "step": 480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.75, + "epoch": 0.0962, + "grad_norm": 1.4472804069519043, + "kl": 0.1841878928244114, + "learning_rate": 9.814958493905962e-07, + "loss": -0.0086, + "num_tokens": 3489633.0, + "reward": 0.847900390625, + "reward_std": 0.017229320481419563, + "rewards//mean": 0.847900390625, + "rewards//std": 0.022270411252975464, + "step": 481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.796875, + "epoch": 0.0964, + "grad_norm": 1.3863857984542847, + "kl": 0.21273548901081085, + "learning_rate": 9.814102214358926e-07, + "loss": -0.0021, + "num_tokens": 3496892.0, + "reward": 0.83355712890625, + "reward_std": 0.022536665201187134, + "rewards//mean": 0.83355712890625, + "rewards//std": 0.03516394644975662, + "step": 482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.859375, + "epoch": 0.0966, + "grad_norm": 1.6862802505493164, + "kl": 0.19829517975449562, + "learning_rate": 9.813243995692097e-07, + "loss": 0.0048, + "num_tokens": 3504123.0, + "reward": 0.8621826171875, + "reward_std": 0.020389292389154434, + "rewards//mean": 0.8621826171875, + "rewards//std": 0.038434114307165146, + "step": 483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.0968, + "grad_norm": 1.3225797414779663, + "kl": 0.1994712334126234, + "learning_rate": 9.81238383825116e-07, + "loss": 0.0073, + "num_tokens": 3511337.0, + "reward": 0.84991455078125, + "reward_std": 0.01709773764014244, + "rewards//mean": 0.84991455078125, + "rewards//std": 0.021618446335196495, + "step": 484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.097, + "grad_norm": 1.4770441055297852, + "kl": 0.15754287876188755, + "learning_rate": 9.81152174238259e-07, + "loss": 0.0063, + "num_tokens": 3518657.0, + "reward": 0.8564453125, + "reward_std": 0.02225455455482006, + "rewards//mean": 0.8564453125, + "rewards//std": 0.0294627845287323, + "step": 485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.640625, + "epoch": 0.0972, + "grad_norm": 1.5787129402160645, + "kl": 0.22845343220978975, + "learning_rate": 9.810657708433635e-07, + "loss": -0.0115, + "num_tokens": 3525890.0, + "reward": 0.8292236328125, + "reward_std": 0.016015663743019104, + "rewards//mean": 0.8292236328125, + "rewards//std": 0.024885697290301323, + "step": 486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.0974, + "grad_norm": 1.6998800039291382, + "kl": 0.19748000986874104, + "learning_rate": 9.809791736752332e-07, + "loss": 0.0045, + "num_tokens": 3533141.0, + "reward": 0.79815673828125, + "reward_std": 0.01553129218518734, + "rewards//mean": 0.79815673828125, + "rewards//std": 0.019870012998580933, + "step": 487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.8125, + "epoch": 0.0976, + "grad_norm": 1.2416727542877197, + "kl": 0.2010064832866192, + "learning_rate": 9.808923827687492e-07, + "loss": 0.0067, + "num_tokens": 3540353.0, + "reward": 0.83953857421875, + "reward_std": 0.014634535647928715, + "rewards//mean": 0.83953857421875, + "rewards//std": 0.028110742568969727, + "step": 488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0978, + "grad_norm": 1.3425137996673584, + "kl": 0.20144413225352764, + "learning_rate": 9.80805398158871e-07, + "loss": 0.0081, + "num_tokens": 3547617.0, + "reward": 0.8194580078125, + "reward_std": 0.016890309751033783, + "rewards//mean": 0.8194580078125, + "rewards//std": 0.028060413897037506, + "step": 489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.098, + "grad_norm": 1.4901678562164307, + "kl": 0.18031947314739227, + "learning_rate": 9.80718219880636e-07, + "loss": 0.0072, + "num_tokens": 3554929.0, + "reward": 0.81402587890625, + "reward_std": 0.01487969420850277, + "rewards//mean": 0.81402587890625, + "rewards//std": 0.017366621643304825, + "step": 490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0982, + "grad_norm": 1.4823634624481201, + "kl": 0.1984155997633934, + "learning_rate": 9.806308479691594e-07, + "loss": 0.0079, + "num_tokens": 3562233.0, + "reward": 0.84271240234375, + "reward_std": 0.027244187891483307, + "rewards//mean": 0.84271240234375, + "rewards//std": 0.03586384654045105, + "step": 491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0984, + "grad_norm": 1.739272952079773, + "kl": 0.23042607866227627, + "learning_rate": 9.805432824596347e-07, + "loss": 0.0092, + "num_tokens": 3569433.0, + "reward": 0.85833740234375, + "reward_std": 0.025159426033496857, + "rewards//mean": 0.85833740234375, + "rewards//std": 0.028782835230231285, + "step": 492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0986, + "grad_norm": 1.3682690858840942, + "kl": 0.17667777836322784, + "learning_rate": 9.804555233873332e-07, + "loss": 0.0071, + "num_tokens": 3576689.0, + "reward": 0.82183837890625, + "reward_std": 0.014823075383901596, + "rewards//mean": 0.82183837890625, + "rewards//std": 0.025042371824383736, + "step": 493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0988, + "grad_norm": 1.412514567375183, + "kl": 0.19041667506098747, + "learning_rate": 9.803675707876048e-07, + "loss": 0.0076, + "num_tokens": 3583881.0, + "reward": 0.76177978515625, + "reward_std": 0.015923084691166878, + "rewards//mean": 0.76177978515625, + "rewards//std": 0.03100483864545822, + "step": 494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.875, + "epoch": 0.099, + "grad_norm": 1.3923618793487549, + "kl": 0.19047866389155388, + "learning_rate": 9.80279424695876e-07, + "loss": 0.0132, + "num_tokens": 3591209.0, + "reward": 0.8466796875, + "reward_std": 0.024917714297771454, + "rewards//mean": 0.8466796875, + "rewards//std": 0.03626516833901405, + "step": 495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0992, + "grad_norm": 1.3233954906463623, + "kl": 0.2003224492073059, + "learning_rate": 9.801910851476524e-07, + "loss": 0.008, + "num_tokens": 3598481.0, + "reward": 0.86517333984375, + "reward_std": 0.022448252886533737, + "rewards//mean": 0.86517333984375, + "rewards//std": 0.029322635382413864, + "step": 496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.0994, + "grad_norm": 1.8993380069732666, + "kl": 0.18796506337821484, + "learning_rate": 9.80102552178517e-07, + "loss": 0.0075, + "num_tokens": 3605753.0, + "reward": 0.865478515625, + "reward_std": 0.022710110992193222, + "rewards//mean": 0.865478515625, + "rewards//std": 0.028644777834415436, + "step": 497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.765625, + "epoch": 0.0996, + "grad_norm": 1.412291169166565, + "kl": 0.2034222763031721, + "learning_rate": 9.800138258241309e-07, + "loss": -0.0036, + "num_tokens": 3613058.0, + "reward": 0.82293701171875, + "reward_std": 0.019907496869564056, + "rewards//mean": 0.82293701171875, + "rewards//std": 0.032741669565439224, + "step": 498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.0998, + "grad_norm": 1.6478419303894043, + "kl": 0.2054336741566658, + "learning_rate": 9.799249061202334e-07, + "loss": 0.0077, + "num_tokens": 3620321.0, + "reward": 0.83642578125, + "reward_std": 0.021415123715996742, + "rewards//mean": 0.83642578125, + "rewards//std": 0.029310306534171104, + "step": 499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1, + "grad_norm": 1.5549136400222778, + "kl": 0.17198997922241688, + "learning_rate": 9.798357931026412e-07, + "loss": 0.0069, + "num_tokens": 3627617.0, + "reward": 0.8480224609375, + "reward_std": 0.018366262316703796, + "rewards//mean": 0.8480224609375, + "rewards//std": 0.027069726958870888, + "step": 500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1002, + "grad_norm": 1.4385416507720947, + "kl": 0.23618987202644348, + "learning_rate": 9.797464868072486e-07, + "loss": 0.0094, + "num_tokens": 3634953.0, + "reward": 0.781982421875, + "reward_std": 0.02006053365767002, + "rewards//mean": 0.781982421875, + "rewards//std": 0.02338447794318199, + "step": 501 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1004, + "grad_norm": 1.7219282388687134, + "kl": 0.21823828108608723, + "learning_rate": 9.796569872700287e-07, + "loss": 0.0087, + "num_tokens": 3642193.0, + "reward": 0.82672119140625, + "reward_std": 0.01792309805750847, + "rewards//mean": 0.82672119140625, + "rewards//std": 0.02505929209291935, + "step": 502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.1006, + "grad_norm": 1.7087880373001099, + "kl": 0.2020204607397318, + "learning_rate": 9.795672945270316e-07, + "loss": 0.0062, + "num_tokens": 3649558.0, + "reward": 0.85076904296875, + "reward_std": 0.025650355964899063, + "rewards//mean": 0.85076904296875, + "rewards//std": 0.03362574428319931, + "step": 503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.84375, + "epoch": 0.1008, + "grad_norm": 2.221937894821167, + "kl": 0.22387738339602947, + "learning_rate": 9.794774086143857e-07, + "loss": 0.0041, + "num_tokens": 3656884.0, + "reward": 0.8050537109375, + "reward_std": 0.017229700461030006, + "rewards//mean": 0.8050537109375, + "rewards//std": 0.021617483347654343, + "step": 504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.101, + "grad_norm": 1.2897640466690063, + "kl": 0.20451311022043228, + "learning_rate": 9.79387329568297e-07, + "loss": 0.0082, + "num_tokens": 3664164.0, + "reward": 0.8558349609375, + "reward_std": 0.016882937401533127, + "rewards//mean": 0.8558349609375, + "rewards//std": 0.018801068887114525, + "step": 505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.796875, + "epoch": 0.1012, + "grad_norm": 1.364809274673462, + "kl": 0.21001940313726664, + "learning_rate": 9.792970574250493e-07, + "loss": 0.0087, + "num_tokens": 3671391.0, + "reward": 0.85797119140625, + "reward_std": 0.016120195388793945, + "rewards//mean": 0.85797119140625, + "rewards//std": 0.024353455752134323, + "step": 506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1014, + "grad_norm": 1.4941961765289307, + "kl": 0.23177559114992619, + "learning_rate": 9.79206592221004e-07, + "loss": 0.0093, + "num_tokens": 3678719.0, + "reward": 0.83367919921875, + "reward_std": 0.019097069278359413, + "rewards//mean": 0.83367919921875, + "rewards//std": 0.025370867922902107, + "step": 507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1016, + "grad_norm": 1.5038299560546875, + "kl": 0.19437365792691708, + "learning_rate": 9.791159339926008e-07, + "loss": 0.0078, + "num_tokens": 3686087.0, + "reward": 0.779541015625, + "reward_std": 0.013362476602196693, + "rewards//mean": 0.779541015625, + "rewards//std": 0.016261154785752296, + "step": 508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1018, + "grad_norm": 1.3199973106384277, + "kl": 0.20985210686922073, + "learning_rate": 9.790250827763565e-07, + "loss": 0.0084, + "num_tokens": 3693359.0, + "reward": 0.88775634765625, + "reward_std": 0.020297124981880188, + "rewards//mean": 0.88775634765625, + "rewards//std": 0.025422723963856697, + "step": 509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.102, + "grad_norm": 1.7803027629852295, + "kl": 0.18566251173615456, + "learning_rate": 9.789340386088662e-07, + "loss": 0.0074, + "num_tokens": 3700575.0, + "reward": 0.85003662109375, + "reward_std": 0.019822798669338226, + "rewards//mean": 0.85003662109375, + "rewards//std": 0.03140167519450188, + "step": 510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1022, + "grad_norm": 1.5313018560409546, + "kl": 0.24413364008069038, + "learning_rate": 9.788428015268026e-07, + "loss": 0.0098, + "num_tokens": 3707895.0, + "reward": 0.78350830078125, + "reward_std": 0.012271279469132423, + "rewards//mean": 0.78350830078125, + "rewards//std": 0.02050292305648327, + "step": 511 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1024, + "grad_norm": 1.6668771505355835, + "kl": 0.23631718568503857, + "learning_rate": 9.787513715669157e-07, + "loss": 0.0095, + "num_tokens": 3715255.0, + "reward": 0.78411865234375, + "reward_std": 0.01061212457716465, + "rewards//mean": 0.78411865234375, + "rewards//std": 0.020649325102567673, + "step": 512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1026, + "grad_norm": 1.4809651374816895, + "kl": 0.20683142356574535, + "learning_rate": 9.786597487660335e-07, + "loss": 0.0083, + "num_tokens": 3722495.0, + "reward": 0.81207275390625, + "reward_std": 0.016664912924170494, + "rewards//mean": 0.81207275390625, + "rewards//std": 0.031056547537446022, + "step": 513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.859375, + "epoch": 0.1028, + "grad_norm": 1.3975049257278442, + "kl": 0.20571527164429426, + "learning_rate": 9.78567933161062e-07, + "loss": 0.0064, + "num_tokens": 3729790.0, + "reward": 0.8411865234375, + "reward_std": 0.017523575574159622, + "rewards//mean": 0.8411865234375, + "rewards//std": 0.02468782663345337, + "step": 514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.103, + "grad_norm": 1.4212419986724854, + "kl": 0.16583594493567944, + "learning_rate": 9.78475924788984e-07, + "loss": 0.0066, + "num_tokens": 3737094.0, + "reward": 0.8177490234375, + "reward_std": 0.020504428073763847, + "rewards//mean": 0.8177490234375, + "rewards//std": 0.025412973016500473, + "step": 515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.1032, + "grad_norm": 1.6780552864074707, + "kl": 0.22582092881202698, + "learning_rate": 9.783837236868609e-07, + "loss": 0.0095, + "num_tokens": 3744316.0, + "reward": 0.81048583984375, + "reward_std": 0.013888959772884846, + "rewards//mean": 0.81048583984375, + "rewards//std": 0.017413625493645668, + "step": 516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.1034, + "grad_norm": 1.5219112634658813, + "kl": 0.20529437996447086, + "learning_rate": 9.782913298918308e-07, + "loss": 0.0095, + "num_tokens": 3751577.0, + "reward": 0.82568359375, + "reward_std": 0.021003328263759613, + "rewards//mean": 0.82568359375, + "rewards//std": 0.03176122531294823, + "step": 517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1036, + "grad_norm": 1.5033385753631592, + "kl": 0.20343627966940403, + "learning_rate": 9.781987434411106e-07, + "loss": 0.0081, + "num_tokens": 3758905.0, + "reward": 0.80133056640625, + "reward_std": 0.01603994145989418, + "rewards//mean": 0.80133056640625, + "rewards//std": 0.02639150433242321, + "step": 518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1038, + "grad_norm": 1.3465487957000732, + "kl": 0.20445391442626715, + "learning_rate": 9.781059643719936e-07, + "loss": 0.0082, + "num_tokens": 3766177.0, + "reward": 0.815673828125, + "reward_std": 0.020442292094230652, + "rewards//mean": 0.815673828125, + "rewards//std": 0.028175892308354378, + "step": 519 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.104, + "grad_norm": 1.2565443515777588, + "kl": 0.2075634878128767, + "learning_rate": 9.780129927218511e-07, + "loss": 0.0083, + "num_tokens": 3773361.0, + "reward": 0.853759765625, + "reward_std": 0.02150179073214531, + "rewards//mean": 0.853759765625, + "rewards//std": 0.03502078726887703, + "step": 520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1042, + "grad_norm": 1.5656163692474365, + "kl": 0.23233303800225258, + "learning_rate": 9.779198285281326e-07, + "loss": 0.0093, + "num_tokens": 3780673.0, + "reward": 0.8231201171875, + "reward_std": 0.024818100035190582, + "rewards//mean": 0.8231201171875, + "rewards//std": 0.03267068415880203, + "step": 521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1044, + "grad_norm": 1.694799780845642, + "kl": 0.2141793742775917, + "learning_rate": 9.77826471828364e-07, + "loss": 0.0086, + "num_tokens": 3787937.0, + "reward": 0.8773193359375, + "reward_std": 0.028373070061206818, + "rewards//mean": 0.8773193359375, + "rewards//std": 0.03792821615934372, + "step": 522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.1046, + "grad_norm": 1.6323399543762207, + "kl": 0.2011316567659378, + "learning_rate": 9.777329226601501e-07, + "loss": 0.0082, + "num_tokens": 3795142.0, + "reward": 0.85009765625, + "reward_std": 0.02034963108599186, + "rewards//mean": 0.85009765625, + "rewards//std": 0.029850710183382034, + "step": 523 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.828125, + "epoch": 0.1048, + "grad_norm": 1.3905750513076782, + "kl": 0.2209323551505804, + "learning_rate": 9.776391810611718e-07, + "loss": 0.0138, + "num_tokens": 3802355.0, + "reward": 0.7762451171875, + "reward_std": 0.015401996672153473, + "rewards//mean": 0.7762451171875, + "rewards//std": 0.0213099867105484, + "step": 524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.105, + "grad_norm": 1.5554221868515015, + "kl": 0.20317039266228676, + "learning_rate": 9.775452470691885e-07, + "loss": 0.0081, + "num_tokens": 3809603.0, + "reward": 0.7979736328125, + "reward_std": 0.01393861137330532, + "rewards//mean": 0.7979736328125, + "rewards//std": 0.02059325948357582, + "step": 525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.421875, + "epoch": 0.1052, + "grad_norm": 1.4136279821395874, + "kl": 0.23443794064223766, + "learning_rate": 9.774511207220368e-07, + "loss": 0.0039, + "num_tokens": 3816806.0, + "reward": 0.85528564453125, + "reward_std": 0.02261098474264145, + "rewards//mean": 0.85528564453125, + "rewards//std": 0.03417055308818817, + "step": 526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.625, + "epoch": 0.1054, + "grad_norm": 1.6402145624160767, + "kl": 0.2381672617048025, + "learning_rate": 9.77356802057631e-07, + "loss": -0.0098, + "num_tokens": 3824126.0, + "reward": 0.807861328125, + "reward_std": 0.018565889447927475, + "rewards//mean": 0.807861328125, + "rewards//std": 0.023559898138046265, + "step": 527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.1056, + "grad_norm": 1.5494813919067383, + "kl": 0.22814045008271933, + "learning_rate": 9.77262291113962e-07, + "loss": 0.0103, + "num_tokens": 3831412.0, + "reward": 0.850341796875, + "reward_std": 0.020727887749671936, + "rewards//mean": 0.850341796875, + "rewards//std": 0.023270267993211746, + "step": 528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1058, + "grad_norm": 1.6450270414352417, + "kl": 0.23497662506997585, + "learning_rate": 9.771675879290996e-07, + "loss": 0.0094, + "num_tokens": 3838748.0, + "reward": 0.830810546875, + "reward_std": 0.0223688967525959, + "rewards//mean": 0.830810546875, + "rewards//std": 0.03166480362415314, + "step": 529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.106, + "grad_norm": 1.6168323755264282, + "kl": 0.2142560686916113, + "learning_rate": 9.770726925411897e-07, + "loss": 0.0086, + "num_tokens": 3846076.0, + "reward": 0.85809326171875, + "reward_std": 0.01855452172458172, + "rewards//mean": 0.85809326171875, + "rewards//std": 0.027375219389796257, + "step": 530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.1062, + "grad_norm": 1.5788313150405884, + "kl": 0.27027702890336514, + "learning_rate": 9.769776049884563e-07, + "loss": 0.0104, + "num_tokens": 3853354.0, + "reward": 0.84130859375, + "reward_std": 0.01918921433389187, + "rewards//mean": 0.84130859375, + "rewards//std": 0.028835445642471313, + "step": 531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1064, + "grad_norm": 1.6106292009353638, + "kl": 0.2636375427246094, + "learning_rate": 9.768823253092008e-07, + "loss": 0.0105, + "num_tokens": 3860730.0, + "reward": 0.817138671875, + "reward_std": 0.016937367618083954, + "rewards//mean": 0.817138671875, + "rewards//std": 0.02472366951406002, + "step": 532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1066, + "grad_norm": 1.4660530090332031, + "kl": 0.17245667427778244, + "learning_rate": 9.767868535418014e-07, + "loss": 0.0069, + "num_tokens": 3868066.0, + "reward": 0.8486328125, + "reward_std": 0.01960989087820053, + "rewards//mean": 0.8486328125, + "rewards//std": 0.02426011860370636, + "step": 533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1068, + "grad_norm": 1.479331135749817, + "kl": 0.24362225271761417, + "learning_rate": 9.766911897247146e-07, + "loss": 0.0097, + "num_tokens": 3875338.0, + "reward": 0.85235595703125, + "reward_std": 0.018911894410848618, + "rewards//mean": 0.85235595703125, + "rewards//std": 0.02581917494535446, + "step": 534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.107, + "grad_norm": 1.7305238246917725, + "kl": 0.215004226192832, + "learning_rate": 9.765953338964734e-07, + "loss": 0.0086, + "num_tokens": 3882634.0, + "reward": 0.86322021484375, + "reward_std": 0.015228481031954288, + "rewards//mean": 0.86322021484375, + "rewards//std": 0.021012671291828156, + "step": 535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1072, + "grad_norm": 1.6360526084899902, + "kl": 0.20905968360602856, + "learning_rate": 9.76499286095689e-07, + "loss": 0.0084, + "num_tokens": 3889914.0, + "reward": 0.8377685546875, + "reward_std": 0.020736603066325188, + "rewards//mean": 0.8377685546875, + "rewards//std": 0.026419952511787415, + "step": 536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.1074, + "grad_norm": 1.4467660188674927, + "kl": 0.1840355582535267, + "learning_rate": 9.764030463610488e-07, + "loss": 0.0072, + "num_tokens": 3897233.0, + "reward": 0.84478759765625, + "reward_std": 0.015448033809661865, + "rewards//mean": 0.84478759765625, + "rewards//std": 0.020472628995776176, + "step": 537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1076, + "grad_norm": 1.3866578340530396, + "kl": 0.19887559860944748, + "learning_rate": 9.763066147313189e-07, + "loss": 0.008, + "num_tokens": 3904529.0, + "reward": 0.82110595703125, + "reward_std": 0.012983580119907856, + "rewards//mean": 0.82110595703125, + "rewards//std": 0.016472838819026947, + "step": 538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1078, + "grad_norm": 1.4183902740478516, + "kl": 0.21209913678467274, + "learning_rate": 9.762099912453412e-07, + "loss": 0.0085, + "num_tokens": 3911753.0, + "reward": 0.82537841796875, + "reward_std": 0.01152852363884449, + "rewards//mean": 0.82537841796875, + "rewards//std": 0.013381893746554852, + "step": 539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.108, + "grad_norm": 1.400371789932251, + "kl": 0.21787577494978905, + "learning_rate": 9.76113175942036e-07, + "loss": 0.0087, + "num_tokens": 3918985.0, + "reward": 0.79534912109375, + "reward_std": 0.01401433628052473, + "rewards//mean": 0.79534912109375, + "rewards//std": 0.01678507961332798, + "step": 540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.1082, + "grad_norm": 1.3207004070281982, + "kl": 0.1828410103917122, + "learning_rate": 9.760161688604007e-07, + "loss": 0.0073, + "num_tokens": 3926224.0, + "reward": 0.7786865234375, + "reward_std": 0.013527845032513142, + "rewards//mean": 0.7786865234375, + "rewards//std": 0.018138844519853592, + "step": 541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.1084, + "grad_norm": 1.874888300895691, + "kl": 0.21350181568413973, + "learning_rate": 9.759189700395095e-07, + "loss": 0.0079, + "num_tokens": 3933500.0, + "reward": 0.82672119140625, + "reward_std": 0.018555855378508568, + "rewards//mean": 0.82672119140625, + "rewards//std": 0.033038534224033356, + "step": 542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1086, + "grad_norm": 1.2065417766571045, + "kl": 0.23950578458607197, + "learning_rate": 9.758215795185138e-07, + "loss": 0.0096, + "num_tokens": 3940820.0, + "reward": 0.8455810546875, + "reward_std": 0.015421528369188309, + "rewards//mean": 0.8455810546875, + "rewards//std": 0.022914646193385124, + "step": 543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.6875, + "epoch": 0.1088, + "grad_norm": 1.3564627170562744, + "kl": 0.21350221149623394, + "learning_rate": 9.757239973366428e-07, + "loss": 0.0037, + "num_tokens": 3948088.0, + "reward": 0.837158203125, + "reward_std": 0.015461664646863937, + "rewards//mean": 0.837158203125, + "rewards//std": 0.02894757129251957, + "step": 544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.875, + "epoch": 0.109, + "grad_norm": 1.5485527515411377, + "kl": 0.2302035242319107, + "learning_rate": 9.756262235332028e-07, + "loss": 0.0082, + "num_tokens": 3955352.0, + "reward": 0.81097412109375, + "reward_std": 0.015178712084889412, + "rewards//mean": 0.81097412109375, + "rewards//std": 0.01597174070775509, + "step": 545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1092, + "grad_norm": 2.625596046447754, + "kl": 0.37556471303105354, + "learning_rate": 9.755282581475767e-07, + "loss": 0.015, + "num_tokens": 3962592.0, + "reward": 0.8369140625, + "reward_std": 0.02894209511578083, + "rewards//mean": 0.8369140625, + "rewards//std": 0.04014497250318527, + "step": 546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.796875, + "epoch": 0.1094, + "grad_norm": 1.3898751735687256, + "kl": 0.27234448306262493, + "learning_rate": 9.754301012192253e-07, + "loss": 0.0084, + "num_tokens": 3969811.0, + "reward": 0.8544921875, + "reward_std": 0.01641753502190113, + "rewards//mean": 0.8544921875, + "rewards//std": 0.018778912723064423, + "step": 547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.1096, + "grad_norm": 1.5232088565826416, + "kl": 0.2135065719485283, + "learning_rate": 9.753317527876856e-07, + "loss": 0.0112, + "num_tokens": 3977062.0, + "reward": 0.86474609375, + "reward_std": 0.019630568102002144, + "rewards//mean": 0.86474609375, + "rewards//std": 0.023380594328045845, + "step": 548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1098, + "grad_norm": 1.8993120193481445, + "kl": 0.2557559497654438, + "learning_rate": 9.75233212892573e-07, + "loss": 0.0102, + "num_tokens": 3984286.0, + "reward": 0.82232666015625, + "reward_std": 0.01804221421480179, + "rewards//mean": 0.82232666015625, + "rewards//std": 0.02811666578054428, + "step": 549 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.11, + "grad_norm": 1.3292995691299438, + "kl": 0.23580178245902061, + "learning_rate": 9.75134481573579e-07, + "loss": 0.0099, + "num_tokens": 3991476.0, + "reward": 0.84979248046875, + "reward_std": 0.016506148502230644, + "rewards//mean": 0.84979248046875, + "rewards//std": 0.02517079748213291, + "step": 550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.546875, + "epoch": 0.1102, + "grad_norm": 1.428501009941101, + "kl": 0.2185239251703024, + "learning_rate": 9.750355588704727e-07, + "loss": 0.0061, + "num_tokens": 3998911.0, + "reward": 0.8209228515625, + "reward_std": 0.015003788284957409, + "rewards//mean": 0.8209228515625, + "rewards//std": 0.020625578239560127, + "step": 551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1104, + "grad_norm": 1.266112208366394, + "kl": 0.24110298044979572, + "learning_rate": 9.749364448231e-07, + "loss": 0.0096, + "num_tokens": 4006223.0, + "reward": 0.77880859375, + "reward_std": 0.01799158565700054, + "rewards//mean": 0.77880859375, + "rewards//std": 0.02976946160197258, + "step": 552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.859375, + "epoch": 0.1106, + "grad_norm": 1.2793605327606201, + "kl": 0.21050270460546017, + "learning_rate": 9.748371394713841e-07, + "loss": 0.0084, + "num_tokens": 4013510.0, + "reward": 0.8658447265625, + "reward_std": 0.01686519756913185, + "rewards//mean": 0.8658447265625, + "rewards//std": 0.02182377316057682, + "step": 553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.1108, + "grad_norm": 1.33749520778656, + "kl": 0.23660637810826302, + "learning_rate": 9.747376428553253e-07, + "loss": 0.008, + "num_tokens": 4020747.0, + "reward": 0.82305908203125, + "reward_std": 0.017363104969263077, + "rewards//mean": 0.82305908203125, + "rewards//std": 0.024216946214437485, + "step": 554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.111, + "grad_norm": 1.647313117980957, + "kl": 0.23921755515038967, + "learning_rate": 9.746379550150008e-07, + "loss": 0.0091, + "num_tokens": 4028028.0, + "reward": 0.85882568359375, + "reward_std": 0.013506576418876648, + "rewards//mean": 0.85882568359375, + "rewards//std": 0.023784909397363663, + "step": 555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1112, + "grad_norm": 1.4423165321350098, + "kl": 0.20535465888679028, + "learning_rate": 9.745380759905647e-07, + "loss": 0.0082, + "num_tokens": 4035412.0, + "reward": 0.834228515625, + "reward_std": 0.01597372442483902, + "rewards//mean": 0.834228515625, + "rewards//std": 0.02752363495528698, + "step": 556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.875, + "epoch": 0.1114, + "grad_norm": 1.5676689147949219, + "kl": 0.30294511280953884, + "learning_rate": 9.744380058222482e-07, + "loss": 0.0095, + "num_tokens": 4042692.0, + "reward": 0.8443603515625, + "reward_std": 0.016331685706973076, + "rewards//mean": 0.8443603515625, + "rewards//std": 0.026831572875380516, + "step": 557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.765625, + "epoch": 0.1116, + "grad_norm": 1.2689718008041382, + "kl": 0.22245273366570473, + "learning_rate": 9.743377445503597e-07, + "loss": 0.0047, + "num_tokens": 4050013.0, + "reward": 0.80108642578125, + "reward_std": 0.01341705396771431, + "rewards//mean": 0.80108642578125, + "rewards//std": 0.019836463034152985, + "step": 558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.796875, + "epoch": 0.1118, + "grad_norm": 1.4511510133743286, + "kl": 0.24876062013208866, + "learning_rate": 9.742372922152845e-07, + "loss": 0.0021, + "num_tokens": 4057280.0, + "reward": 0.85595703125, + "reward_std": 0.01890578307211399, + "rewards//mean": 0.85595703125, + "rewards//std": 0.027040911838412285, + "step": 559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.112, + "grad_norm": 1.4728403091430664, + "kl": 0.21314039267599583, + "learning_rate": 9.74136648857485e-07, + "loss": 0.0085, + "num_tokens": 4064544.0, + "reward": 0.86358642578125, + "reward_std": 0.019322963431477547, + "rewards//mean": 0.86358642578125, + "rewards//std": 0.027279941365122795, + "step": 560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.796875, + "epoch": 0.1122, + "grad_norm": 1.4698659181594849, + "kl": 0.20369108207523823, + "learning_rate": 9.740358145174997e-07, + "loss": 0.0011, + "num_tokens": 4071875.0, + "reward": 0.83551025390625, + "reward_std": 0.01598992943763733, + "rewards//mean": 0.83551025390625, + "rewards//std": 0.020088976249098778, + "step": 561 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1124, + "grad_norm": 1.6563153266906738, + "kl": 0.2229784969240427, + "learning_rate": 9.73934789235945e-07, + "loss": 0.0089, + "num_tokens": 4079227.0, + "reward": 0.8485107421875, + "reward_std": 0.023097027093172073, + "rewards//mean": 0.8485107421875, + "rewards//std": 0.030524414032697678, + "step": 562 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.1126, + "grad_norm": 1.5962547063827515, + "kl": 0.24841345474123955, + "learning_rate": 9.73833573053514e-07, + "loss": 0.0086, + "num_tokens": 4086549.0, + "reward": 0.876708984375, + "reward_std": 0.0186562892049551, + "rewards//mean": 0.876708984375, + "rewards//std": 0.032188259065151215, + "step": 563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.53125, + "epoch": 0.1128, + "grad_norm": 1.5110526084899902, + "kl": 0.2323047649115324, + "learning_rate": 9.737321660109766e-07, + "loss": -0.0179, + "num_tokens": 4093671.0, + "reward": 0.85150146484375, + "reward_std": 0.020474767312407494, + "rewards//mean": 0.85150146484375, + "rewards//std": 0.028582284227013588, + "step": 564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.113, + "grad_norm": 1.7403911352157593, + "kl": 0.25706607662141323, + "learning_rate": 9.73630568149179e-07, + "loss": 0.0103, + "num_tokens": 4100943.0, + "reward": 0.82818603515625, + "reward_std": 0.014532249420881271, + "rewards//mean": 0.82818603515625, + "rewards//std": 0.021502599120140076, + "step": 565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.609375, + "epoch": 0.1132, + "grad_norm": 1.6344367265701294, + "kl": 0.23651538230478764, + "learning_rate": 9.735287795090454e-07, + "loss": 0.0027, + "num_tokens": 4108222.0, + "reward": 0.81683349609375, + "reward_std": 0.014048009179532528, + "rewards//mean": 0.81683349609375, + "rewards//std": 0.021889137104153633, + "step": 566 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.1134, + "grad_norm": 1.572940468788147, + "kl": 0.2526194714009762, + "learning_rate": 9.734268001315759e-07, + "loss": 0.0146, + "num_tokens": 4115432.0, + "reward": 0.8463134765625, + "reward_std": 0.0205571036785841, + "rewards//mean": 0.8463134765625, + "rewards//std": 0.03351781144738197, + "step": 567 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1136, + "grad_norm": 1.790201187133789, + "kl": 0.28595544025301933, + "learning_rate": 9.733246300578482e-07, + "loss": 0.0114, + "num_tokens": 4122656.0, + "reward": 0.87384033203125, + "reward_std": 0.022713251411914825, + "rewards//mean": 0.87384033203125, + "rewards//std": 0.027653606608510017, + "step": 568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.734375, + "epoch": 0.1138, + "grad_norm": 1.4608973264694214, + "kl": 0.26624004915356636, + "learning_rate": 9.73222269329016e-07, + "loss": 0.006, + "num_tokens": 4129823.0, + "reward": 0.8485107421875, + "reward_std": 0.01878565177321434, + "rewards//mean": 0.8485107421875, + "rewards//std": 0.022951610386371613, + "step": 569 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.114, + "grad_norm": 1.6956990957260132, + "kl": 0.21956810168921947, + "learning_rate": 9.731197179863103e-07, + "loss": 0.0088, + "num_tokens": 4137183.0, + "reward": 0.79852294921875, + "reward_std": 0.016127372160553932, + "rewards//mean": 0.79852294921875, + "rewards//std": 0.01714467443525791, + "step": 570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1142, + "grad_norm": 1.755094289779663, + "kl": 0.25584601983428, + "learning_rate": 9.730169760710385e-07, + "loss": 0.0102, + "num_tokens": 4144519.0, + "reward": 0.8583984375, + "reward_std": 0.021049918606877327, + "rewards//mean": 0.8583984375, + "rewards//std": 0.023396126925945282, + "step": 571 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.1144, + "grad_norm": 1.3438835144042969, + "kl": 0.23935799859464169, + "learning_rate": 9.729140436245856e-07, + "loss": 0.0107, + "num_tokens": 4151781.0, + "reward": 0.85443115234375, + "reward_std": 0.015150908380746841, + "rewards//mean": 0.85443115234375, + "rewards//std": 0.017164969816803932, + "step": 572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1146, + "grad_norm": 1.5172195434570312, + "kl": 0.2080207597464323, + "learning_rate": 9.728109206884125e-07, + "loss": 0.0083, + "num_tokens": 4159045.0, + "reward": 0.8475341796875, + "reward_std": 0.01623227819800377, + "rewards//mean": 0.8475341796875, + "rewards//std": 0.021178876981139183, + "step": 573 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.1148, + "grad_norm": 1.285882592201233, + "kl": 0.26950797997415066, + "learning_rate": 9.72707607304057e-07, + "loss": 0.0119, + "num_tokens": 4166377.0, + "reward": 0.8656005859375, + "reward_std": 0.017802193760871887, + "rewards//mean": 0.8656005859375, + "rewards//std": 0.028643455356359482, + "step": 574 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.115, + "grad_norm": 1.5207585096359253, + "kl": 0.27615508809685707, + "learning_rate": 9.726041035131338e-07, + "loss": 0.0157, + "num_tokens": 4173619.0, + "reward": 0.81915283203125, + "reward_std": 0.018715931102633476, + "rewards//mean": 0.81915283203125, + "rewards//std": 0.025833241641521454, + "step": 575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1152, + "grad_norm": 1.4431794881820679, + "kl": 0.287325294688344, + "learning_rate": 9.72500409357334e-07, + "loss": 0.0115, + "num_tokens": 4180979.0, + "reward": 0.8311767578125, + "reward_std": 0.014137900434434414, + "rewards//mean": 0.8311767578125, + "rewards//std": 0.023203495889902115, + "step": 576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1154, + "grad_norm": 1.4161217212677002, + "kl": 0.21908073499798775, + "learning_rate": 9.723965248784262e-07, + "loss": 0.0088, + "num_tokens": 4188299.0, + "reward": 0.84759521484375, + "reward_std": 0.019543122500181198, + "rewards//mean": 0.84759521484375, + "rewards//std": 0.038352251052856445, + "step": 577 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.796875, + "epoch": 0.1156, + "grad_norm": 1.4260485172271729, + "kl": 0.26179312728345394, + "learning_rate": 9.722924501182546e-07, + "loss": 0.0107, + "num_tokens": 4195566.0, + "reward": 0.8648681640625, + "reward_std": 0.02007567696273327, + "rewards//mean": 0.8648681640625, + "rewards//std": 0.04428023472428322, + "step": 578 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1158, + "grad_norm": 1.4398216009140015, + "kl": 0.22768347896635532, + "learning_rate": 9.721881851187405e-07, + "loss": 0.0091, + "num_tokens": 4203030.0, + "reward": 0.82904052734375, + "reward_std": 0.013871956616640091, + "rewards//mean": 0.82904052734375, + "rewards//std": 0.021324453875422478, + "step": 579 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.116, + "grad_norm": 1.3681715726852417, + "kl": 0.21595058031380177, + "learning_rate": 9.720837299218818e-07, + "loss": 0.0086, + "num_tokens": 4210254.0, + "reward": 0.84991455078125, + "reward_std": 0.014661135151982307, + "rewards//mean": 0.84991455078125, + "rewards//std": 0.023102398961782455, + "step": 580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1162, + "grad_norm": 2.1000239849090576, + "kl": 0.374788761138916, + "learning_rate": 9.719790845697532e-07, + "loss": 0.015, + "num_tokens": 4217470.0, + "reward": 0.81781005859375, + "reward_std": 0.019995862618088722, + "rewards//mean": 0.81781005859375, + "rewards//std": 0.027877680957317352, + "step": 581 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1164, + "grad_norm": 1.3347816467285156, + "kl": 0.2373837698251009, + "learning_rate": 9.71874249104506e-07, + "loss": 0.0095, + "num_tokens": 4224758.0, + "reward": 0.8482666015625, + "reward_std": 0.015025592409074306, + "rewards//mean": 0.8482666015625, + "rewards//std": 0.023211320862174034, + "step": 582 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1166, + "grad_norm": 1.280009388923645, + "kl": 0.2621693126857281, + "learning_rate": 9.717692235683674e-07, + "loss": 0.0105, + "num_tokens": 4232102.0, + "reward": 0.86102294921875, + "reward_std": 0.01413792371749878, + "rewards//mean": 0.86102294921875, + "rewards//std": 0.029885616153478622, + "step": 583 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1168, + "grad_norm": 1.4690579175949097, + "kl": 0.3273140713572502, + "learning_rate": 9.716640080036423e-07, + "loss": 0.0131, + "num_tokens": 4239366.0, + "reward": 0.82611083984375, + "reward_std": 0.016715172678232193, + "rewards//mean": 0.82611083984375, + "rewards//std": 0.022978223860263824, + "step": 584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.734375, + "epoch": 0.117, + "grad_norm": 1.6104674339294434, + "kl": 0.25345082208514214, + "learning_rate": 9.715586024527109e-07, + "loss": -0.0014, + "num_tokens": 4246557.0, + "reward": 0.869384765625, + "reward_std": 0.018346967175602913, + "rewards//mean": 0.869384765625, + "rewards//std": 0.027873404324054718, + "step": 585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.1172, + "grad_norm": 1.4254887104034424, + "kl": 0.2513287588953972, + "learning_rate": 9.714530069580308e-07, + "loss": 0.0101, + "num_tokens": 4253796.0, + "reward": 0.8438720703125, + "reward_std": 0.02208077162504196, + "rewards//mean": 0.8438720703125, + "rewards//std": 0.02826252579689026, + "step": 586 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1174, + "grad_norm": 1.4259045124053955, + "kl": 0.26038975082337856, + "learning_rate": 9.71347221562136e-07, + "loss": 0.0104, + "num_tokens": 4261092.0, + "reward": 0.84515380859375, + "reward_std": 0.012152329087257385, + "rewards//mean": 0.84515380859375, + "rewards//std": 0.018796538934111595, + "step": 587 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.84375, + "epoch": 0.1176, + "grad_norm": 1.5936652421951294, + "kl": 0.26153737865388393, + "learning_rate": 9.712412463076367e-07, + "loss": 0.0089, + "num_tokens": 4268370.0, + "reward": 0.8511962890625, + "reward_std": 0.01291497703641653, + "rewards//mean": 0.8511962890625, + "rewards//std": 0.020666636526584625, + "step": 588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1178, + "grad_norm": 1.3843432664871216, + "kl": 0.2792710456997156, + "learning_rate": 9.711350812372196e-07, + "loss": 0.0112, + "num_tokens": 4275642.0, + "reward": 0.81683349609375, + "reward_std": 0.014620404690504074, + "rewards//mean": 0.81683349609375, + "rewards//std": 0.022338194772601128, + "step": 589 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.118, + "grad_norm": 1.8111941814422607, + "kl": 0.32267764024436474, + "learning_rate": 9.710287263936483e-07, + "loss": 0.0129, + "num_tokens": 4282978.0, + "reward": 0.8712158203125, + "reward_std": 0.014023412019014359, + "rewards//mean": 0.8712158203125, + "rewards//std": 0.02387741208076477, + "step": 590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1182, + "grad_norm": 1.4193518161773682, + "kl": 0.23825886845588684, + "learning_rate": 9.709221818197623e-07, + "loss": 0.0095, + "num_tokens": 4290242.0, + "reward": 0.84539794921875, + "reward_std": 0.014532842673361301, + "rewards//mean": 0.84539794921875, + "rewards//std": 0.024536138400435448, + "step": 591 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.1184, + "grad_norm": 1.2298650741577148, + "kl": 0.21300828363746405, + "learning_rate": 9.708154475584777e-07, + "loss": 0.0083, + "num_tokens": 4297485.0, + "reward": 0.83734130859375, + "reward_std": 0.012915708124637604, + "rewards//mean": 0.83734130859375, + "rewards//std": 0.01810072734951973, + "step": 592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.1186, + "grad_norm": 1.508862018585205, + "kl": 0.26324642449617386, + "learning_rate": 9.707085236527873e-07, + "loss": 0.0085, + "num_tokens": 4304654.0, + "reward": 0.8250732421875, + "reward_std": 0.016175588592886925, + "rewards//mean": 0.8250732421875, + "rewards//std": 0.028301063925027847, + "step": 593 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1188, + "grad_norm": 1.4678316116333008, + "kl": 0.2510586064308882, + "learning_rate": 9.706014101457599e-07, + "loss": 0.01, + "num_tokens": 4311942.0, + "reward": 0.8155517578125, + "reward_std": 0.01565970852971077, + "rewards//mean": 0.8155517578125, + "rewards//std": 0.019605128094553947, + "step": 594 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.859375, + "epoch": 0.119, + "grad_norm": 1.4916791915893555, + "kl": 0.2160958144813776, + "learning_rate": 9.704941070805405e-07, + "loss": 0.0071, + "num_tokens": 4319221.0, + "reward": 0.8341064453125, + "reward_std": 0.0165761336684227, + "rewards//mean": 0.8341064453125, + "rewards//std": 0.025910839438438416, + "step": 595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.84375, + "epoch": 0.1192, + "grad_norm": 1.6265201568603516, + "kl": 0.31768742576241493, + "learning_rate": 9.70386614500351e-07, + "loss": 0.0088, + "num_tokens": 4326531.0, + "reward": 0.83831787109375, + "reward_std": 0.016053302213549614, + "rewards//mean": 0.83831787109375, + "rewards//std": 0.0191444493830204, + "step": 596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.1194, + "grad_norm": 1.375156283378601, + "kl": 0.2465062364935875, + "learning_rate": 9.702789324484896e-07, + "loss": 0.007, + "num_tokens": 4333877.0, + "reward": 0.86041259765625, + "reward_std": 0.01806102693080902, + "rewards//mean": 0.86041259765625, + "rewards//std": 0.021307410672307014, + "step": 597 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.1196, + "grad_norm": 1.5875588655471802, + "kl": 0.23962048068642616, + "learning_rate": 9.701710609683305e-07, + "loss": 0.0082, + "num_tokens": 4341128.0, + "reward": 0.779052734375, + "reward_std": 0.014892620034515858, + "rewards//mean": 0.779052734375, + "rewards//std": 0.029437081888318062, + "step": 598 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1198, + "grad_norm": 1.4564008712768555, + "kl": 0.2906534746289253, + "learning_rate": 9.700630001033243e-07, + "loss": 0.0116, + "num_tokens": 4348416.0, + "reward": 0.80682373046875, + "reward_std": 0.017484918236732483, + "rewards//mean": 0.80682373046875, + "rewards//std": 0.031373221427202225, + "step": 599 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.12, + "grad_norm": 1.563356876373291, + "kl": 0.27066104859113693, + "learning_rate": 9.699547498969978e-07, + "loss": 0.0103, + "num_tokens": 4355787.0, + "reward": 0.8310546875, + "reward_std": 0.014367376454174519, + "rewards//mean": 0.8310546875, + "rewards//std": 0.022304370999336243, + "step": 600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.765625, + "epoch": 0.1202, + "grad_norm": 1.6291840076446533, + "kl": 0.2773931007832289, + "learning_rate": 9.698463103929541e-07, + "loss": 0.0023, + "num_tokens": 4363076.0, + "reward": 0.7811279296875, + "reward_std": 0.015339871868491173, + "rewards//mean": 0.7811279296875, + "rewards//std": 0.023724768310785294, + "step": 601 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1204, + "grad_norm": 1.3468503952026367, + "kl": 0.27318892627954483, + "learning_rate": 9.69737681634873e-07, + "loss": 0.0109, + "num_tokens": 4370388.0, + "reward": 0.81304931640625, + "reward_std": 0.017683546990156174, + "rewards//mean": 0.81304931640625, + "rewards//std": 0.019867727532982826, + "step": 602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.78125, + "epoch": 0.1206, + "grad_norm": 1.3680179119110107, + "kl": 0.20291493646800518, + "learning_rate": 9.696288636665097e-07, + "loss": 0.0142, + "num_tokens": 4377654.0, + "reward": 0.84527587890625, + "reward_std": 0.014360816217958927, + "rewards//mean": 0.84527587890625, + "rewards//std": 0.020933276042342186, + "step": 603 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1208, + "grad_norm": 1.4160398244857788, + "kl": 0.24997320026159286, + "learning_rate": 9.695198565316964e-07, + "loss": 0.01, + "num_tokens": 4384870.0, + "reward": 0.828125, + "reward_std": 0.013068178668618202, + "rewards//mean": 0.828125, + "rewards//std": 0.023826956748962402, + "step": 604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.121, + "grad_norm": 1.5822477340698242, + "kl": 0.2210839670151472, + "learning_rate": 9.69410660274341e-07, + "loss": 0.0088, + "num_tokens": 4392230.0, + "reward": 0.82611083984375, + "reward_std": 0.015203042887151241, + "rewards//mean": 0.82611083984375, + "rewards//std": 0.02820267528295517, + "step": 605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1212, + "grad_norm": 1.112256407737732, + "kl": 0.23546110466122627, + "learning_rate": 9.693012749384277e-07, + "loss": 0.0094, + "num_tokens": 4399454.0, + "reward": 0.8709716796875, + "reward_std": 0.015119954943656921, + "rewards//mean": 0.8709716796875, + "rewards//std": 0.022477107122540474, + "step": 606 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.703125, + "epoch": 0.1214, + "grad_norm": 1.3000038862228394, + "kl": 0.2205430008471012, + "learning_rate": 9.691917005680173e-07, + "loss": 0.0076, + "num_tokens": 4406619.0, + "reward": 0.8599853515625, + "reward_std": 0.01992976665496826, + "rewards//mean": 0.8599853515625, + "rewards//std": 0.027963140979409218, + "step": 607 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.1216, + "grad_norm": 1.2919434309005737, + "kl": 0.21983695216476917, + "learning_rate": 9.690819372072456e-07, + "loss": 0.008, + "num_tokens": 4413873.0, + "reward": 0.79736328125, + "reward_std": 0.011510937474668026, + "rewards//mean": 0.79736328125, + "rewards//std": 0.01631505787372589, + "step": 608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1218, + "grad_norm": 1.2820463180541992, + "kl": 0.22580527886748314, + "learning_rate": 9.68971984900326e-07, + "loss": 0.009, + "num_tokens": 4421169.0, + "reward": 0.85992431640625, + "reward_std": 0.015025245025753975, + "rewards//mean": 0.85992431640625, + "rewards//std": 0.023082077503204346, + "step": 609 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.122, + "grad_norm": 1.5311951637268066, + "kl": 0.2749636359512806, + "learning_rate": 9.688618436915468e-07, + "loss": 0.011, + "num_tokens": 4428449.0, + "reward": 0.83111572265625, + "reward_std": 0.018039608374238014, + "rewards//mean": 0.83111572265625, + "rewards//std": 0.02788962423801422, + "step": 610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.1222, + "grad_norm": 1.4111158847808838, + "kl": 0.24242276325821877, + "learning_rate": 9.68751513625273e-07, + "loss": 0.0121, + "num_tokens": 4435723.0, + "reward": 0.82843017578125, + "reward_std": 0.019997671246528625, + "rewards//mean": 0.82843017578125, + "rewards//std": 0.02381988801062107, + "step": 611 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1224, + "grad_norm": 1.3911669254302979, + "kl": 0.24297418631613255, + "learning_rate": 9.686409947459457e-07, + "loss": 0.0097, + "num_tokens": 4443203.0, + "reward": 0.85296630859375, + "reward_std": 0.016692666336894035, + "rewards//mean": 0.85296630859375, + "rewards//std": 0.030043739825487137, + "step": 612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1226, + "grad_norm": 1.3881032466888428, + "kl": 0.2509396467357874, + "learning_rate": 9.685302870980817e-07, + "loss": 0.01, + "num_tokens": 4450475.0, + "reward": 0.88909912109375, + "reward_std": 0.01783701591193676, + "rewards//mean": 0.88909912109375, + "rewards//std": 0.023721180856227875, + "step": 613 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1228, + "grad_norm": 1.4849498271942139, + "kl": 0.23676102049648762, + "learning_rate": 9.684193907262742e-07, + "loss": 0.0095, + "num_tokens": 4457747.0, + "reward": 0.82391357421875, + "reward_std": 0.015947047621011734, + "rewards//mean": 0.82391357421875, + "rewards//std": 0.023860525339841843, + "step": 614 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.123, + "grad_norm": 1.4822955131530762, + "kl": 0.2171874176710844, + "learning_rate": 9.68308305675192e-07, + "loss": 0.0087, + "num_tokens": 4464939.0, + "reward": 0.8609619140625, + "reward_std": 0.022696157917380333, + "rewards//mean": 0.8609619140625, + "rewards//std": 0.03476983308792114, + "step": 615 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.1232, + "grad_norm": 1.2652661800384521, + "kl": 0.21254402864724398, + "learning_rate": 9.681970319895802e-07, + "loss": 0.0087, + "num_tokens": 4472296.0, + "reward": 0.7696533203125, + "reward_std": 0.013492338359355927, + "rewards//mean": 0.7696533203125, + "rewards//std": 0.01808200590312481, + "step": 616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.75, + "epoch": 0.1234, + "grad_norm": 1.439348816871643, + "kl": 0.24875340424478054, + "learning_rate": 9.6808556971426e-07, + "loss": 0.0086, + "num_tokens": 4479632.0, + "reward": 0.86590576171875, + "reward_std": 0.017376970499753952, + "rewards//mean": 0.86590576171875, + "rewards//std": 0.02646653726696968, + "step": 617 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.1236, + "grad_norm": 1.1704462766647339, + "kl": 0.19574426300823689, + "learning_rate": 9.679739188941283e-07, + "loss": 0.0067, + "num_tokens": 4486922.0, + "reward": 0.85992431640625, + "reward_std": 0.016243521124124527, + "rewards//mean": 0.85992431640625, + "rewards//std": 0.02503511682152748, + "step": 618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1238, + "grad_norm": 1.292689561843872, + "kl": 0.23087471164762974, + "learning_rate": 9.678620795741582e-07, + "loss": 0.0092, + "num_tokens": 4494330.0, + "reward": 0.81842041015625, + "reward_std": 0.017513800412416458, + "rewards//mean": 0.81842041015625, + "rewards//std": 0.02275579608976841, + "step": 619 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.124, + "grad_norm": 1.3336927890777588, + "kl": 0.21905435249209404, + "learning_rate": 9.677500517993982e-07, + "loss": 0.0088, + "num_tokens": 4501666.0, + "reward": 0.82958984375, + "reward_std": 0.016690319404006004, + "rewards//mean": 0.82958984375, + "rewards//std": 0.031454719603061676, + "step": 620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1242, + "grad_norm": 1.3386404514312744, + "kl": 0.23591567762196064, + "learning_rate": 9.676378356149732e-07, + "loss": 0.0094, + "num_tokens": 4508962.0, + "reward": 0.84478759765625, + "reward_std": 0.016449466347694397, + "rewards//mean": 0.84478759765625, + "rewards//std": 0.023676468059420586, + "step": 621 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.1244, + "grad_norm": 1.4951586723327637, + "kl": 0.26255306228995323, + "learning_rate": 9.675254310660841e-07, + "loss": 0.0083, + "num_tokens": 4516223.0, + "reward": 0.86602783203125, + "reward_std": 0.017668340355157852, + "rewards//mean": 0.86602783203125, + "rewards//std": 0.02400599978864193, + "step": 622 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1246, + "grad_norm": 1.4737026691436768, + "kl": 0.22438272833824158, + "learning_rate": 9.674128381980071e-07, + "loss": 0.009, + "num_tokens": 4523527.0, + "reward": 0.78594970703125, + "reward_std": 0.01212775893509388, + "rewards//mean": 0.78594970703125, + "rewards//std": 0.02384910359978676, + "step": 623 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.1248, + "grad_norm": 1.6042585372924805, + "kl": 0.26614941470324993, + "learning_rate": 9.67300057056095e-07, + "loss": 0.0122, + "num_tokens": 4530754.0, + "reward": 0.84478759765625, + "reward_std": 0.017744850367307663, + "rewards//mean": 0.84478759765625, + "rewards//std": 0.024854276329278946, + "step": 624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.125, + "grad_norm": 1.3760559558868408, + "kl": 0.25109333731234074, + "learning_rate": 9.671870876857758e-07, + "loss": 0.01, + "num_tokens": 4538018.0, + "reward": 0.873046875, + "reward_std": 0.013843446038663387, + "rewards//mean": 0.873046875, + "rewards//std": 0.02525792457163334, + "step": 625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1252, + "grad_norm": 1.4263784885406494, + "kl": 0.2558713015168905, + "learning_rate": 9.670739301325534e-07, + "loss": 0.0102, + "num_tokens": 4545330.0, + "reward": 0.83013916015625, + "reward_std": 0.017814021557569504, + "rewards//mean": 0.83013916015625, + "rewards//std": 0.04186958074569702, + "step": 626 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.1254, + "grad_norm": 2.1571412086486816, + "kl": 0.4312891364097595, + "learning_rate": 9.669605844420078e-07, + "loss": 0.0172, + "num_tokens": 4552624.0, + "reward": 0.81890869140625, + "reward_std": 0.013371115550398827, + "rewards//mean": 0.81890869140625, + "rewards//std": 0.022642426192760468, + "step": 627 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1256, + "grad_norm": 1.3025541305541992, + "kl": 0.23278475552797318, + "learning_rate": 9.668470506597946e-07, + "loss": 0.0093, + "num_tokens": 4560008.0, + "reward": 0.82867431640625, + "reward_std": 0.015986859798431396, + "rewards//mean": 0.82867431640625, + "rewards//std": 0.022486785426735878, + "step": 628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1258, + "grad_norm": 1.3805891275405884, + "kl": 0.27833780087530613, + "learning_rate": 9.667333288316453e-07, + "loss": 0.0111, + "num_tokens": 4567264.0, + "reward": 0.84844970703125, + "reward_std": 0.014164179563522339, + "rewards//mean": 0.84844970703125, + "rewards//std": 0.021020593121647835, + "step": 629 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.126, + "grad_norm": 1.697991967201233, + "kl": 0.24817977845668793, + "learning_rate": 9.66619419003367e-07, + "loss": 0.0099, + "num_tokens": 4574600.0, + "reward": 0.8365478515625, + "reward_std": 0.022679906338453293, + "rewards//mean": 0.8365478515625, + "rewards//std": 0.02596919611096382, + "step": 630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1262, + "grad_norm": 1.4315420389175415, + "kl": 0.2591031342744827, + "learning_rate": 9.665053212208426e-07, + "loss": 0.0104, + "num_tokens": 4581904.0, + "reward": 0.8402099609375, + "reward_std": 0.019068805500864983, + "rewards//mean": 0.8402099609375, + "rewards//std": 0.02748703584074974, + "step": 631 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.078125, + "epoch": 0.1264, + "grad_norm": 1.5744013786315918, + "kl": 0.24432558380067348, + "learning_rate": 9.663910355300304e-07, + "loss": -0.0288, + "num_tokens": 4589133.0, + "reward": 0.80169677734375, + "reward_std": 0.014481700956821442, + "rewards//mean": 0.80169677734375, + "rewards//std": 0.02404695190489292, + "step": 632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1266, + "grad_norm": 1.385128378868103, + "kl": 0.24782867170870304, + "learning_rate": 9.66276561976965e-07, + "loss": 0.0099, + "num_tokens": 4596413.0, + "reward": 0.85614013671875, + "reward_std": 0.011864407919347286, + "rewards//mean": 0.85614013671875, + "rewards//std": 0.013736908324062824, + "step": 633 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.1268, + "grad_norm": 1.4599612951278687, + "kl": 0.23942369781434536, + "learning_rate": 9.661619006077561e-07, + "loss": 0.0094, + "num_tokens": 4603684.0, + "reward": 0.86456298828125, + "reward_std": 0.015936974436044693, + "rewards//mean": 0.86456298828125, + "rewards//std": 0.01966170221567154, + "step": 634 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.71875, + "epoch": 0.127, + "grad_norm": 1.4299299716949463, + "kl": 0.27197524160146713, + "learning_rate": 9.660470514685895e-07, + "loss": 0.0024, + "num_tokens": 4610906.0, + "reward": 0.7869873046875, + "reward_std": 0.018407925963401794, + "rewards//mean": 0.7869873046875, + "rewards//std": 0.027612315490841866, + "step": 635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1272, + "grad_norm": 1.632752776145935, + "kl": 0.242015628144145, + "learning_rate": 9.659320146057262e-07, + "loss": 0.0097, + "num_tokens": 4618170.0, + "reward": 0.83160400390625, + "reward_std": 0.018885979428887367, + "rewards//mean": 0.83160400390625, + "rewards//std": 0.02540247142314911, + "step": 636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1274, + "grad_norm": 1.436672568321228, + "kl": 0.25354658253490925, + "learning_rate": 9.65816790065503e-07, + "loss": 0.0101, + "num_tokens": 4625498.0, + "reward": 0.8321533203125, + "reward_std": 0.01545516774058342, + "rewards//mean": 0.8321533203125, + "rewards//std": 0.025901490822434425, + "step": 637 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.1276, + "grad_norm": 1.5578771829605103, + "kl": 0.23480653390288353, + "learning_rate": 9.657013778943327e-07, + "loss": 0.0071, + "num_tokens": 4632743.0, + "reward": 0.84619140625, + "reward_std": 0.016961170360445976, + "rewards//mean": 0.84619140625, + "rewards//std": 0.03281146660447121, + "step": 638 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1278, + "grad_norm": 1.4827038049697876, + "kl": 0.24101252853870392, + "learning_rate": 9.65585778138703e-07, + "loss": 0.0096, + "num_tokens": 4640023.0, + "reward": 0.80859375, + "reward_std": 0.01627952791750431, + "rewards//mean": 0.80859375, + "rewards//std": 0.024140017107129097, + "step": 639 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.128, + "grad_norm": 1.3399971723556519, + "kl": 0.23521539568901062, + "learning_rate": 9.654699908451776e-07, + "loss": 0.0094, + "num_tokens": 4647319.0, + "reward": 0.82916259765625, + "reward_std": 0.02385783940553665, + "rewards//mean": 0.82916259765625, + "rewards//std": 0.03310352936387062, + "step": 640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1282, + "grad_norm": 1.3265960216522217, + "kl": 0.24275513365864754, + "learning_rate": 9.653540160603955e-07, + "loss": 0.0097, + "num_tokens": 4654599.0, + "reward": 0.80340576171875, + "reward_std": 0.016774959862232208, + "rewards//mean": 0.80340576171875, + "rewards//std": 0.03108724020421505, + "step": 641 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.578125, + "epoch": 0.1284, + "grad_norm": 1.3954271078109741, + "kl": 0.2770957425236702, + "learning_rate": 9.652378538310713e-07, + "loss": -0.0121, + "num_tokens": 4661844.0, + "reward": 0.803466796875, + "reward_std": 0.017655905336141586, + "rewards//mean": 0.803466796875, + "rewards//std": 0.03198444843292236, + "step": 642 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1286, + "grad_norm": 1.4829704761505127, + "kl": 0.288714736700058, + "learning_rate": 9.651215042039953e-07, + "loss": 0.0115, + "num_tokens": 4669132.0, + "reward": 0.86749267578125, + "reward_std": 0.016397299244999886, + "rewards//mean": 0.86749267578125, + "rewards//std": 0.02560368925333023, + "step": 643 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1288, + "grad_norm": 1.429383397102356, + "kl": 0.2366076372563839, + "learning_rate": 9.650049672260333e-07, + "loss": 0.0095, + "num_tokens": 4676420.0, + "reward": 0.87847900390625, + "reward_std": 0.0138083565980196, + "rewards//mean": 0.87847900390625, + "rewards//std": 0.021351413801312447, + "step": 644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.129, + "grad_norm": 1.2424179315567017, + "kl": 0.22877271845936775, + "learning_rate": 9.648882429441256e-07, + "loss": 0.0092, + "num_tokens": 4683668.0, + "reward": 0.88201904296875, + "reward_std": 0.015774331986904144, + "rewards//mean": 0.88201904296875, + "rewards//std": 0.018151670694351196, + "step": 645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.796875, + "epoch": 0.1292, + "grad_norm": 1.8680589199066162, + "kl": 0.2535843625664711, + "learning_rate": 9.647713314052895e-07, + "loss": 0.0083, + "num_tokens": 4691079.0, + "reward": 0.84906005859375, + "reward_std": 0.015442351810634136, + "rewards//mean": 0.84906005859375, + "rewards//std": 0.02433355711400509, + "step": 646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1294, + "grad_norm": 1.3138562440872192, + "kl": 0.2017708197236061, + "learning_rate": 9.646542326566168e-07, + "loss": 0.0081, + "num_tokens": 4698375.0, + "reward": 0.83990478515625, + "reward_std": 0.014581156894564629, + "rewards//mean": 0.83990478515625, + "rewards//std": 0.02603694424033165, + "step": 647 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.1296, + "grad_norm": 1.2162846326828003, + "kl": 0.24773931503295898, + "learning_rate": 9.645369467452745e-07, + "loss": 0.0095, + "num_tokens": 4705733.0, + "reward": 0.7908935546875, + "reward_std": 0.012166686356067657, + "rewards//mean": 0.7908935546875, + "rewards//std": 0.014763724990189075, + "step": 648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1298, + "grad_norm": 1.5626065731048584, + "kl": 0.29551476426422596, + "learning_rate": 9.644194737185057e-07, + "loss": 0.0118, + "num_tokens": 4712989.0, + "reward": 0.8660888671875, + "reward_std": 0.02191385254263878, + "rewards//mean": 0.8660888671875, + "rewards//std": 0.029119368642568588, + "step": 649 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.13, + "grad_norm": 1.3650506734848022, + "kl": 0.2777904476970434, + "learning_rate": 9.643018136236286e-07, + "loss": 0.0111, + "num_tokens": 4720341.0, + "reward": 0.7977294921875, + "reward_std": 0.017009004950523376, + "rewards//mean": 0.7977294921875, + "rewards//std": 0.030690591782331467, + "step": 650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1302, + "grad_norm": 1.8164430856704712, + "kl": 0.2937076557427645, + "learning_rate": 9.641839665080363e-07, + "loss": 0.0117, + "num_tokens": 4727565.0, + "reward": 0.8084716796875, + "reward_std": 0.018054261803627014, + "rewards//mean": 0.8084716796875, + "rewards//std": 0.02061089314520359, + "step": 651 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1304, + "grad_norm": 1.287145733833313, + "kl": 0.2742070984095335, + "learning_rate": 9.640659324191978e-07, + "loss": 0.011, + "num_tokens": 4734845.0, + "reward": 0.836669921875, + "reward_std": 0.012675769627094269, + "rewards//mean": 0.836669921875, + "rewards//std": 0.015498543158173561, + "step": 652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1306, + "grad_norm": 1.3430968523025513, + "kl": 0.25231581181287766, + "learning_rate": 9.639477114046572e-07, + "loss": 0.0101, + "num_tokens": 4742165.0, + "reward": 0.8792724609375, + "reward_std": 0.01634734310209751, + "rewards//mean": 0.8792724609375, + "rewards//std": 0.02135823667049408, + "step": 653 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1308, + "grad_norm": 1.4489606618881226, + "kl": 0.27256356179714203, + "learning_rate": 9.63829303512034e-07, + "loss": 0.0109, + "num_tokens": 4749477.0, + "reward": 0.834716796875, + "reward_std": 0.015651090070605278, + "rewards//mean": 0.834716796875, + "rewards//std": 0.021393997594714165, + "step": 654 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.131, + "grad_norm": 1.8556914329528809, + "kl": 0.40473814122378826, + "learning_rate": 9.63710708789023e-07, + "loss": 0.0162, + "num_tokens": 4756749.0, + "reward": 0.826904296875, + "reward_std": 0.014268776401877403, + "rewards//mean": 0.826904296875, + "rewards//std": 0.017769979313015938, + "step": 655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1312, + "grad_norm": 1.326011061668396, + "kl": 0.24312574230134487, + "learning_rate": 9.635919272833937e-07, + "loss": 0.0097, + "num_tokens": 4764021.0, + "reward": 0.86029052734375, + "reward_std": 0.017353812232613564, + "rewards//mean": 0.86029052734375, + "rewards//std": 0.02515936829149723, + "step": 656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1314, + "grad_norm": 1.3991453647613525, + "kl": 0.25947905518114567, + "learning_rate": 9.634729590429916e-07, + "loss": 0.0104, + "num_tokens": 4771333.0, + "reward": 0.75787353515625, + "reward_std": 0.01449662633240223, + "rewards//mean": 0.75787353515625, + "rewards//std": 0.0198509581387043, + "step": 657 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.78125, + "epoch": 0.1316, + "grad_norm": 1.5432695150375366, + "kl": 0.28030410036444664, + "learning_rate": 9.63353804115737e-07, + "loss": -0.001, + "num_tokens": 4778695.0, + "reward": 0.86566162109375, + "reward_std": 0.022104142233729362, + "rewards//mean": 0.86566162109375, + "rewards//std": 0.03367927297949791, + "step": 658 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1318, + "grad_norm": 1.3605427742004395, + "kl": 0.22488745488226414, + "learning_rate": 9.632344625496255e-07, + "loss": 0.009, + "num_tokens": 4785967.0, + "reward": 0.85797119140625, + "reward_std": 0.015202043578028679, + "rewards//mean": 0.85797119140625, + "rewards//std": 0.021022753790020943, + "step": 659 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.132, + "grad_norm": 1.311663269996643, + "kl": 0.2491665855050087, + "learning_rate": 9.63114934392728e-07, + "loss": 0.01, + "num_tokens": 4793255.0, + "reward": 0.839111328125, + "reward_std": 0.015414560213685036, + "rewards//mean": 0.839111328125, + "rewards//std": 0.021074624732136726, + "step": 660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1322, + "grad_norm": 1.3590824604034424, + "kl": 0.25725651159882545, + "learning_rate": 9.6299521969319e-07, + "loss": 0.0103, + "num_tokens": 4800591.0, + "reward": 0.79925537109375, + "reward_std": 0.012323799543082714, + "rewards//mean": 0.79925537109375, + "rewards//std": 0.016017170622944832, + "step": 661 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.65625, + "epoch": 0.1324, + "grad_norm": 1.3781718015670776, + "kl": 0.25953250378370285, + "learning_rate": 9.628753184992333e-07, + "loss": 0.017, + "num_tokens": 4807801.0, + "reward": 0.8447265625, + "reward_std": 0.01959945820271969, + "rewards//mean": 0.8447265625, + "rewards//std": 0.03210632503032684, + "step": 662 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1326, + "grad_norm": 1.650534749031067, + "kl": 0.39711068011820316, + "learning_rate": 9.627552308591533e-07, + "loss": 0.0159, + "num_tokens": 4815065.0, + "reward": 0.8389892578125, + "reward_std": 0.014535531401634216, + "rewards//mean": 0.8389892578125, + "rewards//std": 0.021826548501849174, + "step": 663 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1328, + "grad_norm": 1.340232491493225, + "kl": 0.2644961457699537, + "learning_rate": 9.62634956821322e-07, + "loss": 0.0106, + "num_tokens": 4822345.0, + "reward": 0.79595947265625, + "reward_std": 0.01335366815328598, + "rewards//mean": 0.79595947265625, + "rewards//std": 0.01693054847419262, + "step": 664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.133, + "grad_norm": 1.3279492855072021, + "kl": 0.3403146918863058, + "learning_rate": 9.625144964341852e-07, + "loss": 0.0136, + "num_tokens": 4829569.0, + "reward": 0.85009765625, + "reward_std": 0.019558165222406387, + "rewards//mean": 0.85009765625, + "rewards//std": 0.023943569511175156, + "step": 665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1332, + "grad_norm": 1.6522716283798218, + "kl": 0.316254872828722, + "learning_rate": 9.623938497462645e-07, + "loss": 0.0127, + "num_tokens": 4836881.0, + "reward": 0.819580078125, + "reward_std": 0.0184466689825058, + "rewards//mean": 0.819580078125, + "rewards//std": 0.023113617673516273, + "step": 666 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.78125, + "epoch": 0.1334, + "grad_norm": 1.2923028469085693, + "kl": 0.2997368089854717, + "learning_rate": 9.622730168061567e-07, + "loss": -0.0025, + "num_tokens": 4844035.0, + "reward": 0.81982421875, + "reward_std": 0.01776699535548687, + "rewards//mean": 0.81982421875, + "rewards//std": 0.03035757690668106, + "step": 667 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1336, + "grad_norm": 1.7030340433120728, + "kl": 0.33546494878828526, + "learning_rate": 9.621519976625326e-07, + "loss": 0.0134, + "num_tokens": 4851371.0, + "reward": 0.82135009765625, + "reward_std": 0.02076566219329834, + "rewards//mean": 0.82135009765625, + "rewards//std": 0.030107159167528152, + "step": 668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1338, + "grad_norm": 1.3087936639785767, + "kl": 0.2536047911271453, + "learning_rate": 9.620307923641392e-07, + "loss": 0.0101, + "num_tokens": 4858723.0, + "reward": 0.82330322265625, + "reward_std": 0.016149604693055153, + "rewards//mean": 0.82330322265625, + "rewards//std": 0.020204687491059303, + "step": 669 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.134, + "grad_norm": 1.4218815565109253, + "kl": 0.2646583989262581, + "learning_rate": 9.61909400959798e-07, + "loss": 0.0102, + "num_tokens": 4866025.0, + "reward": 0.81494140625, + "reward_std": 0.017814066261053085, + "rewards//mean": 0.81494140625, + "rewards//std": 0.024155063554644585, + "step": 670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.1342, + "grad_norm": 1.4981968402862549, + "kl": 0.2479128260165453, + "learning_rate": 9.617878234984054e-07, + "loss": 0.0078, + "num_tokens": 4873173.0, + "reward": 0.82452392578125, + "reward_std": 0.01448520366102457, + "rewards//mean": 0.82452392578125, + "rewards//std": 0.025745194405317307, + "step": 671 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.8125, + "epoch": 0.1344, + "grad_norm": 1.638085961341858, + "kl": 0.2598614189773798, + "learning_rate": 9.616660600289327e-07, + "loss": 0.0139, + "num_tokens": 4880417.0, + "reward": 0.8345947265625, + "reward_std": 0.016096865758299828, + "rewards//mean": 0.8345947265625, + "rewards//std": 0.025173578411340714, + "step": 672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1346, + "grad_norm": 1.4543349742889404, + "kl": 0.2761615477502346, + "learning_rate": 9.615441106004262e-07, + "loss": 0.011, + "num_tokens": 4887641.0, + "reward": 0.84112548828125, + "reward_std": 0.017716731876134872, + "rewards//mean": 0.84112548828125, + "rewards//std": 0.02178863063454628, + "step": 673 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1348, + "grad_norm": 1.5879896879196167, + "kl": 0.33982468768954277, + "learning_rate": 9.614219752620072e-07, + "loss": 0.0136, + "num_tokens": 4895057.0, + "reward": 0.8607177734375, + "reward_std": 0.019637592136859894, + "rewards//mean": 0.8607177734375, + "rewards//std": 0.02799776755273342, + "step": 674 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.135, + "grad_norm": 1.367310643196106, + "kl": 0.21709785889834166, + "learning_rate": 9.612996540628717e-07, + "loss": 0.0087, + "num_tokens": 4902297.0, + "reward": 0.8370361328125, + "reward_std": 0.01818048767745495, + "rewards//mean": 0.8370361328125, + "rewards//std": 0.03383605554699898, + "step": 675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.1352, + "grad_norm": 1.4379699230194092, + "kl": 0.2711914833635092, + "learning_rate": 9.611771470522907e-07, + "loss": 0.0077, + "num_tokens": 4909572.0, + "reward": 0.802734375, + "reward_std": 0.0174124576151371, + "rewards//mean": 0.802734375, + "rewards//std": 0.021179234609007835, + "step": 676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.1354, + "grad_norm": 1.6526342630386353, + "kl": 0.2870672307908535, + "learning_rate": 9.6105445427961e-07, + "loss": 0.0154, + "num_tokens": 4916797.0, + "reward": 0.81756591796875, + "reward_std": 0.013258553110063076, + "rewards//mean": 0.81756591796875, + "rewards//std": 0.021493446081876755, + "step": 677 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1356, + "grad_norm": 1.3584483861923218, + "kl": 0.2460616324096918, + "learning_rate": 9.609315757942502e-07, + "loss": 0.0098, + "num_tokens": 4924053.0, + "reward": 0.85986328125, + "reward_std": 0.01636352762579918, + "rewards//mean": 0.85986328125, + "rewards//std": 0.02676180750131607, + "step": 678 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1358, + "grad_norm": 1.4725096225738525, + "kl": 0.2520089130848646, + "learning_rate": 9.608085116457068e-07, + "loss": 0.0101, + "num_tokens": 4931325.0, + "reward": 0.8695068359375, + "reward_std": 0.02041422761976719, + "rewards//mean": 0.8695068359375, + "rewards//std": 0.027319103479385376, + "step": 679 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.875, + "epoch": 0.136, + "grad_norm": 1.5842911005020142, + "kl": 0.2424280010163784, + "learning_rate": 9.606852618835502e-07, + "loss": 0.0103, + "num_tokens": 4938621.0, + "reward": 0.8546142578125, + "reward_std": 0.01411538664251566, + "rewards//mean": 0.8546142578125, + "rewards//std": 0.021161716431379318, + "step": 680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1362, + "grad_norm": 1.4191067218780518, + "kl": 0.2489805780351162, + "learning_rate": 9.60561826557425e-07, + "loss": 0.01, + "num_tokens": 4945805.0, + "reward": 0.8818359375, + "reward_std": 0.019485600292682648, + "rewards//mean": 0.8818359375, + "rewards//std": 0.023714875802397728, + "step": 681 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1364, + "grad_norm": 1.215956211090088, + "kl": 0.28247102349996567, + "learning_rate": 9.604382057170512e-07, + "loss": 0.0113, + "num_tokens": 4953093.0, + "reward": 0.82696533203125, + "reward_std": 0.014125103130936623, + "rewards//mean": 0.82696533203125, + "rewards//std": 0.017630470916628838, + "step": 682 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.75, + "epoch": 0.1366, + "grad_norm": 1.5108118057250977, + "kl": 0.2582235299050808, + "learning_rate": 9.603143994122232e-07, + "loss": -0.0062, + "num_tokens": 4960333.0, + "reward": 0.76837158203125, + "reward_std": 0.011715065687894821, + "rewards//mean": 0.76837158203125, + "rewards//std": 0.01896730251610279, + "step": 683 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1368, + "grad_norm": 1.302272081375122, + "kl": 0.24193101562559605, + "learning_rate": 9.601904076928102e-07, + "loss": 0.0097, + "num_tokens": 4967589.0, + "reward": 0.79345703125, + "reward_std": 0.01731676422059536, + "rewards//mean": 0.79345703125, + "rewards//std": 0.0200815349817276, + "step": 684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.137, + "grad_norm": 1.4716485738754272, + "kl": 0.27866013906896114, + "learning_rate": 9.60066230608756e-07, + "loss": 0.0111, + "num_tokens": 4974893.0, + "reward": 0.83941650390625, + "reward_std": 0.01795273646712303, + "rewards//mean": 0.83941650390625, + "rewards//std": 0.02573402039706707, + "step": 685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1372, + "grad_norm": 1.316231608390808, + "kl": 0.24845713004469872, + "learning_rate": 9.599418682100792e-07, + "loss": 0.0099, + "num_tokens": 4982085.0, + "reward": 0.847412109375, + "reward_std": 0.02064218744635582, + "rewards//mean": 0.847412109375, + "rewards//std": 0.022976988926529884, + "step": 686 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.828125, + "epoch": 0.1374, + "grad_norm": 1.3790525197982788, + "kl": 0.2902708277106285, + "learning_rate": 9.598173205468727e-07, + "loss": 0.0042, + "num_tokens": 4989322.0, + "reward": 0.834716796875, + "reward_std": 0.02007957547903061, + "rewards//mean": 0.834716796875, + "rewards//std": 0.034413836896419525, + "step": 687 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1376, + "grad_norm": 1.8550821542739868, + "kl": 0.2601463682949543, + "learning_rate": 9.596925876693047e-07, + "loss": 0.0104, + "num_tokens": 4996682.0, + "reward": 0.83367919921875, + "reward_std": 0.02182183600962162, + "rewards//mean": 0.83367919921875, + "rewards//std": 0.023610521107912064, + "step": 688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1378, + "grad_norm": 1.6097463369369507, + "kl": 0.21962395682930946, + "learning_rate": 9.595676696276171e-07, + "loss": 0.0088, + "num_tokens": 5003906.0, + "reward": 0.8272705078125, + "reward_std": 0.019322998821735382, + "rewards//mean": 0.8272705078125, + "rewards//std": 0.031523704528808594, + "step": 689 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.138, + "grad_norm": 1.996631383895874, + "kl": 0.2594760712236166, + "learning_rate": 9.594425664721274e-07, + "loss": 0.0104, + "num_tokens": 5011122.0, + "reward": 0.885498046875, + "reward_std": 0.019470717757940292, + "rewards//mean": 0.885498046875, + "rewards//std": 0.0339818112552166, + "step": 690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.828125, + "epoch": 0.1382, + "grad_norm": 1.614793300628662, + "kl": 0.24031787551939487, + "learning_rate": 9.593172782532267e-07, + "loss": 0.0071, + "num_tokens": 5018359.0, + "reward": 0.8060302734375, + "reward_std": 0.014451326802372932, + "rewards//mean": 0.8060302734375, + "rewards//std": 0.021934473887085915, + "step": 691 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1384, + "grad_norm": 1.5257173776626587, + "kl": 0.23582077585160732, + "learning_rate": 9.591918050213813e-07, + "loss": 0.0094, + "num_tokens": 5025575.0, + "reward": 0.82586669921875, + "reward_std": 0.015665728598833084, + "rewards//mean": 0.82586669921875, + "rewards//std": 0.02754061110317707, + "step": 692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1386, + "grad_norm": 1.536577582359314, + "kl": 0.22275889478623867, + "learning_rate": 9.590661468271318e-07, + "loss": 0.0089, + "num_tokens": 5032879.0, + "reward": 0.832275390625, + "reward_std": 0.015319372527301311, + "rewards//mean": 0.832275390625, + "rewards//std": 0.017932793125510216, + "step": 693 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1388, + "grad_norm": 1.367129921913147, + "kl": 0.25990950874984264, + "learning_rate": 9.589403037210931e-07, + "loss": 0.0104, + "num_tokens": 5040119.0, + "reward": 0.830810546875, + "reward_std": 0.012422814965248108, + "rewards//mean": 0.830810546875, + "rewards//std": 0.019548293203115463, + "step": 694 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.139, + "grad_norm": 1.3941248655319214, + "kl": 0.2499676551669836, + "learning_rate": 9.58814275753955e-07, + "loss": 0.01, + "num_tokens": 5047391.0, + "reward": 0.82763671875, + "reward_std": 0.015820611268281937, + "rewards//mean": 0.82763671875, + "rewards//std": 0.02944222465157509, + "step": 695 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1392, + "grad_norm": 1.4116902351379395, + "kl": 0.25469120033085346, + "learning_rate": 9.586880629764817e-07, + "loss": 0.0102, + "num_tokens": 5054639.0, + "reward": 0.8294677734375, + "reward_std": 0.01642146147787571, + "rewards//mean": 0.8294677734375, + "rewards//std": 0.024431418627500534, + "step": 696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1394, + "grad_norm": 1.2930734157562256, + "kl": 0.2627185024321079, + "learning_rate": 9.585616654395112e-07, + "loss": 0.0105, + "num_tokens": 5061903.0, + "reward": 0.85675048828125, + "reward_std": 0.024405503645539284, + "rewards//mean": 0.85675048828125, + "rewards//std": 0.03582879528403282, + "step": 697 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.1396, + "grad_norm": 1.0822221040725708, + "kl": 0.2375397738069296, + "learning_rate": 9.584350831939569e-07, + "loss": 0.0083, + "num_tokens": 5069156.0, + "reward": 0.86175537109375, + "reward_std": 0.014787659049034119, + "rewards//mean": 0.86175537109375, + "rewards//std": 0.03408672288060188, + "step": 698 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.1398, + "grad_norm": 1.4881244897842407, + "kl": 0.2753385603427887, + "learning_rate": 9.58308316290806e-07, + "loss": 0.0104, + "num_tokens": 5076422.0, + "reward": 0.84088134765625, + "reward_std": 0.01342055108398199, + "rewards//mean": 0.84088134765625, + "rewards//std": 0.032816946506500244, + "step": 699 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.14, + "grad_norm": 1.4836786985397339, + "kl": 0.29045838862657547, + "learning_rate": 9.581813647811197e-07, + "loss": 0.0116, + "num_tokens": 5083654.0, + "reward": 0.8677978515625, + "reward_std": 0.015603265725076199, + "rewards//mean": 0.8677978515625, + "rewards//std": 0.020719308406114578, + "step": 700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1402, + "grad_norm": 1.246392846107483, + "kl": 0.25422425754368305, + "learning_rate": 9.580542287160346e-07, + "loss": 0.0102, + "num_tokens": 5090878.0, + "reward": 0.8719482421875, + "reward_std": 0.01202121376991272, + "rewards//mean": 0.8719482421875, + "rewards//std": 0.017069127410650253, + "step": 701 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1404, + "grad_norm": 1.5046205520629883, + "kl": 0.2702752538025379, + "learning_rate": 9.579269081467613e-07, + "loss": 0.0108, + "num_tokens": 5098078.0, + "reward": 0.84271240234375, + "reward_std": 0.015805058181285858, + "rewards//mean": 0.84271240234375, + "rewards//std": 0.025875981897115707, + "step": 702 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1406, + "grad_norm": 1.9072450399398804, + "kl": 0.2425725981593132, + "learning_rate": 9.57799403124584e-07, + "loss": 0.0097, + "num_tokens": 5105342.0, + "reward": 0.88330078125, + "reward_std": 0.022027714177966118, + "rewards//mean": 0.88330078125, + "rewards//std": 0.03171544149518013, + "step": 703 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1408, + "grad_norm": 1.2111215591430664, + "kl": 0.3131725452840328, + "learning_rate": 9.576717137008617e-07, + "loss": 0.0125, + "num_tokens": 5112598.0, + "reward": 0.798095703125, + "reward_std": 0.015493860468268394, + "rewards//mean": 0.798095703125, + "rewards//std": 0.02676519937813282, + "step": 704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.8125, + "epoch": 0.141, + "grad_norm": 1.4246021509170532, + "kl": 0.24360130354762077, + "learning_rate": 9.575438399270278e-07, + "loss": 0.0104, + "num_tokens": 5119826.0, + "reward": 0.8612060546875, + "reward_std": 0.017781488597393036, + "rewards//mean": 0.8612060546875, + "rewards//std": 0.026657240465283394, + "step": 705 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1412, + "grad_norm": 1.2498196363449097, + "kl": 0.2710053324699402, + "learning_rate": 9.5741578185459e-07, + "loss": 0.0108, + "num_tokens": 5127226.0, + "reward": 0.8543701171875, + "reward_std": 0.016477342694997787, + "rewards//mean": 0.8543701171875, + "rewards//std": 0.025868741795420647, + "step": 706 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1414, + "grad_norm": 1.6700689792633057, + "kl": 0.25746254064142704, + "learning_rate": 9.572875395351301e-07, + "loss": 0.0103, + "num_tokens": 5134602.0, + "reward": 0.82275390625, + "reward_std": 0.01232250314205885, + "rewards//mean": 0.82275390625, + "rewards//std": 0.02235318347811699, + "step": 707 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.734375, + "epoch": 0.1416, + "grad_norm": 1.4746822118759155, + "kl": 0.25787305645644665, + "learning_rate": 9.571591130203037e-07, + "loss": 0.0118, + "num_tokens": 5141857.0, + "reward": 0.86053466796875, + "reward_std": 0.015957240015268326, + "rewards//mean": 0.86053466796875, + "rewards//std": 0.028319979086518288, + "step": 708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.1418, + "grad_norm": 1.4526888132095337, + "kl": 0.30892943032085896, + "learning_rate": 9.570305023618415e-07, + "loss": 0.0125, + "num_tokens": 5149160.0, + "reward": 0.8236083984375, + "reward_std": 0.023305999115109444, + "rewards//mean": 0.8236083984375, + "rewards//std": 0.03374107554554939, + "step": 709 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.142, + "grad_norm": 1.519028663635254, + "kl": 0.2618818134069443, + "learning_rate": 9.569017076115475e-07, + "loss": 0.0105, + "num_tokens": 5156336.0, + "reward": 0.853271484375, + "reward_std": 0.0109854806214571, + "rewards//mean": 0.853271484375, + "rewards//std": 0.020158275961875916, + "step": 710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.8125, + "epoch": 0.1422, + "grad_norm": 1.538443922996521, + "kl": 0.26174850575625896, + "learning_rate": 9.567727288213004e-07, + "loss": 0.0146, + "num_tokens": 5163644.0, + "reward": 0.84326171875, + "reward_std": 0.02229059673845768, + "rewards//mean": 0.84326171875, + "rewards//std": 0.027334891259670258, + "step": 711 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1424, + "grad_norm": 1.3791813850402832, + "kl": 0.26337551698088646, + "learning_rate": 9.566435660430527e-07, + "loss": 0.0105, + "num_tokens": 5170884.0, + "reward": 0.83209228515625, + "reward_std": 0.012741691432893276, + "rewards//mean": 0.83209228515625, + "rewards//std": 0.024064572528004646, + "step": 712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1426, + "grad_norm": 1.424423336982727, + "kl": 0.269389770925045, + "learning_rate": 9.565142193288312e-07, + "loss": 0.0108, + "num_tokens": 5178188.0, + "reward": 0.87591552734375, + "reward_std": 0.017767131328582764, + "rewards//mean": 0.87591552734375, + "rewards//std": 0.027636632323265076, + "step": 713 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.84375, + "epoch": 0.1428, + "grad_norm": 1.4919954538345337, + "kl": 0.25947343185544014, + "learning_rate": 9.563846887307368e-07, + "loss": 0.0063, + "num_tokens": 5185594.0, + "reward": 0.825439453125, + "reward_std": 0.026511969044804573, + "rewards//mean": 0.825439453125, + "rewards//std": 0.029255511239171028, + "step": 714 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.143, + "grad_norm": 1.5382931232452393, + "kl": 0.2458555605262518, + "learning_rate": 9.562549743009442e-07, + "loss": 0.0098, + "num_tokens": 5192938.0, + "reward": 0.8428955078125, + "reward_std": 0.014166582375764847, + "rewards//mean": 0.8428955078125, + "rewards//std": 0.020733915269374847, + "step": 715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1432, + "grad_norm": 1.3980218172073364, + "kl": 0.26370267011225224, + "learning_rate": 9.561250760917025e-07, + "loss": 0.0105, + "num_tokens": 5200106.0, + "reward": 0.8592529296875, + "reward_std": 0.01639118790626526, + "rewards//mean": 0.8592529296875, + "rewards//std": 0.023785943165421486, + "step": 716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1434, + "grad_norm": 1.8345651626586914, + "kl": 0.23951243795454502, + "learning_rate": 9.55994994155335e-07, + "loss": 0.0096, + "num_tokens": 5207402.0, + "reward": 0.81036376953125, + "reward_std": 0.015995126217603683, + "rewards//mean": 0.81036376953125, + "rewards//std": 0.02355017699301243, + "step": 717 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.578125, + "epoch": 0.1436, + "grad_norm": 1.4839280843734741, + "kl": 0.27861374989151955, + "learning_rate": 9.558647285442381e-07, + "loss": -0.0146, + "num_tokens": 5214663.0, + "reward": 0.8497314453125, + "reward_std": 0.024937253445386887, + "rewards//mean": 0.8497314453125, + "rewards//std": 0.030694536864757538, + "step": 718 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.1438, + "grad_norm": 1.271552324295044, + "kl": 0.24503201246261597, + "learning_rate": 9.55734279310883e-07, + "loss": 0.0105, + "num_tokens": 5221905.0, + "reward": 0.837158203125, + "reward_std": 0.01341659389436245, + "rewards//mean": 0.837158203125, + "rewards//std": 0.028055289760231972, + "step": 719 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.144, + "grad_norm": 1.3849937915802002, + "kl": 0.25117808394134045, + "learning_rate": 9.55603646507815e-07, + "loss": 0.01, + "num_tokens": 5229201.0, + "reward": 0.8719482421875, + "reward_std": 0.017070965841412544, + "rewards//mean": 0.8719482421875, + "rewards//std": 0.02059325948357582, + "step": 720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.8125, + "epoch": 0.1442, + "grad_norm": 1.4470452070236206, + "kl": 0.2840206492692232, + "learning_rate": 9.554728301876524e-07, + "loss": 0.0008, + "num_tokens": 5236645.0, + "reward": 0.8280029296875, + "reward_std": 0.01594052091240883, + "rewards//mean": 0.8280029296875, + "rewards//std": 0.027515659108757973, + "step": 721 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.859375, + "epoch": 0.1444, + "grad_norm": 1.5926982164382935, + "kl": 0.2734318971633911, + "learning_rate": 9.553418304030885e-07, + "loss": 0.0106, + "num_tokens": 5244036.0, + "reward": 0.83538818359375, + "reward_std": 0.02225661091506481, + "rewards//mean": 0.83538818359375, + "rewards//std": 0.032681047916412354, + "step": 722 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1446, + "grad_norm": 1.339241862297058, + "kl": 0.2535356767475605, + "learning_rate": 9.552106472068897e-07, + "loss": 0.0101, + "num_tokens": 5251276.0, + "reward": 0.83837890625, + "reward_std": 0.014229332096874714, + "rewards//mean": 0.83837890625, + "rewards//std": 0.02030939981341362, + "step": 723 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1448, + "grad_norm": 1.8149484395980835, + "kl": 0.2477581389248371, + "learning_rate": 9.550792806518967e-07, + "loss": 0.0099, + "num_tokens": 5258484.0, + "reward": 0.8597412109375, + "reward_std": 0.023520473390817642, + "rewards//mean": 0.8597412109375, + "rewards//std": 0.02705182507634163, + "step": 724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.145, + "grad_norm": 1.3980729579925537, + "kl": 0.27849196642637253, + "learning_rate": 9.549477307910236e-07, + "loss": 0.0111, + "num_tokens": 5265780.0, + "reward": 0.8780517578125, + "reward_std": 0.02092761918902397, + "rewards//mean": 0.8780517578125, + "rewards//std": 0.024582134559750557, + "step": 725 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.8125, + "epoch": 0.1452, + "grad_norm": 1.4455136060714722, + "kl": 0.2345968820154667, + "learning_rate": 9.548159976772592e-07, + "loss": 0.0106, + "num_tokens": 5273104.0, + "reward": 0.8238525390625, + "reward_std": 0.010945318266749382, + "rewards//mean": 0.8238525390625, + "rewards//std": 0.0191583801060915, + "step": 726 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.1454, + "grad_norm": 1.5794117450714111, + "kl": 0.2771427910774946, + "learning_rate": 9.546840813636652e-07, + "loss": 0.0122, + "num_tokens": 5280355.0, + "reward": 0.84478759765625, + "reward_std": 0.0204919520765543, + "rewards//mean": 0.84478759765625, + "rewards//std": 0.030593950301408768, + "step": 727 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1456, + "grad_norm": 1.6019476652145386, + "kl": 0.25622894056141376, + "learning_rate": 9.545519819033777e-07, + "loss": 0.0102, + "num_tokens": 5287675.0, + "reward": 0.87200927734375, + "reward_std": 0.019220631569623947, + "rewards//mean": 0.87200927734375, + "rewards//std": 0.02844850718975067, + "step": 728 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.84375, + "epoch": 0.1458, + "grad_norm": 1.576643466949463, + "kl": 0.24839599058032036, + "learning_rate": 9.544196993496062e-07, + "loss": 0.0096, + "num_tokens": 5294945.0, + "reward": 0.8255615234375, + "reward_std": 0.018222805112600327, + "rewards//mean": 0.8255615234375, + "rewards//std": 0.02829892560839653, + "step": 729 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.146, + "grad_norm": 1.7000409364700317, + "kl": 0.27550058625638485, + "learning_rate": 9.54287233755634e-07, + "loss": 0.011, + "num_tokens": 5302161.0, + "reward": 0.80548095703125, + "reward_std": 0.01413845457136631, + "rewards//mean": 0.80548095703125, + "rewards//std": 0.021215565502643585, + "step": 730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1462, + "grad_norm": 1.6156034469604492, + "kl": 0.303690692409873, + "learning_rate": 9.541545851748185e-07, + "loss": 0.0121, + "num_tokens": 5309561.0, + "reward": 0.8408203125, + "reward_std": 0.017603084444999695, + "rewards//mean": 0.8408203125, + "rewards//std": 0.030488936230540276, + "step": 731 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1464, + "grad_norm": 1.5467205047607422, + "kl": 0.225168963894248, + "learning_rate": 9.540217536605905e-07, + "loss": 0.009, + "num_tokens": 5316905.0, + "reward": 0.79705810546875, + "reward_std": 0.014699389226734638, + "rewards//mean": 0.79705810546875, + "rewards//std": 0.021366296336054802, + "step": 732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1466, + "grad_norm": 1.4790050983428955, + "kl": 0.2845251467078924, + "learning_rate": 9.538887392664543e-07, + "loss": 0.0114, + "num_tokens": 5324185.0, + "reward": 0.863037109375, + "reward_std": 0.019770342856645584, + "rewards//mean": 0.863037109375, + "rewards//std": 0.029437081888318062, + "step": 733 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1468, + "grad_norm": 1.6966058015823364, + "kl": 0.2415014784783125, + "learning_rate": 9.537555420459881e-07, + "loss": 0.0097, + "num_tokens": 5331569.0, + "reward": 0.8529052734375, + "reward_std": 0.015118865296244621, + "rewards//mean": 0.8529052734375, + "rewards//std": 0.026495477184653282, + "step": 734 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.147, + "grad_norm": 1.39463210105896, + "kl": 0.22297674603760242, + "learning_rate": 9.53622162052844e-07, + "loss": 0.0089, + "num_tokens": 5338937.0, + "reward": 0.8663330078125, + "reward_std": 0.01199440099298954, + "rewards//mean": 0.8663330078125, + "rewards//std": 0.021715298295021057, + "step": 735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1472, + "grad_norm": 1.4083036184310913, + "kl": 0.25721851363778114, + "learning_rate": 9.534885993407474e-07, + "loss": 0.0103, + "num_tokens": 5346353.0, + "reward": 0.84906005859375, + "reward_std": 0.010814737528562546, + "rewards//mean": 0.84906005859375, + "rewards//std": 0.01750207133591175, + "step": 736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1474, + "grad_norm": 1.253272533416748, + "kl": 0.284226605668664, + "learning_rate": 9.53354853963497e-07, + "loss": 0.0114, + "num_tokens": 5353601.0, + "reward": 0.84716796875, + "reward_std": 0.014807086437940598, + "rewards//mean": 0.84716796875, + "rewards//std": 0.02382187359035015, + "step": 737 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.859375, + "epoch": 0.1476, + "grad_norm": 1.5726064443588257, + "kl": 0.22898929752409458, + "learning_rate": 9.532209259749658e-07, + "loss": 0.0036, + "num_tokens": 5360920.0, + "reward": 0.8389892578125, + "reward_std": 0.01764056459069252, + "rewards//mean": 0.8389892578125, + "rewards//std": 0.032215289771556854, + "step": 738 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.703125, + "epoch": 0.1478, + "grad_norm": 1.6352814435958862, + "kl": 0.31985520012676716, + "learning_rate": 9.530868154290996e-07, + "loss": -0.0089, + "num_tokens": 5368109.0, + "reward": 0.79730224609375, + "reward_std": 0.014678007923066616, + "rewards//mean": 0.79730224609375, + "rewards//std": 0.022273045033216476, + "step": 739 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.8125, + "epoch": 0.148, + "grad_norm": 1.5722860097885132, + "kl": 0.24214277043938637, + "learning_rate": 9.529525223799184e-07, + "loss": 0.0119, + "num_tokens": 5375409.0, + "reward": 0.83380126953125, + "reward_std": 0.019366323947906494, + "rewards//mean": 0.83380126953125, + "rewards//std": 0.02677474357187748, + "step": 740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1482, + "grad_norm": 1.2450482845306396, + "kl": 0.270299281924963, + "learning_rate": 9.528180468815154e-07, + "loss": 0.0108, + "num_tokens": 5382753.0, + "reward": 0.79779052734375, + "reward_std": 0.013657946139574051, + "rewards//mean": 0.79779052734375, + "rewards//std": 0.021004023030400276, + "step": 741 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1484, + "grad_norm": 1.2421083450317383, + "kl": 0.2535203378647566, + "learning_rate": 9.526833889880572e-07, + "loss": 0.0101, + "num_tokens": 5390073.0, + "reward": 0.8787841796875, + "reward_std": 0.01166562456637621, + "rewards//mean": 0.8787841796875, + "rewards//std": 0.024968286976218224, + "step": 742 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1486, + "grad_norm": 1.316090703010559, + "kl": 0.28325750678777695, + "learning_rate": 9.525485487537841e-07, + "loss": 0.0113, + "num_tokens": 5397553.0, + "reward": 0.84161376953125, + "reward_std": 0.020754147320985794, + "rewards//mean": 0.84161376953125, + "rewards//std": 0.025209257379174232, + "step": 743 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1488, + "grad_norm": 1.5138450860977173, + "kl": 0.25205797888338566, + "learning_rate": 9.524135262330098e-07, + "loss": 0.0101, + "num_tokens": 5404841.0, + "reward": 0.8465576171875, + "reward_std": 0.017185116186738014, + "rewards//mean": 0.8465576171875, + "rewards//std": 0.021524852141737938, + "step": 744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.149, + "grad_norm": 1.4740216732025146, + "kl": 0.2527584731578827, + "learning_rate": 9.522783214801211e-07, + "loss": 0.0101, + "num_tokens": 5412089.0, + "reward": 0.77484130859375, + "reward_std": 0.017248956486582756, + "rewards//mean": 0.77484130859375, + "rewards//std": 0.025632644072175026, + "step": 745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1492, + "grad_norm": 1.4028663635253906, + "kl": 0.27137179113924503, + "learning_rate": 9.521429345495786e-07, + "loss": 0.0109, + "num_tokens": 5419329.0, + "reward": 0.82623291015625, + "reward_std": 0.012156719341874123, + "rewards//mean": 0.82623291015625, + "rewards//std": 0.01774429902434349, + "step": 746 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1494, + "grad_norm": 1.385488748550415, + "kl": 0.2584698870778084, + "learning_rate": 9.520073654959162e-07, + "loss": 0.0103, + "num_tokens": 5426569.0, + "reward": 0.85302734375, + "reward_std": 0.019889961928129196, + "rewards//mean": 0.85302734375, + "rewards//std": 0.028138259425759315, + "step": 747 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.1496, + "grad_norm": 1.390080213546753, + "kl": 0.20847413316369057, + "learning_rate": 9.518716143737409e-07, + "loss": 0.0052, + "num_tokens": 5433829.0, + "reward": 0.83349609375, + "reward_std": 0.013686553575098515, + "rewards//mean": 0.83349609375, + "rewards//std": 0.029842594638466835, + "step": 748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1498, + "grad_norm": 1.6834651231765747, + "kl": 0.2663915064185858, + "learning_rate": 9.517356812377335e-07, + "loss": 0.0107, + "num_tokens": 5441205.0, + "reward": 0.862060546875, + "reward_std": 0.014729274436831474, + "rewards//mean": 0.862060546875, + "rewards//std": 0.02439824678003788, + "step": 749 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.15, + "grad_norm": 1.534364104270935, + "kl": 0.2653078157454729, + "learning_rate": 9.515995661426477e-07, + "loss": 0.0106, + "num_tokens": 5448373.0, + "reward": 0.8411865234375, + "reward_std": 0.018849633634090424, + "rewards//mean": 0.8411865234375, + "rewards//std": 0.022371798753738403, + "step": 750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1502, + "grad_norm": 1.3128658533096313, + "kl": 0.24151846021413803, + "learning_rate": 9.514632691433106e-07, + "loss": 0.0097, + "num_tokens": 5455533.0, + "reward": 0.84521484375, + "reward_std": 0.017035547643899918, + "rewards//mean": 0.84521484375, + "rewards//std": 0.024729792028665543, + "step": 751 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.1504, + "grad_norm": 1.4879817962646484, + "kl": 0.26529062166810036, + "learning_rate": 9.513267902946227e-07, + "loss": 0.0087, + "num_tokens": 5462840.0, + "reward": 0.797607421875, + "reward_std": 0.011784134432673454, + "rewards//mean": 0.797607421875, + "rewards//std": 0.016875134781003, + "step": 752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1506, + "grad_norm": 1.3137022256851196, + "kl": 0.27086566016077995, + "learning_rate": 9.511901296515576e-07, + "loss": 0.0108, + "num_tokens": 5470256.0, + "reward": 0.83795166015625, + "reward_std": 0.014822470024228096, + "rewards//mean": 0.83795166015625, + "rewards//std": 0.01904773712158203, + "step": 753 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1508, + "grad_norm": 1.5186947584152222, + "kl": 0.26560186594724655, + "learning_rate": 9.510532872691623e-07, + "loss": 0.0106, + "num_tokens": 5477584.0, + "reward": 0.7607421875, + "reward_std": 0.014304333366453648, + "rewards//mean": 0.7607421875, + "rewards//std": 0.01952504925429821, + "step": 754 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.151, + "grad_norm": 1.3725115060806274, + "kl": 0.25251931324601173, + "learning_rate": 9.509162632025569e-07, + "loss": 0.0101, + "num_tokens": 5484912.0, + "reward": 0.7637939453125, + "reward_std": 0.011827247217297554, + "rewards//mean": 0.7637939453125, + "rewards//std": 0.026684483513236046, + "step": 755 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1512, + "grad_norm": 1.7933850288391113, + "kl": 0.26473294012248516, + "learning_rate": 9.507790575069345e-07, + "loss": 0.0106, + "num_tokens": 5492160.0, + "reward": 0.84393310546875, + "reward_std": 0.01770966313779354, + "rewards//mean": 0.84393310546875, + "rewards//std": 0.02114981971681118, + "step": 756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.546875, + "epoch": 0.1514, + "grad_norm": 1.7334754467010498, + "kl": 0.290832931175828, + "learning_rate": 9.506416702375617e-07, + "loss": 0.0158, + "num_tokens": 5499395.0, + "reward": 0.84332275390625, + "reward_std": 0.02369631640613079, + "rewards//mean": 0.84332275390625, + "rewards//std": 0.03212229162454605, + "step": 757 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1516, + "grad_norm": 1.5930769443511963, + "kl": 0.3444139827042818, + "learning_rate": 9.505041014497779e-07, + "loss": 0.0138, + "num_tokens": 5506643.0, + "reward": 0.83270263671875, + "reward_std": 0.017027851194143295, + "rewards//mean": 0.83270263671875, + "rewards//std": 0.0231626033782959, + "step": 758 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.84375, + "epoch": 0.1518, + "grad_norm": 1.3657084703445435, + "kl": 0.24776685424149036, + "learning_rate": 9.503663511989962e-07, + "loss": 0.0108, + "num_tokens": 5513977.0, + "reward": 0.853271484375, + "reward_std": 0.019617851823568344, + "rewards//mean": 0.853271484375, + "rewards//std": 0.023917002603411674, + "step": 759 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.75, + "epoch": 0.152, + "grad_norm": 1.3524253368377686, + "kl": 0.25760591216385365, + "learning_rate": 9.502284195407018e-07, + "loss": -0.0073, + "num_tokens": 5521257.0, + "reward": 0.833740234375, + "reward_std": 0.01802907884120941, + "rewards//mean": 0.833740234375, + "rewards//std": 0.029975200071930885, + "step": 760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.8125, + "epoch": 0.1522, + "grad_norm": 1.5107940435409546, + "kl": 0.26265592500567436, + "learning_rate": 9.500903065304539e-07, + "loss": 0.0112, + "num_tokens": 5528525.0, + "reward": 0.84417724609375, + "reward_std": 0.012757277116179466, + "rewards//mean": 0.84417724609375, + "rewards//std": 0.022403156384825706, + "step": 761 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1524, + "grad_norm": 1.2899935245513916, + "kl": 0.25292489491403103, + "learning_rate": 9.499520122238845e-07, + "loss": 0.0101, + "num_tokens": 5535781.0, + "reward": 0.8477783203125, + "reward_std": 0.014628075994551182, + "rewards//mean": 0.8477783203125, + "rewards//std": 0.018014907836914062, + "step": 762 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.1526, + "grad_norm": 1.5842009782791138, + "kl": 0.270897027105093, + "learning_rate": 9.498135366766982e-07, + "loss": 0.0095, + "num_tokens": 5543058.0, + "reward": 0.7877197265625, + "reward_std": 0.013303968124091625, + "rewards//mean": 0.7877197265625, + "rewards//std": 0.019969278946518898, + "step": 763 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1528, + "grad_norm": 1.5608716011047363, + "kl": 0.24659856781363487, + "learning_rate": 9.496748799446732e-07, + "loss": 0.0099, + "num_tokens": 5550202.0, + "reward": 0.7679443359375, + "reward_std": 0.012128211557865143, + "rewards//mean": 0.7679443359375, + "rewards//std": 0.01690157689154148, + "step": 764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.153, + "grad_norm": 1.1622047424316406, + "kl": 0.2462945245206356, + "learning_rate": 9.495360420836602e-07, + "loss": 0.0067, + "num_tokens": 5557540.0, + "reward": 0.845703125, + "reward_std": 0.020662657916545868, + "rewards//mean": 0.845703125, + "rewards//std": 0.03390420973300934, + "step": 765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1532, + "grad_norm": 1.287394642829895, + "kl": 0.2930239364504814, + "learning_rate": 9.493970231495834e-07, + "loss": 0.0117, + "num_tokens": 5564964.0, + "reward": 0.84918212890625, + "reward_std": 0.013009208254516125, + "rewards//mean": 0.84918212890625, + "rewards//std": 0.018250642344355583, + "step": 766 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1534, + "grad_norm": 1.4672991037368774, + "kl": 0.2660859916359186, + "learning_rate": 9.492578231984393e-07, + "loss": 0.0106, + "num_tokens": 5572244.0, + "reward": 0.8424072265625, + "reward_std": 0.01543719694018364, + "rewards//mean": 0.8424072265625, + "rewards//std": 0.022285019978880882, + "step": 767 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1536, + "grad_norm": 1.4379223585128784, + "kl": 0.2430520337074995, + "learning_rate": 9.491184422862979e-07, + "loss": 0.0097, + "num_tokens": 5579532.0, + "reward": 0.8759765625, + "reward_std": 0.021908016875386238, + "rewards//mean": 0.8759765625, + "rewards//std": 0.02879762277007103, + "step": 768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.1538, + "grad_norm": 1.4573997259140015, + "kl": 0.2561649903655052, + "learning_rate": 9.489788804693015e-07, + "loss": 0.01, + "num_tokens": 5586886.0, + "reward": 0.839111328125, + "reward_std": 0.01324736699461937, + "rewards//mean": 0.839111328125, + "rewards//std": 0.021753255277872086, + "step": 769 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.154, + "grad_norm": 1.3978241682052612, + "kl": 0.29851527325809, + "learning_rate": 9.488391378036659e-07, + "loss": 0.0119, + "num_tokens": 5594142.0, + "reward": 0.87994384765625, + "reward_std": 0.01433099526911974, + "rewards//mean": 0.87994384765625, + "rewards//std": 0.031659964472055435, + "step": 770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1542, + "grad_norm": 1.5487738847732544, + "kl": 0.22359062731266022, + "learning_rate": 9.486992143456791e-07, + "loss": 0.0089, + "num_tokens": 5601462.0, + "reward": 0.8194580078125, + "reward_std": 0.013948636129498482, + "rewards//mean": 0.8194580078125, + "rewards//std": 0.02445371448993683, + "step": 771 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.765625, + "epoch": 0.1544, + "grad_norm": 1.347893238067627, + "kl": 0.23161467537283897, + "learning_rate": 9.485591101517026e-07, + "loss": 0.0079, + "num_tokens": 5608703.0, + "reward": 0.83782958984375, + "reward_std": 0.01695202849805355, + "rewards//mean": 0.83782958984375, + "rewards//std": 0.020681556314229965, + "step": 772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1546, + "grad_norm": 1.3476115465164185, + "kl": 0.271294292062521, + "learning_rate": 9.4841882527817e-07, + "loss": 0.0109, + "num_tokens": 5615879.0, + "reward": 0.80279541015625, + "reward_std": 0.01526473555713892, + "rewards//mean": 0.80279541015625, + "rewards//std": 0.020926043391227722, + "step": 773 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1548, + "grad_norm": 1.2350199222564697, + "kl": 0.23909622244536877, + "learning_rate": 9.482783597815882e-07, + "loss": 0.0096, + "num_tokens": 5623319.0, + "reward": 0.85284423828125, + "reward_std": 0.016258811578154564, + "rewards//mean": 0.85284423828125, + "rewards//std": 0.02033984288573265, + "step": 774 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.155, + "grad_norm": 1.6432431936264038, + "kl": 0.2526317238807678, + "learning_rate": 9.481377137185369e-07, + "loss": 0.0066, + "num_tokens": 5630587.0, + "reward": 0.8072509765625, + "reward_std": 0.010593057610094547, + "rewards//mean": 0.8072509765625, + "rewards//std": 0.01604137010872364, + "step": 775 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1552, + "grad_norm": 1.4894939661026, + "kl": 0.2545596491545439, + "learning_rate": 9.479968871456679e-07, + "loss": 0.0102, + "num_tokens": 5637883.0, + "reward": 0.84124755859375, + "reward_std": 0.01732044294476509, + "rewards//mean": 0.84124755859375, + "rewards//std": 0.02505929209291935, + "step": 776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1554, + "grad_norm": 1.4693113565444946, + "kl": 0.23461300134658813, + "learning_rate": 9.478558801197064e-07, + "loss": 0.0094, + "num_tokens": 5645155.0, + "reward": 0.862548828125, + "reward_std": 0.019924219697713852, + "rewards//mean": 0.862548828125, + "rewards//std": 0.02651974745094776, + "step": 777 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1556, + "grad_norm": 1.1065276861190796, + "kl": 0.2249538004398346, + "learning_rate": 9.4771469269745e-07, + "loss": 0.009, + "num_tokens": 5652427.0, + "reward": 0.8587646484375, + "reward_std": 0.010940843261778355, + "rewards//mean": 0.8587646484375, + "rewards//std": 0.016836969181895256, + "step": 778 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1558, + "grad_norm": 1.6173698902130127, + "kl": 0.25361875258386135, + "learning_rate": 9.475733249357688e-07, + "loss": 0.0101, + "num_tokens": 5659723.0, + "reward": 0.81817626953125, + "reward_std": 0.011784767732024193, + "rewards//mean": 0.81817626953125, + "rewards//std": 0.015342336148023605, + "step": 779 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.84375, + "epoch": 0.156, + "grad_norm": 1.6239455938339233, + "kl": 0.24895436689257622, + "learning_rate": 9.474317768916059e-07, + "loss": 0.0109, + "num_tokens": 5667001.0, + "reward": 0.777099609375, + "reward_std": 0.010955520905554295, + "rewards//mean": 0.777099609375, + "rewards//std": 0.015576484613120556, + "step": 780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1562, + "grad_norm": 3.33612322807312, + "kl": 0.22961002588272095, + "learning_rate": 9.472900486219768e-07, + "loss": 0.0092, + "num_tokens": 5674289.0, + "reward": 0.82183837890625, + "reward_std": 0.013438940979540348, + "rewards//mean": 0.82183837890625, + "rewards//std": 0.021163415163755417, + "step": 781 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.75, + "epoch": 0.1564, + "grad_norm": 1.3858630657196045, + "kl": 0.2467146534472704, + "learning_rate": 9.471481401839696e-07, + "loss": -0.006, + "num_tokens": 5681465.0, + "reward": 0.848876953125, + "reward_std": 0.018275512382388115, + "rewards//mean": 0.848876953125, + "rewards//std": 0.026133349165320396, + "step": 782 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1566, + "grad_norm": 1.3140013217926025, + "kl": 0.27021588385105133, + "learning_rate": 9.470060516347449e-07, + "loss": 0.0108, + "num_tokens": 5688777.0, + "reward": 0.8284912109375, + "reward_std": 0.01595388352870941, + "rewards//mean": 0.8284912109375, + "rewards//std": 0.025692589581012726, + "step": 783 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1568, + "grad_norm": 1.7464267015457153, + "kl": 0.249140452593565, + "learning_rate": 9.468637830315362e-07, + "loss": 0.01, + "num_tokens": 5696209.0, + "reward": 0.83343505859375, + "reward_std": 0.020671576261520386, + "rewards//mean": 0.83343505859375, + "rewards//std": 0.027650870382785797, + "step": 784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.157, + "grad_norm": 1.6009631156921387, + "kl": 0.23808876052498817, + "learning_rate": 9.467213344316491e-07, + "loss": 0.0095, + "num_tokens": 5703473.0, + "reward": 0.816162109375, + "reward_std": 0.01652839407324791, + "rewards//mean": 0.816162109375, + "rewards//std": 0.027856020256876945, + "step": 785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.875, + "epoch": 0.1572, + "grad_norm": 1.650692105293274, + "kl": 0.25914822705090046, + "learning_rate": 9.465787058924619e-07, + "loss": 0.0115, + "num_tokens": 5710745.0, + "reward": 0.84466552734375, + "reward_std": 0.012202952988445759, + "rewards//mean": 0.84466552734375, + "rewards//std": 0.018929753452539444, + "step": 786 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.796875, + "epoch": 0.1574, + "grad_norm": 1.4084111452102661, + "kl": 0.2458554431796074, + "learning_rate": 9.464358974714252e-07, + "loss": 0.0077, + "num_tokens": 5717900.0, + "reward": 0.84429931640625, + "reward_std": 0.016395380720496178, + "rewards//mean": 0.84429931640625, + "rewards//std": 0.026207314804196358, + "step": 787 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1576, + "grad_norm": 1.6502057313919067, + "kl": 0.24977733753621578, + "learning_rate": 9.462929092260628e-07, + "loss": 0.01, + "num_tokens": 5725244.0, + "reward": 0.773193359375, + "reward_std": 0.014216196723282337, + "rewards//mean": 0.773193359375, + "rewards//std": 0.020773665979504585, + "step": 788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1578, + "grad_norm": 1.4039467573165894, + "kl": 0.20891019515693188, + "learning_rate": 9.461497412139696e-07, + "loss": 0.0084, + "num_tokens": 5732540.0, + "reward": 0.811279296875, + "reward_std": 0.016357604414224625, + "rewards//mean": 0.811279296875, + "rewards//std": 0.027249474078416824, + "step": 789 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.158, + "grad_norm": 1.4908607006072998, + "kl": 0.24472160264849663, + "learning_rate": 9.460063934928141e-07, + "loss": 0.0098, + "num_tokens": 5739908.0, + "reward": 0.82696533203125, + "reward_std": 0.019522128626704216, + "rewards//mean": 0.82696533203125, + "rewards//std": 0.029386064037680626, + "step": 790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1582, + "grad_norm": 1.9192866086959839, + "kl": 0.2783154472708702, + "learning_rate": 9.458628661203366e-07, + "loss": 0.0111, + "num_tokens": 5747260.0, + "reward": 0.8206787109375, + "reward_std": 0.02153770625591278, + "rewards//mean": 0.8206787109375, + "rewards//std": 0.02873631939291954, + "step": 791 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1584, + "grad_norm": 1.720473051071167, + "kl": 0.25573834776878357, + "learning_rate": 9.4571915915435e-07, + "loss": 0.0102, + "num_tokens": 5754500.0, + "reward": 0.78662109375, + "reward_std": 0.01824701949954033, + "rewards//mean": 0.78662109375, + "rewards//std": 0.022450489923357964, + "step": 792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1586, + "grad_norm": 1.629339575767517, + "kl": 0.22184279561042786, + "learning_rate": 9.455752726527392e-07, + "loss": 0.0089, + "num_tokens": 5761844.0, + "reward": 0.8572998046875, + "reward_std": 0.023047277703881264, + "rewards//mean": 0.8572998046875, + "rewards//std": 0.03714260831475258, + "step": 793 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1588, + "grad_norm": 1.380544900894165, + "kl": 0.2809348274022341, + "learning_rate": 9.454312066734622e-07, + "loss": 0.0112, + "num_tokens": 5769052.0, + "reward": 0.85498046875, + "reward_std": 0.021646970883011818, + "rewards//mean": 0.85498046875, + "rewards//std": 0.024324923753738403, + "step": 794 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.159, + "grad_norm": 1.536270022392273, + "kl": 0.2287327293306589, + "learning_rate": 9.452869612745483e-07, + "loss": 0.0091, + "num_tokens": 5776292.0, + "reward": 0.84027099609375, + "reward_std": 0.010791771113872528, + "rewards//mean": 0.84027099609375, + "rewards//std": 0.015888119116425514, + "step": 795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1592, + "grad_norm": 1.4697705507278442, + "kl": 0.26433999091386795, + "learning_rate": 9.451425365140994e-07, + "loss": 0.0106, + "num_tokens": 5783668.0, + "reward": 0.7711181640625, + "reward_std": 0.014557410962879658, + "rewards//mean": 0.7711181640625, + "rewards//std": 0.03029543161392212, + "step": 796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1594, + "grad_norm": 1.3547488451004028, + "kl": 0.23753996193408966, + "learning_rate": 9.449979324502903e-07, + "loss": 0.0095, + "num_tokens": 5790884.0, + "reward": 0.81219482421875, + "reward_std": 0.012803702615201473, + "rewards//mean": 0.81219482421875, + "rewards//std": 0.02519364096224308, + "step": 797 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1596, + "grad_norm": 1.7047500610351562, + "kl": 0.3211044408380985, + "learning_rate": 9.448531491413672e-07, + "loss": 0.0128, + "num_tokens": 5798068.0, + "reward": 0.8653564453125, + "reward_std": 0.01457663718611002, + "rewards//mean": 0.8653564453125, + "rewards//std": 0.02006305754184723, + "step": 798 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1598, + "grad_norm": 1.5758135318756104, + "kl": 0.27484776824712753, + "learning_rate": 9.447081866456487e-07, + "loss": 0.011, + "num_tokens": 5805348.0, + "reward": 0.84271240234375, + "reward_std": 0.02228953316807747, + "rewards//mean": 0.84271240234375, + "rewards//std": 0.030971620231866837, + "step": 799 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.859375, + "epoch": 0.16, + "grad_norm": 1.4819086790084839, + "kl": 0.3045799024403095, + "learning_rate": 9.445630450215259e-07, + "loss": 0.0096, + "num_tokens": 5812571.0, + "reward": 0.838623046875, + "reward_std": 0.019571879878640175, + "rewards//mean": 0.838623046875, + "rewards//std": 0.02627200074493885, + "step": 800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1602, + "grad_norm": 1.578709602355957, + "kl": 0.24118798226118088, + "learning_rate": 9.444177243274617e-07, + "loss": 0.0096, + "num_tokens": 5819883.0, + "reward": 0.844970703125, + "reward_std": 0.013073444366455078, + "rewards//mean": 0.844970703125, + "rewards//std": 0.02001357637345791, + "step": 801 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.1604, + "grad_norm": 1.601787805557251, + "kl": 0.2532284837216139, + "learning_rate": 9.442722246219913e-07, + "loss": 0.0103, + "num_tokens": 5827196.0, + "reward": 0.8560791015625, + "reward_std": 0.02183767780661583, + "rewards//mean": 0.8560791015625, + "rewards//std": 0.028418494388461113, + "step": 802 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1606, + "grad_norm": 1.2509363889694214, + "kl": 0.2553468644618988, + "learning_rate": 9.441265459637219e-07, + "loss": 0.0102, + "num_tokens": 5834476.0, + "reward": 0.85809326171875, + "reward_std": 0.009726915508508682, + "rewards//mean": 0.85809326171875, + "rewards//std": 0.016762517392635345, + "step": 803 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1608, + "grad_norm": 1.520261287689209, + "kl": 0.2615738958120346, + "learning_rate": 9.43980688411333e-07, + "loss": 0.0105, + "num_tokens": 5841716.0, + "reward": 0.8759765625, + "reward_std": 0.0181514173746109, + "rewards//mean": 0.8759765625, + "rewards//std": 0.022173680365085602, + "step": 804 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.161, + "grad_norm": 2.5066535472869873, + "kl": 0.28283718414604664, + "learning_rate": 9.438346520235758e-07, + "loss": 0.0113, + "num_tokens": 5849108.0, + "reward": 0.818115234375, + "reward_std": 0.013364220038056374, + "rewards//mean": 0.818115234375, + "rewards//std": 0.018595842644572258, + "step": 805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.1612, + "grad_norm": 1.474459171295166, + "kl": 0.24693896435201168, + "learning_rate": 9.436884368592739e-07, + "loss": 0.0098, + "num_tokens": 5856402.0, + "reward": 0.849853515625, + "reward_std": 0.022694626823067665, + "rewards//mean": 0.849853515625, + "rewards//std": 0.027856020256876945, + "step": 806 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1614, + "grad_norm": 1.4600752592086792, + "kl": 0.2550163306295872, + "learning_rate": 9.435420429773227e-07, + "loss": 0.0102, + "num_tokens": 5863706.0, + "reward": 0.85394287109375, + "reward_std": 0.016245879232883453, + "rewards//mean": 0.85394287109375, + "rewards//std": 0.02925337664783001, + "step": 807 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1616, + "grad_norm": 1.6055666208267212, + "kl": 0.25367947667837143, + "learning_rate": 9.433954704366896e-07, + "loss": 0.0101, + "num_tokens": 5870962.0, + "reward": 0.86395263671875, + "reward_std": 0.017359502613544464, + "rewards//mean": 0.86395263671875, + "rewards//std": 0.032841384410858154, + "step": 808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1618, + "grad_norm": 1.3919883966445923, + "kl": 0.2671513333916664, + "learning_rate": 9.43248719296414e-07, + "loss": 0.0107, + "num_tokens": 5878314.0, + "reward": 0.8663330078125, + "reward_std": 0.022560913115739822, + "rewards//mean": 0.8663330078125, + "rewards//std": 0.028301063925027847, + "step": 809 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.162, + "grad_norm": 2.780120611190796, + "kl": 0.2593911662697792, + "learning_rate": 9.431017896156073e-07, + "loss": 0.0104, + "num_tokens": 5885554.0, + "reward": 0.85809326171875, + "reward_std": 0.022016307339072227, + "rewards//mean": 0.85809326171875, + "rewards//std": 0.03847559541463852, + "step": 810 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1622, + "grad_norm": 1.649478554725647, + "kl": 0.28215585835278034, + "learning_rate": 9.429546814534528e-07, + "loss": 0.0113, + "num_tokens": 5892874.0, + "reward": 0.8355712890625, + "reward_std": 0.017516594380140305, + "rewards//mean": 0.8355712890625, + "rewards//std": 0.025503354147076607, + "step": 811 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1624, + "grad_norm": 1.401228427886963, + "kl": 0.2701694704592228, + "learning_rate": 9.428073948692054e-07, + "loss": 0.0108, + "num_tokens": 5900162.0, + "reward": 0.83251953125, + "reward_std": 0.017454421147704124, + "rewards//mean": 0.83251953125, + "rewards//std": 0.02323511429131031, + "step": 812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1626, + "grad_norm": 1.5418131351470947, + "kl": 0.2793327532708645, + "learning_rate": 9.426599299221924e-07, + "loss": 0.0112, + "num_tokens": 5907474.0, + "reward": 0.8048095703125, + "reward_std": 0.011554021388292313, + "rewards//mean": 0.8048095703125, + "rewards//std": 0.016351666301488876, + "step": 813 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1628, + "grad_norm": 1.342590570449829, + "kl": 0.24666925705969334, + "learning_rate": 9.425122866718127e-07, + "loss": 0.0099, + "num_tokens": 5914770.0, + "reward": 0.84075927734375, + "reward_std": 0.014624981209635735, + "rewards//mean": 0.84075927734375, + "rewards//std": 0.034094713628292084, + "step": 814 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.163, + "grad_norm": 1.4770463705062866, + "kl": 0.25065608508884907, + "learning_rate": 9.423644651775368e-07, + "loss": 0.01, + "num_tokens": 5922010.0, + "reward": 0.79608154296875, + "reward_std": 0.014442982152104378, + "rewards//mean": 0.79608154296875, + "rewards//std": 0.023665595799684525, + "step": 815 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1632, + "grad_norm": 1.3567519187927246, + "kl": 0.26999335549771786, + "learning_rate": 9.422164654989071e-07, + "loss": 0.0108, + "num_tokens": 5929314.0, + "reward": 0.8465576171875, + "reward_std": 0.015238994732499123, + "rewards//mean": 0.8465576171875, + "rewards//std": 0.01772349327802658, + "step": 816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1634, + "grad_norm": 1.0919685363769531, + "kl": 0.2639268357306719, + "learning_rate": 9.420682876955381e-07, + "loss": 0.0106, + "num_tokens": 5936610.0, + "reward": 0.8280029296875, + "reward_std": 0.014155473560094833, + "rewards//mean": 0.8280029296875, + "rewards//std": 0.02401900105178356, + "step": 817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1636, + "grad_norm": 1.7981566190719604, + "kl": 0.2707471735775471, + "learning_rate": 9.419199318271156e-07, + "loss": 0.0108, + "num_tokens": 5943874.0, + "reward": 0.84722900390625, + "reward_std": 0.013559436425566673, + "rewards//mean": 0.84722900390625, + "rewards//std": 0.022789033129811287, + "step": 818 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1638, + "grad_norm": 1.5497338771820068, + "kl": 0.26401237957179546, + "learning_rate": 9.417713979533974e-07, + "loss": 0.0106, + "num_tokens": 5951186.0, + "reward": 0.82366943359375, + "reward_std": 0.017271725460886955, + "rewards//mean": 0.82366943359375, + "rewards//std": 0.025984566658735275, + "step": 819 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.164, + "grad_norm": 1.6230720281600952, + "kl": 0.28239046409726143, + "learning_rate": 9.41622686134213e-07, + "loss": 0.0113, + "num_tokens": 5958490.0, + "reward": 0.83453369140625, + "reward_std": 0.015524398535490036, + "rewards//mean": 0.83453369140625, + "rewards//std": 0.018484700471162796, + "step": 820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1642, + "grad_norm": 1.5261975526809692, + "kl": 0.25122480280697346, + "learning_rate": 9.414737964294634e-07, + "loss": 0.01, + "num_tokens": 5965802.0, + "reward": 0.79901123046875, + "reward_std": 0.01502363570034504, + "rewards//mean": 0.79901123046875, + "rewards//std": 0.020326443016529083, + "step": 821 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.734375, + "epoch": 0.1644, + "grad_norm": 1.775661587715149, + "kl": 0.29149238020181656, + "learning_rate": 9.413247288991215e-07, + "loss": 0.0076, + "num_tokens": 5973145.0, + "reward": 0.8302001953125, + "reward_std": 0.016552936285734177, + "rewards//mean": 0.8302001953125, + "rewards//std": 0.020925771445035934, + "step": 822 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1646, + "grad_norm": 1.5683794021606445, + "kl": 0.26188207790255547, + "learning_rate": 9.411754836032314e-07, + "loss": 0.0105, + "num_tokens": 5980417.0, + "reward": 0.83807373046875, + "reward_std": 0.016027413308620453, + "rewards//mean": 0.83807373046875, + "rewards//std": 0.02573225647211075, + "step": 823 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.828125, + "epoch": 0.1648, + "grad_norm": 1.6140596866607666, + "kl": 0.3201983757317066, + "learning_rate": 9.410260606019094e-07, + "loss": 0.0077, + "num_tokens": 5987814.0, + "reward": 0.87823486328125, + "reward_std": 0.022849811241030693, + "rewards//mean": 0.87823486328125, + "rewards//std": 0.03230602666735649, + "step": 824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.165, + "grad_norm": 1.5142934322357178, + "kl": 0.26200552843511105, + "learning_rate": 9.408764599553428e-07, + "loss": 0.0105, + "num_tokens": 5995110.0, + "reward": 0.8505859375, + "reward_std": 0.01587473228573799, + "rewards//mean": 0.8505859375, + "rewards//std": 0.023499423637986183, + "step": 825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1652, + "grad_norm": 1.5083154439926147, + "kl": 0.24763718619942665, + "learning_rate": 9.40726681723791e-07, + "loss": 0.0099, + "num_tokens": 6002526.0, + "reward": 0.826904296875, + "reward_std": 0.014450838789343834, + "rewards//mean": 0.826904296875, + "rewards//std": 0.022346410900354385, + "step": 826 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.1654, + "grad_norm": 1.5409547090530396, + "kl": 0.2738236114382744, + "learning_rate": 9.405767259675844e-07, + "loss": 0.011, + "num_tokens": 6009773.0, + "reward": 0.85546875, + "reward_std": 0.019870268180966377, + "rewards//mean": 0.85546875, + "rewards//std": 0.026602955535054207, + "step": 827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1656, + "grad_norm": 1.418137550354004, + "kl": 0.23964584805071354, + "learning_rate": 9.404265927471253e-07, + "loss": 0.0096, + "num_tokens": 6017029.0, + "reward": 0.814697265625, + "reward_std": 0.015745818614959717, + "rewards//mean": 0.814697265625, + "rewards//std": 0.028905706480145454, + "step": 828 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1658, + "grad_norm": 1.657520055770874, + "kl": 0.27615330275148153, + "learning_rate": 9.402762821228874e-07, + "loss": 0.011, + "num_tokens": 6024365.0, + "reward": 0.863037109375, + "reward_std": 0.02402604930102825, + "rewards//mean": 0.863037109375, + "rewards//std": 0.03629103675484657, + "step": 829 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.166, + "grad_norm": 1.4563051462173462, + "kl": 0.24008294194936752, + "learning_rate": 9.401257941554156e-07, + "loss": 0.0096, + "num_tokens": 6031613.0, + "reward": 0.87396240234375, + "reward_std": 0.018684500828385353, + "rewards//mean": 0.87396240234375, + "rewards//std": 0.029050862416625023, + "step": 830 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1662, + "grad_norm": 2.0248889923095703, + "kl": 0.23671350441873074, + "learning_rate": 9.399751289053266e-07, + "loss": 0.0095, + "num_tokens": 6038909.0, + "reward": 0.85809326171875, + "reward_std": 0.017527006566524506, + "rewards//mean": 0.85809326171875, + "rewards//std": 0.029643533751368523, + "step": 831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1664, + "grad_norm": 2.218914747238159, + "kl": 0.3402737509459257, + "learning_rate": 9.398242864333083e-07, + "loss": 0.0136, + "num_tokens": 6046253.0, + "reward": 0.8580322265625, + "reward_std": 0.020087484270334244, + "rewards//mean": 0.8580322265625, + "rewards//std": 0.023788489401340485, + "step": 832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1666, + "grad_norm": 1.516398549079895, + "kl": 0.2592080645263195, + "learning_rate": 9.396732668001199e-07, + "loss": 0.0104, + "num_tokens": 6053629.0, + "reward": 0.82177734375, + "reward_std": 0.013778358697891235, + "rewards//mean": 0.82177734375, + "rewards//std": 0.025709375739097595, + "step": 833 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.1668, + "grad_norm": 1.2439967393875122, + "kl": 0.21824480965733528, + "learning_rate": 9.395220700665922e-07, + "loss": 0.0082, + "num_tokens": 6061018.0, + "reward": 0.86065673828125, + "reward_std": 0.016340935602784157, + "rewards//mean": 0.86065673828125, + "rewards//std": 0.022413289174437523, + "step": 834 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.59375, + "epoch": 0.167, + "grad_norm": 1.5312206745147705, + "kl": 0.29390639811754227, + "learning_rate": 9.393706962936274e-07, + "loss": -0.0151, + "num_tokens": 6068304.0, + "reward": 0.817626953125, + "reward_std": 0.01572204753756523, + "rewards//mean": 0.817626953125, + "rewards//std": 0.020855121314525604, + "step": 835 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1672, + "grad_norm": 1.431575894355774, + "kl": 0.21382459998130798, + "learning_rate": 9.392191455421987e-07, + "loss": 0.0086, + "num_tokens": 6075496.0, + "reward": 0.81768798828125, + "reward_std": 0.011336727999150753, + "rewards//mean": 0.81768798828125, + "rewards//std": 0.023959290236234665, + "step": 836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.875, + "epoch": 0.1674, + "grad_norm": 1.485544204711914, + "kl": 0.27903779223561287, + "learning_rate": 9.390674178733507e-07, + "loss": 0.0138, + "num_tokens": 6082784.0, + "reward": 0.84637451171875, + "reward_std": 0.01458788849413395, + "rewards//mean": 0.84637451171875, + "rewards//std": 0.02402994967997074, + "step": 837 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.875, + "epoch": 0.1676, + "grad_norm": 1.6491026878356934, + "kl": 0.26448546163737774, + "learning_rate": 9.389155133481992e-07, + "loss": 0.0062, + "num_tokens": 6089992.0, + "reward": 0.8778076171875, + "reward_std": 0.017530817538499832, + "rewards//mean": 0.8778076171875, + "rewards//std": 0.026222113519906998, + "step": 838 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.84375, + "epoch": 0.1678, + "grad_norm": 1.4433629512786865, + "kl": 0.27931851521134377, + "learning_rate": 9.387634320279314e-07, + "loss": 0.01, + "num_tokens": 6097390.0, + "reward": 0.84735107421875, + "reward_std": 0.013316976837813854, + "rewards//mean": 0.84735107421875, + "rewards//std": 0.019970322027802467, + "step": 839 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.168, + "grad_norm": 1.3577572107315063, + "kl": 0.2221212014555931, + "learning_rate": 9.386111739738056e-07, + "loss": 0.0089, + "num_tokens": 6104646.0, + "reward": 0.8768310546875, + "reward_std": 0.0203755684196949, + "rewards//mean": 0.8768310546875, + "rewards//std": 0.03325483202934265, + "step": 840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.1682, + "grad_norm": 1.2117210626602173, + "kl": 0.2548555787652731, + "learning_rate": 9.384587392471514e-07, + "loss": 0.0107, + "num_tokens": 6111892.0, + "reward": 0.81201171875, + "reward_std": 0.013897782191634178, + "rewards//mean": 0.81201171875, + "rewards//std": 0.024205146357417107, + "step": 841 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1684, + "grad_norm": 1.5409685373306274, + "kl": 0.25204242393374443, + "learning_rate": 9.383061279093696e-07, + "loss": 0.0101, + "num_tokens": 6119188.0, + "reward": 0.83782958984375, + "reward_std": 0.01701168157160282, + "rewards//mean": 0.83782958984375, + "rewards//std": 0.021202005445957184, + "step": 842 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1686, + "grad_norm": 2.154470443725586, + "kl": 0.2384718656539917, + "learning_rate": 9.381533400219317e-07, + "loss": 0.0095, + "num_tokens": 6126460.0, + "reward": 0.857666015625, + "reward_std": 0.015932051464915276, + "rewards//mean": 0.857666015625, + "rewards//std": 0.035814058035612106, + "step": 843 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1688, + "grad_norm": 1.469427227973938, + "kl": 0.30962241254746914, + "learning_rate": 9.38000375646381e-07, + "loss": 0.0124, + "num_tokens": 6133748.0, + "reward": 0.86578369140625, + "reward_std": 0.020865220576524734, + "rewards//mean": 0.86578369140625, + "rewards//std": 0.024933326989412308, + "step": 844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.169, + "grad_norm": 1.527585506439209, + "kl": 0.24398992769420147, + "learning_rate": 9.378472348443314e-07, + "loss": 0.0098, + "num_tokens": 6141020.0, + "reward": 0.81219482421875, + "reward_std": 0.01909388229250908, + "rewards//mean": 0.81219482421875, + "rewards//std": 0.03233412653207779, + "step": 845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1692, + "grad_norm": 1.445777177810669, + "kl": 0.314063124358654, + "learning_rate": 9.376939176774677e-07, + "loss": 0.0126, + "num_tokens": 6148364.0, + "reward": 0.8594970703125, + "reward_std": 0.009349750354886055, + "rewards//mean": 0.8594970703125, + "rewards//std": 0.016307169571518898, + "step": 846 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1694, + "grad_norm": 1.6229969263076782, + "kl": 0.29766186513006687, + "learning_rate": 9.375404242075466e-07, + "loss": 0.0119, + "num_tokens": 6155684.0, + "reward": 0.7679443359375, + "reward_std": 0.020312149077653885, + "rewards//mean": 0.7679443359375, + "rewards//std": 0.028148747980594635, + "step": 847 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.78125, + "epoch": 0.1696, + "grad_norm": 1.4073107242584229, + "kl": 0.29593499936163425, + "learning_rate": 9.373867544963948e-07, + "loss": 0.013, + "num_tokens": 6162942.0, + "reward": 0.85003662109375, + "reward_std": 0.016311928629875183, + "rewards//mean": 0.85003662109375, + "rewards//std": 0.02723829261958599, + "step": 848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1698, + "grad_norm": 1.5788273811340332, + "kl": 0.2611276488751173, + "learning_rate": 9.372329086059107e-07, + "loss": 0.0104, + "num_tokens": 6170246.0, + "reward": 0.7969970703125, + "reward_std": 0.012450498528778553, + "rewards//mean": 0.7969970703125, + "rewards//std": 0.013954070396721363, + "step": 849 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.17, + "grad_norm": 1.1014920473098755, + "kl": 0.24066632986068726, + "learning_rate": 9.370788865980632e-07, + "loss": 0.0096, + "num_tokens": 6177478.0, + "reward": 0.8575439453125, + "reward_std": 0.014558149501681328, + "rewards//mean": 0.8575439453125, + "rewards//std": 0.021743163466453552, + "step": 850 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.1702, + "grad_norm": 1.3903484344482422, + "kl": 0.26153174228966236, + "learning_rate": 9.369246885348925e-07, + "loss": 0.0102, + "num_tokens": 6184765.0, + "reward": 0.8411865234375, + "reward_std": 0.014473306946456432, + "rewards//mean": 0.8411865234375, + "rewards//std": 0.02438179962337017, + "step": 851 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1704, + "grad_norm": 1.2252346277236938, + "kl": 0.2827958147972822, + "learning_rate": 9.367703144785095e-07, + "loss": 0.0113, + "num_tokens": 6192125.0, + "reward": 0.83062744140625, + "reward_std": 0.011248480528593063, + "rewards//mean": 0.83062744140625, + "rewards//std": 0.01757112704217434, + "step": 852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1706, + "grad_norm": 1.3319724798202515, + "kl": 0.2555301357060671, + "learning_rate": 9.366157644910959e-07, + "loss": 0.0102, + "num_tokens": 6199421.0, + "reward": 0.84600830078125, + "reward_std": 0.01682605966925621, + "rewards//mean": 0.84600830078125, + "rewards//std": 0.02689996361732483, + "step": 853 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1708, + "grad_norm": 1.5893418788909912, + "kl": 0.2497469075024128, + "learning_rate": 9.364610386349047e-07, + "loss": 0.01, + "num_tokens": 6206725.0, + "reward": 0.82891845703125, + "reward_std": 0.01692725345492363, + "rewards//mean": 0.82891845703125, + "rewards//std": 0.020974453538656235, + "step": 854 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.171, + "grad_norm": 1.34995436668396, + "kl": 0.28051420487463474, + "learning_rate": 9.363061369722594e-07, + "loss": 0.0112, + "num_tokens": 6214037.0, + "reward": 0.82000732421875, + "reward_std": 0.009825940243899822, + "rewards//mean": 0.82000732421875, + "rewards//std": 0.013465343043208122, + "step": 855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.875, + "epoch": 0.1712, + "grad_norm": 1.31371009349823, + "kl": 0.2544517554342747, + "learning_rate": 9.361510595655544e-07, + "loss": 0.0016, + "num_tokens": 6221301.0, + "reward": 0.83819580078125, + "reward_std": 0.01792732998728752, + "rewards//mean": 0.83819580078125, + "rewards//std": 0.02175247296690941, + "step": 856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1714, + "grad_norm": 1.6055275201797485, + "kl": 0.27129225991666317, + "learning_rate": 9.359958064772546e-07, + "loss": 0.0109, + "num_tokens": 6228621.0, + "reward": 0.81768798828125, + "reward_std": 0.017227141186594963, + "rewards//mean": 0.81768798828125, + "rewards//std": 0.028340283781290054, + "step": 857 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1716, + "grad_norm": 1.4599229097366333, + "kl": 0.24865556322038174, + "learning_rate": 9.35840377769896e-07, + "loss": 0.0099, + "num_tokens": 6235837.0, + "reward": 0.8436279296875, + "reward_std": 0.017951738089323044, + "rewards//mean": 0.8436279296875, + "rewards//std": 0.03191693499684334, + "step": 858 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1718, + "grad_norm": 1.5053385496139526, + "kl": 0.27313671447336674, + "learning_rate": 9.356847735060856e-07, + "loss": 0.0109, + "num_tokens": 6243141.0, + "reward": 0.86627197265625, + "reward_std": 0.015194113366305828, + "rewards//mean": 0.86627197265625, + "rewards//std": 0.022975590080022812, + "step": 859 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.172, + "grad_norm": 1.6693018674850464, + "kl": 0.2759652491658926, + "learning_rate": 9.355289937485004e-07, + "loss": 0.011, + "num_tokens": 6250461.0, + "reward": 0.857177734375, + "reward_std": 0.013696461915969849, + "rewards//mean": 0.857177734375, + "rewards//std": 0.02309264987707138, + "step": 860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1722, + "grad_norm": 1.6792573928833008, + "kl": 0.29803698509931564, + "learning_rate": 9.353730385598886e-07, + "loss": 0.0119, + "num_tokens": 6257677.0, + "reward": 0.83258056640625, + "reward_std": 0.022679826244711876, + "rewards//mean": 0.83258056640625, + "rewards//std": 0.02582562156021595, + "step": 861 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1724, + "grad_norm": 1.5576682090759277, + "kl": 0.2618778236210346, + "learning_rate": 9.35216908003069e-07, + "loss": 0.0105, + "num_tokens": 6264949.0, + "reward": 0.82928466796875, + "reward_std": 0.016643334180116653, + "rewards//mean": 0.82928466796875, + "rewards//std": 0.02009575627744198, + "step": 862 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1726, + "grad_norm": 1.9229856729507446, + "kl": 0.29852112010121346, + "learning_rate": 9.350606021409308e-07, + "loss": 0.0119, + "num_tokens": 6272261.0, + "reward": 0.836669921875, + "reward_std": 0.015347281470894814, + "rewards//mean": 0.836669921875, + "rewards//std": 0.020409032702445984, + "step": 863 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1728, + "grad_norm": 1.3199729919433594, + "kl": 0.27345328219234943, + "learning_rate": 9.349041210364341e-07, + "loss": 0.0109, + "num_tokens": 6279629.0, + "reward": 0.866455078125, + "reward_std": 0.014291839674115181, + "rewards//mean": 0.866455078125, + "rewards//std": 0.020796971395611763, + "step": 864 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.173, + "grad_norm": 1.3758840560913086, + "kl": 0.2668046560138464, + "learning_rate": 9.347474647526095e-07, + "loss": 0.0051, + "num_tokens": 6286872.0, + "reward": 0.87811279296875, + "reward_std": 0.019996630027890205, + "rewards//mean": 0.87811279296875, + "rewards//std": 0.02216471917927265, + "step": 865 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1732, + "grad_norm": 1.6928755044937134, + "kl": 0.3020530790090561, + "learning_rate": 9.34590633352558e-07, + "loss": 0.0121, + "num_tokens": 6294216.0, + "reward": 0.8521728515625, + "reward_std": 0.023942861706018448, + "rewards//mean": 0.8521728515625, + "rewards//std": 0.029661208391189575, + "step": 866 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1734, + "grad_norm": 1.6322613954544067, + "kl": 0.34493105113506317, + "learning_rate": 9.344336268994515e-07, + "loss": 0.0138, + "num_tokens": 6301504.0, + "reward": 0.84368896484375, + "reward_std": 0.02713008038699627, + "rewards//mean": 0.84368896484375, + "rewards//std": 0.029931675642728806, + "step": 867 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.875, + "epoch": 0.1736, + "grad_norm": 1.6518305540084839, + "kl": 0.26213178783655167, + "learning_rate": 9.342764454565319e-07, + "loss": 0.0041, + "num_tokens": 6308776.0, + "reward": 0.885498046875, + "reward_std": 0.017489494755864143, + "rewards//mean": 0.885498046875, + "rewards//std": 0.022248649969697, + "step": 868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1738, + "grad_norm": 1.3987659215927124, + "kl": 0.27298332937061787, + "learning_rate": 9.341190890871121e-07, + "loss": 0.0109, + "num_tokens": 6316080.0, + "reward": 0.8447265625, + "reward_std": 0.014299525879323483, + "rewards//mean": 0.8447265625, + "rewards//std": 0.028151167556643486, + "step": 869 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.174, + "grad_norm": 1.422455072402954, + "kl": 0.28332848846912384, + "learning_rate": 9.339615578545752e-07, + "loss": 0.0113, + "num_tokens": 6323376.0, + "reward": 0.85443115234375, + "reward_std": 0.0181420985609293, + "rewards//mean": 0.85443115234375, + "rewards//std": 0.03146043419837952, + "step": 870 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.1742, + "grad_norm": 1.5425488948822021, + "kl": 0.2562232930213213, + "learning_rate": 9.338038518223745e-07, + "loss": 0.0118, + "num_tokens": 6330662.0, + "reward": 0.8369140625, + "reward_std": 0.016764743253588676, + "rewards//mean": 0.8369140625, + "rewards//std": 0.021190667524933815, + "step": 871 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1744, + "grad_norm": 1.5780805349349976, + "kl": 0.24467191100120544, + "learning_rate": 9.336459710540343e-07, + "loss": 0.0098, + "num_tokens": 6338102.0, + "reward": 0.83721923828125, + "reward_std": 0.015912635251879692, + "rewards//mean": 0.83721923828125, + "rewards//std": 0.02318873070180416, + "step": 872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1746, + "grad_norm": 1.299098014831543, + "kl": 0.26970611698925495, + "learning_rate": 9.334879156131488e-07, + "loss": 0.0108, + "num_tokens": 6345414.0, + "reward": 0.84259033203125, + "reward_std": 0.01378169097006321, + "rewards//mean": 0.84259033203125, + "rewards//std": 0.01859329827129841, + "step": 873 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1748, + "grad_norm": 1.3476893901824951, + "kl": 0.3001403249800205, + "learning_rate": 9.333296855633827e-07, + "loss": 0.012, + "num_tokens": 6352678.0, + "reward": 0.84027099609375, + "reward_std": 0.012851668521761894, + "rewards//mean": 0.84027099609375, + "rewards//std": 0.022852713242173195, + "step": 874 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.175, + "grad_norm": 1.5192245244979858, + "kl": 0.31475691869854927, + "learning_rate": 9.331712809684711e-07, + "loss": 0.0126, + "num_tokens": 6359926.0, + "reward": 0.85223388671875, + "reward_std": 0.016872083768248558, + "rewards//mean": 0.85223388671875, + "rewards//std": 0.022526469081640244, + "step": 875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1752, + "grad_norm": 1.2745354175567627, + "kl": 0.2924560960382223, + "learning_rate": 9.330127018922193e-07, + "loss": 0.0117, + "num_tokens": 6367238.0, + "reward": 0.83197021484375, + "reward_std": 0.021466590464115143, + "rewards//mean": 0.83197021484375, + "rewards//std": 0.033309582620859146, + "step": 876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1754, + "grad_norm": 1.2694365978240967, + "kl": 0.3039685171097517, + "learning_rate": 9.32853948398503e-07, + "loss": 0.0122, + "num_tokens": 6374470.0, + "reward": 0.84844970703125, + "reward_std": 0.0206051804125309, + "rewards//mean": 0.84844970703125, + "rewards//std": 0.026939326897263527, + "step": 877 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.1756, + "grad_norm": 1.433699131011963, + "kl": 0.27996223978698254, + "learning_rate": 9.32695020551268e-07, + "loss": 0.0113, + "num_tokens": 6381612.0, + "reward": 0.85357666015625, + "reward_std": 0.017497539520263672, + "rewards//mean": 0.85357666015625, + "rewards//std": 0.02736581675708294, + "step": 878 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1758, + "grad_norm": 1.68576180934906, + "kl": 0.3288582041859627, + "learning_rate": 9.325359184145305e-07, + "loss": 0.0132, + "num_tokens": 6388892.0, + "reward": 0.83984375, + "reward_std": 0.014790777117013931, + "rewards//mean": 0.83984375, + "rewards//std": 0.021179234609007835, + "step": 879 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.176, + "grad_norm": 1.8443819284439087, + "kl": 0.29126692563295364, + "learning_rate": 9.323766420523767e-07, + "loss": 0.0115, + "num_tokens": 6396203.0, + "reward": 0.85089111328125, + "reward_std": 0.012348340824246407, + "rewards//mean": 0.85089111328125, + "rewards//std": 0.019269762560725212, + "step": 880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1762, + "grad_norm": 1.6812399625778198, + "kl": 0.27706542052328587, + "learning_rate": 9.322171915289633e-07, + "loss": 0.0111, + "num_tokens": 6403491.0, + "reward": 0.83831787109375, + "reward_std": 0.018039263784885406, + "rewards//mean": 0.83831787109375, + "rewards//std": 0.02496487833559513, + "step": 881 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1764, + "grad_norm": 1.7909225225448608, + "kl": 0.3524009734392166, + "learning_rate": 9.320575669085169e-07, + "loss": 0.0141, + "num_tokens": 6410691.0, + "reward": 0.8172607421875, + "reward_std": 0.016086801886558533, + "rewards//mean": 0.8172607421875, + "rewards//std": 0.021412033587694168, + "step": 882 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1766, + "grad_norm": 1.5248249769210815, + "kl": 0.2820226922631264, + "learning_rate": 9.31897768255334e-07, + "loss": 0.0113, + "num_tokens": 6417979.0, + "reward": 0.8072509765625, + "reward_std": 0.014892157167196274, + "rewards//mean": 0.8072509765625, + "rewards//std": 0.027020471170544624, + "step": 883 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1768, + "grad_norm": 1.601388692855835, + "kl": 0.27656582184135914, + "learning_rate": 9.317377956337818e-07, + "loss": 0.0111, + "num_tokens": 6425187.0, + "reward": 0.8260498046875, + "reward_std": 0.011920111253857613, + "rewards//mean": 0.8260498046875, + "rewards//std": 0.01813550479710102, + "step": 884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.177, + "grad_norm": 1.5032188892364502, + "kl": 0.26728659495711327, + "learning_rate": 9.315776491082972e-07, + "loss": 0.0107, + "num_tokens": 6432523.0, + "reward": 0.82269287109375, + "reward_std": 0.01926327683031559, + "rewards//mean": 0.82269287109375, + "rewards//std": 0.02802714891731739, + "step": 885 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1772, + "grad_norm": 1.659440279006958, + "kl": 0.29828933626413345, + "learning_rate": 9.314173287433872e-07, + "loss": 0.0119, + "num_tokens": 6439811.0, + "reward": 0.8143310546875, + "reward_std": 0.018105288967490196, + "rewards//mean": 0.8143310546875, + "rewards//std": 0.028047464787960052, + "step": 886 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.1774, + "grad_norm": 1.3873087167739868, + "kl": 0.2769951820373535, + "learning_rate": 9.312568346036287e-07, + "loss": 0.0108, + "num_tokens": 6446994.0, + "reward": 0.85546875, + "reward_std": 0.017127666622400284, + "rewards//mean": 0.85546875, + "rewards//std": 0.024498552083969116, + "step": 887 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1776, + "grad_norm": 1.4745452404022217, + "kl": 0.31842252239584923, + "learning_rate": 9.310961667536688e-07, + "loss": 0.0127, + "num_tokens": 6454338.0, + "reward": 0.86053466796875, + "reward_std": 0.01627405360341072, + "rewards//mean": 0.86053466796875, + "rewards//std": 0.02074805460870266, + "step": 888 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1778, + "grad_norm": 1.5704268217086792, + "kl": 0.2771360855549574, + "learning_rate": 9.309353252582245e-07, + "loss": 0.0111, + "num_tokens": 6461626.0, + "reward": 0.83966064453125, + "reward_std": 0.0231628380715847, + "rewards//mean": 0.83966064453125, + "rewards//std": 0.02884325385093689, + "step": 889 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.178, + "grad_norm": 1.4102036952972412, + "kl": 0.23608792200684547, + "learning_rate": 9.307743101820827e-07, + "loss": 0.0094, + "num_tokens": 6468914.0, + "reward": 0.85906982421875, + "reward_std": 0.019042206928133965, + "rewards//mean": 0.85906982421875, + "rewards//std": 0.028153788298368454, + "step": 890 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1782, + "grad_norm": 1.5858533382415771, + "kl": 0.3938771318644285, + "learning_rate": 9.306131215901003e-07, + "loss": 0.0158, + "num_tokens": 6476178.0, + "reward": 0.8394775390625, + "reward_std": 0.0172707699239254, + "rewards//mean": 0.8394775390625, + "rewards//std": 0.019446976482868195, + "step": 891 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1784, + "grad_norm": 1.4378043413162231, + "kl": 0.2420545518398285, + "learning_rate": 9.304517595472039e-07, + "loss": 0.0097, + "num_tokens": 6483482.0, + "reward": 0.8682861328125, + "reward_std": 0.015598418191075325, + "rewards//mean": 0.8682861328125, + "rewards//std": 0.021893026307225227, + "step": 892 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.578125, + "epoch": 0.1786, + "grad_norm": 1.469434380531311, + "kl": 0.2995621617883444, + "learning_rate": 9.302902241183903e-07, + "loss": -0.0062, + "num_tokens": 6490671.0, + "reward": 0.8270263671875, + "reward_std": 0.01273314468562603, + "rewards//mean": 0.8270263671875, + "rewards//std": 0.017378492280840874, + "step": 893 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1788, + "grad_norm": 1.4540715217590332, + "kl": 0.275119299069047, + "learning_rate": 9.301285153687259e-07, + "loss": 0.011, + "num_tokens": 6498095.0, + "reward": 0.86859130859375, + "reward_std": 0.013033459894359112, + "rewards//mean": 0.86859130859375, + "rewards//std": 0.025670412927865982, + "step": 894 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.859375, + "epoch": 0.179, + "grad_norm": 1.3786343336105347, + "kl": 0.2890021912753582, + "learning_rate": 9.29966633363347e-07, + "loss": 0.0046, + "num_tokens": 6505454.0, + "reward": 0.8602294921875, + "reward_std": 0.013028230518102646, + "rewards//mean": 0.8602294921875, + "rewards//std": 0.02222243882715702, + "step": 895 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1792, + "grad_norm": 4.070776462554932, + "kl": 0.5062491334974766, + "learning_rate": 9.298045781674595e-07, + "loss": 0.0202, + "num_tokens": 6512774.0, + "reward": 0.8441162109375, + "reward_std": 0.017598960548639297, + "rewards//mean": 0.8441162109375, + "rewards//std": 0.04082529619336128, + "step": 896 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1794, + "grad_norm": 1.718794584274292, + "kl": 0.27621921338140965, + "learning_rate": 9.296423498463395e-07, + "loss": 0.011, + "num_tokens": 6520078.0, + "reward": 0.8226318359375, + "reward_std": 0.015087373554706573, + "rewards//mean": 0.8226318359375, + "rewards//std": 0.0231956634670496, + "step": 897 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.828125, + "epoch": 0.1796, + "grad_norm": 1.7182880640029907, + "kl": 0.291449585929513, + "learning_rate": 9.294799484653322e-07, + "loss": 0.0032, + "num_tokens": 6527299.0, + "reward": 0.8568115234375, + "reward_std": 0.012801086530089378, + "rewards//mean": 0.8568115234375, + "rewards//std": 0.015076219104230404, + "step": 898 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1798, + "grad_norm": 1.5339016914367676, + "kl": 0.26360774226486683, + "learning_rate": 9.29317374089853e-07, + "loss": 0.0105, + "num_tokens": 6534603.0, + "reward": 0.8365478515625, + "reward_std": 0.01247391477227211, + "rewards//mean": 0.8365478515625, + "rewards//std": 0.02092866413295269, + "step": 899 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.18, + "grad_norm": 1.6512507200241089, + "kl": 0.2602615859359503, + "learning_rate": 9.291546267853869e-07, + "loss": 0.0119, + "num_tokens": 6541836.0, + "reward": 0.80877685546875, + "reward_std": 0.02092839777469635, + "rewards//mean": 0.80877685546875, + "rewards//std": 0.026221752166748047, + "step": 900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1802, + "grad_norm": 1.230847716331482, + "kl": 0.3382634613662958, + "learning_rate": 9.289917066174885e-07, + "loss": 0.0135, + "num_tokens": 6549244.0, + "reward": 0.8668212890625, + "reward_std": 0.011406907811760902, + "rewards//mean": 0.8668212890625, + "rewards//std": 0.016829773783683777, + "step": 901 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1804, + "grad_norm": 1.5082858800888062, + "kl": 0.2543829567730427, + "learning_rate": 9.288286136517819e-07, + "loss": 0.0102, + "num_tokens": 6556516.0, + "reward": 0.853759765625, + "reward_std": 0.022799383848905563, + "rewards//mean": 0.853759765625, + "rewards//std": 0.033652350306510925, + "step": 902 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.1806, + "grad_norm": 1.5351184606552124, + "kl": 0.3044157661497593, + "learning_rate": 9.28665347953961e-07, + "loss": 0.0125, + "num_tokens": 6563867.0, + "reward": 0.8251953125, + "reward_std": 0.014988833107054234, + "rewards//mean": 0.8251953125, + "rewards//std": 0.024170098826289177, + "step": 903 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1808, + "grad_norm": 1.5466413497924805, + "kl": 0.2977803945541382, + "learning_rate": 9.285019095897893e-07, + "loss": 0.0119, + "num_tokens": 6571163.0, + "reward": 0.8642578125, + "reward_std": 0.018962236121296883, + "rewards//mean": 0.8642578125, + "rewards//std": 0.02897370606660843, + "step": 904 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.181, + "grad_norm": 1.3295753002166748, + "kl": 0.2926426362246275, + "learning_rate": 9.283382986250996e-07, + "loss": 0.0117, + "num_tokens": 6578291.0, + "reward": 0.85003662109375, + "reward_std": 0.01354585774242878, + "rewards//mean": 0.85003662109375, + "rewards//std": 0.019656311720609665, + "step": 905 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1812, + "grad_norm": 1.4777607917785645, + "kl": 0.2794528938829899, + "learning_rate": 9.281745151257945e-07, + "loss": 0.0112, + "num_tokens": 6585507.0, + "reward": 0.85040283203125, + "reward_std": 0.01644163951277733, + "rewards//mean": 0.85040283203125, + "rewards//std": 0.02368094213306904, + "step": 906 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1814, + "grad_norm": 1.5701440572738647, + "kl": 0.26727248914539814, + "learning_rate": 9.280105591578458e-07, + "loss": 0.0107, + "num_tokens": 6592771.0, + "reward": 0.84490966796875, + "reward_std": 0.013723326846957207, + "rewards//mean": 0.84490966796875, + "rewards//std": 0.01761672832071781, + "step": 907 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1816, + "grad_norm": 1.435335636138916, + "kl": 0.2744126208126545, + "learning_rate": 9.278464307872951e-07, + "loss": 0.011, + "num_tokens": 6600083.0, + "reward": 0.793212890625, + "reward_std": 0.012483829632401466, + "rewards//mean": 0.793212890625, + "rewards//std": 0.02038528397679329, + "step": 908 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1818, + "grad_norm": 1.510980248451233, + "kl": 0.33268991485238075, + "learning_rate": 9.276821300802533e-07, + "loss": 0.0133, + "num_tokens": 6607291.0, + "reward": 0.86053466796875, + "reward_std": 0.015695173293352127, + "rewards//mean": 0.86053466796875, + "rewards//std": 0.023299440741539, + "step": 909 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.84375, + "epoch": 0.182, + "grad_norm": 1.8137496709823608, + "kl": 0.27948739752173424, + "learning_rate": 9.275176571029006e-07, + "loss": 0.0063, + "num_tokens": 6614489.0, + "reward": 0.80548095703125, + "reward_std": 0.013368627056479454, + "rewards//mean": 0.80548095703125, + "rewards//std": 0.02200019732117653, + "step": 910 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1822, + "grad_norm": 1.629288673400879, + "kl": 0.3101373314857483, + "learning_rate": 9.273530119214867e-07, + "loss": 0.0124, + "num_tokens": 6621777.0, + "reward": 0.843994140625, + "reward_std": 0.015419116243720055, + "rewards//mean": 0.843994140625, + "rewards//std": 0.02437838539481163, + "step": 911 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1824, + "grad_norm": 1.3753328323364258, + "kl": 0.2606661897152662, + "learning_rate": 9.271881946023308e-07, + "loss": 0.0104, + "num_tokens": 6629089.0, + "reward": 0.8587646484375, + "reward_std": 0.011638050898909569, + "rewards//mean": 0.8587646484375, + "rewards//std": 0.01413086149841547, + "step": 912 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1826, + "grad_norm": 1.7198463678359985, + "kl": 0.27075505815446377, + "learning_rate": 9.270232052118212e-07, + "loss": 0.0108, + "num_tokens": 6636457.0, + "reward": 0.836181640625, + "reward_std": 0.017545603215694427, + "rewards//mean": 0.836181640625, + "rewards//std": 0.021753255277872086, + "step": 913 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1828, + "grad_norm": 1.4734933376312256, + "kl": 0.35747941583395004, + "learning_rate": 9.268580438164155e-07, + "loss": 0.0143, + "num_tokens": 6643721.0, + "reward": 0.83917236328125, + "reward_std": 0.0142026636749506, + "rewards//mean": 0.83917236328125, + "rewards//std": 0.022623028606176376, + "step": 914 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.183, + "grad_norm": 1.3824533224105835, + "kl": 0.2855211850255728, + "learning_rate": 9.266927104826408e-07, + "loss": 0.0114, + "num_tokens": 6651033.0, + "reward": 0.8616943359375, + "reward_std": 0.0119086392223835, + "rewards//mean": 0.8616943359375, + "rewards//std": 0.016597915440797806, + "step": 915 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1832, + "grad_norm": 1.322574257850647, + "kl": 0.2680364288389683, + "learning_rate": 9.265272052770935e-07, + "loss": 0.0107, + "num_tokens": 6658385.0, + "reward": 0.86407470703125, + "reward_std": 0.014182406477630138, + "rewards//mean": 0.86407470703125, + "rewards//std": 0.02244703285396099, + "step": 916 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1834, + "grad_norm": 1.6013283729553223, + "kl": 0.2930336557328701, + "learning_rate": 9.263615282664388e-07, + "loss": 0.0117, + "num_tokens": 6665697.0, + "reward": 0.8607177734375, + "reward_std": 0.02229662798345089, + "rewards//mean": 0.8607177734375, + "rewards//std": 0.035937897861003876, + "step": 917 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1836, + "grad_norm": 1.4464163780212402, + "kl": 0.2766840700060129, + "learning_rate": 9.261956795174115e-07, + "loss": 0.0111, + "num_tokens": 6673297.0, + "reward": 0.86932373046875, + "reward_std": 0.016279064118862152, + "rewards//mean": 0.86932373046875, + "rewards//std": 0.029272517189383507, + "step": 918 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1838, + "grad_norm": 1.374275803565979, + "kl": 0.2975800633430481, + "learning_rate": 9.260296590968156e-07, + "loss": 0.0119, + "num_tokens": 6680593.0, + "reward": 0.85986328125, + "reward_std": 0.012850064784288406, + "rewards//mean": 0.85986328125, + "rewards//std": 0.02489573322236538, + "step": 919 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.184, + "grad_norm": 1.4675790071487427, + "kl": 0.2572833374142647, + "learning_rate": 9.258634670715237e-07, + "loss": 0.0103, + "num_tokens": 6687873.0, + "reward": 0.86492919921875, + "reward_std": 0.01854993775486946, + "rewards//mean": 0.86492919921875, + "rewards//std": 0.02271118201315403, + "step": 920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1842, + "grad_norm": 1.4902923107147217, + "kl": 0.27219419553875923, + "learning_rate": 9.256971035084784e-07, + "loss": 0.0109, + "num_tokens": 6695145.0, + "reward": 0.84527587890625, + "reward_std": 0.013368219137191772, + "rewards//mean": 0.84527587890625, + "rewards//std": 0.02740672044456005, + "step": 921 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1844, + "grad_norm": 1.6446360349655151, + "kl": 0.2975337989628315, + "learning_rate": 9.255305684746907e-07, + "loss": 0.0119, + "num_tokens": 6702313.0, + "reward": 0.84918212890625, + "reward_std": 0.01459230575710535, + "rewards//mean": 0.84918212890625, + "rewards//std": 0.025071369484066963, + "step": 922 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1846, + "grad_norm": 1.4976067543029785, + "kl": 0.2833384405821562, + "learning_rate": 9.253638620372408e-07, + "loss": 0.0113, + "num_tokens": 6709593.0, + "reward": 0.88641357421875, + "reward_std": 0.013253288343548775, + "rewards//mean": 0.88641357421875, + "rewards//std": 0.018731191754341125, + "step": 923 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1848, + "grad_norm": 1.4137479066848755, + "kl": 0.2893131896853447, + "learning_rate": 9.251969842632783e-07, + "loss": 0.0116, + "num_tokens": 6716849.0, + "reward": 0.87054443359375, + "reward_std": 0.013486528769135475, + "rewards//mean": 0.87054443359375, + "rewards//std": 0.016126427799463272, + "step": 924 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.185, + "grad_norm": 1.3857172727584839, + "kl": 0.3088892437517643, + "learning_rate": 9.250299352200212e-07, + "loss": 0.0124, + "num_tokens": 6724153.0, + "reward": 0.848876953125, + "reward_std": 0.01381350215524435, + "rewards//mean": 0.848876953125, + "rewards//std": 0.02582569606602192, + "step": 925 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1852, + "grad_norm": 1.551288366317749, + "kl": 0.2794480510056019, + "learning_rate": 9.248627149747572e-07, + "loss": 0.0112, + "num_tokens": 6731449.0, + "reward": 0.86199951171875, + "reward_std": 0.024598252028226852, + "rewards//mean": 0.86199951171875, + "rewards//std": 0.0375581793487072, + "step": 926 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.75, + "epoch": 0.1854, + "grad_norm": 1.434441328048706, + "kl": 0.3836228381842375, + "learning_rate": 9.246953235948422e-07, + "loss": 0.017, + "num_tokens": 6738697.0, + "reward": 0.8013916015625, + "reward_std": 0.014746603555977345, + "rewards//mean": 0.8013916015625, + "rewards//std": 0.018377620726823807, + "step": 927 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1856, + "grad_norm": 1.7647937536239624, + "kl": 0.29338979348540306, + "learning_rate": 9.245277611477018e-07, + "loss": 0.0117, + "num_tokens": 6745985.0, + "reward": 0.81048583984375, + "reward_std": 0.016923287883400917, + "rewards//mean": 0.81048583984375, + "rewards//std": 0.017385786399245262, + "step": 928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1858, + "grad_norm": 1.6195451021194458, + "kl": 0.29230377078056335, + "learning_rate": 9.2436002770083e-07, + "loss": 0.0117, + "num_tokens": 6753377.0, + "reward": 0.8641357421875, + "reward_std": 0.017138758674263954, + "rewards//mean": 0.8641357421875, + "rewards//std": 0.021815448999404907, + "step": 929 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.186, + "grad_norm": 1.7394155263900757, + "kl": 0.32355763763189316, + "learning_rate": 9.241921233217897e-07, + "loss": 0.0129, + "num_tokens": 6760697.0, + "reward": 0.83935546875, + "reward_std": 0.017016177996993065, + "rewards//mean": 0.83935546875, + "rewards//std": 0.0254061222076416, + "step": 930 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1862, + "grad_norm": 1.785834550857544, + "kl": 0.34111603535711765, + "learning_rate": 9.240240480782129e-07, + "loss": 0.0136, + "num_tokens": 6767961.0, + "reward": 0.79449462890625, + "reward_std": 0.011081939563155174, + "rewards//mean": 0.79449462890625, + "rewards//std": 0.025426296517252922, + "step": 931 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1864, + "grad_norm": 1.6231780052185059, + "kl": 0.2730038370937109, + "learning_rate": 9.238558020378003e-07, + "loss": 0.0109, + "num_tokens": 6775257.0, + "reward": 0.82611083984375, + "reward_std": 0.014123712666332722, + "rewards//mean": 0.82611083984375, + "rewards//std": 0.027125800028443336, + "step": 932 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.1866, + "grad_norm": 1.4732736349105835, + "kl": 0.25376254692673683, + "learning_rate": 9.236873852683212e-07, + "loss": 0.0106, + "num_tokens": 6782559.0, + "reward": 0.86553955078125, + "reward_std": 0.01672912761569023, + "rewards//mean": 0.86553955078125, + "rewards//std": 0.02648254670202732, + "step": 933 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.1868, + "grad_norm": 1.9056611061096191, + "kl": 0.2815764117985964, + "learning_rate": 9.235187978376141e-07, + "loss": 0.0114, + "num_tokens": 6789877.0, + "reward": 0.84930419921875, + "reward_std": 0.019669581204652786, + "rewards//mean": 0.84930419921875, + "rewards//std": 0.02814948745071888, + "step": 934 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.187, + "grad_norm": 1.5390018224716187, + "kl": 0.2624163944274187, + "learning_rate": 9.233500398135858e-07, + "loss": 0.0105, + "num_tokens": 6797133.0, + "reward": 0.85089111328125, + "reward_std": 0.018158117309212685, + "rewards//mean": 0.85089111328125, + "rewards//std": 0.03358520567417145, + "step": 935 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1872, + "grad_norm": 1.0366404056549072, + "kl": 0.24765265174210072, + "learning_rate": 9.23181111264212e-07, + "loss": 0.0099, + "num_tokens": 6804485.0, + "reward": 0.84490966796875, + "reward_std": 0.010842327028512955, + "rewards//mean": 0.84490966796875, + "rewards//std": 0.0175340436398983, + "step": 936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1874, + "grad_norm": 1.3878121376037598, + "kl": 0.2721179574728012, + "learning_rate": 9.230120122575375e-07, + "loss": 0.0109, + "num_tokens": 6811797.0, + "reward": 0.87469482421875, + "reward_std": 0.021142028272151947, + "rewards//mean": 0.87469482421875, + "rewards//std": 0.026913465932011604, + "step": 937 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1876, + "grad_norm": 1.5936589241027832, + "kl": 0.3385761044919491, + "learning_rate": 9.228427428616748e-07, + "loss": 0.0135, + "num_tokens": 6819077.0, + "reward": 0.83099365234375, + "reward_std": 0.012895422987639904, + "rewards//mean": 0.83099365234375, + "rewards//std": 0.022512352094054222, + "step": 938 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1878, + "grad_norm": 1.3902446031570435, + "kl": 0.2676719222217798, + "learning_rate": 9.22673303144806e-07, + "loss": 0.0107, + "num_tokens": 6826317.0, + "reward": 0.8741455078125, + "reward_std": 0.014871601946651936, + "rewards//mean": 0.8741455078125, + "rewards//std": 0.03359179571270943, + "step": 939 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.188, + "grad_norm": 1.3851964473724365, + "kl": 0.27285304106771946, + "learning_rate": 9.22503693175181e-07, + "loss": 0.0109, + "num_tokens": 6833533.0, + "reward": 0.8629150390625, + "reward_std": 0.01862572506070137, + "rewards//mean": 0.8629150390625, + "rewards//std": 0.026691291481256485, + "step": 940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1882, + "grad_norm": 1.4476288557052612, + "kl": 0.3337275367230177, + "learning_rate": 9.223339130211192e-07, + "loss": 0.0133, + "num_tokens": 6840757.0, + "reward": 0.86279296875, + "reward_std": 0.017014414072036743, + "rewards//mean": 0.86279296875, + "rewards//std": 0.024384593591094017, + "step": 941 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1884, + "grad_norm": 1.3864424228668213, + "kl": 0.25808248668909073, + "learning_rate": 9.221639627510075e-07, + "loss": 0.0103, + "num_tokens": 6848037.0, + "reward": 0.867919921875, + "reward_std": 0.01657295599579811, + "rewards//mean": 0.867919921875, + "rewards//std": 0.0232806745916605, + "step": 942 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.1886, + "grad_norm": 1.6590120792388916, + "kl": 0.39298348873853683, + "learning_rate": 9.219938424333023e-07, + "loss": 0.015, + "num_tokens": 6855426.0, + "reward": 0.83197021484375, + "reward_std": 0.01740080863237381, + "rewards//mean": 0.83197021484375, + "rewards//std": 0.02403435856103897, + "step": 943 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1888, + "grad_norm": 1.6118922233581543, + "kl": 0.3280082009732723, + "learning_rate": 9.218235521365276e-07, + "loss": 0.0131, + "num_tokens": 6862730.0, + "reward": 0.81304931640625, + "reward_std": 0.01629365049302578, + "rewards//mean": 0.81304931640625, + "rewards//std": 0.022997979074716568, + "step": 944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.189, + "grad_norm": 1.6022790670394897, + "kl": 0.2535574957728386, + "learning_rate": 9.216530919292767e-07, + "loss": 0.0101, + "num_tokens": 6870026.0, + "reward": 0.8682861328125, + "reward_std": 0.018257353454828262, + "rewards//mean": 0.8682861328125, + "rewards//std": 0.022814013063907623, + "step": 945 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.3125, + "epoch": 0.1892, + "grad_norm": 2.187138557434082, + "kl": 0.2769910991191864, + "learning_rate": 9.214824618802107e-07, + "loss": -0.0053, + "num_tokens": 6877270.0, + "reward": 0.8612060546875, + "reward_std": 0.01827901601791382, + "rewards//mean": 0.8612060546875, + "rewards//std": 0.026684483513236046, + "step": 946 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1894, + "grad_norm": 1.5916674137115479, + "kl": 0.2682803329080343, + "learning_rate": 9.213116620580596e-07, + "loss": 0.0107, + "num_tokens": 6884454.0, + "reward": 0.80316162109375, + "reward_std": 0.012032775208353996, + "rewards//mean": 0.80316162109375, + "rewards//std": 0.022440964356064796, + "step": 947 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1896, + "grad_norm": 1.3456374406814575, + "kl": 0.2993052005767822, + "learning_rate": 9.211406925316212e-07, + "loss": 0.012, + "num_tokens": 6891726.0, + "reward": 0.85113525390625, + "reward_std": 0.015091245993971825, + "rewards//mean": 0.85113525390625, + "rewards//std": 0.026658590883016586, + "step": 948 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1898, + "grad_norm": 1.815134882926941, + "kl": 0.27939172834157944, + "learning_rate": 9.209695533697623e-07, + "loss": 0.0112, + "num_tokens": 6898918.0, + "reward": 0.8687744140625, + "reward_std": 0.020937826484441757, + "rewards//mean": 0.8687744140625, + "rewards//std": 0.03164065629243851, + "step": 949 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.19, + "grad_norm": 1.7557371854782104, + "kl": 0.2593674473464489, + "learning_rate": 9.207982446414177e-07, + "loss": 0.0104, + "num_tokens": 6906142.0, + "reward": 0.8380126953125, + "reward_std": 0.014308642596006393, + "rewards//mean": 0.8380126953125, + "rewards//std": 0.016157962381839752, + "step": 950 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1902, + "grad_norm": 1.3839820623397827, + "kl": 0.3047237992286682, + "learning_rate": 9.206267664155906e-07, + "loss": 0.0122, + "num_tokens": 6913454.0, + "reward": 0.8564453125, + "reward_std": 0.019890882074832916, + "rewards//mean": 0.8564453125, + "rewards//std": 0.032061029225587845, + "step": 951 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1904, + "grad_norm": 1.4495247602462769, + "kl": 0.33429375290870667, + "learning_rate": 9.20455118761352e-07, + "loss": 0.0134, + "num_tokens": 6920718.0, + "reward": 0.85107421875, + "reward_std": 0.01951078325510025, + "rewards//mean": 0.85107421875, + "rewards//std": 0.023318355903029442, + "step": 952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.75, + "epoch": 0.1906, + "grad_norm": 1.7163975238800049, + "kl": 0.2760246340185404, + "learning_rate": 9.202833017478421e-07, + "loss": 0.0163, + "num_tokens": 6928046.0, + "reward": 0.8294677734375, + "reward_std": 0.018185680732131004, + "rewards//mean": 0.8294677734375, + "rewards//std": 0.02636718936264515, + "step": 953 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1908, + "grad_norm": 1.584498643875122, + "kl": 0.27105364575982094, + "learning_rate": 9.201113154442683e-07, + "loss": 0.0108, + "num_tokens": 6935358.0, + "reward": 0.82818603515625, + "reward_std": 0.0172556284815073, + "rewards//mean": 0.82818603515625, + "rewards//std": 0.01831522397696972, + "step": 954 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.8125, + "epoch": 0.191, + "grad_norm": 1.3614630699157715, + "kl": 0.3561857156455517, + "learning_rate": 9.199391599199071e-07, + "loss": 0.0025, + "num_tokens": 6942594.0, + "reward": 0.84112548828125, + "reward_std": 0.01845856010913849, + "rewards//mean": 0.84112548828125, + "rewards//std": 0.02364383637905121, + "step": 955 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.1912, + "grad_norm": 1.5932536125183105, + "kl": 0.21216821111738682, + "learning_rate": 9.197668352441023e-07, + "loss": 0.011, + "num_tokens": 6949934.0, + "reward": 0.858642578125, + "reward_std": 0.017951373010873795, + "rewards//mean": 0.858642578125, + "rewards//std": 0.03394615277647972, + "step": 956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.859375, + "epoch": 0.1914, + "grad_norm": 1.599956750869751, + "kl": 0.2627936005592346, + "learning_rate": 9.195943414862665e-07, + "loss": 0.0047, + "num_tokens": 6957197.0, + "reward": 0.87103271484375, + "reward_std": 0.02025160938501358, + "rewards//mean": 0.87103271484375, + "rewards//std": 0.03132348507642746, + "step": 957 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1916, + "grad_norm": 1.3740862607955933, + "kl": 0.2925429530441761, + "learning_rate": 9.194216787158804e-07, + "loss": 0.0117, + "num_tokens": 6964549.0, + "reward": 0.8526611328125, + "reward_std": 0.016533691436052322, + "rewards//mean": 0.8526611328125, + "rewards//std": 0.022417763248085976, + "step": 958 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1918, + "grad_norm": 1.472395420074463, + "kl": 0.2981392238289118, + "learning_rate": 9.192488470024919e-07, + "loss": 0.0119, + "num_tokens": 6971861.0, + "reward": 0.8155517578125, + "reward_std": 0.011286063119769096, + "rewards//mean": 0.8155517578125, + "rewards//std": 0.02036857046186924, + "step": 959 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.192, + "grad_norm": 1.7898032665252686, + "kl": 0.27251823619008064, + "learning_rate": 9.190758464157182e-07, + "loss": 0.0109, + "num_tokens": 6979109.0, + "reward": 0.86431884765625, + "reward_std": 0.017122235149145126, + "rewards//mean": 0.86431884765625, + "rewards//std": 0.022000884637236595, + "step": 960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1922, + "grad_norm": 1.5181336402893066, + "kl": 0.27799420058727264, + "learning_rate": 9.189026770252436e-07, + "loss": 0.0111, + "num_tokens": 6986437.0, + "reward": 0.8226318359375, + "reward_std": 0.02269936539232731, + "rewards//mean": 0.8226318359375, + "rewards//std": 0.028208915144205093, + "step": 961 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1924, + "grad_norm": 1.4149967432022095, + "kl": 0.27177889086306095, + "learning_rate": 9.187293389008208e-07, + "loss": 0.0109, + "num_tokens": 6993725.0, + "reward": 0.8143310546875, + "reward_std": 0.017867321148514748, + "rewards//mean": 0.8143310546875, + "rewards//std": 0.022627461701631546, + "step": 962 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1926, + "grad_norm": 1.640693187713623, + "kl": 0.27410155162215233, + "learning_rate": 9.185558321122704e-07, + "loss": 0.011, + "num_tokens": 7001085.0, + "reward": 0.83868408203125, + "reward_std": 0.01781122200191021, + "rewards//mean": 0.83868408203125, + "rewards//std": 0.020666180178523064, + "step": 963 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1928, + "grad_norm": 1.53792405128479, + "kl": 0.2535569379106164, + "learning_rate": 9.183821567294808e-07, + "loss": 0.0101, + "num_tokens": 7008365.0, + "reward": 0.83831787109375, + "reward_std": 0.01457600761204958, + "rewards//mean": 0.83831787109375, + "rewards//std": 0.021197721362113953, + "step": 964 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.193, + "grad_norm": 1.4017175436019897, + "kl": 0.2707077134400606, + "learning_rate": 9.182083128224086e-07, + "loss": 0.0108, + "num_tokens": 7015693.0, + "reward": 0.8521728515625, + "reward_std": 0.015689954161643982, + "rewards//mean": 0.8521728515625, + "rewards//std": 0.02630280889570713, + "step": 965 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1932, + "grad_norm": 2.366211175918579, + "kl": 0.26356519013643265, + "learning_rate": 9.180343004610779e-07, + "loss": 0.0105, + "num_tokens": 7022989.0, + "reward": 0.827392578125, + "reward_std": 0.012868167832493782, + "rewards//mean": 0.827392578125, + "rewards//std": 0.026792334392666817, + "step": 966 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1934, + "grad_norm": 1.2503937482833862, + "kl": 0.25284795090556145, + "learning_rate": 9.178601197155811e-07, + "loss": 0.0101, + "num_tokens": 7030325.0, + "reward": 0.8345947265625, + "reward_std": 0.013466108590364456, + "rewards//mean": 0.8345947265625, + "rewards//std": 0.03241577744483948, + "step": 967 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1936, + "grad_norm": 1.7107510566711426, + "kl": 0.28458115458488464, + "learning_rate": 9.176857706560779e-07, + "loss": 0.0114, + "num_tokens": 7037597.0, + "reward": 0.839599609375, + "reward_std": 0.014125293120741844, + "rewards//mean": 0.839599609375, + "rewards//std": 0.030031709000468254, + "step": 968 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1938, + "grad_norm": 1.3375592231750488, + "kl": 0.23313883505761623, + "learning_rate": 9.175112533527963e-07, + "loss": 0.0093, + "num_tokens": 7044901.0, + "reward": 0.75390625, + "reward_std": 0.010236109606921673, + "rewards//mean": 0.75390625, + "rewards//std": 0.015453549101948738, + "step": 969 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.194, + "grad_norm": 1.8244404792785645, + "kl": 0.33404130674898624, + "learning_rate": 9.173365678760317e-07, + "loss": 0.0134, + "num_tokens": 7052285.0, + "reward": 0.8258056640625, + "reward_std": 0.026936283335089684, + "rewards//mean": 0.8258056640625, + "rewards//std": 0.03547323867678642, + "step": 970 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1942, + "grad_norm": 1.4581828117370605, + "kl": 0.27170366793870926, + "learning_rate": 9.171617142961476e-07, + "loss": 0.0109, + "num_tokens": 7059565.0, + "reward": 0.821044921875, + "reward_std": 0.010899350047111511, + "rewards//mean": 0.821044921875, + "rewards//std": 0.017973264679312706, + "step": 971 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.796875, + "epoch": 0.1944, + "grad_norm": 1.3672025203704834, + "kl": 0.3223393093794584, + "learning_rate": 9.169866926835747e-07, + "loss": 0.0096, + "num_tokens": 7066808.0, + "reward": 0.8253173828125, + "reward_std": 0.02105821669101715, + "rewards//mean": 0.8253173828125, + "rewards//std": 0.023780852556228638, + "step": 972 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1946, + "grad_norm": 1.723407506942749, + "kl": 0.25767615996301174, + "learning_rate": 9.16811503108812e-07, + "loss": 0.0103, + "num_tokens": 7074240.0, + "reward": 0.80816650390625, + "reward_std": 0.01845592074096203, + "rewards//mean": 0.80816650390625, + "rewards//std": 0.0317964181303978, + "step": 973 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1948, + "grad_norm": 1.6233340501785278, + "kl": 0.2723458744585514, + "learning_rate": 9.166361456424257e-07, + "loss": 0.0109, + "num_tokens": 7081456.0, + "reward": 0.88922119140625, + "reward_std": 0.015013474971055984, + "rewards//mean": 0.88922119140625, + "rewards//std": 0.02096506953239441, + "step": 974 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.75, + "epoch": 0.195, + "grad_norm": 1.9742152690887451, + "kl": 0.24852723628282547, + "learning_rate": 9.164606203550497e-07, + "loss": 0.008, + "num_tokens": 7088864.0, + "reward": 0.7955322265625, + "reward_std": 0.011826101690530777, + "rewards//mean": 0.7955322265625, + "rewards//std": 0.01604137010872364, + "step": 975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1952, + "grad_norm": 1.76129150390625, + "kl": 0.25620356388390064, + "learning_rate": 9.162849273173856e-07, + "loss": 0.0102, + "num_tokens": 7096288.0, + "reward": 0.8555908203125, + "reward_std": 0.022052085027098656, + "rewards//mean": 0.8555908203125, + "rewards//std": 0.028956720605492592, + "step": 976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1954, + "grad_norm": 1.5876294374465942, + "kl": 0.28523839078843594, + "learning_rate": 9.161090666002027e-07, + "loss": 0.0114, + "num_tokens": 7103528.0, + "reward": 0.84820556640625, + "reward_std": 0.010693060234189034, + "rewards//mean": 0.84820556640625, + "rewards//std": 0.021875301375985146, + "step": 977 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1956, + "grad_norm": 1.3972828388214111, + "kl": 0.25338258128613234, + "learning_rate": 9.159330382743373e-07, + "loss": 0.0101, + "num_tokens": 7110864.0, + "reward": 0.87408447265625, + "reward_std": 0.01629462279379368, + "rewards//mean": 0.87408447265625, + "rewards//std": 0.02395613305270672, + "step": 978 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1958, + "grad_norm": 1.6152536869049072, + "kl": 0.2625191677361727, + "learning_rate": 9.157568424106941e-07, + "loss": 0.0105, + "num_tokens": 7118176.0, + "reward": 0.84112548828125, + "reward_std": 0.016102679073810577, + "rewards//mean": 0.84112548828125, + "rewards//std": 0.01834907941520214, + "step": 979 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.578125, + "epoch": 0.196, + "grad_norm": 1.3965927362442017, + "kl": 0.32916584238409996, + "learning_rate": 9.155804790802443e-07, + "loss": -0.0167, + "num_tokens": 7125485.0, + "reward": 0.82159423828125, + "reward_std": 0.013638921082019806, + "rewards//mean": 0.82159423828125, + "rewards//std": 0.018441244959831238, + "step": 980 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1962, + "grad_norm": 1.3552148342132568, + "kl": 0.2809001952409744, + "learning_rate": 9.154039483540272e-07, + "loss": 0.0112, + "num_tokens": 7132757.0, + "reward": 0.849853515625, + "reward_std": 0.021047620102763176, + "rewards//mean": 0.849853515625, + "rewards//std": 0.02768157795071602, + "step": 981 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.609375, + "epoch": 0.1964, + "grad_norm": 1.714436411857605, + "kl": 0.2717098630964756, + "learning_rate": 9.152272503031495e-07, + "loss": -0.0048, + "num_tokens": 7140044.0, + "reward": 0.79296875, + "reward_std": 0.011610167101025581, + "rewards//mean": 0.79296875, + "rewards//std": 0.01862349919974804, + "step": 982 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1966, + "grad_norm": 1.365868091583252, + "kl": 0.39328265376389027, + "learning_rate": 9.150503849987851e-07, + "loss": 0.0157, + "num_tokens": 7147284.0, + "reward": 0.86962890625, + "reward_std": 0.01820511370897293, + "rewards//mean": 0.86962890625, + "rewards//std": 0.030325647443532944, + "step": 983 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1968, + "grad_norm": 1.6765756607055664, + "kl": 0.3309708647429943, + "learning_rate": 9.14873352512175e-07, + "loss": 0.0132, + "num_tokens": 7154580.0, + "reward": 0.84259033203125, + "reward_std": 0.013551724143326283, + "rewards//mean": 0.84259033203125, + "rewards//std": 0.019170524552464485, + "step": 984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.197, + "grad_norm": 1.2856684923171997, + "kl": 0.29120177403092384, + "learning_rate": 9.146961529146284e-07, + "loss": 0.0116, + "num_tokens": 7161844.0, + "reward": 0.8553466796875, + "reward_std": 0.018655089661478996, + "rewards//mean": 0.8553466796875, + "rewards//std": 0.024428939446806908, + "step": 985 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.859375, + "epoch": 0.1972, + "grad_norm": 28.28349494934082, + "kl": 1.72380980104208, + "learning_rate": 9.145187862775208e-07, + "loss": 0.0686, + "num_tokens": 7169075.0, + "reward": 0.81488037109375, + "reward_std": 0.018393974751234055, + "rewards//mean": 0.81488037109375, + "rewards//std": 0.029311275109648705, + "step": 986 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1974, + "grad_norm": 1.29963219165802, + "kl": 0.3083583880215883, + "learning_rate": 9.143412526722958e-07, + "loss": 0.0123, + "num_tokens": 7176267.0, + "reward": 0.82635498046875, + "reward_std": 0.013727284036576748, + "rewards//mean": 0.82635498046875, + "rewards//std": 0.027934638783335686, + "step": 987 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1976, + "grad_norm": 1.6141998767852783, + "kl": 0.2906055115163326, + "learning_rate": 9.141635521704636e-07, + "loss": 0.0116, + "num_tokens": 7183619.0, + "reward": 0.8060302734375, + "reward_std": 0.015219033695757389, + "rewards//mean": 0.8060302734375, + "rewards//std": 0.023043762892484665, + "step": 988 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1978, + "grad_norm": 1.3118774890899658, + "kl": 0.2931559681892395, + "learning_rate": 9.139856848436023e-07, + "loss": 0.0117, + "num_tokens": 7190955.0, + "reward": 0.86358642578125, + "reward_std": 0.0149539764970541, + "rewards//mean": 0.86358642578125, + "rewards//std": 0.02364639937877655, + "step": 989 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.198, + "grad_norm": 1.298476219177246, + "kl": 0.2458500899374485, + "learning_rate": 9.138076507633565e-07, + "loss": 0.0098, + "num_tokens": 7198371.0, + "reward": 0.87738037109375, + "reward_std": 0.018017351627349854, + "rewards//mean": 0.87738037109375, + "rewards//std": 0.02545485831797123, + "step": 990 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1982, + "grad_norm": 2.52919340133667, + "kl": 0.522140403278172, + "learning_rate": 9.136294500014385e-07, + "loss": 0.0209, + "num_tokens": 7205683.0, + "reward": 0.798583984375, + "reward_std": 0.014768850058317184, + "rewards//mean": 0.798583984375, + "rewards//std": 0.023363754153251648, + "step": 991 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1984, + "grad_norm": 1.394331455230713, + "kl": 0.27060199715197086, + "learning_rate": 9.134510826296276e-07, + "loss": 0.0108, + "num_tokens": 7213043.0, + "reward": 0.80615234375, + "reward_std": 0.013855718076229095, + "rewards//mean": 0.80615234375, + "rewards//std": 0.01581757515668869, + "step": 992 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1986, + "grad_norm": 1.8186156749725342, + "kl": 0.29676372557878494, + "learning_rate": 9.1327254871977e-07, + "loss": 0.0119, + "num_tokens": 7220291.0, + "reward": 0.89068603515625, + "reward_std": 0.018382033333182335, + "rewards//mean": 0.89068603515625, + "rewards//std": 0.026689235121011734, + "step": 993 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1988, + "grad_norm": 1.4798511266708374, + "kl": 0.2885011751204729, + "learning_rate": 9.130938483437791e-07, + "loss": 0.0115, + "num_tokens": 7227611.0, + "reward": 0.808837890625, + "reward_std": 0.0156979039311409, + "rewards//mean": 0.808837890625, + "rewards//std": 0.02639157697558403, + "step": 994 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.199, + "grad_norm": 1.5187963247299194, + "kl": 0.28327991627156734, + "learning_rate": 9.129149815736357e-07, + "loss": 0.0113, + "num_tokens": 7234939.0, + "reward": 0.79534912109375, + "reward_std": 0.010961196385324001, + "rewards//mean": 0.79534912109375, + "rewards//std": 0.01618265174329281, + "step": 995 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1992, + "grad_norm": 1.5058679580688477, + "kl": 0.2557656653225422, + "learning_rate": 9.12735948481387e-07, + "loss": 0.0102, + "num_tokens": 7242251.0, + "reward": 0.847900390625, + "reward_std": 0.01865149475634098, + "rewards//mean": 0.847900390625, + "rewards//std": 0.021786632016301155, + "step": 996 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1994, + "grad_norm": 1.6553444862365723, + "kl": 0.2805487122386694, + "learning_rate": 9.125567491391475e-07, + "loss": 0.0112, + "num_tokens": 7249619.0, + "reward": 0.79541015625, + "reward_std": 0.020988380536437035, + "rewards//mean": 0.79541015625, + "rewards//std": 0.02696916088461876, + "step": 997 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1996, + "grad_norm": 1.1820542812347412, + "kl": 0.24605071917176247, + "learning_rate": 9.123773836190989e-07, + "loss": 0.0098, + "num_tokens": 7257059.0, + "reward": 0.850341796875, + "reward_std": 0.016207000240683556, + "rewards//mean": 0.850341796875, + "rewards//std": 0.027890779078006744, + "step": 998 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.1998, + "grad_norm": 1.5638903379440308, + "kl": 0.2712193988263607, + "learning_rate": 9.121978519934895e-07, + "loss": 0.0108, + "num_tokens": 7264307.0, + "reward": 0.84893798828125, + "reward_std": 0.016866926103830338, + "rewards//mean": 0.84893798828125, + "rewards//std": 0.02816024050116539, + "step": 999 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2, + "grad_norm": 1.5844074487686157, + "kl": 0.2647052016109228, + "learning_rate": 9.120181543346346e-07, + "loss": 0.0106, + "num_tokens": 7271627.0, + "reward": 0.83416748046875, + "reward_std": 0.018513580784201622, + "rewards//mean": 0.83416748046875, + "rewards//std": 0.02366303652524948, + "step": 1000 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.2002, + "grad_norm": 1.6165317296981812, + "kl": 0.2896607667207718, + "learning_rate": 9.118382907149163e-07, + "loss": 0.0126, + "num_tokens": 7278849.0, + "reward": 0.8670654296875, + "reward_std": 0.01939752697944641, + "rewards//mean": 0.8670654296875, + "rewards//std": 0.022282304242253304, + "step": 1001 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2004, + "grad_norm": 1.5347421169281006, + "kl": 0.23404448851943016, + "learning_rate": 9.116582612067838e-07, + "loss": 0.0094, + "num_tokens": 7286185.0, + "reward": 0.8526611328125, + "reward_std": 0.017334189265966415, + "rewards//mean": 0.8526611328125, + "rewards//std": 0.030690591782331467, + "step": 1002 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2006, + "grad_norm": 1.6108916997909546, + "kl": 0.2646257672458887, + "learning_rate": 9.11478065882753e-07, + "loss": 0.0106, + "num_tokens": 7293593.0, + "reward": 0.85601806640625, + "reward_std": 0.01590588688850403, + "rewards//mean": 0.85601806640625, + "rewards//std": 0.03519320860505104, + "step": 1003 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2008, + "grad_norm": 1.4960722923278809, + "kl": 0.2693611662834883, + "learning_rate": 9.112977048154064e-07, + "loss": 0.0108, + "num_tokens": 7300889.0, + "reward": 0.83465576171875, + "reward_std": 0.014412742108106613, + "rewards//mean": 0.83465576171875, + "rewards//std": 0.027709385380148888, + "step": 1004 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.201, + "grad_norm": 1.5316067934036255, + "kl": 0.3284708745777607, + "learning_rate": 9.111171780773936e-07, + "loss": 0.0131, + "num_tokens": 7308089.0, + "reward": 0.835205078125, + "reward_std": 0.017600711435079575, + "rewards//mean": 0.835205078125, + "rewards//std": 0.02845815010368824, + "step": 1005 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.84375, + "epoch": 0.2012, + "grad_norm": 1.5380256175994873, + "kl": 0.287580793723464, + "learning_rate": 9.109364857414305e-07, + "loss": 0.0095, + "num_tokens": 7315383.0, + "reward": 0.82647705078125, + "reward_std": 0.022789932787418365, + "rewards//mean": 0.82647705078125, + "rewards//std": 0.02705259434878826, + "step": 1006 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2014, + "grad_norm": 1.3942116498947144, + "kl": 0.31265921890735626, + "learning_rate": 9.107556278803002e-07, + "loss": 0.0125, + "num_tokens": 7322671.0, + "reward": 0.81341552734375, + "reward_std": 0.014851601794362068, + "rewards//mean": 0.81341552734375, + "rewards//std": 0.020865188911557198, + "step": 1007 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2016, + "grad_norm": 2.0492236614227295, + "kl": 0.29369689524173737, + "learning_rate": 9.10574604566852e-07, + "loss": 0.0117, + "num_tokens": 7329975.0, + "reward": 0.8499755859375, + "reward_std": 0.015038689598441124, + "rewards//mean": 0.8499755859375, + "rewards//std": 0.026380963623523712, + "step": 1008 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2018, + "grad_norm": 1.4978117942810059, + "kl": 0.28624070435762405, + "learning_rate": 9.103934158740022e-07, + "loss": 0.0114, + "num_tokens": 7337335.0, + "reward": 0.87945556640625, + "reward_std": 0.021770721301436424, + "rewards//mean": 0.87945556640625, + "rewards//std": 0.03032059222459793, + "step": 1009 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.202, + "grad_norm": 1.5066719055175781, + "kl": 0.2469142135232687, + "learning_rate": 9.102120618747336e-07, + "loss": 0.0099, + "num_tokens": 7344663.0, + "reward": 0.84490966796875, + "reward_std": 0.014753270894289017, + "rewards//mean": 0.84490966796875, + "rewards//std": 0.01562003418803215, + "step": 1010 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2022, + "grad_norm": 1.519261360168457, + "kl": 0.30959648825228214, + "learning_rate": 9.100305426420956e-07, + "loss": 0.0124, + "num_tokens": 7352039.0, + "reward": 0.83526611328125, + "reward_std": 0.017698127776384354, + "rewards//mean": 0.83526611328125, + "rewards//std": 0.028760211542248726, + "step": 1011 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.8125, + "epoch": 0.2024, + "grad_norm": 1.4792993068695068, + "kl": 0.2664998099207878, + "learning_rate": 9.098488582492039e-07, + "loss": 0.007, + "num_tokens": 7359299.0, + "reward": 0.84307861328125, + "reward_std": 0.014557499438524246, + "rewards//mean": 0.84307861328125, + "rewards//std": 0.02354631945490837, + "step": 1012 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2026, + "grad_norm": 1.5726137161254883, + "kl": 0.2845254521816969, + "learning_rate": 9.096670087692411e-07, + "loss": 0.0114, + "num_tokens": 7366579.0, + "reward": 0.84796142578125, + "reward_std": 0.019523650407791138, + "rewards//mean": 0.84796142578125, + "rewards//std": 0.030901648104190826, + "step": 1013 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.796875, + "epoch": 0.2028, + "grad_norm": 1.3448067903518677, + "kl": 0.29310030303895473, + "learning_rate": 9.094849942754563e-07, + "loss": 0.0181, + "num_tokens": 7373982.0, + "reward": 0.8836669921875, + "reward_std": 0.015273809432983398, + "rewards//mean": 0.8836669921875, + "rewards//std": 0.029565107077360153, + "step": 1014 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.6875, + "epoch": 0.203, + "grad_norm": 1.548168659210205, + "kl": 0.26194378547370434, + "learning_rate": 9.093028148411648e-07, + "loss": -0.0128, + "num_tokens": 7381274.0, + "reward": 0.83612060546875, + "reward_std": 0.01924586296081543, + "rewards//mean": 0.83612060546875, + "rewards//std": 0.025812137871980667, + "step": 1015 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2032, + "grad_norm": 1.5940430164337158, + "kl": 0.3436625264585018, + "learning_rate": 9.091204705397483e-07, + "loss": 0.0137, + "num_tokens": 7388602.0, + "reward": 0.85821533203125, + "reward_std": 0.018193718045949936, + "rewards//mean": 0.85821533203125, + "rewards//std": 0.03421482443809509, + "step": 1016 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2034, + "grad_norm": 1.6026945114135742, + "kl": 0.28586328960955143, + "learning_rate": 9.089379614446553e-07, + "loss": 0.0114, + "num_tokens": 7395914.0, + "reward": 0.870361328125, + "reward_std": 0.017290808260440826, + "rewards//mean": 0.870361328125, + "rewards//std": 0.02493825927376747, + "step": 1017 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2036, + "grad_norm": 1.4881082773208618, + "kl": 0.30238936841487885, + "learning_rate": 9.087552876294002e-07, + "loss": 0.0121, + "num_tokens": 7403098.0, + "reward": 0.7813720703125, + "reward_std": 0.014119284227490425, + "rewards//mean": 0.7813720703125, + "rewards//std": 0.026747945696115494, + "step": 1018 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2038, + "grad_norm": 1.261290431022644, + "kl": 0.24675209447741508, + "learning_rate": 9.085724491675642e-07, + "loss": 0.0099, + "num_tokens": 7410354.0, + "reward": 0.85223388671875, + "reward_std": 0.014088775962591171, + "rewards//mean": 0.85223388671875, + "rewards//std": 0.023463886231184006, + "step": 1019 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.204, + "grad_norm": 1.3138515949249268, + "kl": 0.3298809230327606, + "learning_rate": 9.083894461327945e-07, + "loss": 0.0132, + "num_tokens": 7417674.0, + "reward": 0.86517333984375, + "reward_std": 0.020085645839571953, + "rewards//mean": 0.86517333984375, + "rewards//std": 0.03316018357872963, + "step": 1020 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2042, + "grad_norm": 1.667799949645996, + "kl": 0.30730950832366943, + "learning_rate": 9.082062785988048e-07, + "loss": 0.0123, + "num_tokens": 7424906.0, + "reward": 0.8472900390625, + "reward_std": 0.022130627185106277, + "rewards//mean": 0.8472900390625, + "rewards//std": 0.032374657690525055, + "step": 1021 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2044, + "grad_norm": 1.3660027980804443, + "kl": 0.27062366530299187, + "learning_rate": 9.080229466393749e-07, + "loss": 0.0108, + "num_tokens": 7432266.0, + "reward": 0.8419189453125, + "reward_std": 0.013266797177493572, + "rewards//mean": 0.8419189453125, + "rewards//std": 0.022107703611254692, + "step": 1022 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.859375, + "epoch": 0.2046, + "grad_norm": 1.5199333429336548, + "kl": 0.3231301326304674, + "learning_rate": 9.078394503283508e-07, + "loss": 0.0125, + "num_tokens": 7439593.0, + "reward": 0.82769775390625, + "reward_std": 0.01583663932979107, + "rewards//mean": 0.82769775390625, + "rewards//std": 0.018598996102809906, + "step": 1023 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2048, + "grad_norm": 1.6428008079528809, + "kl": 0.298568993806839, + "learning_rate": 9.076557897396451e-07, + "loss": 0.0119, + "num_tokens": 7446897.0, + "reward": 0.81787109375, + "reward_std": 0.016144737601280212, + "rewards//mean": 0.81787109375, + "rewards//std": 0.026443149894475937, + "step": 1024 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.205, + "grad_norm": 1.4515444040298462, + "kl": 0.2804217264056206, + "learning_rate": 9.074719649472357e-07, + "loss": 0.0112, + "num_tokens": 7454113.0, + "reward": 0.86553955078125, + "reward_std": 0.012626571580767632, + "rewards//mean": 0.86553955078125, + "rewards//std": 0.018737656995654106, + "step": 1025 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2052, + "grad_norm": 1.6939901113510132, + "kl": 0.29491949267685413, + "learning_rate": 9.072879760251679e-07, + "loss": 0.0118, + "num_tokens": 7461441.0, + "reward": 0.8297119140625, + "reward_std": 0.012870442122220993, + "rewards//mean": 0.8297119140625, + "rewards//std": 0.017030060291290283, + "step": 1026 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2054, + "grad_norm": 1.3032807111740112, + "kl": 0.34082455560564995, + "learning_rate": 9.071038230475519e-07, + "loss": 0.0136, + "num_tokens": 7468673.0, + "reward": 0.87371826171875, + "reward_std": 0.01984982192516327, + "rewards//mean": 0.87371826171875, + "rewards//std": 0.031558919697999954, + "step": 1027 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2056, + "grad_norm": 1.442789912223816, + "kl": 0.2694656252861023, + "learning_rate": 9.069195060885646e-07, + "loss": 0.0108, + "num_tokens": 7475937.0, + "reward": 0.86962890625, + "reward_std": 0.017983518540859222, + "rewards//mean": 0.86962890625, + "rewards//std": 0.02448371797800064, + "step": 1028 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2058, + "grad_norm": 1.1434813737869263, + "kl": 0.28347601741552353, + "learning_rate": 9.067350252224489e-07, + "loss": 0.0113, + "num_tokens": 7483281.0, + "reward": 0.83282470703125, + "reward_std": 0.0182790644466877, + "rewards//mean": 0.83282470703125, + "rewards//std": 0.029141908511519432, + "step": 1029 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.206, + "grad_norm": 1.3003278970718384, + "kl": 0.2543511278927326, + "learning_rate": 9.065503805235137e-07, + "loss": 0.0096, + "num_tokens": 7490501.0, + "reward": 0.86126708984375, + "reward_std": 0.01618809811770916, + "rewards//mean": 0.86126708984375, + "rewards//std": 0.034910768270492554, + "step": 1030 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2062, + "grad_norm": 2.059946298599243, + "kl": 0.3952889982610941, + "learning_rate": 9.06365572066134e-07, + "loss": 0.0158, + "num_tokens": 7497837.0, + "reward": 0.78021240234375, + "reward_std": 0.012896540574729443, + "rewards//mean": 0.78021240234375, + "rewards//std": 0.024961238726973534, + "step": 1031 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.765625, + "epoch": 0.2064, + "grad_norm": 1.4446262121200562, + "kl": 0.2953635323792696, + "learning_rate": 9.061805999247503e-07, + "loss": 0.0019, + "num_tokens": 7505214.0, + "reward": 0.81060791015625, + "reward_std": 0.01568615436553955, + "rewards//mean": 0.81060791015625, + "rewards//std": 0.018714213743805885, + "step": 1032 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2066, + "grad_norm": 1.4640775918960571, + "kl": 0.3072448931634426, + "learning_rate": 9.059954641738697e-07, + "loss": 0.0123, + "num_tokens": 7512446.0, + "reward": 0.83837890625, + "reward_std": 0.012560788542032242, + "rewards//mean": 0.83837890625, + "rewards//std": 0.020117685198783875, + "step": 1033 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2068, + "grad_norm": 1.4690897464752197, + "kl": 0.3276218958199024, + "learning_rate": 9.058101648880645e-07, + "loss": 0.0131, + "num_tokens": 7519798.0, + "reward": 0.80084228515625, + "reward_std": 0.013164764270186424, + "rewards//mean": 0.80084228515625, + "rewards//std": 0.018590856343507767, + "step": 1034 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.207, + "grad_norm": 2.429396152496338, + "kl": 0.33461112529039383, + "learning_rate": 9.056247021419734e-07, + "loss": 0.0134, + "num_tokens": 7527150.0, + "reward": 0.8338623046875, + "reward_std": 0.012573663145303726, + "rewards//mean": 0.8338623046875, + "rewards//std": 0.018387503921985626, + "step": 1035 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2072, + "grad_norm": 1.1946641206741333, + "kl": 0.249336589127779, + "learning_rate": 9.054390760103009e-07, + "loss": 0.01, + "num_tokens": 7534510.0, + "reward": 0.79833984375, + "reward_std": 0.017235033214092255, + "rewards//mean": 0.79833984375, + "rewards//std": 0.03064345009624958, + "step": 1036 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2074, + "grad_norm": 1.3922507762908936, + "kl": 0.292334858328104, + "learning_rate": 9.052532865678171e-07, + "loss": 0.0117, + "num_tokens": 7541846.0, + "reward": 0.85931396484375, + "reward_std": 0.014980427920818329, + "rewards//mean": 0.85931396484375, + "rewards//std": 0.023750517517328262, + "step": 1037 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.578125, + "epoch": 0.2076, + "grad_norm": 1.2582674026489258, + "kl": 0.2931496873497963, + "learning_rate": 9.050673338893577e-07, + "loss": -0.0225, + "num_tokens": 7549171.0, + "reward": 0.8665771484375, + "reward_std": 0.01506099198013544, + "rewards//mean": 0.8665771484375, + "rewards//std": 0.020847497507929802, + "step": 1038 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2078, + "grad_norm": 1.6514098644256592, + "kl": 0.2772949393838644, + "learning_rate": 9.04881218049825e-07, + "loss": 0.0111, + "num_tokens": 7556475.0, + "reward": 0.8660888671875, + "reward_std": 0.019441697746515274, + "rewards//mean": 0.8660888671875, + "rewards//std": 0.02706748992204666, + "step": 1039 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.84375, + "epoch": 0.208, + "grad_norm": 1.5099669694900513, + "kl": 0.27753408811986446, + "learning_rate": 9.046949391241858e-07, + "loss": 0.0149, + "num_tokens": 7563769.0, + "reward": 0.84716796875, + "reward_std": 0.01377116795629263, + "rewards//mean": 0.84716796875, + "rewards//std": 0.022298941388726234, + "step": 1040 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.2082, + "grad_norm": 1.366972804069519, + "kl": 0.31633927673101425, + "learning_rate": 9.045084971874737e-07, + "loss": 0.0127, + "num_tokens": 7571111.0, + "reward": 0.86407470703125, + "reward_std": 0.01603882387280464, + "rewards//mean": 0.86407470703125, + "rewards//std": 0.021745512261986732, + "step": 1041 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.84375, + "epoch": 0.2084, + "grad_norm": 2.048292636871338, + "kl": 0.30588189885020256, + "learning_rate": 9.043218923147873e-07, + "loss": 0.0062, + "num_tokens": 7578397.0, + "reward": 0.84222412109375, + "reward_std": 0.01903456449508667, + "rewards//mean": 0.84222412109375, + "rewards//std": 0.024355942383408546, + "step": 1042 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2086, + "grad_norm": 1.5283973217010498, + "kl": 0.32866689935326576, + "learning_rate": 9.04135124581291e-07, + "loss": 0.0131, + "num_tokens": 7585757.0, + "reward": 0.83349609375, + "reward_std": 0.013524653390049934, + "rewards//mean": 0.83349609375, + "rewards//std": 0.022707931697368622, + "step": 1043 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2088, + "grad_norm": 1.3643848896026611, + "kl": 0.28969121538102627, + "learning_rate": 9.039481940622146e-07, + "loss": 0.0116, + "num_tokens": 7592989.0, + "reward": 0.80816650390625, + "reward_std": 0.018874652683734894, + "rewards//mean": 0.80816650390625, + "rewards//std": 0.02530694752931595, + "step": 1044 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.209, + "grad_norm": 1.436395287513733, + "kl": 0.27974518947303295, + "learning_rate": 9.037611008328543e-07, + "loss": 0.0085, + "num_tokens": 7600279.0, + "reward": 0.87335205078125, + "reward_std": 0.022406145930290222, + "rewards//mean": 0.87335205078125, + "rewards//std": 0.03162600100040436, + "step": 1045 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2092, + "grad_norm": 1.2557544708251953, + "kl": 0.30518285371363163, + "learning_rate": 9.035738449685706e-07, + "loss": 0.0122, + "num_tokens": 7607623.0, + "reward": 0.80047607421875, + "reward_std": 0.016110483556985855, + "rewards//mean": 0.80047607421875, + "rewards//std": 0.030155889689922333, + "step": 1046 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2094, + "grad_norm": 1.2854658365249634, + "kl": 0.3165763318538666, + "learning_rate": 9.033864265447906e-07, + "loss": 0.0127, + "num_tokens": 7614911.0, + "reward": 0.83148193359375, + "reward_std": 0.016706043854355812, + "rewards//mean": 0.83148193359375, + "rewards//std": 0.020715197548270226, + "step": 1047 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2096, + "grad_norm": 1.52437162399292, + "kl": 0.30344888009130955, + "learning_rate": 9.031988456370061e-07, + "loss": 0.0121, + "num_tokens": 7622175.0, + "reward": 0.853515625, + "reward_std": 0.015016037970781326, + "rewards//mean": 0.853515625, + "rewards//std": 0.028679635375738144, + "step": 1048 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2098, + "grad_norm": 1.2589610815048218, + "kl": 0.289528576657176, + "learning_rate": 9.030111023207749e-07, + "loss": 0.0116, + "num_tokens": 7629543.0, + "reward": 0.840576171875, + "reward_std": 0.013808859512209892, + "rewards//mean": 0.840576171875, + "rewards//std": 0.026592710986733437, + "step": 1049 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.21, + "grad_norm": 1.3315870761871338, + "kl": 0.31426358968019485, + "learning_rate": 9.028231966717198e-07, + "loss": 0.0126, + "num_tokens": 7636847.0, + "reward": 0.84564208984375, + "reward_std": 0.0196281298995018, + "rewards//mean": 0.84564208984375, + "rewards//std": 0.024557722732424736, + "step": 1050 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2102, + "grad_norm": 1.369013786315918, + "kl": 0.3085191883146763, + "learning_rate": 9.026351287655293e-07, + "loss": 0.0123, + "num_tokens": 7644151.0, + "reward": 0.81195068359375, + "reward_std": 0.01285467203706503, + "rewards//mean": 0.81195068359375, + "rewards//std": 0.0199285876005888, + "step": 1051 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2104, + "grad_norm": 1.239999771118164, + "kl": 0.34454499185085297, + "learning_rate": 9.02446898677957e-07, + "loss": 0.0138, + "num_tokens": 7651455.0, + "reward": 0.79156494140625, + "reward_std": 0.01320042833685875, + "rewards//mean": 0.79156494140625, + "rewards//std": 0.017404930666089058, + "step": 1052 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2106, + "grad_norm": 1.430853247642517, + "kl": 0.29095567390322685, + "learning_rate": 9.02258506484822e-07, + "loss": 0.0116, + "num_tokens": 7658719.0, + "reward": 0.8624267578125, + "reward_std": 0.01779588870704174, + "rewards//mean": 0.8624267578125, + "rewards//std": 0.023689011111855507, + "step": 1053 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2108, + "grad_norm": 1.334028959274292, + "kl": 0.3011806830763817, + "learning_rate": 9.02069952262009e-07, + "loss": 0.012, + "num_tokens": 7665999.0, + "reward": 0.83502197265625, + "reward_std": 0.01098968181759119, + "rewards//mean": 0.83502197265625, + "rewards//std": 0.025293784216046333, + "step": 1054 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.211, + "grad_norm": 1.3110787868499756, + "kl": 0.32636628299951553, + "learning_rate": 9.018812360854671e-07, + "loss": 0.0131, + "num_tokens": 7673287.0, + "reward": 0.81622314453125, + "reward_std": 0.01039394736289978, + "rewards//mean": 0.81622314453125, + "rewards//std": 0.016299160197377205, + "step": 1055 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2112, + "grad_norm": 1.273958683013916, + "kl": 0.3023297358304262, + "learning_rate": 9.016923580312113e-07, + "loss": 0.0121, + "num_tokens": 7680591.0, + "reward": 0.85113525390625, + "reward_std": 0.01080024242401123, + "rewards//mean": 0.85113525390625, + "rewards//std": 0.023158026859164238, + "step": 1056 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2114, + "grad_norm": 1.4694616794586182, + "kl": 0.311909569427371, + "learning_rate": 9.015033181753218e-07, + "loss": 0.0125, + "num_tokens": 7687791.0, + "reward": 0.87469482421875, + "reward_std": 0.02377953752875328, + "rewards//mean": 0.87469482421875, + "rewards//std": 0.03833211585879326, + "step": 1057 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.828125, + "epoch": 0.2116, + "grad_norm": 1.2298214435577393, + "kl": 0.31344976648688316, + "learning_rate": 9.013141165939438e-07, + "loss": 0.0097, + "num_tokens": 7695036.0, + "reward": 0.8516845703125, + "reward_std": 0.013471393845975399, + "rewards//mean": 0.8516845703125, + "rewards//std": 0.0197466891258955, + "step": 1058 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2118, + "grad_norm": 1.39145827293396, + "kl": 0.3645864836871624, + "learning_rate": 9.011247533632875e-07, + "loss": 0.0146, + "num_tokens": 7702244.0, + "reward": 0.80853271484375, + "reward_std": 0.017626021057367325, + "rewards//mean": 0.80853271484375, + "rewards//std": 0.024503417313098907, + "step": 1059 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.212, + "grad_norm": 1.2642346620559692, + "kl": 0.2661507651209831, + "learning_rate": 9.009352285596285e-07, + "loss": 0.0106, + "num_tokens": 7709516.0, + "reward": 0.8245849609375, + "reward_std": 0.01296839490532875, + "rewards//mean": 0.8245849609375, + "rewards//std": 0.016199130564928055, + "step": 1060 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2122, + "grad_norm": 1.413506031036377, + "kl": 0.29754213988780975, + "learning_rate": 9.007455422593075e-07, + "loss": 0.0119, + "num_tokens": 7716820.0, + "reward": 0.8409423828125, + "reward_std": 0.01037822850048542, + "rewards//mean": 0.8409423828125, + "rewards//std": 0.013287244364619255, + "step": 1061 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2124, + "grad_norm": 1.6234136819839478, + "kl": 0.3709472268819809, + "learning_rate": 9.0055569453873e-07, + "loss": 0.0148, + "num_tokens": 7724044.0, + "reward": 0.86541748046875, + "reward_std": 0.016948748379945755, + "rewards//mean": 0.86541748046875, + "rewards//std": 0.024821974337100983, + "step": 1062 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2126, + "grad_norm": 1.575308084487915, + "kl": 0.3151492718607187, + "learning_rate": 9.003656854743666e-07, + "loss": 0.0126, + "num_tokens": 7731340.0, + "reward": 0.79278564453125, + "reward_std": 0.014859853312373161, + "rewards//mean": 0.79278564453125, + "rewards//std": 0.025079218670725822, + "step": 1063 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2128, + "grad_norm": 1.6109012365341187, + "kl": 0.32406527549028397, + "learning_rate": 9.00175515142753e-07, + "loss": 0.013, + "num_tokens": 7738636.0, + "reward": 0.83648681640625, + "reward_std": 0.015054719522595406, + "rewards//mean": 0.83648681640625, + "rewards//std": 0.01718788407742977, + "step": 1064 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.213, + "grad_norm": 1.361842393875122, + "kl": 0.3084038719534874, + "learning_rate": 8.9998518362049e-07, + "loss": 0.0123, + "num_tokens": 7746004.0, + "reward": 0.8382568359375, + "reward_std": 0.016675379127264023, + "rewards//mean": 0.8382568359375, + "rewards//std": 0.025303132832050323, + "step": 1065 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2132, + "grad_norm": 1.4300802946090698, + "kl": 0.27536091208457947, + "learning_rate": 8.997946909842424e-07, + "loss": 0.011, + "num_tokens": 7753340.0, + "reward": 0.85003662109375, + "reward_std": 0.014984278939664364, + "rewards//mean": 0.85003662109375, + "rewards//std": 0.020286934450268745, + "step": 1066 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2134, + "grad_norm": 1.589374303817749, + "kl": 0.3029318284243345, + "learning_rate": 8.996040373107414e-07, + "loss": 0.0121, + "num_tokens": 7760772.0, + "reward": 0.761474609375, + "reward_std": 0.01079709455370903, + "rewards//mean": 0.761474609375, + "rewards//std": 0.019160354509949684, + "step": 1067 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2136, + "grad_norm": 1.479730486869812, + "kl": 0.34336921758949757, + "learning_rate": 8.994132226767819e-07, + "loss": 0.0137, + "num_tokens": 7768092.0, + "reward": 0.841552734375, + "reward_std": 0.018693240359425545, + "rewards//mean": 0.841552734375, + "rewards//std": 0.02722279727458954, + "step": 1068 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.2138, + "grad_norm": 1.534416675567627, + "kl": 0.30263084918260574, + "learning_rate": 8.992222471592239e-07, + "loss": 0.0114, + "num_tokens": 7775318.0, + "reward": 0.85723876953125, + "reward_std": 0.01667502149939537, + "rewards//mean": 0.85723876953125, + "rewards//std": 0.022723177447915077, + "step": 1069 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.214, + "grad_norm": 1.3195987939834595, + "kl": 0.25270831771194935, + "learning_rate": 8.990311108349926e-07, + "loss": 0.0101, + "num_tokens": 7782678.0, + "reward": 0.84722900390625, + "reward_std": 0.015624535270035267, + "rewards//mean": 0.84722900390625, + "rewards//std": 0.027082236483693123, + "step": 1070 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2142, + "grad_norm": 1.305699110031128, + "kl": 0.2860517054796219, + "learning_rate": 8.988398137810776e-07, + "loss": 0.0114, + "num_tokens": 7790038.0, + "reward": 0.79638671875, + "reward_std": 0.012477382086217403, + "rewards//mean": 0.79638671875, + "rewards//std": 0.015740828588604927, + "step": 1071 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2144, + "grad_norm": 1.4604891538619995, + "kl": 0.34655385091900826, + "learning_rate": 8.986483560745333e-07, + "loss": 0.0139, + "num_tokens": 7797334.0, + "reward": 0.8167724609375, + "reward_std": 0.012890180572867393, + "rewards//mean": 0.8167724609375, + "rewards//std": 0.015667088329792023, + "step": 1072 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2146, + "grad_norm": 1.4661251306533813, + "kl": 0.35321579687297344, + "learning_rate": 8.984567377924789e-07, + "loss": 0.0141, + "num_tokens": 7804646.0, + "reward": 0.77490234375, + "reward_std": 0.01513337716460228, + "rewards//mean": 0.77490234375, + "rewards//std": 0.021276216953992844, + "step": 1073 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2148, + "grad_norm": 1.281410813331604, + "kl": 0.32096279226243496, + "learning_rate": 8.982649590120981e-07, + "loss": 0.0128, + "num_tokens": 7811846.0, + "reward": 0.86956787109375, + "reward_std": 0.021586943417787552, + "rewards//mean": 0.86956787109375, + "rewards//std": 0.033368609845638275, + "step": 1074 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.215, + "grad_norm": 1.2725930213928223, + "kl": 0.28765570744872093, + "learning_rate": 8.980730198106394e-07, + "loss": 0.0115, + "num_tokens": 7819206.0, + "reward": 0.85986328125, + "reward_std": 0.01563686691224575, + "rewards//mean": 0.85986328125, + "rewards//std": 0.018262352794408798, + "step": 1075 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2152, + "grad_norm": 1.2307754755020142, + "kl": 0.26919612288475037, + "learning_rate": 8.97880920265416e-07, + "loss": 0.0108, + "num_tokens": 7826654.0, + "reward": 0.85174560546875, + "reward_std": 0.014455176889896393, + "rewards//mean": 0.85174560546875, + "rewards//std": 0.03143877163529396, + "step": 1076 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.2154, + "grad_norm": 1.4141573905944824, + "kl": 0.28113503381609917, + "learning_rate": 8.976886604538055e-07, + "loss": 0.0107, + "num_tokens": 7833971.0, + "reward": 0.86590576171875, + "reward_std": 0.02066919580101967, + "rewards//mean": 0.86590576171875, + "rewards//std": 0.029708825051784515, + "step": 1077 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2156, + "grad_norm": 1.4310346841812134, + "kl": 0.35123853757977486, + "learning_rate": 8.974962404532501e-07, + "loss": 0.014, + "num_tokens": 7841299.0, + "reward": 0.84088134765625, + "reward_std": 0.01990894228219986, + "rewards//mean": 0.84088134765625, + "rewards//std": 0.022350389510393143, + "step": 1078 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.2158, + "grad_norm": 1.431424617767334, + "kl": 0.2781920749694109, + "learning_rate": 8.973036603412566e-07, + "loss": 0.0092, + "num_tokens": 7848533.0, + "reward": 0.8221435546875, + "reward_std": 0.013412028551101685, + "rewards//mean": 0.8221435546875, + "rewards//std": 0.022347426041960716, + "step": 1079 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.216, + "grad_norm": 1.3146693706512451, + "kl": 0.33246543258428574, + "learning_rate": 8.971109201953962e-07, + "loss": 0.0133, + "num_tokens": 7855765.0, + "reward": 0.84844970703125, + "reward_std": 0.010926431976258755, + "rewards//mean": 0.84844970703125, + "rewards//std": 0.023869404569268227, + "step": 1080 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2162, + "grad_norm": 1.5433149337768555, + "kl": 0.2973218187689781, + "learning_rate": 8.969180200933047e-07, + "loss": 0.0119, + "num_tokens": 7862997.0, + "reward": 0.82781982421875, + "reward_std": 0.01542243268340826, + "rewards//mean": 0.82781982421875, + "rewards//std": 0.02515515685081482, + "step": 1081 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2164, + "grad_norm": 1.3194642066955566, + "kl": 0.30874940007925034, + "learning_rate": 8.967249601126821e-07, + "loss": 0.0123, + "num_tokens": 7870213.0, + "reward": 0.85650634765625, + "reward_std": 0.01950845494866371, + "rewards//mean": 0.85650634765625, + "rewards//std": 0.029799891635775566, + "step": 1082 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.8125, + "epoch": 0.2166, + "grad_norm": 1.5168063640594482, + "kl": 0.29404761269688606, + "learning_rate": 8.96531740331293e-07, + "loss": 0.0002, + "num_tokens": 7877489.0, + "reward": 0.83428955078125, + "reward_std": 0.017388880252838135, + "rewards//mean": 0.83428955078125, + "rewards//std": 0.023610521107912064, + "step": 1083 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2168, + "grad_norm": 1.368632197380066, + "kl": 0.29596048779785633, + "learning_rate": 8.963383608269663e-07, + "loss": 0.0118, + "num_tokens": 7884817.0, + "reward": 0.8475341796875, + "reward_std": 0.013293050229549408, + "rewards//mean": 0.8475341796875, + "rewards//std": 0.020410144701600075, + "step": 1084 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.217, + "grad_norm": 1.3989510536193848, + "kl": 0.33459964767098427, + "learning_rate": 8.961448216775953e-07, + "loss": 0.0134, + "num_tokens": 7892137.0, + "reward": 0.8648681640625, + "reward_std": 0.022088980302214622, + "rewards//mean": 0.8648681640625, + "rewards//std": 0.03071622923016548, + "step": 1085 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2172, + "grad_norm": 1.4184560775756836, + "kl": 0.35629044100642204, + "learning_rate": 8.959511229611375e-07, + "loss": 0.0143, + "num_tokens": 7899465.0, + "reward": 0.86383056640625, + "reward_std": 0.020550712943077087, + "rewards//mean": 0.86383056640625, + "rewards//std": 0.024328580126166344, + "step": 1086 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2174, + "grad_norm": 1.1670403480529785, + "kl": 0.26582225039601326, + "learning_rate": 8.957572647556147e-07, + "loss": 0.0106, + "num_tokens": 7906657.0, + "reward": 0.873291015625, + "reward_std": 0.011837629601359367, + "rewards//mean": 0.873291015625, + "rewards//std": 0.026105530560016632, + "step": 1087 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2176, + "grad_norm": 1.921682596206665, + "kl": 0.3731733709573746, + "learning_rate": 8.95563247139113e-07, + "loss": 0.0149, + "num_tokens": 7913993.0, + "reward": 0.85595703125, + "reward_std": 0.014214186929166317, + "rewards//mean": 0.85595703125, + "rewards//std": 0.03211763873696327, + "step": 1088 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2178, + "grad_norm": 1.1656925678253174, + "kl": 0.3070422876626253, + "learning_rate": 8.953690701897827e-07, + "loss": 0.0123, + "num_tokens": 7921225.0, + "reward": 0.83544921875, + "reward_std": 0.010457788594067097, + "rewards//mean": 0.83544921875, + "rewards//std": 0.01721087656915188, + "step": 1089 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.218, + "grad_norm": 1.511762261390686, + "kl": 0.38593145832419395, + "learning_rate": 8.951747339858382e-07, + "loss": 0.0154, + "num_tokens": 7928473.0, + "reward": 0.83453369140625, + "reward_std": 0.01064581423997879, + "rewards//mean": 0.83453369140625, + "rewards//std": 0.017871806398034096, + "step": 1090 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2182, + "grad_norm": 1.1775896549224854, + "kl": 0.2766279149800539, + "learning_rate": 8.94980238605558e-07, + "loss": 0.0111, + "num_tokens": 7935721.0, + "reward": 0.864990234375, + "reward_std": 0.011889766901731491, + "rewards//mean": 0.864990234375, + "rewards//std": 0.0271693617105484, + "step": 1091 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2184, + "grad_norm": 1.4358397722244263, + "kl": 0.310285622254014, + "learning_rate": 8.947855841272851e-07, + "loss": 0.0124, + "num_tokens": 7942961.0, + "reward": 0.83905029296875, + "reward_std": 0.015601323917508125, + "rewards//mean": 0.83905029296875, + "rewards//std": 0.02035323530435562, + "step": 1092 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2186, + "grad_norm": 1.1140609979629517, + "kl": 0.31539697386324406, + "learning_rate": 8.94590770629426e-07, + "loss": 0.0126, + "num_tokens": 7950289.0, + "reward": 0.87353515625, + "reward_std": 0.012624558061361313, + "rewards//mean": 0.87353515625, + "rewards//std": 0.029573554173111916, + "step": 1093 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2188, + "grad_norm": 1.0853230953216553, + "kl": 0.24161593616008759, + "learning_rate": 8.943957981904517e-07, + "loss": 0.0097, + "num_tokens": 7957657.0, + "reward": 0.86395263671875, + "reward_std": 0.011656182818114758, + "rewards//mean": 0.86395263671875, + "rewards//std": 0.02065299078822136, + "step": 1094 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.219, + "grad_norm": 1.2128750085830688, + "kl": 0.25720201805233955, + "learning_rate": 8.942006668888971e-07, + "loss": 0.0103, + "num_tokens": 7964833.0, + "reward": 0.7919921875, + "reward_std": 0.014743391424417496, + "rewards//mean": 0.7919921875, + "rewards//std": 0.031125744804739952, + "step": 1095 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2192, + "grad_norm": 2.7204067707061768, + "kl": 0.3773266337811947, + "learning_rate": 8.940053768033608e-07, + "loss": 0.0151, + "num_tokens": 7972169.0, + "reward": 0.77239990234375, + "reward_std": 0.01440650224685669, + "rewards//mean": 0.77239990234375, + "rewards//std": 0.02604391984641552, + "step": 1096 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.828125, + "epoch": 0.2194, + "grad_norm": 1.5218347311019897, + "kl": 0.2826386373490095, + "learning_rate": 8.938099280125062e-07, + "loss": 0.0098, + "num_tokens": 7979542.0, + "reward": 0.83575439453125, + "reward_std": 0.014548620209097862, + "rewards//mean": 0.83575439453125, + "rewards//std": 0.02564268186688423, + "step": 1097 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.796875, + "epoch": 0.2196, + "grad_norm": 1.8756946325302124, + "kl": 0.29326434805989265, + "learning_rate": 8.936143205950595e-07, + "loss": 0.0161, + "num_tokens": 7986729.0, + "reward": 0.82025146484375, + "reward_std": 0.01551084779202938, + "rewards//mean": 0.82025146484375, + "rewards//std": 0.02061704359948635, + "step": 1098 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.2198, + "grad_norm": 1.7793521881103516, + "kl": 0.2732016555964947, + "learning_rate": 8.934185546298115e-07, + "loss": 0.0113, + "num_tokens": 7993960.0, + "reward": 0.83099365234375, + "reward_std": 0.014071241021156311, + "rewards//mean": 0.83099365234375, + "rewards//std": 0.024292465299367905, + "step": 1099 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.22, + "grad_norm": 2.632420063018799, + "kl": 0.5108823738992214, + "learning_rate": 8.932226301956169e-07, + "loss": 0.0204, + "num_tokens": 8001280.0, + "reward": 0.85565185546875, + "reward_std": 0.024661105126142502, + "rewards//mean": 0.85565185546875, + "rewards//std": 0.030245110392570496, + "step": 1100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.8125, + "epoch": 0.2202, + "grad_norm": 1.3002866506576538, + "kl": 0.30508384853601456, + "learning_rate": 8.930265473713937e-07, + "loss": 0.0044, + "num_tokens": 8008572.0, + "reward": 0.854736328125, + "reward_std": 0.012034866027534008, + "rewards//mean": 0.854736328125, + "rewards//std": 0.017173897475004196, + "step": 1101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2204, + "grad_norm": 1.5157485008239746, + "kl": 0.2944438625127077, + "learning_rate": 8.928303062361243e-07, + "loss": 0.0118, + "num_tokens": 8015876.0, + "reward": 0.86737060546875, + "reward_std": 0.02427748218178749, + "rewards//mean": 0.86737060546875, + "rewards//std": 0.03499003127217293, + "step": 1102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2206, + "grad_norm": 1.481295108795166, + "kl": 0.3486741669476032, + "learning_rate": 8.926339068688545e-07, + "loss": 0.0139, + "num_tokens": 8023164.0, + "reward": 0.83270263671875, + "reward_std": 0.012031204998493195, + "rewards//mean": 0.83270263671875, + "rewards//std": 0.01750466600060463, + "step": 1103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2208, + "grad_norm": 1.5600775480270386, + "kl": 0.30140966176986694, + "learning_rate": 8.924373493486941e-07, + "loss": 0.0121, + "num_tokens": 8030460.0, + "reward": 0.79052734375, + "reward_std": 0.017214424908161163, + "rewards//mean": 0.79052734375, + "rewards//std": 0.03596000373363495, + "step": 1104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.84375, + "epoch": 0.221, + "grad_norm": 1.4044548273086548, + "kl": 0.3456531874835491, + "learning_rate": 8.922406337548161e-07, + "loss": 0.0115, + "num_tokens": 8037706.0, + "reward": 0.87078857421875, + "reward_std": 0.018264906480908394, + "rewards//mean": 0.87078857421875, + "rewards//std": 0.02772904559969902, + "step": 1105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2212, + "grad_norm": 1.5318219661712646, + "kl": 0.28124709241092205, + "learning_rate": 8.920437601664579e-07, + "loss": 0.0112, + "num_tokens": 8045018.0, + "reward": 0.82269287109375, + "reward_std": 0.011908026412129402, + "rewards//mean": 0.82269287109375, + "rewards//std": 0.017366621643304825, + "step": 1106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.2214, + "grad_norm": 1.618323564529419, + "kl": 0.26674538291990757, + "learning_rate": 8.918467286629198e-07, + "loss": 0.0055, + "num_tokens": 8052275.0, + "reward": 0.8511962890625, + "reward_std": 0.01498610619455576, + "rewards//mean": 0.8511962890625, + "rewards//std": 0.03094405308365822, + "step": 1107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2216, + "grad_norm": 1.503462553024292, + "kl": 0.342885285615921, + "learning_rate": 8.916495393235665e-07, + "loss": 0.0137, + "num_tokens": 8059451.0, + "reward": 0.7884521484375, + "reward_std": 0.010113345459103584, + "rewards//mean": 0.7884521484375, + "rewards//std": 0.0155818285420537, + "step": 1108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2218, + "grad_norm": 1.3841408491134644, + "kl": 0.3379524201154709, + "learning_rate": 8.914521922278255e-07, + "loss": 0.0135, + "num_tokens": 8066731.0, + "reward": 0.80328369140625, + "reward_std": 0.018625400960445404, + "rewards//mean": 0.80328369140625, + "rewards//std": 0.03392825648188591, + "step": 1109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.222, + "grad_norm": 1.4049842357635498, + "kl": 0.31494373455643654, + "learning_rate": 8.912546874551882e-07, + "loss": 0.0155, + "num_tokens": 8074048.0, + "reward": 0.82574462890625, + "reward_std": 0.016314435750246048, + "rewards//mean": 0.82574462890625, + "rewards//std": 0.023873845115303993, + "step": 1110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2222, + "grad_norm": 1.66502845287323, + "kl": 0.34005986899137497, + "learning_rate": 8.910570250852096e-07, + "loss": 0.0136, + "num_tokens": 8081320.0, + "reward": 0.808837890625, + "reward_std": 0.017721248790621758, + "rewards//mean": 0.808837890625, + "rewards//std": 0.031825028359889984, + "step": 1111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.8125, + "epoch": 0.2224, + "grad_norm": 1.5225322246551514, + "kl": 0.3060644008219242, + "learning_rate": 8.908592051975081e-07, + "loss": 0.0072, + "num_tokens": 8088636.0, + "reward": 0.86212158203125, + "reward_std": 0.01942976377904415, + "rewards//mean": 0.86212158203125, + "rewards//std": 0.0341084748506546, + "step": 1112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2226, + "grad_norm": 1.2494603395462036, + "kl": 0.2984900660812855, + "learning_rate": 8.906612278717655e-07, + "loss": 0.0119, + "num_tokens": 8095900.0, + "reward": 0.81378173828125, + "reward_std": 0.012197243049740791, + "rewards//mean": 0.81378173828125, + "rewards//std": 0.02101915329694748, + "step": 1113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2228, + "grad_norm": 1.2255491018295288, + "kl": 0.33436625823378563, + "learning_rate": 8.90463093187727e-07, + "loss": 0.0134, + "num_tokens": 8103148.0, + "reward": 0.821044921875, + "reward_std": 0.012956248596310616, + "rewards//mean": 0.821044921875, + "rewards//std": 0.01424449309706688, + "step": 1114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.223, + "grad_norm": 1.2374465465545654, + "kl": 0.293248163536191, + "learning_rate": 8.902648012252012e-07, + "loss": 0.0117, + "num_tokens": 8110460.0, + "reward": 0.8382568359375, + "reward_std": 0.01120686810463667, + "rewards//mean": 0.8382568359375, + "rewards//std": 0.016347963362932205, + "step": 1115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2232, + "grad_norm": 2.056161403656006, + "kl": 0.5049624368548393, + "learning_rate": 8.900663520640603e-07, + "loss": 0.0202, + "num_tokens": 8117724.0, + "reward": 0.85833740234375, + "reward_std": 0.019626203924417496, + "rewards//mean": 0.85833740234375, + "rewards//std": 0.024322357028722763, + "step": 1116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.2234, + "grad_norm": 1.4653035402297974, + "kl": 0.3318171575665474, + "learning_rate": 8.898677457842394e-07, + "loss": 0.0153, + "num_tokens": 8124989.0, + "reward": 0.84033203125, + "reward_std": 0.012051420286297798, + "rewards//mean": 0.84033203125, + "rewards//std": 0.018062321469187737, + "step": 1117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2236, + "grad_norm": 1.5163328647613525, + "kl": 0.35273227840662, + "learning_rate": 8.896689824657371e-07, + "loss": 0.0141, + "num_tokens": 8132197.0, + "reward": 0.8260498046875, + "reward_std": 0.014245107769966125, + "rewards//mean": 0.8260498046875, + "rewards//std": 0.01877851039171219, + "step": 1118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.71875, + "epoch": 0.2238, + "grad_norm": 1.5443251132965088, + "kl": 0.3266940098255873, + "learning_rate": 8.894700621886152e-07, + "loss": 0.0134, + "num_tokens": 8139491.0, + "reward": 0.80047607421875, + "reward_std": 0.014119492843747139, + "rewards//mean": 0.80047607421875, + "rewards//std": 0.020449694246053696, + "step": 1119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.224, + "grad_norm": 1.2869195938110352, + "kl": 0.333605395630002, + "learning_rate": 8.892709850329989e-07, + "loss": 0.0133, + "num_tokens": 8146731.0, + "reward": 0.86297607421875, + "reward_std": 0.01802264340221882, + "rewards//mean": 0.86297607421875, + "rewards//std": 0.03212370350956917, + "step": 1120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2242, + "grad_norm": 1.2888580560684204, + "kl": 0.3217974379658699, + "learning_rate": 8.890717510790762e-07, + "loss": 0.0129, + "num_tokens": 8153995.0, + "reward": 0.7994384765625, + "reward_std": 0.009287133812904358, + "rewards//mean": 0.7994384765625, + "rewards//std": 0.018535098060965538, + "step": 1121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2244, + "grad_norm": 1.37937331199646, + "kl": 0.28130376897752285, + "learning_rate": 8.888723604070989e-07, + "loss": 0.0113, + "num_tokens": 8161283.0, + "reward": 0.83953857421875, + "reward_std": 0.013689241372048855, + "rewards//mean": 0.83953857421875, + "rewards//std": 0.0181132685393095, + "step": 1122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.2246, + "grad_norm": 1.7337329387664795, + "kl": 0.3053101394325495, + "learning_rate": 8.886728130973813e-07, + "loss": 0.0106, + "num_tokens": 8168513.0, + "reward": 0.84613037109375, + "reward_std": 0.017252976074814796, + "rewards//mean": 0.84613037109375, + "rewards//std": 0.027653058990836143, + "step": 1123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2248, + "grad_norm": 1.5498358011245728, + "kl": 0.30053524300456047, + "learning_rate": 8.884731092303011e-07, + "loss": 0.012, + "num_tokens": 8175905.0, + "reward": 0.8685302734375, + "reward_std": 0.020171385258436203, + "rewards//mean": 0.8685302734375, + "rewards//std": 0.02867303602397442, + "step": 1124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.8125, + "epoch": 0.225, + "grad_norm": 1.3682861328125, + "kl": 0.31818437203764915, + "learning_rate": 8.882732488862987e-07, + "loss": 0.002, + "num_tokens": 8183189.0, + "reward": 0.8345947265625, + "reward_std": 0.011721896938979626, + "rewards//mean": 0.8345947265625, + "rewards//std": 0.023501677438616753, + "step": 1125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2252, + "grad_norm": 1.563463568687439, + "kl": 0.28169108368456364, + "learning_rate": 8.880732321458784e-07, + "loss": 0.0113, + "num_tokens": 8190413.0, + "reward": 0.80096435546875, + "reward_std": 0.015571564435958862, + "rewards//mean": 0.80096435546875, + "rewards//std": 0.025319505482912064, + "step": 1126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2254, + "grad_norm": 1.5872894525527954, + "kl": 0.31593018025159836, + "learning_rate": 8.878730590896065e-07, + "loss": 0.0126, + "num_tokens": 8197757.0, + "reward": 0.8604736328125, + "reward_std": 0.019493814557790756, + "rewards//mean": 0.8604736328125, + "rewards//std": 0.03284812346100807, + "step": 1127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2256, + "grad_norm": 1.741148829460144, + "kl": 0.378243088722229, + "learning_rate": 8.876727297981127e-07, + "loss": 0.0151, + "num_tokens": 8204981.0, + "reward": 0.867431640625, + "reward_std": 0.019384153187274933, + "rewards//mean": 0.867431640625, + "rewards//std": 0.028321649879217148, + "step": 1128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.2258, + "grad_norm": 1.231563687324524, + "kl": 0.27389721386134624, + "learning_rate": 8.874722443520898e-07, + "loss": 0.0153, + "num_tokens": 8212262.0, + "reward": 0.82318115234375, + "reward_std": 0.012972738593816757, + "rewards//mean": 0.82318115234375, + "rewards//std": 0.019025472924113274, + "step": 1129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.226, + "grad_norm": 1.555290937423706, + "kl": 0.27070917934179306, + "learning_rate": 8.872716028322931e-07, + "loss": 0.0108, + "num_tokens": 8219638.0, + "reward": 0.8057861328125, + "reward_std": 0.025014152750372887, + "rewards//mean": 0.8057861328125, + "rewards//std": 0.0312613807618618, + "step": 1130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2262, + "grad_norm": 1.5296005010604858, + "kl": 0.3477272242307663, + "learning_rate": 8.870708053195413e-07, + "loss": 0.0139, + "num_tokens": 8226926.0, + "reward": 0.8238525390625, + "reward_std": 0.012625312432646751, + "rewards//mean": 0.8238525390625, + "rewards//std": 0.02233387529850006, + "step": 1131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2264, + "grad_norm": 1.4045249223709106, + "kl": 0.34977005794644356, + "learning_rate": 8.868698518947151e-07, + "loss": 0.014, + "num_tokens": 8234166.0, + "reward": 0.82476806640625, + "reward_std": 0.014361133798956871, + "rewards//mean": 0.82476806640625, + "rewards//std": 0.02639150433242321, + "step": 1132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2266, + "grad_norm": 1.283003330230713, + "kl": 0.273545129224658, + "learning_rate": 8.866687426387591e-07, + "loss": 0.0109, + "num_tokens": 8241438.0, + "reward": 0.8485107421875, + "reward_std": 0.010255584493279457, + "rewards//mean": 0.8485107421875, + "rewards//std": 0.024749066680669785, + "step": 1133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2268, + "grad_norm": 1.6926970481872559, + "kl": 0.3778489287942648, + "learning_rate": 8.864674776326797e-07, + "loss": 0.0151, + "num_tokens": 8248742.0, + "reward": 0.83349609375, + "reward_std": 0.013870935887098312, + "rewards//mean": 0.83349609375, + "rewards//std": 0.01899053156375885, + "step": 1134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.5625, + "epoch": 0.227, + "grad_norm": 1.259641408920288, + "kl": 0.26174345053732395, + "learning_rate": 8.862660569575464e-07, + "loss": -0.0252, + "num_tokens": 8256050.0, + "reward": 0.86279296875, + "reward_std": 0.016032755374908447, + "rewards//mean": 0.86279296875, + "rewards//std": 0.02952437475323677, + "step": 1135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2272, + "grad_norm": 1.3456966876983643, + "kl": 0.3078604191541672, + "learning_rate": 8.860644806944917e-07, + "loss": 0.0123, + "num_tokens": 8263322.0, + "reward": 0.8463134765625, + "reward_std": 0.011766590178012848, + "rewards//mean": 0.8463134765625, + "rewards//std": 0.027429701760411263, + "step": 1136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2274, + "grad_norm": 1.1828416585922241, + "kl": 0.31726289354264736, + "learning_rate": 8.858627489247104e-07, + "loss": 0.0127, + "num_tokens": 8270570.0, + "reward": 0.8587646484375, + "reward_std": 0.013768864795565605, + "rewards//mean": 0.8587646484375, + "rewards//std": 0.020234353840351105, + "step": 1137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2276, + "grad_norm": 1.335329294204712, + "kl": 0.3098258413374424, + "learning_rate": 8.856608617294599e-07, + "loss": 0.0124, + "num_tokens": 8277938.0, + "reward": 0.81439208984375, + "reward_std": 0.016306539997458458, + "rewards//mean": 0.81439208984375, + "rewards//std": 0.033013783395290375, + "step": 1138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.796875, + "epoch": 0.2278, + "grad_norm": 1.4928950071334839, + "kl": 0.3215921800583601, + "learning_rate": 8.854588191900604e-07, + "loss": 0.0168, + "num_tokens": 8285229.0, + "reward": 0.87628173828125, + "reward_std": 0.011933600530028343, + "rewards//mean": 0.87628173828125, + "rewards//std": 0.024881668388843536, + "step": 1139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.228, + "grad_norm": 1.401386022567749, + "kl": 0.2702289242297411, + "learning_rate": 8.852566213878946e-07, + "loss": 0.0108, + "num_tokens": 8292469.0, + "reward": 0.863525390625, + "reward_std": 0.01262308843433857, + "rewards//mean": 0.863525390625, + "rewards//std": 0.021506911143660545, + "step": 1140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2282, + "grad_norm": 1.2952547073364258, + "kl": 0.3330863583832979, + "learning_rate": 8.850542684044078e-07, + "loss": 0.0133, + "num_tokens": 8299733.0, + "reward": 0.86529541015625, + "reward_std": 0.012894053012132645, + "rewards//mean": 0.86529541015625, + "rewards//std": 0.027197135612368584, + "step": 1141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2284, + "grad_norm": 1.2976689338684082, + "kl": 0.32321260310709476, + "learning_rate": 8.848517603211078e-07, + "loss": 0.0129, + "num_tokens": 8306957.0, + "reward": 0.83489990234375, + "reward_std": 0.016241595149040222, + "rewards//mean": 0.83489990234375, + "rewards//std": 0.024192556738853455, + "step": 1142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2286, + "grad_norm": 1.321282982826233, + "kl": 0.3078153170645237, + "learning_rate": 8.846490972195646e-07, + "loss": 0.0123, + "num_tokens": 8314237.0, + "reward": 0.846435546875, + "reward_std": 0.018259583041071892, + "rewards//mean": 0.846435546875, + "rewards//std": 0.030780475586652756, + "step": 1143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2288, + "grad_norm": 1.5316067934036255, + "kl": 0.305793359875679, + "learning_rate": 8.844462791814112e-07, + "loss": 0.0122, + "num_tokens": 8321501.0, + "reward": 0.86651611328125, + "reward_std": 0.014890076592564583, + "rewards//mean": 0.86651611328125, + "rewards//std": 0.024886533617973328, + "step": 1144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.229, + "grad_norm": 1.3614609241485596, + "kl": 0.3125270791351795, + "learning_rate": 8.842433062883425e-07, + "loss": 0.0125, + "num_tokens": 8328709.0, + "reward": 0.88909912109375, + "reward_std": 0.011840015649795532, + "rewards//mean": 0.88909912109375, + "rewards//std": 0.018250642344355583, + "step": 1145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2292, + "grad_norm": 1.481597661972046, + "kl": 0.29813105799257755, + "learning_rate": 8.840401786221159e-07, + "loss": 0.0119, + "num_tokens": 8335949.0, + "reward": 0.80731201171875, + "reward_std": 0.01880635693669319, + "rewards//mean": 0.80731201171875, + "rewards//std": 0.022596247494220734, + "step": 1146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2294, + "grad_norm": 1.1973108053207397, + "kl": 0.2666045781224966, + "learning_rate": 8.838368962645513e-07, + "loss": 0.0107, + "num_tokens": 8343333.0, + "reward": 0.8406982421875, + "reward_std": 0.016185544431209564, + "rewards//mean": 0.8406982421875, + "rewards//std": 0.022178800776600838, + "step": 1147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2296, + "grad_norm": 1.3184343576431274, + "kl": 0.2574116624891758, + "learning_rate": 8.836334592975308e-07, + "loss": 0.0103, + "num_tokens": 8350645.0, + "reward": 0.852294921875, + "reward_std": 0.015836235135793686, + "rewards//mean": 0.852294921875, + "rewards//std": 0.02073865942656994, + "step": 1148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2298, + "grad_norm": 1.200423240661621, + "kl": 0.2781838271766901, + "learning_rate": 8.834298678029988e-07, + "loss": 0.0111, + "num_tokens": 8357885.0, + "reward": 0.849853515625, + "reward_std": 0.016728466376662254, + "rewards//mean": 0.849853515625, + "rewards//std": 0.02747960016131401, + "step": 1149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.23, + "grad_norm": 1.485186219215393, + "kl": 0.2595775779336691, + "learning_rate": 8.83226121862962e-07, + "loss": 0.0045, + "num_tokens": 8365087.0, + "reward": 0.85595703125, + "reward_std": 0.014451934024691582, + "rewards//mean": 0.85595703125, + "rewards//std": 0.029011299833655357, + "step": 1150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.2302, + "grad_norm": 1.3524093627929688, + "kl": 0.34412404522299767, + "learning_rate": 8.83022221559489e-07, + "loss": 0.0145, + "num_tokens": 8372316.0, + "reward": 0.84796142578125, + "reward_std": 0.01545743364840746, + "rewards//mean": 0.84796142578125, + "rewards//std": 0.020366618409752846, + "step": 1151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2304, + "grad_norm": 1.5702593326568604, + "kl": 0.3125195614993572, + "learning_rate": 8.82818166974711e-07, + "loss": 0.0125, + "num_tokens": 8379644.0, + "reward": 0.869384765625, + "reward_std": 0.020595047622919083, + "rewards//mean": 0.869384765625, + "rewards//std": 0.026882583275437355, + "step": 1152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2306, + "grad_norm": 1.1991804838180542, + "kl": 0.24202877841889858, + "learning_rate": 8.826139581908211e-07, + "loss": 0.0097, + "num_tokens": 8386900.0, + "reward": 0.8179931640625, + "reward_std": 0.011535374447703362, + "rewards//mean": 0.8179931640625, + "rewards//std": 0.018826814368367195, + "step": 1153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2308, + "grad_norm": 1.4068957567214966, + "kl": 0.27145761251449585, + "learning_rate": 8.824095952900746e-07, + "loss": 0.0109, + "num_tokens": 8394164.0, + "reward": 0.83843994140625, + "reward_std": 0.012582596391439438, + "rewards//mean": 0.83843994140625, + "rewards//std": 0.021297462284564972, + "step": 1154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.231, + "grad_norm": 1.3848906755447388, + "kl": 0.3069807067513466, + "learning_rate": 8.822050783547889e-07, + "loss": 0.0123, + "num_tokens": 8401452.0, + "reward": 0.84722900390625, + "reward_std": 0.019517958164215088, + "rewards//mean": 0.84722900390625, + "rewards//std": 0.025249456986784935, + "step": 1155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2312, + "grad_norm": 1.4061836004257202, + "kl": 0.30647798627614975, + "learning_rate": 8.820004074673433e-07, + "loss": 0.0123, + "num_tokens": 8408708.0, + "reward": 0.835693359375, + "reward_std": 0.013824371621012688, + "rewards//mean": 0.835693359375, + "rewards//std": 0.018815957009792328, + "step": 1156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2314, + "grad_norm": 1.305436134338379, + "kl": 0.2950445506721735, + "learning_rate": 8.817955827101792e-07, + "loss": 0.0118, + "num_tokens": 8415956.0, + "reward": 0.8265380859375, + "reward_std": 0.011674957349896431, + "rewards//mean": 0.8265380859375, + "rewards//std": 0.022964797914028168, + "step": 1157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2316, + "grad_norm": 1.3825528621673584, + "kl": 0.2580115906894207, + "learning_rate": 8.815906041658001e-07, + "loss": 0.0103, + "num_tokens": 8423220.0, + "reward": 0.82080078125, + "reward_std": 0.009966205805540085, + "rewards//mean": 0.82080078125, + "rewards//std": 0.02051113359630108, + "step": 1158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.8125, + "epoch": 0.2318, + "grad_norm": 1.555238962173462, + "kl": 0.29306414164602757, + "learning_rate": 8.813854719167712e-07, + "loss": 0.0028, + "num_tokens": 8430480.0, + "reward": 0.79876708984375, + "reward_std": 0.015370631590485573, + "rewards//mean": 0.79876708984375, + "rewards//std": 0.02114480920135975, + "step": 1159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.232, + "grad_norm": 1.6780948638916016, + "kl": 0.45330084301531315, + "learning_rate": 8.8118018604572e-07, + "loss": 0.0192, + "num_tokens": 8437709.0, + "reward": 0.85675048828125, + "reward_std": 0.012600554153323174, + "rewards//mean": 0.85675048828125, + "rewards//std": 0.020172445103526115, + "step": 1160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2322, + "grad_norm": 1.504480004310608, + "kl": 0.2946792468428612, + "learning_rate": 8.809747466353355e-07, + "loss": 0.0118, + "num_tokens": 8444957.0, + "reward": 0.873046875, + "reward_std": 0.012377694249153137, + "rewards//mean": 0.873046875, + "rewards//std": 0.025477521121501923, + "step": 1161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.734375, + "epoch": 0.2324, + "grad_norm": 1.447669506072998, + "kl": 0.29421080090105534, + "learning_rate": 8.807691537683684e-07, + "loss": 0.0055, + "num_tokens": 8452156.0, + "reward": 0.80511474609375, + "reward_std": 0.014060543850064278, + "rewards//mean": 0.80511474609375, + "rewards//std": 0.018164174631237984, + "step": 1162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.828125, + "epoch": 0.2326, + "grad_norm": 1.4648672342300415, + "kl": 0.34358642622828484, + "learning_rate": 8.805634075276317e-07, + "loss": 0.0072, + "num_tokens": 8459433.0, + "reward": 0.84088134765625, + "reward_std": 0.014779163524508476, + "rewards//mean": 0.84088134765625, + "rewards//std": 0.02418254315853119, + "step": 1163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2328, + "grad_norm": 1.475757360458374, + "kl": 0.3273986931890249, + "learning_rate": 8.80357507996e-07, + "loss": 0.0131, + "num_tokens": 8466681.0, + "reward": 0.8597412109375, + "reward_std": 0.013744980096817017, + "rewards//mean": 0.8597412109375, + "rewards//std": 0.019938934594392776, + "step": 1164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.640625, + "epoch": 0.233, + "grad_norm": 1.507431983947754, + "kl": 0.31308015063405037, + "learning_rate": 8.801514552564095e-07, + "loss": -0.007, + "num_tokens": 8474058.0, + "reward": 0.86181640625, + "reward_std": 0.016959045082330704, + "rewards//mean": 0.86181640625, + "rewards//std": 0.026259321719408035, + "step": 1165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2332, + "grad_norm": 1.7107517719268799, + "kl": 0.32240357249975204, + "learning_rate": 8.799452493918585e-07, + "loss": 0.0129, + "num_tokens": 8481266.0, + "reward": 0.8057861328125, + "reward_std": 0.012468172237277031, + "rewards//mean": 0.8057861328125, + "rewards//std": 0.019493624567985535, + "step": 1166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2334, + "grad_norm": 1.5541225671768188, + "kl": 0.3173799216747284, + "learning_rate": 8.797388904854063e-07, + "loss": 0.0127, + "num_tokens": 8488578.0, + "reward": 0.8326416015625, + "reward_std": 0.01738959550857544, + "rewards//mean": 0.8326416015625, + "rewards//std": 0.021386567503213882, + "step": 1167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2336, + "grad_norm": 1.2953616380691528, + "kl": 0.26051969453692436, + "learning_rate": 8.795323786201745e-07, + "loss": 0.0104, + "num_tokens": 8495810.0, + "reward": 0.851318359375, + "reward_std": 0.013006538152694702, + "rewards//mean": 0.851318359375, + "rewards//std": 0.02043275348842144, + "step": 1168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2338, + "grad_norm": 1.675406813621521, + "kl": 0.30533958598971367, + "learning_rate": 8.79325713879346e-07, + "loss": 0.0122, + "num_tokens": 8503026.0, + "reward": 0.83746337890625, + "reward_std": 0.015183142386376858, + "rewards//mean": 0.83746337890625, + "rewards//std": 0.01846175454556942, + "step": 1169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.234, + "grad_norm": 1.650717854499817, + "kl": 0.2768800873309374, + "learning_rate": 8.791188963461652e-07, + "loss": 0.0111, + "num_tokens": 8510258.0, + "reward": 0.84149169921875, + "reward_std": 0.012370424345135689, + "rewards//mean": 0.84149169921875, + "rewards//std": 0.015822241082787514, + "step": 1170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2342, + "grad_norm": 1.558935284614563, + "kl": 0.2880551740527153, + "learning_rate": 8.789119261039384e-07, + "loss": 0.0115, + "num_tokens": 8517594.0, + "reward": 0.86016845703125, + "reward_std": 0.026043182238936424, + "rewards//mean": 0.86016845703125, + "rewards//std": 0.039544276893138885, + "step": 1171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2344, + "grad_norm": 1.5427850484848022, + "kl": 0.32242418453097343, + "learning_rate": 8.78704803236033e-07, + "loss": 0.0129, + "num_tokens": 8524978.0, + "reward": 0.72216796875, + "reward_std": 0.014576987363398075, + "rewards//mean": 0.72216796875, + "rewards//std": 0.022686591371893883, + "step": 1172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2346, + "grad_norm": 1.3698527812957764, + "kl": 0.2934680553153157, + "learning_rate": 8.784975278258782e-07, + "loss": 0.0117, + "num_tokens": 8532258.0, + "reward": 0.8466796875, + "reward_std": 0.015440032817423344, + "rewards//mean": 0.8466796875, + "rewards//std": 0.024379627779126167, + "step": 1173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.2348, + "grad_norm": 1.355126142501831, + "kl": 0.33492540568113327, + "learning_rate": 8.782900999569645e-07, + "loss": 0.0125, + "num_tokens": 8539532.0, + "reward": 0.88055419921875, + "reward_std": 0.02368714101612568, + "rewards//mean": 0.88055419921875, + "rewards//std": 0.03215008229017258, + "step": 1174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.235, + "grad_norm": 1.2978743314743042, + "kl": 0.2971823550760746, + "learning_rate": 8.780825197128437e-07, + "loss": 0.0119, + "num_tokens": 8546844.0, + "reward": 0.849609375, + "reward_std": 0.01594850793480873, + "rewards//mean": 0.849609375, + "rewards//std": 0.030449189245700836, + "step": 1175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2352, + "grad_norm": 1.725576400756836, + "kl": 0.30804870650172234, + "learning_rate": 8.778747871771291e-07, + "loss": 0.0123, + "num_tokens": 8554084.0, + "reward": 0.81195068359375, + "reward_std": 0.01313888281583786, + "rewards//mean": 0.81195068359375, + "rewards//std": 0.021764300763607025, + "step": 1176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2354, + "grad_norm": 1.239060878753662, + "kl": 0.2764011360704899, + "learning_rate": 8.776669024334955e-07, + "loss": 0.0111, + "num_tokens": 8561404.0, + "reward": 0.79052734375, + "reward_std": 0.014276277273893356, + "rewards//mean": 0.79052734375, + "rewards//std": 0.020862378180027008, + "step": 1177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2356, + "grad_norm": 1.19170343875885, + "kl": 0.25033535435795784, + "learning_rate": 8.774588655656787e-07, + "loss": 0.01, + "num_tokens": 8568724.0, + "reward": 0.792236328125, + "reward_std": 0.016274308785796165, + "rewards//mean": 0.792236328125, + "rewards//std": 0.032044973224401474, + "step": 1178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2358, + "grad_norm": 1.3729695081710815, + "kl": 0.29489726200699806, + "learning_rate": 8.772506766574761e-07, + "loss": 0.0118, + "num_tokens": 8576060.0, + "reward": 0.84515380859375, + "reward_std": 0.01753777638077736, + "rewards//mean": 0.84515380859375, + "rewards//std": 0.019958948716521263, + "step": 1179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.236, + "grad_norm": 1.3475689888000488, + "kl": 0.3221146948635578, + "learning_rate": 8.770423357927462e-07, + "loss": 0.0129, + "num_tokens": 8583444.0, + "reward": 0.7652587890625, + "reward_std": 0.014738639816641808, + "rewards//mean": 0.7652587890625, + "rewards//std": 0.018158862367272377, + "step": 1180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2362, + "grad_norm": 1.5659760236740112, + "kl": 0.2656094413250685, + "learning_rate": 8.768338430554082e-07, + "loss": 0.0106, + "num_tokens": 8590708.0, + "reward": 0.8642578125, + "reward_std": 0.016988035291433334, + "rewards//mean": 0.8642578125, + "rewards//std": 0.03189440071582794, + "step": 1181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2364, + "grad_norm": 1.381804347038269, + "kl": 0.31498219817876816, + "learning_rate": 8.766251985294434e-07, + "loss": 0.0126, + "num_tokens": 8598044.0, + "reward": 0.82763671875, + "reward_std": 0.012066368013620377, + "rewards//mean": 0.82763671875, + "rewards//std": 0.022080639377236366, + "step": 1182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2366, + "grad_norm": 1.4412140846252441, + "kl": 0.3065001852810383, + "learning_rate": 8.764164022988937e-07, + "loss": 0.0123, + "num_tokens": 8605324.0, + "reward": 0.83099365234375, + "reward_std": 0.015787649899721146, + "rewards//mean": 0.83099365234375, + "rewards//std": 0.023138409480452538, + "step": 1183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2368, + "grad_norm": 1.6529051065444946, + "kl": 0.3023077305406332, + "learning_rate": 8.762074544478621e-07, + "loss": 0.0121, + "num_tokens": 8612540.0, + "reward": 0.85284423828125, + "reward_std": 0.02080358937382698, + "rewards//mean": 0.85284423828125, + "rewards//std": 0.030781889334321022, + "step": 1184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.237, + "grad_norm": 1.316893219947815, + "kl": 0.3417473938316107, + "learning_rate": 8.75998355060513e-07, + "loss": 0.0137, + "num_tokens": 8619772.0, + "reward": 0.8280029296875, + "reward_std": 0.015342580154538155, + "rewards//mean": 0.8280029296875, + "rewards//std": 0.02511337399482727, + "step": 1185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2372, + "grad_norm": 1.4206535816192627, + "kl": 0.26095854863524437, + "learning_rate": 8.757891042210712e-07, + "loss": 0.0104, + "num_tokens": 8627084.0, + "reward": 0.828857421875, + "reward_std": 0.015587522648274899, + "rewards//mean": 0.828857421875, + "rewards//std": 0.018595842644572258, + "step": 1186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.796875, + "epoch": 0.2374, + "grad_norm": 1.4381815195083618, + "kl": 0.31584757193922997, + "learning_rate": 8.755797020138234e-07, + "loss": 0.0204, + "num_tokens": 8634359.0, + "reward": 0.84942626953125, + "reward_std": 0.014363951981067657, + "rewards//mean": 0.84942626953125, + "rewards//std": 0.023927679285407066, + "step": 1187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2376, + "grad_norm": 1.4054882526397705, + "kl": 0.3100094720721245, + "learning_rate": 8.753701485231164e-07, + "loss": 0.0124, + "num_tokens": 8641599.0, + "reward": 0.8016357421875, + "reward_std": 0.011107858270406723, + "rewards//mean": 0.8016357421875, + "rewards//std": 0.01832813210785389, + "step": 1188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2378, + "grad_norm": 1.4722260236740112, + "kl": 0.2997556235641241, + "learning_rate": 8.751604438333586e-07, + "loss": 0.012, + "num_tokens": 8648927.0, + "reward": 0.85186767578125, + "reward_std": 0.01570052281022072, + "rewards//mean": 0.85186767578125, + "rewards//std": 0.023625904694199562, + "step": 1189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.238, + "grad_norm": 1.676213264465332, + "kl": 0.29651119001209736, + "learning_rate": 8.749505880290188e-07, + "loss": 0.0119, + "num_tokens": 8656199.0, + "reward": 0.89105224609375, + "reward_std": 0.014070548117160797, + "rewards//mean": 0.89105224609375, + "rewards//std": 0.020706426352262497, + "step": 1190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2382, + "grad_norm": 1.6067500114440918, + "kl": 0.29450749419629574, + "learning_rate": 8.74740581194627e-07, + "loss": 0.0118, + "num_tokens": 8663415.0, + "reward": 0.85040283203125, + "reward_std": 0.017494406551122665, + "rewards//mean": 0.85040283203125, + "rewards//std": 0.022093577310442924, + "step": 1191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2384, + "grad_norm": 1.2899130582809448, + "kl": 0.33176747895777225, + "learning_rate": 8.745304234147739e-07, + "loss": 0.0133, + "num_tokens": 8670743.0, + "reward": 0.87103271484375, + "reward_std": 0.016929296776652336, + "rewards//mean": 0.87103271484375, + "rewards//std": 0.02598107047379017, + "step": 1192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2386, + "grad_norm": 1.6087414026260376, + "kl": 0.2965226024389267, + "learning_rate": 8.743201147741111e-07, + "loss": 0.0119, + "num_tokens": 8678039.0, + "reward": 0.87353515625, + "reward_std": 0.016652971506118774, + "rewards//mean": 0.87353515625, + "rewards//std": 0.022536631673574448, + "step": 1193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2388, + "grad_norm": 1.5180723667144775, + "kl": 0.2919251322746277, + "learning_rate": 8.741096553573506e-07, + "loss": 0.0117, + "num_tokens": 8685303.0, + "reward": 0.84442138671875, + "reward_std": 0.019744152203202248, + "rewards//mean": 0.84442138671875, + "rewards//std": 0.026241373270750046, + "step": 1194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.239, + "grad_norm": 1.571536660194397, + "kl": 0.315290167927742, + "learning_rate": 8.73899045249266e-07, + "loss": 0.0126, + "num_tokens": 8692599.0, + "reward": 0.84033203125, + "reward_std": 0.017980795353651047, + "rewards//mean": 0.84033203125, + "rewards//std": 0.024384593591094017, + "step": 1195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2392, + "grad_norm": 1.8673691749572754, + "kl": 0.3074193261563778, + "learning_rate": 8.736882845346905e-07, + "loss": 0.0123, + "num_tokens": 8699951.0, + "reward": 0.88671875, + "reward_std": 0.02119932323694229, + "rewards//mean": 0.88671875, + "rewards//std": 0.023928390815854073, + "step": 1196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2394, + "grad_norm": 1.4556435346603394, + "kl": 0.2899591401219368, + "learning_rate": 8.734773732985185e-07, + "loss": 0.0116, + "num_tokens": 8707231.0, + "reward": 0.84039306640625, + "reward_std": 0.018389727920293808, + "rewards//mean": 0.84039306640625, + "rewards//std": 0.028081107884645462, + "step": 1197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2396, + "grad_norm": 1.6215529441833496, + "kl": 0.38078857585787773, + "learning_rate": 8.732663116257055e-07, + "loss": 0.0152, + "num_tokens": 8714487.0, + "reward": 0.85919189453125, + "reward_std": 0.016233356669545174, + "rewards//mean": 0.85919189453125, + "rewards//std": 0.02672494389116764, + "step": 1198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2398, + "grad_norm": 1.2332844734191895, + "kl": 0.3037409335374832, + "learning_rate": 8.730550996012667e-07, + "loss": 0.0121, + "num_tokens": 8721751.0, + "reward": 0.861083984375, + "reward_std": 0.018080413341522217, + "rewards//mean": 0.861083984375, + "rewards//std": 0.029031120240688324, + "step": 1199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.24, + "grad_norm": 1.5333460569381714, + "kl": 0.29680389910936356, + "learning_rate": 8.728437373102784e-07, + "loss": 0.0119, + "num_tokens": 8728935.0, + "reward": 0.86285400390625, + "reward_std": 0.015476769767701626, + "rewards//mean": 0.86285400390625, + "rewards//std": 0.0292575154453516, + "step": 1200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2402, + "grad_norm": 1.8121824264526367, + "kl": 0.25693375431001186, + "learning_rate": 8.726322248378774e-07, + "loss": 0.0103, + "num_tokens": 8736183.0, + "reward": 0.82293701171875, + "reward_std": 0.013597310520708561, + "rewards//mean": 0.82293701171875, + "rewards//std": 0.016084132716059685, + "step": 1201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.2404, + "grad_norm": 1.4376623630523682, + "kl": 0.3169674798846245, + "learning_rate": 8.724205622692606e-07, + "loss": 0.0128, + "num_tokens": 8743486.0, + "reward": 0.837890625, + "reward_std": 0.014063305221498013, + "rewards//mean": 0.837890625, + "rewards//std": 0.028856437653303146, + "step": 1202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2406, + "grad_norm": 1.443190574645996, + "kl": 0.31739574298262596, + "learning_rate": 8.72208749689686e-07, + "loss": 0.0127, + "num_tokens": 8750670.0, + "reward": 0.85064697265625, + "reward_std": 0.012559673748910427, + "rewards//mean": 0.85064697265625, + "rewards//std": 0.01354492548853159, + "step": 1203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2408, + "grad_norm": 1.3598047494888306, + "kl": 0.28114689514040947, + "learning_rate": 8.719967871844715e-07, + "loss": 0.0112, + "num_tokens": 8757902.0, + "reward": 0.81402587890625, + "reward_std": 0.011684499680995941, + "rewards//mean": 0.81402587890625, + "rewards//std": 0.024681923910975456, + "step": 1204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.241, + "grad_norm": 1.2504719495773315, + "kl": 0.25765062868595123, + "learning_rate": 8.717846748389955e-07, + "loss": 0.0103, + "num_tokens": 8765142.0, + "reward": 0.86083984375, + "reward_std": 0.014264484867453575, + "rewards//mean": 0.86083984375, + "rewards//std": 0.021196382120251656, + "step": 1205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2412, + "grad_norm": 11.551108360290527, + "kl": 0.9893203899264336, + "learning_rate": 8.71572412738697e-07, + "loss": 0.0396, + "num_tokens": 8772470.0, + "reward": 0.85101318359375, + "reward_std": 0.01461751852184534, + "rewards//mean": 0.85101318359375, + "rewards//std": 0.02321808785200119, + "step": 1206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2414, + "grad_norm": 1.2581733465194702, + "kl": 0.31145558319985867, + "learning_rate": 8.713600009690751e-07, + "loss": 0.0125, + "num_tokens": 8779742.0, + "reward": 0.83544921875, + "reward_std": 0.01706441305577755, + "rewards//mean": 0.83544921875, + "rewards//std": 0.022877952083945274, + "step": 1207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2416, + "grad_norm": 1.275691270828247, + "kl": 0.27063408121466637, + "learning_rate": 8.711474396156892e-07, + "loss": 0.0108, + "num_tokens": 8786918.0, + "reward": 0.86297607421875, + "reward_std": 0.013752002269029617, + "rewards//mean": 0.86297607421875, + "rewards//std": 0.023595130071043968, + "step": 1208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2418, + "grad_norm": 1.3203303813934326, + "kl": 0.33405005000531673, + "learning_rate": 8.709347287641592e-07, + "loss": 0.0134, + "num_tokens": 8794182.0, + "reward": 0.88311767578125, + "reward_std": 0.016967788338661194, + "rewards//mean": 0.88311767578125, + "rewards//std": 0.02986534871160984, + "step": 1209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.242, + "grad_norm": 1.2664909362792969, + "kl": 0.3152100630104542, + "learning_rate": 8.707218685001646e-07, + "loss": 0.0142, + "num_tokens": 8801348.0, + "reward": 0.8419189453125, + "reward_std": 0.013620372861623764, + "rewards//mean": 0.8419189453125, + "rewards//std": 0.018623093143105507, + "step": 1210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2422, + "grad_norm": 1.6501415967941284, + "kl": 0.33215282298624516, + "learning_rate": 8.705088589094458e-07, + "loss": 0.0133, + "num_tokens": 8808556.0, + "reward": 0.82000732421875, + "reward_std": 0.01609901338815689, + "rewards//mean": 0.82000732421875, + "rewards//std": 0.02698536403477192, + "step": 1211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2424, + "grad_norm": 1.905149221420288, + "kl": 0.3167665880173445, + "learning_rate": 8.702957000778029e-07, + "loss": 0.0127, + "num_tokens": 8815940.0, + "reward": 0.85137939453125, + "reward_std": 0.014738207682967186, + "rewards//mean": 0.85137939453125, + "rewards//std": 0.02443351037800312, + "step": 1212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2426, + "grad_norm": 1.3100833892822266, + "kl": 0.2855425775051117, + "learning_rate": 8.700823920910963e-07, + "loss": 0.0114, + "num_tokens": 8823220.0, + "reward": 0.8416748046875, + "reward_std": 0.011013327166438103, + "rewards//mean": 0.8416748046875, + "rewards//std": 0.02373497560620308, + "step": 1213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2428, + "grad_norm": 1.3750017881393433, + "kl": 0.24206598289310932, + "learning_rate": 8.698689350352464e-07, + "loss": 0.0097, + "num_tokens": 8830516.0, + "reward": 0.78753662109375, + "reward_std": 0.012219125404953957, + "rewards//mean": 0.78753662109375, + "rewards//std": 0.015556913800537586, + "step": 1214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.243, + "grad_norm": 1.5141162872314453, + "kl": 0.31533854082226753, + "learning_rate": 8.696553289962337e-07, + "loss": 0.0126, + "num_tokens": 8837780.0, + "reward": 0.84210205078125, + "reward_std": 0.013525916263461113, + "rewards//mean": 0.84210205078125, + "rewards//std": 0.021830275654792786, + "step": 1215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2432, + "grad_norm": 1.3857293128967285, + "kl": 0.2650327570736408, + "learning_rate": 8.694415740600988e-07, + "loss": 0.0106, + "num_tokens": 8844948.0, + "reward": 0.85736083984375, + "reward_std": 0.015982141718268394, + "rewards//mean": 0.85736083984375, + "rewards//std": 0.020243609324097633, + "step": 1216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2434, + "grad_norm": 1.5561374425888062, + "kl": 0.27168528363108635, + "learning_rate": 8.69227670312942e-07, + "loss": 0.0109, + "num_tokens": 8852228.0, + "reward": 0.8778076171875, + "reward_std": 0.01986616477370262, + "rewards//mean": 0.8778076171875, + "rewards//std": 0.03355572372674942, + "step": 1217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2436, + "grad_norm": 1.4583102464675903, + "kl": 0.3086981922388077, + "learning_rate": 8.690136178409235e-07, + "loss": 0.0123, + "num_tokens": 8859524.0, + "reward": 0.80645751953125, + "reward_std": 0.01742030493915081, + "rewards//mean": 0.80645751953125, + "rewards//std": 0.03376052901148796, + "step": 1218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2438, + "grad_norm": 1.3936412334442139, + "kl": 0.3012086134403944, + "learning_rate": 8.687994167302641e-07, + "loss": 0.012, + "num_tokens": 8866860.0, + "reward": 0.84271240234375, + "reward_std": 0.01771920546889305, + "rewards//mean": 0.84271240234375, + "rewards//std": 0.033293217420578, + "step": 1219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.244, + "grad_norm": 2.0340206623077393, + "kl": 0.35549681074917316, + "learning_rate": 8.685850670672438e-07, + "loss": 0.0142, + "num_tokens": 8874124.0, + "reward": 0.76434326171875, + "reward_std": 0.016297735273838043, + "rewards//mean": 0.76434326171875, + "rewards//std": 0.02817743830382824, + "step": 1220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2442, + "grad_norm": 1.7135554552078247, + "kl": 0.2999767381697893, + "learning_rate": 8.683705689382024e-07, + "loss": 0.012, + "num_tokens": 8881524.0, + "reward": 0.85626220703125, + "reward_std": 0.016818776726722717, + "rewards//mean": 0.85626220703125, + "rewards//std": 0.02404128573834896, + "step": 1221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2444, + "grad_norm": 1.5737414360046387, + "kl": 0.2855926752090454, + "learning_rate": 8.6815592242954e-07, + "loss": 0.0114, + "num_tokens": 8888788.0, + "reward": 0.83746337890625, + "reward_std": 0.014662383124232292, + "rewards//mean": 0.83746337890625, + "rewards//std": 0.027836382389068604, + "step": 1222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2446, + "grad_norm": 1.2603435516357422, + "kl": 0.29573289304971695, + "learning_rate": 8.67941127627716e-07, + "loss": 0.0118, + "num_tokens": 8896204.0, + "reward": 0.833984375, + "reward_std": 0.013072097674012184, + "rewards//mean": 0.833984375, + "rewards//std": 0.019661027938127518, + "step": 1223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2448, + "grad_norm": 1.4424082040786743, + "kl": 0.31074803322553635, + "learning_rate": 8.677261846192499e-07, + "loss": 0.0124, + "num_tokens": 8903460.0, + "reward": 0.84844970703125, + "reward_std": 0.016001654788851738, + "rewards//mean": 0.84844970703125, + "rewards//std": 0.03426080569624901, + "step": 1224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.245, + "grad_norm": 1.4225530624389648, + "kl": 0.27809068746864796, + "learning_rate": 8.675110934907204e-07, + "loss": 0.0111, + "num_tokens": 8910740.0, + "reward": 0.78497314453125, + "reward_std": 0.012241804972290993, + "rewards//mean": 0.78497314453125, + "rewards//std": 0.021936804056167603, + "step": 1225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2452, + "grad_norm": 1.4516700506210327, + "kl": 0.32299651950597763, + "learning_rate": 8.672958543287666e-07, + "loss": 0.0129, + "num_tokens": 8917940.0, + "reward": 0.78472900390625, + "reward_std": 0.011119918897747993, + "rewards//mean": 0.78472900390625, + "rewards//std": 0.0195760540664196, + "step": 1226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2454, + "grad_norm": 1.582103967666626, + "kl": 0.3145412616431713, + "learning_rate": 8.670804672200865e-07, + "loss": 0.0126, + "num_tokens": 8925212.0, + "reward": 0.863525390625, + "reward_std": 0.020832661539316177, + "rewards//mean": 0.863525390625, + "rewards//std": 0.023249443620443344, + "step": 1227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2456, + "grad_norm": 1.2434687614440918, + "kl": 0.3140351288020611, + "learning_rate": 8.668649322514381e-07, + "loss": 0.0126, + "num_tokens": 8932508.0, + "reward": 0.8743896484375, + "reward_std": 0.014243105426430702, + "rewards//mean": 0.8743896484375, + "rewards//std": 0.019465647637844086, + "step": 1228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.734375, + "epoch": 0.2458, + "grad_norm": 1.5369316339492798, + "kl": 0.31203255988657475, + "learning_rate": 8.666492495096389e-07, + "loss": 0.0202, + "num_tokens": 8939755.0, + "reward": 0.85546875, + "reward_std": 0.015223393216729164, + "rewards//mean": 0.85546875, + "rewards//std": 0.01806902512907982, + "step": 1229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.246, + "grad_norm": 2.2139594554901123, + "kl": 0.5603203922510147, + "learning_rate": 8.664334190815659e-07, + "loss": 0.0224, + "num_tokens": 8947059.0, + "reward": 0.81903076171875, + "reward_std": 0.019142575562000275, + "rewards//mean": 0.81903076171875, + "rewards//std": 0.029708825051784515, + "step": 1230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2462, + "grad_norm": 1.5945507287979126, + "kl": 0.2623530402779579, + "learning_rate": 8.662174410541554e-07, + "loss": 0.0105, + "num_tokens": 8954307.0, + "reward": 0.81109619140625, + "reward_std": 0.02148263156414032, + "rewards//mean": 0.81109619140625, + "rewards//std": 0.029490450397133827, + "step": 1231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2464, + "grad_norm": 1.6207854747772217, + "kl": 0.2951295617967844, + "learning_rate": 8.660013155144035e-07, + "loss": 0.0118, + "num_tokens": 8961491.0, + "reward": 0.872314453125, + "reward_std": 0.01367340236902237, + "rewards//mean": 0.872314453125, + "rewards//std": 0.020194290205836296, + "step": 1232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2466, + "grad_norm": 1.6710872650146484, + "kl": 0.3132718615233898, + "learning_rate": 8.657850425493654e-07, + "loss": 0.0125, + "num_tokens": 8968755.0, + "reward": 0.87890625, + "reward_std": 0.017590492963790894, + "rewards//mean": 0.87890625, + "rewards//std": 0.03378971666097641, + "step": 1233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2468, + "grad_norm": 1.8344796895980835, + "kl": 0.3137147631496191, + "learning_rate": 8.65568622246156e-07, + "loss": 0.0125, + "num_tokens": 8976059.0, + "reward": 0.82037353515625, + "reward_std": 0.012071283534169197, + "rewards//mean": 0.82037353515625, + "rewards//std": 0.018926555290818214, + "step": 1234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.247, + "grad_norm": 1.303460955619812, + "kl": 0.30274322628974915, + "learning_rate": 8.653520546919493e-07, + "loss": 0.0118, + "num_tokens": 8983270.0, + "reward": 0.86602783203125, + "reward_std": 0.012999658472836018, + "rewards//mean": 0.86602783203125, + "rewards//std": 0.02664552628993988, + "step": 1235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.796875, + "epoch": 0.2472, + "grad_norm": 1.190699815750122, + "kl": 0.3147462122142315, + "learning_rate": 8.651353399739787e-07, + "loss": 0.0069, + "num_tokens": 8990537.0, + "reward": 0.824462890625, + "reward_std": 0.01757073774933815, + "rewards//mean": 0.824462890625, + "rewards//std": 0.03502078726887703, + "step": 1236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2474, + "grad_norm": 1.6580536365509033, + "kl": 0.29574711062014103, + "learning_rate": 8.649184781795367e-07, + "loss": 0.0118, + "num_tokens": 8997833.0, + "reward": 0.82720947265625, + "reward_std": 0.015309564769268036, + "rewards//mean": 0.82720947265625, + "rewards//std": 0.01878848299384117, + "step": 1237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2476, + "grad_norm": 1.2368236780166626, + "kl": 0.291087593883276, + "learning_rate": 8.647014693959753e-07, + "loss": 0.0116, + "num_tokens": 9005113.0, + "reward": 0.867431640625, + "reward_std": 0.014261826872825623, + "rewards//mean": 0.867431640625, + "rewards//std": 0.02527589723467827, + "step": 1238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.2478, + "grad_norm": 1.2148113250732422, + "kl": 0.35841636173427105, + "learning_rate": 8.644843137107057e-07, + "loss": 0.0153, + "num_tokens": 9012439.0, + "reward": 0.8499755859375, + "reward_std": 0.0180840902030468, + "rewards//mean": 0.8499755859375, + "rewards//std": 0.03329668566584587, + "step": 1239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.248, + "grad_norm": 2.3757529258728027, + "kl": 0.49072660878300667, + "learning_rate": 8.642670112111981e-07, + "loss": 0.0196, + "num_tokens": 9019823.0, + "reward": 0.82098388671875, + "reward_std": 0.013396857306361198, + "rewards//mean": 0.82098388671875, + "rewards//std": 0.028017966076731682, + "step": 1240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.2482, + "grad_norm": 1.3467384576797485, + "kl": 0.35098395869135857, + "learning_rate": 8.64049561984982e-07, + "loss": 0.0139, + "num_tokens": 9027165.0, + "reward": 0.83892822265625, + "reward_std": 0.017711324617266655, + "rewards//mean": 0.83892822265625, + "rewards//std": 0.023589355871081352, + "step": 1241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2484, + "grad_norm": 1.2288347482681274, + "kl": 0.33751652762293816, + "learning_rate": 8.638319661196459e-07, + "loss": 0.0135, + "num_tokens": 9034405.0, + "reward": 0.8299560546875, + "reward_std": 0.012196116149425507, + "rewards//mean": 0.8299560546875, + "rewards//std": 0.020481223240494728, + "step": 1242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2486, + "grad_norm": 1.348368525505066, + "kl": 0.3787542413920164, + "learning_rate": 8.636142237028372e-07, + "loss": 0.0152, + "num_tokens": 9041725.0, + "reward": 0.8475341796875, + "reward_std": 0.012821504846215248, + "rewards//mean": 0.8475341796875, + "rewards//std": 0.019917665049433708, + "step": 1243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.78125, + "epoch": 0.2488, + "grad_norm": 1.1636168956756592, + "kl": 0.30466122925281525, + "learning_rate": 8.633963348222628e-07, + "loss": 0.0008, + "num_tokens": 9049031.0, + "reward": 0.86236572265625, + "reward_std": 0.01548854447901249, + "rewards//mean": 0.86236572265625, + "rewards//std": 0.020011968910694122, + "step": 1244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.859375, + "epoch": 0.249, + "grad_norm": 1.4366421699523926, + "kl": 0.3067665994167328, + "learning_rate": 8.631782995656882e-07, + "loss": 0.0163, + "num_tokens": 9056302.0, + "reward": 0.85333251953125, + "reward_std": 0.015026215463876724, + "rewards//mean": 0.85333251953125, + "rewards//std": 0.027152014896273613, + "step": 1245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.2492, + "grad_norm": 1.7424943447113037, + "kl": 0.2948855571448803, + "learning_rate": 8.62960118020938e-07, + "loss": 0.0121, + "num_tokens": 9063645.0, + "reward": 0.8763427734375, + "reward_std": 0.018422730267047882, + "rewards//mean": 0.8763427734375, + "rewards//std": 0.028401443734765053, + "step": 1246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.2494, + "grad_norm": 1.6004081964492798, + "kl": 0.33925825729966164, + "learning_rate": 8.627417902758956e-07, + "loss": 0.0104, + "num_tokens": 9071015.0, + "reward": 0.82867431640625, + "reward_std": 0.015576413832604885, + "rewards//mean": 0.82867431640625, + "rewards//std": 0.025400089100003242, + "step": 1247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2496, + "grad_norm": 1.1693512201309204, + "kl": 0.3038679752498865, + "learning_rate": 8.625233164185034e-07, + "loss": 0.0122, + "num_tokens": 9078255.0, + "reward": 0.82281494140625, + "reward_std": 0.010882077738642693, + "rewards//mean": 0.82281494140625, + "rewards//std": 0.016795897856354713, + "step": 1248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2498, + "grad_norm": 1.2860006093978882, + "kl": 0.30532761849462986, + "learning_rate": 8.623046965367628e-07, + "loss": 0.0122, + "num_tokens": 9085487.0, + "reward": 0.84210205078125, + "reward_std": 0.01517038606107235, + "rewards//mean": 0.84210205078125, + "rewards//std": 0.027593325823545456, + "step": 1249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.25, + "grad_norm": 1.2357232570648193, + "kl": 0.2483038306236267, + "learning_rate": 8.620859307187338e-07, + "loss": 0.0099, + "num_tokens": 9092743.0, + "reward": 0.7994384765625, + "reward_std": 0.011469470337033272, + "rewards//mean": 0.7994384765625, + "rewards//std": 0.023253023624420166, + "step": 1250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2502, + "grad_norm": 3.4494106769561768, + "kl": 0.5381257887929678, + "learning_rate": 8.61867019052535e-07, + "loss": 0.0215, + "num_tokens": 9100199.0, + "reward": 0.84185791015625, + "reward_std": 0.015931783244013786, + "rewards//mean": 0.84185791015625, + "rewards//std": 0.02548695169389248, + "step": 1251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2504, + "grad_norm": 1.3210333585739136, + "kl": 0.317903870716691, + "learning_rate": 8.616479616263444e-07, + "loss": 0.0127, + "num_tokens": 9107495.0, + "reward": 0.80755615234375, + "reward_std": 0.012488706037402153, + "rewards//mean": 0.80755615234375, + "rewards//std": 0.02500002272427082, + "step": 1252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.296875, + "epoch": 0.2506, + "grad_norm": 1.5485681295394897, + "kl": 0.2921069413423538, + "learning_rate": 8.61428758528398e-07, + "loss": 0.026, + "num_tokens": 9114618.0, + "reward": 0.84027099609375, + "reward_std": 0.015359066426753998, + "rewards//mean": 0.84027099609375, + "rewards//std": 0.02352123335003853, + "step": 1253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2508, + "grad_norm": 1.4636845588684082, + "kl": 0.2986578904092312, + "learning_rate": 8.612094098469909e-07, + "loss": 0.0119, + "num_tokens": 9121890.0, + "reward": 0.8017578125, + "reward_std": 0.015112556517124176, + "rewards//mean": 0.8017578125, + "rewards//std": 0.01876601204276085, + "step": 1254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.251, + "grad_norm": 1.45558500289917, + "kl": 0.3055503573268652, + "learning_rate": 8.609899156704767e-07, + "loss": 0.0122, + "num_tokens": 9129258.0, + "reward": 0.8428955078125, + "reward_std": 0.012153634801506996, + "rewards//mean": 0.8428955078125, + "rewards//std": 0.01607530564069748, + "step": 1255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2512, + "grad_norm": 1.5923055410385132, + "kl": 0.28121004067361355, + "learning_rate": 8.607702760872677e-07, + "loss": 0.0112, + "num_tokens": 9136698.0, + "reward": 0.88311767578125, + "reward_std": 0.01590283401310444, + "rewards//mean": 0.88311767578125, + "rewards//std": 0.02727106213569641, + "step": 1256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2514, + "grad_norm": 1.2232272624969482, + "kl": 0.251688988879323, + "learning_rate": 8.605504911858346e-07, + "loss": 0.0101, + "num_tokens": 9143898.0, + "reward": 0.8297119140625, + "reward_std": 0.011857141740620136, + "rewards//mean": 0.8297119140625, + "rewards//std": 0.01757597178220749, + "step": 1257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2516, + "grad_norm": 1.6961383819580078, + "kl": 0.3216768726706505, + "learning_rate": 8.603305610547069e-07, + "loss": 0.0129, + "num_tokens": 9151266.0, + "reward": 0.8424072265625, + "reward_std": 0.013951857574284077, + "rewards//mean": 0.8424072265625, + "rewards//std": 0.02047235146164894, + "step": 1258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.859375, + "epoch": 0.2518, + "grad_norm": 1.3832769393920898, + "kl": 0.3723264019936323, + "learning_rate": 8.601104857824722e-07, + "loss": 0.0182, + "num_tokens": 9158521.0, + "reward": 0.8731689453125, + "reward_std": 0.0191444493830204, + "rewards//mean": 0.8731689453125, + "rewards//std": 0.03592272847890854, + "step": 1259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.252, + "grad_norm": 1.288241982460022, + "kl": 0.24963012896478176, + "learning_rate": 8.598902654577768e-07, + "loss": 0.01, + "num_tokens": 9165777.0, + "reward": 0.855712890625, + "reward_std": 0.02114582061767578, + "rewards//mean": 0.855712890625, + "rewards//std": 0.027249474078416824, + "step": 1260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2522, + "grad_norm": 1.3279056549072266, + "kl": 0.2848063390702009, + "learning_rate": 8.596699001693255e-07, + "loss": 0.0114, + "num_tokens": 9173089.0, + "reward": 0.82159423828125, + "reward_std": 0.014413061551749706, + "rewards//mean": 0.82159423828125, + "rewards//std": 0.02006409503519535, + "step": 1261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2524, + "grad_norm": 1.6296544075012207, + "kl": 0.2529169414192438, + "learning_rate": 8.594493900058816e-07, + "loss": 0.0101, + "num_tokens": 9180313.0, + "reward": 0.85003662109375, + "reward_std": 0.0121697299182415, + "rewards//mean": 0.85003662109375, + "rewards//std": 0.022059977054595947, + "step": 1262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2526, + "grad_norm": 1.2738933563232422, + "kl": 0.30445025488734245, + "learning_rate": 8.592287350562663e-07, + "loss": 0.0122, + "num_tokens": 9187537.0, + "reward": 0.86383056640625, + "reward_std": 0.018176648765802383, + "rewards//mean": 0.86383056640625, + "rewards//std": 0.03170296922326088, + "step": 1263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2528, + "grad_norm": 1.258620262145996, + "kl": 0.2691894620656967, + "learning_rate": 8.590079354093593e-07, + "loss": 0.0108, + "num_tokens": 9194825.0, + "reward": 0.8466796875, + "reward_std": 0.01278956513851881, + "rewards//mean": 0.8466796875, + "rewards//std": 0.016962816938757896, + "step": 1264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.253, + "grad_norm": 1.6550928354263306, + "kl": 0.31179968640208244, + "learning_rate": 8.587869911540992e-07, + "loss": 0.014, + "num_tokens": 9202126.0, + "reward": 0.87493896484375, + "reward_std": 0.019884789362549782, + "rewards//mean": 0.87493896484375, + "rewards//std": 0.03490035980939865, + "step": 1265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2532, + "grad_norm": 1.1539061069488525, + "kl": 0.34029836393892765, + "learning_rate": 8.585659023794818e-07, + "loss": 0.0136, + "num_tokens": 9209446.0, + "reward": 0.8056640625, + "reward_std": 0.011066862381994724, + "rewards//mean": 0.8056640625, + "rewards//std": 0.016233202069997787, + "step": 1266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.2534, + "grad_norm": 1.5260175466537476, + "kl": 0.2907567583024502, + "learning_rate": 8.583446691745617e-07, + "loss": 0.0136, + "num_tokens": 9216823.0, + "reward": 0.82708740234375, + "reward_std": 0.01721060276031494, + "rewards//mean": 0.82708740234375, + "rewards//std": 0.024560188874602318, + "step": 1267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2536, + "grad_norm": 1.6303738355636597, + "kl": 0.334762305021286, + "learning_rate": 8.581232916284517e-07, + "loss": 0.0134, + "num_tokens": 9224135.0, + "reward": 0.752197265625, + "reward_std": 0.016469206660985947, + "rewards//mean": 0.752197265625, + "rewards//std": 0.028038017451763153, + "step": 1268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2538, + "grad_norm": 1.8616774082183838, + "kl": 0.29583064280450344, + "learning_rate": 8.579017698303228e-07, + "loss": 0.0118, + "num_tokens": 9231439.0, + "reward": 0.83929443359375, + "reward_std": 0.016346191987395287, + "rewards//mean": 0.83929443359375, + "rewards//std": 0.02150682359933853, + "step": 1269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.254, + "grad_norm": 1.4960848093032837, + "kl": 0.3784705139696598, + "learning_rate": 8.576801038694039e-07, + "loss": 0.0151, + "num_tokens": 9238735.0, + "reward": 0.89794921875, + "reward_std": 0.011985618621110916, + "rewards//mean": 0.89794921875, + "rewards//std": 0.018512625247240067, + "step": 1270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.2542, + "grad_norm": 1.60847008228302, + "kl": 0.31339624151587486, + "learning_rate": 8.574582938349817e-07, + "loss": 0.0131, + "num_tokens": 9246076.0, + "reward": 0.83062744140625, + "reward_std": 0.014313199557363987, + "rewards//mean": 0.83062744140625, + "rewards//std": 0.02180321514606476, + "step": 1271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2544, + "grad_norm": 1.7307155132293701, + "kl": 0.29840651899576187, + "learning_rate": 8.572363398164016e-07, + "loss": 0.0119, + "num_tokens": 9253340.0, + "reward": 0.87579345703125, + "reward_std": 0.020710190758109093, + "rewards//mean": 0.87579345703125, + "rewards//std": 0.02718099020421505, + "step": 1272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.59375, + "epoch": 0.2546, + "grad_norm": 1.6250442266464233, + "kl": 0.303670734167099, + "learning_rate": 8.570142419030666e-07, + "loss": -0.0198, + "num_tokens": 9260658.0, + "reward": 0.860107421875, + "reward_std": 0.026295488700270653, + "rewards//mean": 0.860107421875, + "rewards//std": 0.030288685113191605, + "step": 1273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2548, + "grad_norm": 1.6929569244384766, + "kl": 0.291608152911067, + "learning_rate": 8.567920001844375e-07, + "loss": 0.0117, + "num_tokens": 9267914.0, + "reward": 0.851806640625, + "reward_std": 0.016382914036512375, + "rewards//mean": 0.851806640625, + "rewards//std": 0.02633645571768284, + "step": 1274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.255, + "grad_norm": 1.5195263624191284, + "kl": 0.2807927783578634, + "learning_rate": 8.565696147500337e-07, + "loss": 0.0112, + "num_tokens": 9275202.0, + "reward": 0.84014892578125, + "reward_std": 0.01341167464852333, + "rewards//mean": 0.84014892578125, + "rewards//std": 0.024102913215756416, + "step": 1275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.828125, + "epoch": 0.2552, + "grad_norm": 1.3883553743362427, + "kl": 0.2923179157078266, + "learning_rate": 8.563470856894314e-07, + "loss": 0.0036, + "num_tokens": 9282471.0, + "reward": 0.80902099609375, + "reward_std": 0.012350053526461124, + "rewards//mean": 0.80902099609375, + "rewards//std": 0.017035726457834244, + "step": 1276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2554, + "grad_norm": 1.330381989479065, + "kl": 0.2846367433667183, + "learning_rate": 8.561244130922657e-07, + "loss": 0.0114, + "num_tokens": 9289783.0, + "reward": 0.8594970703125, + "reward_std": 0.015482347458600998, + "rewards//mean": 0.8594970703125, + "rewards//std": 0.02453775703907013, + "step": 1277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2556, + "grad_norm": 1.3704814910888672, + "kl": 0.34872109442949295, + "learning_rate": 8.559015970482291e-07, + "loss": 0.0139, + "num_tokens": 9297095.0, + "reward": 0.85906982421875, + "reward_std": 0.018443649634718895, + "rewards//mean": 0.85906982421875, + "rewards//std": 0.033038076013326645, + "step": 1278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2558, + "grad_norm": 1.2474561929702759, + "kl": 0.28718893602490425, + "learning_rate": 8.556786376470716e-07, + "loss": 0.0115, + "num_tokens": 9304479.0, + "reward": 0.8646240234375, + "reward_std": 0.018941236659884453, + "rewards//mean": 0.8646240234375, + "rewards//std": 0.029000600799918175, + "step": 1279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.256, + "grad_norm": 1.5414234399795532, + "kl": 0.39564596861600876, + "learning_rate": 8.554555349786015e-07, + "loss": 0.0129, + "num_tokens": 9311715.0, + "reward": 0.82513427734375, + "reward_std": 0.01568378508090973, + "rewards//mean": 0.82513427734375, + "rewards//std": 0.025711657479405403, + "step": 1280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2562, + "grad_norm": 1.3702397346496582, + "kl": 0.3313169293105602, + "learning_rate": 8.552322891326844e-07, + "loss": 0.0133, + "num_tokens": 9319011.0, + "reward": 0.8096923828125, + "reward_std": 0.01569448783993721, + "rewards//mean": 0.8096923828125, + "rewards//std": 0.018380915746092796, + "step": 1281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2564, + "grad_norm": 1.3578155040740967, + "kl": 0.2382032684981823, + "learning_rate": 8.550089001992437e-07, + "loss": 0.0095, + "num_tokens": 9326307.0, + "reward": 0.81011962890625, + "reward_std": 0.009478038176894188, + "rewards//mean": 0.81011962890625, + "rewards//std": 0.018250642344355583, + "step": 1282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2566, + "grad_norm": 1.3703792095184326, + "kl": 0.32240070402622223, + "learning_rate": 8.547853682682604e-07, + "loss": 0.0129, + "num_tokens": 9333491.0, + "reward": 0.85870361328125, + "reward_std": 0.01892610639333725, + "rewards//mean": 0.85870361328125, + "rewards//std": 0.02586018294095993, + "step": 1283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2568, + "grad_norm": 1.6003445386886597, + "kl": 0.3205294981598854, + "learning_rate": 8.545616934297733e-07, + "loss": 0.0128, + "num_tokens": 9340739.0, + "reward": 0.87933349609375, + "reward_std": 0.020694326609373093, + "rewards//mean": 0.87933349609375, + "rewards//std": 0.025893526151776314, + "step": 1284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.257, + "grad_norm": 1.3282372951507568, + "kl": 0.2804751433432102, + "learning_rate": 8.543378757738784e-07, + "loss": 0.0112, + "num_tokens": 9348019.0, + "reward": 0.79278564453125, + "reward_std": 0.012204921804368496, + "rewards//mean": 0.79278564453125, + "rewards//std": 0.01853785291314125, + "step": 1285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2572, + "grad_norm": 1.4039361476898193, + "kl": 0.2796861231327057, + "learning_rate": 8.541139153907295e-07, + "loss": 0.0112, + "num_tokens": 9355251.0, + "reward": 0.8974609375, + "reward_std": 0.01179260853677988, + "rewards//mean": 0.8974609375, + "rewards//std": 0.014959799125790596, + "step": 1286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2574, + "grad_norm": 1.1078616380691528, + "kl": 0.27726874500513077, + "learning_rate": 8.538898123705379e-07, + "loss": 0.0111, + "num_tokens": 9362507.0, + "reward": 0.86846923828125, + "reward_std": 0.012940380722284317, + "rewards//mean": 0.86846923828125, + "rewards//std": 0.019674016162753105, + "step": 1287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2576, + "grad_norm": 1.4511327743530273, + "kl": 0.32931919023394585, + "learning_rate": 8.536655668035721e-07, + "loss": 0.0132, + "num_tokens": 9369811.0, + "reward": 0.884521484375, + "reward_std": 0.018337156623601913, + "rewards//mean": 0.884521484375, + "rewards//std": 0.029404152184724808, + "step": 1288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2578, + "grad_norm": 1.7291276454925537, + "kl": 0.4174620360136032, + "learning_rate": 8.534411787801586e-07, + "loss": 0.0167, + "num_tokens": 9377099.0, + "reward": 0.84381103515625, + "reward_std": 0.01891905441880226, + "rewards//mean": 0.84381103515625, + "rewards//std": 0.022803641855716705, + "step": 1289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.258, + "grad_norm": 1.3968030214309692, + "kl": 0.310411911457777, + "learning_rate": 8.532166483906802e-07, + "loss": 0.0124, + "num_tokens": 9384379.0, + "reward": 0.83929443359375, + "reward_std": 0.011200999841094017, + "rewards//mean": 0.83929443359375, + "rewards//std": 0.01964706741273403, + "step": 1290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.78125, + "epoch": 0.2582, + "grad_norm": 1.3861591815948486, + "kl": 0.3202430810779333, + "learning_rate": 8.529919757255781e-07, + "loss": 0.0018, + "num_tokens": 9391597.0, + "reward": 0.80767822265625, + "reward_std": 0.01412142999470234, + "rewards//mean": 0.80767822265625, + "rewards//std": 0.017410147935152054, + "step": 1291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2584, + "grad_norm": 1.9131174087524414, + "kl": 0.5694746039807796, + "learning_rate": 8.527671608753506e-07, + "loss": 0.0228, + "num_tokens": 9398861.0, + "reward": 0.8253173828125, + "reward_std": 0.015128182247281075, + "rewards//mean": 0.8253173828125, + "rewards//std": 0.017709821462631226, + "step": 1292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2586, + "grad_norm": 1.4270175695419312, + "kl": 0.3101390190422535, + "learning_rate": 8.525422039305528e-07, + "loss": 0.0124, + "num_tokens": 9406213.0, + "reward": 0.83587646484375, + "reward_std": 0.013689063489437103, + "rewards//mean": 0.83587646484375, + "rewards//std": 0.02081579715013504, + "step": 1293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.75, + "epoch": 0.2588, + "grad_norm": 1.1977941989898682, + "kl": 0.34167974814772606, + "learning_rate": 8.523171049817973e-07, + "loss": 0.0221, + "num_tokens": 9413429.0, + "reward": 0.8326416015625, + "reward_std": 0.013358555734157562, + "rewards//mean": 0.8326416015625, + "rewards//std": 0.01688006892800331, + "step": 1294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.259, + "grad_norm": 1.4974557161331177, + "kl": 0.2813408561050892, + "learning_rate": 8.520918641197541e-07, + "loss": 0.0113, + "num_tokens": 9420725.0, + "reward": 0.840576171875, + "reward_std": 0.019028451293706894, + "rewards//mean": 0.840576171875, + "rewards//std": 0.025993958115577698, + "step": 1295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2592, + "grad_norm": 1.2182297706604004, + "kl": 0.25497711077332497, + "learning_rate": 8.518664814351502e-07, + "loss": 0.0102, + "num_tokens": 9428061.0, + "reward": 0.76654052734375, + "reward_std": 0.01415950432419777, + "rewards//mean": 0.76654052734375, + "rewards//std": 0.027451975271105766, + "step": 1296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2594, + "grad_norm": 1.3261966705322266, + "kl": 0.369146253913641, + "learning_rate": 8.516409570187696e-07, + "loss": 0.0148, + "num_tokens": 9435453.0, + "reward": 0.82598876953125, + "reward_std": 0.01952804997563362, + "rewards//mean": 0.82598876953125, + "rewards//std": 0.027664553374052048, + "step": 1297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2596, + "grad_norm": 1.3804298639297485, + "kl": 0.33276091516017914, + "learning_rate": 8.514152909614535e-07, + "loss": 0.0133, + "num_tokens": 9442805.0, + "reward": 0.88421630859375, + "reward_std": 0.01722046546638012, + "rewards//mean": 0.88421630859375, + "rewards//std": 0.021902963519096375, + "step": 1298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2598, + "grad_norm": 1.2417479753494263, + "kl": 0.2742691170424223, + "learning_rate": 8.511894833541005e-07, + "loss": 0.011, + "num_tokens": 9450117.0, + "reward": 0.865478515625, + "reward_std": 0.013272881507873535, + "rewards//mean": 0.865478515625, + "rewards//std": 0.01832028478384018, + "step": 1299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.26, + "grad_norm": 1.5830901861190796, + "kl": 0.3008856400847435, + "learning_rate": 8.509635342876654e-07, + "loss": 0.012, + "num_tokens": 9457341.0, + "reward": 0.87310791015625, + "reward_std": 0.01998218707740307, + "rewards//mean": 0.87310791015625, + "rewards//std": 0.025732843205332756, + "step": 1300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2602, + "grad_norm": 1.2729382514953613, + "kl": 0.31642600521445274, + "learning_rate": 8.507374438531606e-07, + "loss": 0.0127, + "num_tokens": 9464589.0, + "reward": 0.8216552734375, + "reward_std": 0.014192678034305573, + "rewards//mean": 0.8216552734375, + "rewards//std": 0.027667082846164703, + "step": 1301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2604, + "grad_norm": 1.4583390951156616, + "kl": 0.3295265845954418, + "learning_rate": 8.505112121416553e-07, + "loss": 0.0132, + "num_tokens": 9471933.0, + "reward": 0.829345703125, + "reward_std": 0.014674220234155655, + "rewards//mean": 0.829345703125, + "rewards//std": 0.02301911450922489, + "step": 1302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2606, + "grad_norm": 1.3957315683364868, + "kl": 0.31831588596105576, + "learning_rate": 8.502848392442758e-07, + "loss": 0.0127, + "num_tokens": 9479197.0, + "reward": 0.8623046875, + "reward_std": 0.0190432071685791, + "rewards//mean": 0.8623046875, + "rewards//std": 0.024079743772745132, + "step": 1303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.828125, + "epoch": 0.2608, + "grad_norm": 1.1940209865570068, + "kl": 0.2899571806192398, + "learning_rate": 8.500583252522052e-07, + "loss": 0.0066, + "num_tokens": 9486426.0, + "reward": 0.83203125, + "reward_std": 0.016773520037531853, + "rewards//mean": 0.83203125, + "rewards//std": 0.02410989999771118, + "step": 1304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.261, + "grad_norm": 1.424069881439209, + "kl": 0.29884066991508007, + "learning_rate": 8.498316702566826e-07, + "loss": 0.0114, + "num_tokens": 9493696.0, + "reward": 0.78338623046875, + "reward_std": 0.01333322748541832, + "rewards//mean": 0.78338623046875, + "rewards//std": 0.016306588426232338, + "step": 1305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2612, + "grad_norm": 1.2925097942352295, + "kl": 0.2644070740789175, + "learning_rate": 8.496048743490053e-07, + "loss": 0.0106, + "num_tokens": 9501088.0, + "reward": 0.8876953125, + "reward_std": 0.016714494675397873, + "rewards//mean": 0.8876953125, + "rewards//std": 0.028595058247447014, + "step": 1306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2614, + "grad_norm": 1.5411168336868286, + "kl": 0.44226009026169777, + "learning_rate": 8.493779376205264e-07, + "loss": 0.0177, + "num_tokens": 9508368.0, + "reward": 0.82806396484375, + "reward_std": 0.014918161556124687, + "rewards//mean": 0.82806396484375, + "rewards//std": 0.02985875867307186, + "step": 1307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2616, + "grad_norm": 1.310579776763916, + "kl": 0.3400305397808552, + "learning_rate": 8.491508601626561e-07, + "loss": 0.0136, + "num_tokens": 9515688.0, + "reward": 0.86932373046875, + "reward_std": 0.013851284980773926, + "rewards//mean": 0.86932373046875, + "rewards//std": 0.02931385673582554, + "step": 1308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2618, + "grad_norm": 1.4292532205581665, + "kl": 0.2925645560026169, + "learning_rate": 8.489236420668608e-07, + "loss": 0.0117, + "num_tokens": 9523152.0, + "reward": 0.85003662109375, + "reward_std": 0.01441037654876709, + "rewards//mean": 0.85003662109375, + "rewards//std": 0.018902545794844627, + "step": 1309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.262, + "grad_norm": 1.1385201215744019, + "kl": 0.2836003676056862, + "learning_rate": 8.486962834246645e-07, + "loss": 0.0113, + "num_tokens": 9530400.0, + "reward": 0.82476806640625, + "reward_std": 0.013421183452010155, + "rewards//mean": 0.82476806640625, + "rewards//std": 0.03189339116215706, + "step": 1310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2622, + "grad_norm": 1.3815991878509521, + "kl": 0.3041307833045721, + "learning_rate": 8.484687843276468e-07, + "loss": 0.0122, + "num_tokens": 9537648.0, + "reward": 0.84405517578125, + "reward_std": 0.012508604675531387, + "rewards//mean": 0.84405517578125, + "rewards//std": 0.01932937279343605, + "step": 1311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2624, + "grad_norm": 1.5587464570999146, + "kl": 0.3041359707713127, + "learning_rate": 8.482411448674445e-07, + "loss": 0.0122, + "num_tokens": 9544936.0, + "reward": 0.85723876953125, + "reward_std": 0.01713060401380062, + "rewards//mean": 0.85723876953125, + "rewards//std": 0.032651856541633606, + "step": 1312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2626, + "grad_norm": 1.310192346572876, + "kl": 0.34027083218097687, + "learning_rate": 8.480133651357505e-07, + "loss": 0.0136, + "num_tokens": 9552248.0, + "reward": 0.8404541015625, + "reward_std": 0.01475224643945694, + "rewards//mean": 0.8404541015625, + "rewards//std": 0.023971056565642357, + "step": 1313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.6875, + "epoch": 0.2628, + "grad_norm": 1.6217435598373413, + "kl": 0.3402005899697542, + "learning_rate": 8.477854452243147e-07, + "loss": 0.0154, + "num_tokens": 9559564.0, + "reward": 0.83221435546875, + "reward_std": 0.017295021563768387, + "rewards//mean": 0.83221435546875, + "rewards//std": 0.025376835837960243, + "step": 1314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.263, + "grad_norm": 1.3412684202194214, + "kl": 0.2878578454256058, + "learning_rate": 8.475573852249434e-07, + "loss": 0.0115, + "num_tokens": 9566828.0, + "reward": 0.770263671875, + "reward_std": 0.012887135148048401, + "rewards//mean": 0.770263671875, + "rewards//std": 0.024957675486803055, + "step": 1315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2632, + "grad_norm": 1.5512804985046387, + "kl": 0.3410915844142437, + "learning_rate": 8.473291852294986e-07, + "loss": 0.0136, + "num_tokens": 9574052.0, + "reward": 0.84814453125, + "reward_std": 0.01907038316130638, + "rewards//mean": 0.84814453125, + "rewards//std": 0.025281885638833046, + "step": 1316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2634, + "grad_norm": 1.5569106340408325, + "kl": 0.39782155118882656, + "learning_rate": 8.471008453298996e-07, + "loss": 0.0159, + "num_tokens": 9581308.0, + "reward": 0.879638671875, + "reward_std": 0.011672201566398144, + "rewards//mean": 0.879638671875, + "rewards//std": 0.023186853155493736, + "step": 1317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.859375, + "epoch": 0.2636, + "grad_norm": 1.6246870756149292, + "kl": 0.3101568967103958, + "learning_rate": 8.468723656181218e-07, + "loss": 0.0029, + "num_tokens": 9588483.0, + "reward": 0.84637451171875, + "reward_std": 0.011541333049535751, + "rewards//mean": 0.84637451171875, + "rewards//std": 0.018689120188355446, + "step": 1318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2638, + "grad_norm": 1.2183343172073364, + "kl": 0.3000669050961733, + "learning_rate": 8.466437461861964e-07, + "loss": 0.012, + "num_tokens": 9595787.0, + "reward": 0.862060546875, + "reward_std": 0.011477585881948471, + "rewards//mean": 0.862060546875, + "rewards//std": 0.014580593444406986, + "step": 1319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.264, + "grad_norm": 1.3851430416107178, + "kl": 0.27157880552113056, + "learning_rate": 8.464149871262116e-07, + "loss": 0.0109, + "num_tokens": 9603179.0, + "reward": 0.842529296875, + "reward_std": 0.014080485329031944, + "rewards//mean": 0.842529296875, + "rewards//std": 0.03970134258270264, + "step": 1320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2642, + "grad_norm": 1.3442597389221191, + "kl": 0.29818442463874817, + "learning_rate": 8.461860885303113e-07, + "loss": 0.0119, + "num_tokens": 9610499.0, + "reward": 0.86761474609375, + "reward_std": 0.019340991973876953, + "rewards//mean": 0.86761474609375, + "rewards//std": 0.028840629383921623, + "step": 1321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2644, + "grad_norm": 1.3825781345367432, + "kl": 0.30397532880306244, + "learning_rate": 8.459570504906961e-07, + "loss": 0.0122, + "num_tokens": 9617803.0, + "reward": 0.8355712890625, + "reward_std": 0.015740733593702316, + "rewards//mean": 0.8355712890625, + "rewards//std": 0.02713228575885296, + "step": 1322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2646, + "grad_norm": 1.517054557800293, + "kl": 0.299238009378314, + "learning_rate": 8.457278730996222e-07, + "loss": 0.012, + "num_tokens": 9625059.0, + "reward": 0.85430908203125, + "reward_std": 0.017855513840913773, + "rewards//mean": 0.85430908203125, + "rewards//std": 0.021403105929493904, + "step": 1323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2648, + "grad_norm": 1.1943415403366089, + "kl": 0.28625962510704994, + "learning_rate": 8.454985564494024e-07, + "loss": 0.0115, + "num_tokens": 9632355.0, + "reward": 0.81573486328125, + "reward_std": 0.012475917115807533, + "rewards//mean": 0.81573486328125, + "rewards//std": 0.021333681419491768, + "step": 1324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.265, + "grad_norm": 1.478379249572754, + "kl": 0.3246854394674301, + "learning_rate": 8.452691006324054e-07, + "loss": 0.013, + "num_tokens": 9639627.0, + "reward": 0.85479736328125, + "reward_std": 0.015554528683423996, + "rewards//mean": 0.85479736328125, + "rewards//std": 0.024145582690835, + "step": 1325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2652, + "grad_norm": 1.3699034452438354, + "kl": 0.27305834740400314, + "learning_rate": 8.45039505741056e-07, + "loss": 0.0109, + "num_tokens": 9646923.0, + "reward": 0.851806640625, + "reward_std": 0.011158410459756851, + "rewards//mean": 0.851806640625, + "rewards//std": 0.02437838539481163, + "step": 1326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2654, + "grad_norm": 1.2348827123641968, + "kl": 0.33284727670252323, + "learning_rate": 8.448097718678348e-07, + "loss": 0.0133, + "num_tokens": 9654275.0, + "reward": 0.81268310546875, + "reward_std": 0.01029951497912407, + "rewards//mean": 0.81268310546875, + "rewards//std": 0.01882309652864933, + "step": 1327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.2656, + "grad_norm": 1.4178707599639893, + "kl": 0.2773618772625923, + "learning_rate": 8.44579899105279e-07, + "loss": 0.0116, + "num_tokens": 9661546.0, + "reward": 0.82867431640625, + "reward_std": 0.014012651517987251, + "rewards//mean": 0.82867431640625, + "rewards//std": 0.022551320493221283, + "step": 1328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2658, + "grad_norm": 1.3786417245864868, + "kl": 0.27097526006400585, + "learning_rate": 8.443498875459808e-07, + "loss": 0.0108, + "num_tokens": 9668738.0, + "reward": 0.854248046875, + "reward_std": 0.01760711334645748, + "rewards//mean": 0.854248046875, + "rewards//std": 0.022722594439983368, + "step": 1329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.266, + "grad_norm": 1.2865365743637085, + "kl": 0.30301662907004356, + "learning_rate": 8.441197372825892e-07, + "loss": 0.0121, + "num_tokens": 9675970.0, + "reward": 0.81390380859375, + "reward_std": 0.012572163715958595, + "rewards//mean": 0.81390380859375, + "rewards//std": 0.02014015056192875, + "step": 1330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2662, + "grad_norm": 1.384590983390808, + "kl": 0.3184867240488529, + "learning_rate": 8.438894484078085e-07, + "loss": 0.0127, + "num_tokens": 9683250.0, + "reward": 0.84661865234375, + "reward_std": 0.015659991651773453, + "rewards//mean": 0.84661865234375, + "rewards//std": 0.025212259963154793, + "step": 1331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2664, + "grad_norm": 1.3851872682571411, + "kl": 0.31294592283666134, + "learning_rate": 8.43659021014399e-07, + "loss": 0.0125, + "num_tokens": 9690522.0, + "reward": 0.8648681640625, + "reward_std": 0.016254518181085587, + "rewards//mean": 0.8648681640625, + "rewards//std": 0.025322269648313522, + "step": 1332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.2666, + "grad_norm": 1.1616060733795166, + "kl": 0.2988252807408571, + "learning_rate": 8.434284551951772e-07, + "loss": 0.0095, + "num_tokens": 9697836.0, + "reward": 0.79876708984375, + "reward_std": 0.011973205022513866, + "rewards//mean": 0.79876708984375, + "rewards//std": 0.019954398274421692, + "step": 1333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2668, + "grad_norm": 1.2754782438278198, + "kl": 0.3003023099154234, + "learning_rate": 8.431977510430145e-07, + "loss": 0.012, + "num_tokens": 9705028.0, + "reward": 0.80242919921875, + "reward_std": 0.016111014410853386, + "rewards//mean": 0.80242919921875, + "rewards//std": 0.021929901093244553, + "step": 1334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.267, + "grad_norm": 1.4447665214538574, + "kl": 0.33098744601011276, + "learning_rate": 8.429669086508389e-07, + "loss": 0.0132, + "num_tokens": 9712396.0, + "reward": 0.82696533203125, + "reward_std": 0.018968671560287476, + "rewards//mean": 0.82696533203125, + "rewards//std": 0.02358871139585972, + "step": 1335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.2672, + "grad_norm": 1.4388426542282104, + "kl": 0.3069209735840559, + "learning_rate": 8.427359281116333e-07, + "loss": 0.0125, + "num_tokens": 9719667.0, + "reward": 0.83648681640625, + "reward_std": 0.014880568720400333, + "rewards//mean": 0.83648681640625, + "rewards//std": 0.0199285876005888, + "step": 1336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2674, + "grad_norm": 1.5635889768600464, + "kl": 0.39606883376836777, + "learning_rate": 8.42504809518437e-07, + "loss": 0.0158, + "num_tokens": 9726947.0, + "reward": 0.856201171875, + "reward_std": 0.015940863639116287, + "rewards//mean": 0.856201171875, + "rewards//std": 0.02938767336308956, + "step": 1337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2676, + "grad_norm": 1.7180397510528564, + "kl": 0.3465915732085705, + "learning_rate": 8.422735529643443e-07, + "loss": 0.0139, + "num_tokens": 9734235.0, + "reward": 0.8582763671875, + "reward_std": 0.012179205194115639, + "rewards//mean": 0.8582763671875, + "rewards//std": 0.018512215465307236, + "step": 1338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2678, + "grad_norm": 1.4561457633972168, + "kl": 0.2978605777025223, + "learning_rate": 8.420421585425055e-07, + "loss": 0.0119, + "num_tokens": 9741443.0, + "reward": 0.85357666015625, + "reward_std": 0.015225725248456001, + "rewards//mean": 0.85357666015625, + "rewards//std": 0.031688641756772995, + "step": 1339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.268, + "grad_norm": 1.6379729509353638, + "kl": 0.3191088940948248, + "learning_rate": 8.41810626346126e-07, + "loss": 0.0128, + "num_tokens": 9748651.0, + "reward": 0.8450927734375, + "reward_std": 0.01722337305545807, + "rewards//mean": 0.8450927734375, + "rewards//std": 0.024550093337893486, + "step": 1340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2682, + "grad_norm": 1.4406367540359497, + "kl": 0.2940732892602682, + "learning_rate": 8.415789564684673e-07, + "loss": 0.0118, + "num_tokens": 9755843.0, + "reward": 0.82086181640625, + "reward_std": 0.013545993715524673, + "rewards//mean": 0.82086181640625, + "rewards//std": 0.018790094181895256, + "step": 1341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.828125, + "epoch": 0.2684, + "grad_norm": 1.5731678009033203, + "kl": 0.3199227787554264, + "learning_rate": 8.413471490028455e-07, + "loss": 0.0087, + "num_tokens": 9763152.0, + "reward": 0.87078857421875, + "reward_std": 0.014333833009004593, + "rewards//mean": 0.87078857421875, + "rewards//std": 0.03336634114384651, + "step": 1342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2686, + "grad_norm": 1.588686227798462, + "kl": 0.3444179594516754, + "learning_rate": 8.41115204042633e-07, + "loss": 0.0138, + "num_tokens": 9770392.0, + "reward": 0.851318359375, + "reward_std": 0.017083728685975075, + "rewards//mean": 0.851318359375, + "rewards//std": 0.03323958069086075, + "step": 1343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2688, + "grad_norm": 1.520098090171814, + "kl": 0.32035438157618046, + "learning_rate": 8.408831216812573e-07, + "loss": 0.0128, + "num_tokens": 9777728.0, + "reward": 0.86236572265625, + "reward_std": 0.01009269617497921, + "rewards//mean": 0.86236572265625, + "rewards//std": 0.01734045147895813, + "step": 1344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.269, + "grad_norm": 1.2243731021881104, + "kl": 0.29103656113147736, + "learning_rate": 8.406509020122008e-07, + "loss": 0.0116, + "num_tokens": 9785016.0, + "reward": 0.8411865234375, + "reward_std": 0.011368529871106148, + "rewards//mean": 0.8411865234375, + "rewards//std": 0.012987658381462097, + "step": 1345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2692, + "grad_norm": 1.3014568090438843, + "kl": 0.251448854804039, + "learning_rate": 8.404185451290017e-07, + "loss": 0.0101, + "num_tokens": 9792368.0, + "reward": 0.8499755859375, + "reward_std": 0.010735771618783474, + "rewards//mean": 0.8499755859375, + "rewards//std": 0.018645837903022766, + "step": 1346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2694, + "grad_norm": 1.2110536098480225, + "kl": 0.2933613359928131, + "learning_rate": 8.401860511252533e-07, + "loss": 0.0117, + "num_tokens": 9799592.0, + "reward": 0.85296630859375, + "reward_std": 0.014508984982967377, + "rewards//mean": 0.85296630859375, + "rewards//std": 0.021501190960407257, + "step": 1347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.2696, + "grad_norm": 1.3742611408233643, + "kl": 0.34128476679325104, + "learning_rate": 8.399534200946043e-07, + "loss": 0.01, + "num_tokens": 9806812.0, + "reward": 0.84613037109375, + "reward_std": 0.015074972994625568, + "rewards//mean": 0.84613037109375, + "rewards//std": 0.019495468586683273, + "step": 1348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2698, + "grad_norm": 1.3933212757110596, + "kl": 0.324620159342885, + "learning_rate": 8.397206521307583e-07, + "loss": 0.013, + "num_tokens": 9814100.0, + "reward": 0.85675048828125, + "reward_std": 0.01348080299794674, + "rewards//mean": 0.85675048828125, + "rewards//std": 0.023396695032715797, + "step": 1349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.27, + "grad_norm": 1.8278355598449707, + "kl": 0.3997762966901064, + "learning_rate": 8.394877473274741e-07, + "loss": 0.016, + "num_tokens": 9821436.0, + "reward": 0.81500244140625, + "reward_std": 0.018473554402589798, + "rewards//mean": 0.81500244140625, + "rewards//std": 0.031198550015687943, + "step": 1350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.5625, + "epoch": 0.2702, + "grad_norm": 1.2337459325790405, + "kl": 0.3142358437180519, + "learning_rate": 8.392547057785661e-07, + "loss": 0.0207, + "num_tokens": 9828712.0, + "reward": 0.80633544921875, + "reward_std": 0.008002669550478458, + "rewards//mean": 0.80633544921875, + "rewards//std": 0.014209247194230556, + "step": 1351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2704, + "grad_norm": 1.184089183807373, + "kl": 0.2971717566251755, + "learning_rate": 8.39021527577903e-07, + "loss": 0.0119, + "num_tokens": 9835992.0, + "reward": 0.83843994140625, + "reward_std": 0.010883670300245285, + "rewards//mean": 0.83843994140625, + "rewards//std": 0.018154172226786613, + "step": 1352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2706, + "grad_norm": 1.605528473854065, + "kl": 0.3450569286942482, + "learning_rate": 8.387882128194092e-07, + "loss": 0.0138, + "num_tokens": 9843328.0, + "reward": 0.859619140625, + "reward_std": 0.019211186096072197, + "rewards//mean": 0.859619140625, + "rewards//std": 0.028729207813739777, + "step": 1353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.2708, + "grad_norm": 1.2870677709579468, + "kl": 0.33200591430068016, + "learning_rate": 8.385547615970638e-07, + "loss": 0.0128, + "num_tokens": 9850654.0, + "reward": 0.8270263671875, + "reward_std": 0.01494554802775383, + "rewards//mean": 0.8270263671875, + "rewards//std": 0.02482723444700241, + "step": 1354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.271, + "grad_norm": 1.4272429943084717, + "kl": 0.33825743570923805, + "learning_rate": 8.38321174004901e-07, + "loss": 0.0153, + "num_tokens": 9857923.0, + "reward": 0.80279541015625, + "reward_std": 0.01634499616920948, + "rewards//mean": 0.80279541015625, + "rewards//std": 0.022118913009762764, + "step": 1355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2712, + "grad_norm": 1.433832049369812, + "kl": 0.3653715178370476, + "learning_rate": 8.380874501370097e-07, + "loss": 0.0146, + "num_tokens": 9865195.0, + "reward": 0.8536376953125, + "reward_std": 0.012346497736871243, + "rewards//mean": 0.8536376953125, + "rewards//std": 0.016381263732910156, + "step": 1356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.875, + "epoch": 0.2714, + "grad_norm": 1.407943606376648, + "kl": 0.32267457991838455, + "learning_rate": 8.378535900875338e-07, + "loss": 0.0143, + "num_tokens": 9872499.0, + "reward": 0.8197021484375, + "reward_std": 0.013561130501329899, + "rewards//mean": 0.8197021484375, + "rewards//std": 0.028316037729382515, + "step": 1357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.828125, + "epoch": 0.2716, + "grad_norm": 1.8667653799057007, + "kl": 0.6009446363896132, + "learning_rate": 8.376195939506725e-07, + "loss": 0.0247, + "num_tokens": 9879872.0, + "reward": 0.81671142578125, + "reward_std": 0.011660019867122173, + "rewards//mean": 0.81671142578125, + "rewards//std": 0.022012578323483467, + "step": 1358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2718, + "grad_norm": 1.462983250617981, + "kl": 0.29316066578030586, + "learning_rate": 8.373854618206789e-07, + "loss": 0.0117, + "num_tokens": 9887224.0, + "reward": 0.8431396484375, + "reward_std": 0.013425862416625023, + "rewards//mean": 0.8431396484375, + "rewards//std": 0.018338041380047798, + "step": 1359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.875, + "epoch": 0.272, + "grad_norm": 1.5521855354309082, + "kl": 0.34069391153752804, + "learning_rate": 8.371511937918617e-07, + "loss": 0.0136, + "num_tokens": 9894560.0, + "reward": 0.80712890625, + "reward_std": 0.01235443539917469, + "rewards//mean": 0.80712890625, + "rewards//std": 0.01567916013300419, + "step": 1360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2722, + "grad_norm": 1.3533636331558228, + "kl": 0.30860453099012375, + "learning_rate": 8.369167899585839e-07, + "loss": 0.0123, + "num_tokens": 9901904.0, + "reward": 0.88067626953125, + "reward_std": 0.013875782489776611, + "rewards//mean": 0.88067626953125, + "rewards//std": 0.030890868976712227, + "step": 1361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.2724, + "grad_norm": 1.8221584558486938, + "kl": 0.413870295509696, + "learning_rate": 8.366822504152636e-07, + "loss": 0.018, + "num_tokens": 9909179.0, + "reward": 0.8800048828125, + "reward_std": 0.01501530222594738, + "rewards//mean": 0.8800048828125, + "rewards//std": 0.023933134973049164, + "step": 1362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2726, + "grad_norm": 1.4522680044174194, + "kl": 0.2892884574830532, + "learning_rate": 8.364475752563728e-07, + "loss": 0.0116, + "num_tokens": 9916459.0, + "reward": 0.82696533203125, + "reward_std": 0.01126411184668541, + "rewards//mean": 0.82696533203125, + "rewards//std": 0.02081797830760479, + "step": 1363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2728, + "grad_norm": 1.5168336629867554, + "kl": 0.3844279944896698, + "learning_rate": 8.362127645764389e-07, + "loss": 0.0154, + "num_tokens": 9923683.0, + "reward": 0.802978515625, + "reward_std": 0.0170002244412899, + "rewards//mean": 0.802978515625, + "rewards//std": 0.021109074354171753, + "step": 1364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.273, + "grad_norm": 1.4382659196853638, + "kl": 0.32855346612632275, + "learning_rate": 8.359778184700439e-07, + "loss": 0.0135, + "num_tokens": 9930978.0, + "reward": 0.83807373046875, + "reward_std": 0.014422155916690826, + "rewards//mean": 0.83807373046875, + "rewards//std": 0.01793353259563446, + "step": 1365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2732, + "grad_norm": 1.408332109451294, + "kl": 0.24870331771671772, + "learning_rate": 8.357427370318238e-07, + "loss": 0.0099, + "num_tokens": 9938258.0, + "reward": 0.85479736328125, + "reward_std": 0.01876802369952202, + "rewards//mean": 0.85479736328125, + "rewards//std": 0.030880087986588478, + "step": 1366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2734, + "grad_norm": 1.247920274734497, + "kl": 0.31013052724301815, + "learning_rate": 8.355075203564692e-07, + "loss": 0.0124, + "num_tokens": 9945578.0, + "reward": 0.8355712890625, + "reward_std": 0.011087839491665363, + "rewards//mean": 0.8355712890625, + "rewards//std": 0.01850239932537079, + "step": 1367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2736, + "grad_norm": 1.4381861686706543, + "kl": 0.29335867054760456, + "learning_rate": 8.352721685387256e-07, + "loss": 0.0117, + "num_tokens": 9952858.0, + "reward": 0.8131103515625, + "reward_std": 0.015941249206662178, + "rewards//mean": 0.8131103515625, + "rewards//std": 0.021409206092357635, + "step": 1368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2738, + "grad_norm": 1.2757803201675415, + "kl": 0.32787470519542694, + "learning_rate": 8.350366816733926e-07, + "loss": 0.0131, + "num_tokens": 9960122.0, + "reward": 0.85601806640625, + "reward_std": 0.016312718391418457, + "rewards//mean": 0.85601806640625, + "rewards//std": 0.027133610099554062, + "step": 1369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.274, + "grad_norm": 1.5735647678375244, + "kl": 0.28119746036827564, + "learning_rate": 8.348010598553243e-07, + "loss": 0.0112, + "num_tokens": 9967450.0, + "reward": 0.8243408203125, + "reward_std": 0.015382321551442146, + "rewards//mean": 0.8243408203125, + "rewards//std": 0.02236909233033657, + "step": 1370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.578125, + "epoch": 0.2742, + "grad_norm": 1.9144037961959839, + "kl": 0.42534390464425087, + "learning_rate": 8.34565303179429e-07, + "loss": -0.0126, + "num_tokens": 9974823.0, + "reward": 0.81298828125, + "reward_std": 0.014700992964208126, + "rewards//mean": 0.81298828125, + "rewards//std": 0.021276216953992844, + "step": 1371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2744, + "grad_norm": 1.6017062664031982, + "kl": 0.28615082800388336, + "learning_rate": 8.343294117406698e-07, + "loss": 0.0114, + "num_tokens": 9982175.0, + "reward": 0.87677001953125, + "reward_std": 0.019899921491742134, + "rewards//mean": 0.87677001953125, + "rewards//std": 0.02500668168067932, + "step": 1372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2746, + "grad_norm": 1.4394991397857666, + "kl": 0.2930687926709652, + "learning_rate": 8.340933856340635e-07, + "loss": 0.0117, + "num_tokens": 9989423.0, + "reward": 0.81005859375, + "reward_std": 0.014619136229157448, + "rewards//mean": 0.81005859375, + "rewards//std": 0.0310361310839653, + "step": 1373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2748, + "grad_norm": 1.7185091972351074, + "kl": 0.35454847663640976, + "learning_rate": 8.338572249546812e-07, + "loss": 0.0142, + "num_tokens": 9996775.0, + "reward": 0.84637451171875, + "reward_std": 0.020303290337324142, + "rewards//mean": 0.84637451171875, + "rewards//std": 0.027883654460310936, + "step": 1374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.275, + "grad_norm": 1.885239601135254, + "kl": 0.3814831003546715, + "learning_rate": 8.336209297976489e-07, + "loss": 0.0153, + "num_tokens": 10004063.0, + "reward": 0.84161376953125, + "reward_std": 0.015640629455447197, + "rewards//mean": 0.84161376953125, + "rewards//std": 0.025486357510089874, + "step": 1375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2752, + "grad_norm": 1.4953336715698242, + "kl": 0.3014150392264128, + "learning_rate": 8.333845002581458e-07, + "loss": 0.0121, + "num_tokens": 10011319.0, + "reward": 0.8553466796875, + "reward_std": 0.016689199954271317, + "rewards//mean": 0.8553466796875, + "rewards//std": 0.02469518408179283, + "step": 1376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2754, + "grad_norm": 1.4381823539733887, + "kl": 0.3371537998318672, + "learning_rate": 8.331479364314059e-07, + "loss": 0.0135, + "num_tokens": 10018543.0, + "reward": 0.81134033203125, + "reward_std": 0.011821608990430832, + "rewards//mean": 0.81134033203125, + "rewards//std": 0.014309038408100605, + "step": 1377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2756, + "grad_norm": 1.311785101890564, + "kl": 0.2978538889437914, + "learning_rate": 8.32911238412717e-07, + "loss": 0.0119, + "num_tokens": 10025799.0, + "reward": 0.83892822265625, + "reward_std": 0.015360351651906967, + "rewards//mean": 0.83892822265625, + "rewards//std": 0.020216671749949455, + "step": 1378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.84375, + "epoch": 0.2758, + "grad_norm": 1.3470689058303833, + "kl": 0.32623889297246933, + "learning_rate": 8.326744062974211e-07, + "loss": 0.0074, + "num_tokens": 10033125.0, + "reward": 0.78448486328125, + "reward_std": 0.01471803244203329, + "rewards//mean": 0.78448486328125, + "rewards//std": 0.018408380448818207, + "step": 1379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.276, + "grad_norm": 1.07109534740448, + "kl": 0.29266275465488434, + "learning_rate": 8.324374401809142e-07, + "loss": 0.0124, + "num_tokens": 10040355.0, + "reward": 0.8775634765625, + "reward_std": 0.014457919634878635, + "rewards//mean": 0.8775634765625, + "rewards//std": 0.023117218166589737, + "step": 1380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2762, + "grad_norm": 1.4012113809585571, + "kl": 0.368148859590292, + "learning_rate": 8.322003401586461e-07, + "loss": 0.0147, + "num_tokens": 10047635.0, + "reward": 0.86492919921875, + "reward_std": 0.02174787037074566, + "rewards//mean": 0.86492919921875, + "rewards//std": 0.032591529190540314, + "step": 1381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.875, + "epoch": 0.2764, + "grad_norm": 1.6198135614395142, + "kl": 0.25480523332953453, + "learning_rate": 8.319631063261207e-07, + "loss": 0.0067, + "num_tokens": 10054875.0, + "reward": 0.85601806640625, + "reward_std": 0.019441165030002594, + "rewards//mean": 0.85601806640625, + "rewards//std": 0.027873337268829346, + "step": 1382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2766, + "grad_norm": 1.136506199836731, + "kl": 0.2601087102666497, + "learning_rate": 8.317257387788958e-07, + "loss": 0.0104, + "num_tokens": 10062091.0, + "reward": 0.8548583984375, + "reward_std": 0.01225061435252428, + "rewards//mean": 0.8548583984375, + "rewards//std": 0.016965940594673157, + "step": 1383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.2768, + "grad_norm": 1.675748348236084, + "kl": 0.29624835029244423, + "learning_rate": 8.314882376125831e-07, + "loss": 0.0138, + "num_tokens": 10069414.0, + "reward": 0.87188720703125, + "reward_std": 0.016756437718868256, + "rewards//mean": 0.87188720703125, + "rewards//std": 0.030402857810258865, + "step": 1384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.277, + "grad_norm": 1.6237905025482178, + "kl": 0.29457212798297405, + "learning_rate": 8.312506029228477e-07, + "loss": 0.0118, + "num_tokens": 10076750.0, + "reward": 0.77099609375, + "reward_std": 0.013287574052810669, + "rewards//mean": 0.77099609375, + "rewards//std": 0.0196425411850214, + "step": 1385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.609375, + "epoch": 0.2772, + "grad_norm": 1.1796571016311646, + "kl": 0.30765629187226295, + "learning_rate": 8.310128348054093e-07, + "loss": -0.0017, + "num_tokens": 10084005.0, + "reward": 0.8238525390625, + "reward_std": 0.011908927001059055, + "rewards//mean": 0.8238525390625, + "rewards//std": 0.026763787493109703, + "step": 1386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2774, + "grad_norm": 1.6734696626663208, + "kl": 0.3300440236926079, + "learning_rate": 8.307749333560404e-07, + "loss": 0.0132, + "num_tokens": 10091253.0, + "reward": 0.79681396484375, + "reward_std": 0.020186766982078552, + "rewards//mean": 0.79681396484375, + "rewards//std": 0.023668793961405754, + "step": 1387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2776, + "grad_norm": 1.3826664686203003, + "kl": 0.323578592389822, + "learning_rate": 8.305368986705681e-07, + "loss": 0.0129, + "num_tokens": 10098469.0, + "reward": 0.8587646484375, + "reward_std": 0.014831777662038803, + "rewards//mean": 0.8587646484375, + "rewards//std": 0.026431409642100334, + "step": 1388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2778, + "grad_norm": 1.3982044458389282, + "kl": 0.3438527137041092, + "learning_rate": 8.302987308448723e-07, + "loss": 0.0138, + "num_tokens": 10105733.0, + "reward": 0.8599853515625, + "reward_std": 0.014751886948943138, + "rewards//mean": 0.8599853515625, + "rewards//std": 0.018743008375167847, + "step": 1389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.278, + "grad_norm": 1.370172142982483, + "kl": 0.30728005059063435, + "learning_rate": 8.300604299748874e-07, + "loss": 0.0123, + "num_tokens": 10113149.0, + "reward": 0.8792724609375, + "reward_std": 0.015479329973459244, + "rewards//mean": 0.8792724609375, + "rewards//std": 0.020288147032260895, + "step": 1390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2782, + "grad_norm": 1.7385305166244507, + "kl": 0.30826626904308796, + "learning_rate": 8.298219961566008e-07, + "loss": 0.0123, + "num_tokens": 10120461.0, + "reward": 0.83203125, + "reward_std": 0.016622185707092285, + "rewards//mean": 0.83203125, + "rewards//std": 0.022520504891872406, + "step": 1391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.2784, + "grad_norm": 1.3337079286575317, + "kl": 0.3106526620686054, + "learning_rate": 8.295834294860534e-07, + "loss": 0.0126, + "num_tokens": 10127745.0, + "reward": 0.7703857421875, + "reward_std": 0.011379053816199303, + "rewards//mean": 0.7703857421875, + "rewards//std": 0.023576276376843452, + "step": 1392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2786, + "grad_norm": 1.3403879404067993, + "kl": 0.3211323730647564, + "learning_rate": 8.293447300593402e-07, + "loss": 0.0128, + "num_tokens": 10135041.0, + "reward": 0.8438720703125, + "reward_std": 0.02032431587576866, + "rewards//mean": 0.8438720703125, + "rewards//std": 0.030337374657392502, + "step": 1393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2788, + "grad_norm": 1.1712530851364136, + "kl": 0.30725202709436417, + "learning_rate": 8.291058979726091e-07, + "loss": 0.0123, + "num_tokens": 10142257.0, + "reward": 0.845947265625, + "reward_std": 0.014373554848134518, + "rewards//mean": 0.845947265625, + "rewards//std": 0.021596815437078476, + "step": 1394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.279, + "grad_norm": 1.85001540184021, + "kl": 0.40600116923451424, + "learning_rate": 8.288669333220614e-07, + "loss": 0.0162, + "num_tokens": 10149481.0, + "reward": 0.7916259765625, + "reward_std": 0.012610180303454399, + "rewards//mean": 0.7916259765625, + "rewards//std": 0.020436828956007957, + "step": 1395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2792, + "grad_norm": 1.3576034307479858, + "kl": 0.3561114352196455, + "learning_rate": 8.286278362039527e-07, + "loss": 0.0142, + "num_tokens": 10156809.0, + "reward": 0.871826171875, + "reward_std": 0.017442025244235992, + "rewards//mean": 0.871826171875, + "rewards//std": 0.026547132059931755, + "step": 1396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.84375, + "epoch": 0.2794, + "grad_norm": 1.553392767906189, + "kl": 0.2973686009645462, + "learning_rate": 8.283886067145906e-07, + "loss": 0.0088, + "num_tokens": 10164055.0, + "reward": 0.8653564453125, + "reward_std": 0.015870681032538414, + "rewards//mean": 0.8653564453125, + "rewards//std": 0.02538198046386242, + "step": 1397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2796, + "grad_norm": 1.5573686361312866, + "kl": 0.31458782963454723, + "learning_rate": 8.281492449503372e-07, + "loss": 0.0126, + "num_tokens": 10171343.0, + "reward": 0.84625244140625, + "reward_std": 0.01583908125758171, + "rewards//mean": 0.84625244140625, + "rewards//std": 0.02182542160153389, + "step": 1398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.2798, + "grad_norm": 1.5464180707931519, + "kl": 0.2721885498613119, + "learning_rate": 8.279097510076069e-07, + "loss": 0.0108, + "num_tokens": 10178670.0, + "reward": 0.851318359375, + "reward_std": 0.01894408091902733, + "rewards//mean": 0.851318359375, + "rewards//std": 0.025054534897208214, + "step": 1399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.28, + "grad_norm": 1.3301175832748413, + "kl": 0.2740920279175043, + "learning_rate": 8.276701249828684e-07, + "loss": 0.011, + "num_tokens": 10185926.0, + "reward": 0.76751708984375, + "reward_std": 0.01567905955016613, + "rewards//mean": 0.76751708984375, + "rewards//std": 0.02776123583316803, + "step": 1400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2802, + "grad_norm": 1.4856452941894531, + "kl": 0.2774148676544428, + "learning_rate": 8.274303669726426e-07, + "loss": 0.0111, + "num_tokens": 10193206.0, + "reward": 0.86602783203125, + "reward_std": 0.016889283433556557, + "rewards//mean": 0.86602783203125, + "rewards//std": 0.020829608663916588, + "step": 1401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2804, + "grad_norm": 1.5494855642318726, + "kl": 0.30239273235201836, + "learning_rate": 8.271904770735041e-07, + "loss": 0.0121, + "num_tokens": 10200486.0, + "reward": 0.84002685546875, + "reward_std": 0.013544932007789612, + "rewards//mean": 0.84002685546875, + "rewards//std": 0.015553995035588741, + "step": 1402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.2806, + "grad_norm": 1.593980073928833, + "kl": 0.30972765013575554, + "learning_rate": 8.269504553820805e-07, + "loss": 0.013, + "num_tokens": 10207764.0, + "reward": 0.82794189453125, + "reward_std": 0.01674109697341919, + "rewards//mean": 0.82794189453125, + "rewards//std": 0.031669046729803085, + "step": 1403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.59375, + "epoch": 0.2808, + "grad_norm": 1.1172655820846558, + "kl": 0.30527516081929207, + "learning_rate": 8.267103019950528e-07, + "loss": -0.0174, + "num_tokens": 10214986.0, + "reward": 0.838134765625, + "reward_std": 0.013356514275074005, + "rewards//mean": 0.838134765625, + "rewards//std": 0.020313872024416924, + "step": 1404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.281, + "grad_norm": 1.6722997426986694, + "kl": 0.3477785363793373, + "learning_rate": 8.264700170091543e-07, + "loss": 0.0139, + "num_tokens": 10222202.0, + "reward": 0.8045654296875, + "reward_std": 0.015183941461145878, + "rewards//mean": 0.8045654296875, + "rewards//std": 0.024458665400743484, + "step": 1405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.828125, + "epoch": 0.2812, + "grad_norm": 1.2158697843551636, + "kl": 0.28977758064866066, + "learning_rate": 8.262296005211721e-07, + "loss": 0.0131, + "num_tokens": 10229423.0, + "reward": 0.8189697265625, + "reward_std": 0.010701088234782219, + "rewards//mean": 0.8189697265625, + "rewards//std": 0.019390849396586418, + "step": 1406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.2814, + "grad_norm": 1.2036935091018677, + "kl": 0.4090277962386608, + "learning_rate": 8.259890526279459e-07, + "loss": 0.0153, + "num_tokens": 10236772.0, + "reward": 0.80517578125, + "reward_std": 0.013406028971076012, + "rewards//mean": 0.80517578125, + "rewards//std": 0.022374844178557396, + "step": 1407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.859375, + "epoch": 0.2816, + "grad_norm": 1.4532430171966553, + "kl": 0.3875210378319025, + "learning_rate": 8.257483734263681e-07, + "loss": 0.0174, + "num_tokens": 10244067.0, + "reward": 0.86279296875, + "reward_std": 0.016561079770326614, + "rewards//mean": 0.86279296875, + "rewards//std": 0.02853146195411682, + "step": 1408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.765625, + "epoch": 0.2818, + "grad_norm": 1.4959962368011475, + "kl": 0.2981047313660383, + "learning_rate": 8.255075630133845e-07, + "loss": 0.0094, + "num_tokens": 10251284.0, + "reward": 0.85028076171875, + "reward_std": 0.012935432605445385, + "rewards//mean": 0.85028076171875, + "rewards//std": 0.016293587163090706, + "step": 1409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.282, + "grad_norm": 1.3630428314208984, + "kl": 0.28023738600313663, + "learning_rate": 8.252666214859934e-07, + "loss": 0.0112, + "num_tokens": 10258612.0, + "reward": 0.81463623046875, + "reward_std": 0.0167583879083395, + "rewards//mean": 0.81463623046875, + "rewards//std": 0.03403916954994202, + "step": 1410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2822, + "grad_norm": 1.866133451461792, + "kl": 0.3360593356192112, + "learning_rate": 8.250255489412462e-07, + "loss": 0.0134, + "num_tokens": 10266004.0, + "reward": 0.8544921875, + "reward_std": 0.02367309294641018, + "rewards//mean": 0.8544921875, + "rewards//std": 0.02914041467010975, + "step": 1411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2824, + "grad_norm": 1.4208579063415527, + "kl": 0.3061874508857727, + "learning_rate": 8.247843454762466e-07, + "loss": 0.0122, + "num_tokens": 10273228.0, + "reward": 0.87750244140625, + "reward_std": 0.016392098739743233, + "rewards//mean": 0.87750244140625, + "rewards//std": 0.026423605158925056, + "step": 1412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2826, + "grad_norm": 1.2278823852539062, + "kl": 0.3591478727757931, + "learning_rate": 8.245430111881517e-07, + "loss": 0.0144, + "num_tokens": 10280564.0, + "reward": 0.867919921875, + "reward_std": 0.013442113064229488, + "rewards//mean": 0.867919921875, + "rewards//std": 0.024258870631456375, + "step": 1413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2828, + "grad_norm": 1.3383231163024902, + "kl": 0.3074133563786745, + "learning_rate": 8.243015461741706e-07, + "loss": 0.0123, + "num_tokens": 10287908.0, + "reward": 0.85394287109375, + "reward_std": 0.016201213002204895, + "rewards//mean": 0.85394287109375, + "rewards//std": 0.033506233245134354, + "step": 1414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.283, + "grad_norm": 1.5757461786270142, + "kl": 0.279807997867465, + "learning_rate": 8.240599505315654e-07, + "loss": 0.0112, + "num_tokens": 10295244.0, + "reward": 0.862548828125, + "reward_std": 0.019760165363550186, + "rewards//mean": 0.862548828125, + "rewards//std": 0.026981506496667862, + "step": 1415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2832, + "grad_norm": 6.109029769897461, + "kl": 0.8303087018430233, + "learning_rate": 8.238182243576511e-07, + "loss": 0.0332, + "num_tokens": 10302524.0, + "reward": 0.8568115234375, + "reward_std": 0.017343387007713318, + "rewards//mean": 0.8568115234375, + "rewards//std": 0.02349136956036091, + "step": 1416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2834, + "grad_norm": 1.8838496208190918, + "kl": 0.42528393492102623, + "learning_rate": 8.235763677497945e-07, + "loss": 0.017, + "num_tokens": 10309740.0, + "reward": 0.86553955078125, + "reward_std": 0.01664792001247406, + "rewards//mean": 0.86553955078125, + "rewards//std": 0.026746459305286407, + "step": 1417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.796875, + "epoch": 0.2836, + "grad_norm": 1.5974868535995483, + "kl": 0.29750731959939003, + "learning_rate": 8.233343808054157e-07, + "loss": 0.0088, + "num_tokens": 10317039.0, + "reward": 0.83770751953125, + "reward_std": 0.01631435751914978, + "rewards//mean": 0.83770751953125, + "rewards//std": 0.025419747456908226, + "step": 1418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2838, + "grad_norm": 1.4290895462036133, + "kl": 0.3089058417826891, + "learning_rate": 8.23092263621987e-07, + "loss": 0.0124, + "num_tokens": 10324351.0, + "reward": 0.83349609375, + "reward_std": 0.015409504994750023, + "rewards//mean": 0.83349609375, + "rewards//std": 0.019016021862626076, + "step": 1419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.284, + "grad_norm": 1.7389518022537231, + "kl": 0.36727126501500607, + "learning_rate": 8.228500162970332e-07, + "loss": 0.0147, + "num_tokens": 10331671.0, + "reward": 0.854248046875, + "reward_std": 0.018062327057123184, + "rewards//mean": 0.854248046875, + "rewards//std": 0.02643742226064205, + "step": 1420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2842, + "grad_norm": 1.3801932334899902, + "kl": 0.2857962790876627, + "learning_rate": 8.226076389281314e-07, + "loss": 0.0114, + "num_tokens": 10339023.0, + "reward": 0.8485107421875, + "reward_std": 0.01978445053100586, + "rewards//mean": 0.8485107421875, + "rewards//std": 0.02517598308622837, + "step": 1421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2844, + "grad_norm": 1.3517098426818848, + "kl": 0.35249757766723633, + "learning_rate": 8.223651316129114e-07, + "loss": 0.0141, + "num_tokens": 10346303.0, + "reward": 0.805419921875, + "reward_std": 0.016645517200231552, + "rewards//mean": 0.805419921875, + "rewards//std": 0.02692759409546852, + "step": 1422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2846, + "grad_norm": 1.682958722114563, + "kl": 0.29881145991384983, + "learning_rate": 8.221224944490548e-07, + "loss": 0.012, + "num_tokens": 10353615.0, + "reward": 0.8641357421875, + "reward_std": 0.024907875806093216, + "rewards//mean": 0.8641357421875, + "rewards//std": 0.047011565417051315, + "step": 1423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2848, + "grad_norm": 1.6322317123413086, + "kl": 0.30387632362544537, + "learning_rate": 8.21879727534296e-07, + "loss": 0.0122, + "num_tokens": 10360887.0, + "reward": 0.76092529296875, + "reward_std": 0.013196103274822235, + "rewards//mean": 0.76092529296875, + "rewards//std": 0.02048371732234955, + "step": 1424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.285, + "grad_norm": 1.3377870321273804, + "kl": 0.2975314296782017, + "learning_rate": 8.216368309664213e-07, + "loss": 0.012, + "num_tokens": 10368062.0, + "reward": 0.865966796875, + "reward_std": 0.01496448740363121, + "rewards//mean": 0.865966796875, + "rewards//std": 0.025984639301896095, + "step": 1425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2852, + "grad_norm": 1.596271276473999, + "kl": 0.33691645227372646, + "learning_rate": 8.213938048432696e-07, + "loss": 0.0135, + "num_tokens": 10375382.0, + "reward": 0.81524658203125, + "reward_std": 0.0137696648016572, + "rewards//mean": 0.81524658203125, + "rewards//std": 0.026481404900550842, + "step": 1426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2854, + "grad_norm": 1.5868946313858032, + "kl": 0.3348490260541439, + "learning_rate": 8.211506492627318e-07, + "loss": 0.0134, + "num_tokens": 10382670.0, + "reward": 0.8544921875, + "reward_std": 0.019390640780329704, + "rewards//mean": 0.8544921875, + "rewards//std": 0.03310175985097885, + "step": 1427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2856, + "grad_norm": 1.3025484085083008, + "kl": 0.2681413535028696, + "learning_rate": 8.209073643227509e-07, + "loss": 0.0107, + "num_tokens": 10389982.0, + "reward": 0.7940673828125, + "reward_std": 0.013146926648914814, + "rewards//mean": 0.7940673828125, + "rewards//std": 0.022525545209646225, + "step": 1428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.2858, + "grad_norm": 1.3696526288986206, + "kl": 0.35803766921162605, + "learning_rate": 8.206639501213219e-07, + "loss": 0.0138, + "num_tokens": 10397351.0, + "reward": 0.83001708984375, + "reward_std": 0.013320768252015114, + "rewards//mean": 0.83001708984375, + "rewards//std": 0.014894770458340645, + "step": 1429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.286, + "grad_norm": 1.6417268514633179, + "kl": 0.3545222021639347, + "learning_rate": 8.204204067564924e-07, + "loss": 0.0183, + "num_tokens": 10404618.0, + "reward": 0.863037109375, + "reward_std": 0.016295358538627625, + "rewards//mean": 0.863037109375, + "rewards//std": 0.02209571748971939, + "step": 1430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2862, + "grad_norm": 1.5915260314941406, + "kl": 0.282078480347991, + "learning_rate": 8.201767343263611e-07, + "loss": 0.0113, + "num_tokens": 10411946.0, + "reward": 0.8387451171875, + "reward_std": 0.015421109274029732, + "rewards//mean": 0.8387451171875, + "rewards//std": 0.019568031653761864, + "step": 1431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.2864, + "grad_norm": 1.2283846139907837, + "kl": 0.29994137212634087, + "learning_rate": 8.199329329290796e-07, + "loss": 0.0119, + "num_tokens": 10419316.0, + "reward": 0.84869384765625, + "reward_std": 0.01431953627616167, + "rewards//mean": 0.84869384765625, + "rewards//std": 0.02229614183306694, + "step": 1432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2866, + "grad_norm": 1.691126823425293, + "kl": 0.3495689891278744, + "learning_rate": 8.19689002662851e-07, + "loss": 0.014, + "num_tokens": 10426524.0, + "reward": 0.85546875, + "reward_std": 0.015370241366326809, + "rewards//mean": 0.85546875, + "rewards//std": 0.029289640486240387, + "step": 1433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2868, + "grad_norm": 1.2745459079742432, + "kl": 0.2886010427027941, + "learning_rate": 8.194449436259303e-07, + "loss": 0.0115, + "num_tokens": 10433748.0, + "reward": 0.8341064453125, + "reward_std": 0.012916624546051025, + "rewards//mean": 0.8341064453125, + "rewards//std": 0.018440116196870804, + "step": 1434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.287, + "grad_norm": 1.2374838590621948, + "kl": 0.29754670709371567, + "learning_rate": 8.192007559166247e-07, + "loss": 0.0119, + "num_tokens": 10441028.0, + "reward": 0.81317138671875, + "reward_std": 0.010684057138860226, + "rewards//mean": 0.81317138671875, + "rewards//std": 0.01808985322713852, + "step": 1435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2872, + "grad_norm": 2.16088604927063, + "kl": 0.46372854709625244, + "learning_rate": 8.189564396332926e-07, + "loss": 0.0185, + "num_tokens": 10448316.0, + "reward": 0.86041259765625, + "reward_std": 0.01286916434764862, + "rewards//mean": 0.86041259765625, + "rewards//std": 0.029881058260798454, + "step": 1436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.2874, + "grad_norm": 1.2702109813690186, + "kl": 0.30875580199062824, + "learning_rate": 8.187119948743449e-07, + "loss": 0.0144, + "num_tokens": 10455645.0, + "reward": 0.8505859375, + "reward_std": 0.01876179501414299, + "rewards//mean": 0.8505859375, + "rewards//std": 0.02689271606504917, + "step": 1437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.8125, + "epoch": 0.2876, + "grad_norm": 1.3100404739379883, + "kl": 0.3243616744875908, + "learning_rate": 8.184674217382437e-07, + "loss": 0.0075, + "num_tokens": 10462969.0, + "reward": 0.859375, + "reward_std": 0.01083114929497242, + "rewards//mean": 0.859375, + "rewards//std": 0.02463657222688198, + "step": 1438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2878, + "grad_norm": 1.337518334388733, + "kl": 0.30989624559879303, + "learning_rate": 8.182227203235031e-07, + "loss": 0.0124, + "num_tokens": 10470337.0, + "reward": 0.87152099609375, + "reward_std": 0.017137765884399414, + "rewards//mean": 0.87152099609375, + "rewards//std": 0.023241546005010605, + "step": 1439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.288, + "grad_norm": 1.4019232988357544, + "kl": 0.35729533061385155, + "learning_rate": 8.179778907286887e-07, + "loss": 0.0143, + "num_tokens": 10477657.0, + "reward": 0.82647705078125, + "reward_std": 0.014675214886665344, + "rewards//mean": 0.82647705078125, + "rewards//std": 0.026243679225444794, + "step": 1440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.2882, + "grad_norm": 1.4837628602981567, + "kl": 0.3519560694694519, + "learning_rate": 8.177329330524181e-07, + "loss": 0.0146, + "num_tokens": 10485277.0, + "reward": 0.84625244140625, + "reward_std": 0.017010845243930817, + "rewards//mean": 0.84625244140625, + "rewards//std": 0.02191401831805706, + "step": 1441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2884, + "grad_norm": 1.7243119478225708, + "kl": 0.3166736587882042, + "learning_rate": 8.1748784739336e-07, + "loss": 0.0127, + "num_tokens": 10492557.0, + "reward": 0.84771728515625, + "reward_std": 0.0229438878595829, + "rewards//mean": 0.84771728515625, + "rewards//std": 0.03295136243104935, + "step": 1442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2886, + "grad_norm": 1.380411148071289, + "kl": 0.35263198986649513, + "learning_rate": 8.17242633850235e-07, + "loss": 0.0141, + "num_tokens": 10499829.0, + "reward": 0.86151123046875, + "reward_std": 0.012386612594127655, + "rewards//mean": 0.86151123046875, + "rewards//std": 0.021853838115930557, + "step": 1443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2888, + "grad_norm": 1.6929066181182861, + "kl": 0.26783732511103153, + "learning_rate": 8.16997292521815e-07, + "loss": 0.0107, + "num_tokens": 10507117.0, + "reward": 0.86761474609375, + "reward_std": 0.01892443746328354, + "rewards//mean": 0.86761474609375, + "rewards//std": 0.02969302609562874, + "step": 1444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.289, + "grad_norm": 1.4331556558609009, + "kl": 0.29850901663303375, + "learning_rate": 8.167518235069234e-07, + "loss": 0.0119, + "num_tokens": 10514397.0, + "reward": 0.83984375, + "reward_std": 0.01796860620379448, + "rewards//mean": 0.83984375, + "rewards//std": 0.03142004832625389, + "step": 1445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2892, + "grad_norm": 1.6900211572647095, + "kl": 0.3127448484301567, + "learning_rate": 8.165062269044352e-07, + "loss": 0.0125, + "num_tokens": 10521581.0, + "reward": 0.7818603515625, + "reward_std": 0.012998288497328758, + "rewards//mean": 0.7818603515625, + "rewards//std": 0.01763787493109703, + "step": 1446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2894, + "grad_norm": 1.9734954833984375, + "kl": 0.3059917874634266, + "learning_rate": 8.162605028132768e-07, + "loss": 0.0122, + "num_tokens": 10528917.0, + "reward": 0.8424072265625, + "reward_std": 0.016506727784872055, + "rewards//mean": 0.8424072265625, + "rewards//std": 0.02323218248784542, + "step": 1447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2896, + "grad_norm": 1.4387296438217163, + "kl": 0.3480677641928196, + "learning_rate": 8.160146513324254e-07, + "loss": 0.0139, + "num_tokens": 10536237.0, + "reward": 0.84857177734375, + "reward_std": 0.01565377414226532, + "rewards//mean": 0.84857177734375, + "rewards//std": 0.018092364072799683, + "step": 1448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2898, + "grad_norm": 1.4658440351486206, + "kl": 0.3087512403726578, + "learning_rate": 8.157686725609105e-07, + "loss": 0.0124, + "num_tokens": 10543517.0, + "reward": 0.82012939453125, + "reward_std": 0.012093218974769115, + "rewards//mean": 0.82012939453125, + "rewards//std": 0.021535661071538925, + "step": 1449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.29, + "grad_norm": 1.396183729171753, + "kl": 0.32163139432668686, + "learning_rate": 8.155225665978118e-07, + "loss": 0.0129, + "num_tokens": 10550869.0, + "reward": 0.836669921875, + "reward_std": 0.01620744913816452, + "rewards//mean": 0.836669921875, + "rewards//std": 0.025044864043593407, + "step": 1450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.59375, + "epoch": 0.2902, + "grad_norm": 1.4805197715759277, + "kl": 0.27179672569036484, + "learning_rate": 8.152763335422612e-07, + "loss": -0.0195, + "num_tokens": 10558067.0, + "reward": 0.873291015625, + "reward_std": 0.020589567720890045, + "rewards//mean": 0.873291015625, + "rewards//std": 0.03087475523352623, + "step": 1451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.8125, + "epoch": 0.2904, + "grad_norm": 1.4774982929229736, + "kl": 0.35989243909716606, + "learning_rate": 8.150299734934412e-07, + "loss": 0.0049, + "num_tokens": 10565351.0, + "reward": 0.72979736328125, + "reward_std": 0.011783169582486153, + "rewards//mean": 0.72979736328125, + "rewards//std": 0.018276335671544075, + "step": 1452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2906, + "grad_norm": 1.7663031816482544, + "kl": 0.3293916881084442, + "learning_rate": 8.147834865505853e-07, + "loss": 0.0132, + "num_tokens": 10572583.0, + "reward": 0.86090087890625, + "reward_std": 0.016243917867541313, + "rewards//mean": 0.86090087890625, + "rewards//std": 0.04054489731788635, + "step": 1453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2908, + "grad_norm": 1.618172526359558, + "kl": 0.3173008244484663, + "learning_rate": 8.145368728129789e-07, + "loss": 0.0127, + "num_tokens": 10579791.0, + "reward": 0.803466796875, + "reward_std": 0.014538172632455826, + "rewards//mean": 0.803466796875, + "rewards//std": 0.031434498727321625, + "step": 1454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.291, + "grad_norm": 1.278242826461792, + "kl": 0.27393062971532345, + "learning_rate": 8.142901323799577e-07, + "loss": 0.011, + "num_tokens": 10587079.0, + "reward": 0.85711669921875, + "reward_std": 0.014552965760231018, + "rewards//mean": 0.85711669921875, + "rewards//std": 0.02675551362335682, + "step": 1455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2912, + "grad_norm": 1.50359046459198, + "kl": 0.2944757491350174, + "learning_rate": 8.140432653509087e-07, + "loss": 0.0118, + "num_tokens": 10594383.0, + "reward": 0.8082275390625, + "reward_std": 0.014143723994493484, + "rewards//mean": 0.8082275390625, + "rewards//std": 0.022922571748495102, + "step": 1456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2914, + "grad_norm": 1.3807387351989746, + "kl": 0.28976445086300373, + "learning_rate": 8.1379627182527e-07, + "loss": 0.0116, + "num_tokens": 10601655.0, + "reward": 0.87017822265625, + "reward_std": 0.016552114859223366, + "rewards//mean": 0.87017822265625, + "rewards//std": 0.02316456288099289, + "step": 1457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.84375, + "epoch": 0.2916, + "grad_norm": 1.5082694292068481, + "kl": 0.3442747723311186, + "learning_rate": 8.135491519025306e-07, + "loss": 0.0166, + "num_tokens": 10609005.0, + "reward": 0.843994140625, + "reward_std": 0.013236332684755325, + "rewards//mean": 0.843994140625, + "rewards//std": 0.021291865035891533, + "step": 1458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2918, + "grad_norm": 1.4740980863571167, + "kl": 0.2891473565250635, + "learning_rate": 8.133019056822302e-07, + "loss": 0.0116, + "num_tokens": 10616317.0, + "reward": 0.8504638671875, + "reward_std": 0.01705564744770527, + "rewards//mean": 0.8504638671875, + "rewards//std": 0.026268254965543747, + "step": 1459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.292, + "grad_norm": 1.5733057260513306, + "kl": 0.2782926522195339, + "learning_rate": 8.130545332639597e-07, + "loss": 0.0111, + "num_tokens": 10623605.0, + "reward": 0.87109375, + "reward_std": 0.012899640947580338, + "rewards//mean": 0.87109375, + "rewards//std": 0.020810069516301155, + "step": 1460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2922, + "grad_norm": 4.731704235076904, + "kl": 0.8432967402040958, + "learning_rate": 8.128070347473608e-07, + "loss": 0.0337, + "num_tokens": 10630981.0, + "reward": 0.8492431640625, + "reward_std": 0.015428568236529827, + "rewards//mean": 0.8492431640625, + "rewards//std": 0.020760180428624153, + "step": 1461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2924, + "grad_norm": 1.2685590982437134, + "kl": 0.27815070003271103, + "learning_rate": 8.125594102321255e-07, + "loss": 0.0111, + "num_tokens": 10638253.0, + "reward": 0.8553466796875, + "reward_std": 0.011630173772573471, + "rewards//mean": 0.8553466796875, + "rewards//std": 0.01949983462691307, + "step": 1462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2926, + "grad_norm": 1.3246593475341797, + "kl": 0.2952705230563879, + "learning_rate": 8.123116598179971e-07, + "loss": 0.0118, + "num_tokens": 10645629.0, + "reward": 0.89923095703125, + "reward_std": 0.015028344467282295, + "rewards//mean": 0.89923095703125, + "rewards//std": 0.02560250833630562, + "step": 1463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.609375, + "epoch": 0.2928, + "grad_norm": 1.2848283052444458, + "kl": 0.3126201145350933, + "learning_rate": 8.120637836047697e-07, + "loss": -0.0205, + "num_tokens": 10652828.0, + "reward": 0.8603515625, + "reward_std": 0.016164904460310936, + "rewards//mean": 0.8603515625, + "rewards//std": 0.021202094852924347, + "step": 1464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.293, + "grad_norm": 1.3071662187576294, + "kl": 0.3282759450376034, + "learning_rate": 8.118157816922874e-07, + "loss": 0.0131, + "num_tokens": 10660148.0, + "reward": 0.8326416015625, + "reward_std": 0.011772183701395988, + "rewards//mean": 0.8326416015625, + "rewards//std": 0.02313816361129284, + "step": 1465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2932, + "grad_norm": 2.557805299758911, + "kl": 0.3711511045694351, + "learning_rate": 8.115676541804455e-07, + "loss": 0.0148, + "num_tokens": 10667404.0, + "reward": 0.84747314453125, + "reward_std": 0.018909133970737457, + "rewards//mean": 0.84747314453125, + "rewards//std": 0.019666319712996483, + "step": 1466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.2934, + "grad_norm": 1.317111849784851, + "kl": 0.2640094291418791, + "learning_rate": 8.113194011691899e-07, + "loss": 0.011, + "num_tokens": 10674635.0, + "reward": 0.840087890625, + "reward_std": 0.01456998847424984, + "rewards//mean": 0.840087890625, + "rewards//std": 0.020586274564266205, + "step": 1467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2936, + "grad_norm": 1.5302176475524902, + "kl": 0.38959237560629845, + "learning_rate": 8.110710227585167e-07, + "loss": 0.0156, + "num_tokens": 10681875.0, + "reward": 0.8433837890625, + "reward_std": 0.014915725216269493, + "rewards//mean": 0.8433837890625, + "rewards//std": 0.019384603947401047, + "step": 1468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2938, + "grad_norm": 1.2340589761734009, + "kl": 0.30220554769039154, + "learning_rate": 8.108225190484726e-07, + "loss": 0.0121, + "num_tokens": 10689203.0, + "reward": 0.87640380859375, + "reward_std": 0.018568597733974457, + "rewards//mean": 0.87640380859375, + "rewards//std": 0.0248164851218462, + "step": 1469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.294, + "grad_norm": 1.3469769954681396, + "kl": 0.3370181489735842, + "learning_rate": 8.105738901391551e-07, + "loss": 0.0135, + "num_tokens": 10696603.0, + "reward": 0.84075927734375, + "reward_std": 0.01564771682024002, + "rewards//mean": 0.84075927734375, + "rewards//std": 0.021050097420811653, + "step": 1470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2942, + "grad_norm": 1.610282301902771, + "kl": 0.3537074029445648, + "learning_rate": 8.103251361307118e-07, + "loss": 0.0141, + "num_tokens": 10703931.0, + "reward": 0.845947265625, + "reward_std": 0.016495388001203537, + "rewards//mean": 0.845947265625, + "rewards//std": 0.021675176918506622, + "step": 1471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.703125, + "epoch": 0.2944, + "grad_norm": 1.2779359817504883, + "kl": 0.3685655780136585, + "learning_rate": 8.100762571233408e-07, + "loss": -0.0035, + "num_tokens": 10711168.0, + "reward": 0.848876953125, + "reward_std": 0.019098371267318726, + "rewards//mean": 0.848876953125, + "rewards//std": 0.026318056508898735, + "step": 1472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.2946, + "grad_norm": 1.4246820211410522, + "kl": 0.3558243252336979, + "learning_rate": 8.098272532172905e-07, + "loss": 0.0161, + "num_tokens": 10718625.0, + "reward": 0.84735107421875, + "reward_std": 0.017014967277646065, + "rewards//mean": 0.84735107421875, + "rewards//std": 0.023201782256364822, + "step": 1473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2948, + "grad_norm": 1.2981644868850708, + "kl": 0.30674841813743114, + "learning_rate": 8.095781245128597e-07, + "loss": 0.0123, + "num_tokens": 10725993.0, + "reward": 0.85784912109375, + "reward_std": 0.01688157394528389, + "rewards//mean": 0.85784912109375, + "rewards//std": 0.031616903841495514, + "step": 1474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.295, + "grad_norm": 1.3512153625488281, + "kl": 0.27563803270459175, + "learning_rate": 8.093288711103971e-07, + "loss": 0.011, + "num_tokens": 10733201.0, + "reward": 0.87506103515625, + "reward_std": 0.019900936633348465, + "rewards//mean": 0.87506103515625, + "rewards//std": 0.03341122344136238, + "step": 1475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2952, + "grad_norm": 1.4263083934783936, + "kl": 0.356829471886158, + "learning_rate": 8.090794931103026e-07, + "loss": 0.0143, + "num_tokens": 10740465.0, + "reward": 0.8621826171875, + "reward_std": 0.018121518194675446, + "rewards//mean": 0.8621826171875, + "rewards//std": 0.029474854469299316, + "step": 1476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.2954, + "grad_norm": 1.546237826347351, + "kl": 0.2586859595030546, + "learning_rate": 8.08829990613025e-07, + "loss": 0.01, + "num_tokens": 10747706.0, + "reward": 0.84368896484375, + "reward_std": 0.01913711242377758, + "rewards//mean": 0.84368896484375, + "rewards//std": 0.034248873591423035, + "step": 1477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2956, + "grad_norm": 1.7362890243530273, + "kl": 0.36855185218155384, + "learning_rate": 8.085803637190643e-07, + "loss": 0.0147, + "num_tokens": 10754986.0, + "reward": 0.82611083984375, + "reward_std": 0.01457749493420124, + "rewards//mean": 0.82611083984375, + "rewards//std": 0.026593778282403946, + "step": 1478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2958, + "grad_norm": 1.629625916481018, + "kl": 0.3187599293887615, + "learning_rate": 8.083306125289697e-07, + "loss": 0.0128, + "num_tokens": 10762162.0, + "reward": 0.87835693359375, + "reward_std": 0.016909673810005188, + "rewards//mean": 0.87835693359375, + "rewards//std": 0.02868695743381977, + "step": 1479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.296, + "grad_norm": 1.4497143030166626, + "kl": 0.357412364333868, + "learning_rate": 8.080807371433414e-07, + "loss": 0.0142, + "num_tokens": 10769421.0, + "reward": 0.82208251953125, + "reward_std": 0.01600024104118347, + "rewards//mean": 0.82208251953125, + "rewards//std": 0.026217134669423103, + "step": 1480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2962, + "grad_norm": 1.894439935684204, + "kl": 0.4279318265616894, + "learning_rate": 8.07830737662829e-07, + "loss": 0.0171, + "num_tokens": 10776733.0, + "reward": 0.87713623046875, + "reward_std": 0.022195473313331604, + "rewards//mean": 0.87713623046875, + "rewards//std": 0.03134232386946678, + "step": 1481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2964, + "grad_norm": 1.193255066871643, + "kl": 0.242482241243124, + "learning_rate": 8.075806141881325e-07, + "loss": 0.0097, + "num_tokens": 10784117.0, + "reward": 0.86029052734375, + "reward_std": 0.013963716104626656, + "rewards//mean": 0.86029052734375, + "rewards//std": 0.02645452320575714, + "step": 1482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2966, + "grad_norm": 1.3492789268493652, + "kl": 0.26845999248325825, + "learning_rate": 8.073303668200011e-07, + "loss": 0.0107, + "num_tokens": 10791453.0, + "reward": 0.8519287109375, + "reward_std": 0.016935624182224274, + "rewards//mean": 0.8519287109375, + "rewards//std": 0.021762648597359657, + "step": 1483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2968, + "grad_norm": 1.2691549062728882, + "kl": 0.3261929787695408, + "learning_rate": 8.070799956592349e-07, + "loss": 0.013, + "num_tokens": 10798661.0, + "reward": 0.84942626953125, + "reward_std": 0.014307074248790741, + "rewards//mean": 0.84942626953125, + "rewards//std": 0.022584186866879463, + "step": 1484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.297, + "grad_norm": 1.1463243961334229, + "kl": 0.2965557109564543, + "learning_rate": 8.06829500806683e-07, + "loss": 0.0119, + "num_tokens": 10805949.0, + "reward": 0.83465576171875, + "reward_std": 0.016123266890645027, + "rewards//mean": 0.83465576171875, + "rewards//std": 0.026125745847821236, + "step": 1485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2972, + "grad_norm": 1.5197086334228516, + "kl": 0.33280492573976517, + "learning_rate": 8.06578882363245e-07, + "loss": 0.0133, + "num_tokens": 10813317.0, + "reward": 0.81634521484375, + "reward_std": 0.015044312924146652, + "rewards//mean": 0.81634521484375, + "rewards//std": 0.02325587160885334, + "step": 1486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2974, + "grad_norm": 1.3014553785324097, + "kl": 0.27343977987766266, + "learning_rate": 8.063281404298699e-07, + "loss": 0.0109, + "num_tokens": 10820645.0, + "reward": 0.85809326171875, + "reward_std": 0.01650698482990265, + "rewards//mean": 0.85809326171875, + "rewards//std": 0.028764422982931137, + "step": 1487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2976, + "grad_norm": 1.4420580863952637, + "kl": 0.2728839609771967, + "learning_rate": 8.060772751075562e-07, + "loss": 0.0109, + "num_tokens": 10827941.0, + "reward": 0.83612060546875, + "reward_std": 0.016460038721561432, + "rewards//mean": 0.83612060546875, + "rewards//std": 0.019255615770816803, + "step": 1488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2978, + "grad_norm": 1.5871968269348145, + "kl": 0.3016901910305023, + "learning_rate": 8.058262864973528e-07, + "loss": 0.0121, + "num_tokens": 10835253.0, + "reward": 0.857421875, + "reward_std": 0.01456526480615139, + "rewards//mean": 0.857421875, + "rewards//std": 0.025344081223011017, + "step": 1489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.298, + "grad_norm": 1.1724040508270264, + "kl": 0.27518524415791035, + "learning_rate": 8.055751747003579e-07, + "loss": 0.011, + "num_tokens": 10842485.0, + "reward": 0.86651611328125, + "reward_std": 0.013870753347873688, + "rewards//mean": 0.86651611328125, + "rewards//std": 0.01814332976937294, + "step": 1490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2982, + "grad_norm": 1.5847339630126953, + "kl": 0.34753765910863876, + "learning_rate": 8.053239398177191e-07, + "loss": 0.0139, + "num_tokens": 10849717.0, + "reward": 0.80645751953125, + "reward_std": 0.016520297154784203, + "rewards//mean": 0.80645751953125, + "rewards//std": 0.02850113809108734, + "step": 1491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2984, + "grad_norm": 1.4639928340911865, + "kl": 0.2945970129221678, + "learning_rate": 8.050725819506339e-07, + "loss": 0.0118, + "num_tokens": 10857005.0, + "reward": 0.77252197265625, + "reward_std": 0.011289472691714764, + "rewards//mean": 0.77252197265625, + "rewards//std": 0.014511778950691223, + "step": 1492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.875, + "epoch": 0.2986, + "grad_norm": 1.6170941591262817, + "kl": 0.31073845364153385, + "learning_rate": 8.048211012003489e-07, + "loss": 0.0104, + "num_tokens": 10864333.0, + "reward": 0.789794921875, + "reward_std": 0.016004003584384918, + "rewards//mean": 0.789794921875, + "rewards//std": 0.027942834421992302, + "step": 1493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2988, + "grad_norm": 1.4733760356903076, + "kl": 0.4095168113708496, + "learning_rate": 8.045694976681612e-07, + "loss": 0.0164, + "num_tokens": 10871677.0, + "reward": 0.80291748046875, + "reward_std": 0.012650880962610245, + "rewards//mean": 0.80291748046875, + "rewards//std": 0.021742727607488632, + "step": 1494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.299, + "grad_norm": 1.4045523405075073, + "kl": 0.30944090709090233, + "learning_rate": 8.043177714554159e-07, + "loss": 0.0124, + "num_tokens": 10878973.0, + "reward": 0.88861083984375, + "reward_std": 0.01587771065533161, + "rewards//mean": 0.88861083984375, + "rewards//std": 0.02303086593747139, + "step": 1495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2992, + "grad_norm": 1.3711963891983032, + "kl": 0.2968934178352356, + "learning_rate": 8.04065922663509e-07, + "loss": 0.0119, + "num_tokens": 10886317.0, + "reward": 0.84503173828125, + "reward_std": 0.014715560711920261, + "rewards//mean": 0.84503173828125, + "rewards//std": 0.021373379975557327, + "step": 1496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2994, + "grad_norm": 1.4290168285369873, + "kl": 0.2856170628219843, + "learning_rate": 8.038139513938845e-07, + "loss": 0.0114, + "num_tokens": 10893669.0, + "reward": 0.82080078125, + "reward_std": 0.011576957069337368, + "rewards//mean": 0.82080078125, + "rewards//std": 0.02110476978123188, + "step": 1497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2996, + "grad_norm": 1.8442049026489258, + "kl": 0.34291842952370644, + "learning_rate": 8.035618577480369e-07, + "loss": 0.0137, + "num_tokens": 10900949.0, + "reward": 0.85137939453125, + "reward_std": 0.016781043261289597, + "rewards//mean": 0.85137939453125, + "rewards//std": 0.02527162991464138, + "step": 1498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.2998, + "grad_norm": 1.4824087619781494, + "kl": 0.31204189360141754, + "learning_rate": 8.033096418275092e-07, + "loss": 0.0125, + "num_tokens": 10908285.0, + "reward": 0.80322265625, + "reward_std": 0.010821114294230938, + "rewards//mean": 0.80322265625, + "rewards//std": 0.021603824570775032, + "step": 1499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3, + "grad_norm": 1.2904417514801025, + "kl": 0.32626503705978394, + "learning_rate": 8.030573037338941e-07, + "loss": 0.0131, + "num_tokens": 10915597.0, + "reward": 0.7811279296875, + "reward_std": 0.01472054049372673, + "rewards//mean": 0.7811279296875, + "rewards//std": 0.022433962672948837, + "step": 1500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3002, + "grad_norm": 1.4359349012374878, + "kl": 0.3633577823638916, + "learning_rate": 8.028048435688333e-07, + "loss": 0.0145, + "num_tokens": 10922829.0, + "reward": 0.816650390625, + "reward_std": 0.013700155541300774, + "rewards//mean": 0.816650390625, + "rewards//std": 0.01786513440310955, + "step": 1501 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3004, + "grad_norm": 1.2929086685180664, + "kl": 0.34128935635089874, + "learning_rate": 8.025522614340177e-07, + "loss": 0.0137, + "num_tokens": 10930005.0, + "reward": 0.79644775390625, + "reward_std": 0.01208253763616085, + "rewards//mean": 0.79644775390625, + "rewards//std": 0.021711375564336777, + "step": 1502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3006, + "grad_norm": 1.304463505744934, + "kl": 0.31310432218015194, + "learning_rate": 8.022995574311875e-07, + "loss": 0.0125, + "num_tokens": 10937325.0, + "reward": 0.79656982421875, + "reward_std": 0.016265297308564186, + "rewards//mean": 0.79656982421875, + "rewards//std": 0.02330593764781952, + "step": 1503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.3008, + "grad_norm": 1.5412545204162598, + "kl": 0.3197664190083742, + "learning_rate": 8.020467316621316e-07, + "loss": 0.0108, + "num_tokens": 10944550.0, + "reward": 0.85369873046875, + "reward_std": 0.01599094271659851, + "rewards//mean": 0.85369873046875, + "rewards//std": 0.024408714845776558, + "step": 1504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.301, + "grad_norm": 1.4726200103759766, + "kl": 0.34774226881563663, + "learning_rate": 8.017937842286882e-07, + "loss": 0.0139, + "num_tokens": 10951870.0, + "reward": 0.85968017578125, + "reward_std": 0.01636398956179619, + "rewards//mean": 0.85968017578125, + "rewards//std": 0.0235745906829834, + "step": 1505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3012, + "grad_norm": 1.605487585067749, + "kl": 0.3731590937823057, + "learning_rate": 8.015407152327447e-07, + "loss": 0.0149, + "num_tokens": 10959118.0, + "reward": 0.8077392578125, + "reward_std": 0.01029239222407341, + "rewards//mean": 0.8077392578125, + "rewards//std": 0.017791688442230225, + "step": 1506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3014, + "grad_norm": 2.7941622734069824, + "kl": 0.5371859837323427, + "learning_rate": 8.012875247762372e-07, + "loss": 0.0215, + "num_tokens": 10966358.0, + "reward": 0.880126953125, + "reward_std": 0.01935962587594986, + "rewards//mean": 0.880126953125, + "rewards//std": 0.026981506496667862, + "step": 1507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3016, + "grad_norm": 1.322916865348816, + "kl": 0.3004636149853468, + "learning_rate": 8.010342129611507e-07, + "loss": 0.012, + "num_tokens": 10973630.0, + "reward": 0.8189697265625, + "reward_std": 0.017845112830400467, + "rewards//mean": 0.8189697265625, + "rewards//std": 0.025734975934028625, + "step": 1508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.765625, + "epoch": 0.3018, + "grad_norm": 1.6455235481262207, + "kl": 0.30842231027781963, + "learning_rate": 8.007807798895193e-07, + "loss": 0.0009, + "num_tokens": 10981007.0, + "reward": 0.82342529296875, + "reward_std": 0.016311902552843094, + "rewards//mean": 0.82342529296875, + "rewards//std": 0.018178338184952736, + "step": 1509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.302, + "grad_norm": 1.9122346639633179, + "kl": 0.27128574065864086, + "learning_rate": 8.005272256634257e-07, + "loss": 0.0109, + "num_tokens": 10988295.0, + "reward": 0.81402587890625, + "reward_std": 0.014504838734865189, + "rewards//mean": 0.81402587890625, + "rewards//std": 0.025832654908299446, + "step": 1510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3022, + "grad_norm": 1.3461147546768188, + "kl": 0.25536189042031765, + "learning_rate": 8.002735503850015e-07, + "loss": 0.0102, + "num_tokens": 10995631.0, + "reward": 0.8013916015625, + "reward_std": 0.01301396731287241, + "rewards//mean": 0.8013916015625, + "rewards//std": 0.02566901221871376, + "step": 1511 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3024, + "grad_norm": 1.4530577659606934, + "kl": 0.3871631361544132, + "learning_rate": 8.000197541564271e-07, + "loss": 0.0155, + "num_tokens": 11002935.0, + "reward": 0.842041015625, + "reward_std": 0.02613624930381775, + "rewards//mean": 0.842041015625, + "rewards//std": 0.038712941110134125, + "step": 1512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3026, + "grad_norm": 1.5088372230529785, + "kl": 0.37906224466860294, + "learning_rate": 7.997658370799316e-07, + "loss": 0.0152, + "num_tokens": 11010183.0, + "reward": 0.8345947265625, + "reward_std": 0.015880979597568512, + "rewards//mean": 0.8345947265625, + "rewards//std": 0.017624137923121452, + "step": 1513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3028, + "grad_norm": 1.5916668176651, + "kl": 0.28466516733169556, + "learning_rate": 7.995117992577928e-07, + "loss": 0.0114, + "num_tokens": 11017415.0, + "reward": 0.79608154296875, + "reward_std": 0.011368243023753166, + "rewards//mean": 0.79608154296875, + "rewards//std": 0.016310302540659904, + "step": 1514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.303, + "grad_norm": 1.5053290128707886, + "kl": 0.30097674019634724, + "learning_rate": 7.992576407923372e-07, + "loss": 0.012, + "num_tokens": 11024711.0, + "reward": 0.8370361328125, + "reward_std": 0.014247571118175983, + "rewards//mean": 0.8370361328125, + "rewards//std": 0.02334917150437832, + "step": 1515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3032, + "grad_norm": 1.2988007068634033, + "kl": 0.3414370324462652, + "learning_rate": 7.990033617859395e-07, + "loss": 0.0137, + "num_tokens": 11031903.0, + "reward": 0.8582763671875, + "reward_std": 0.014907744713127613, + "rewards//mean": 0.8582763671875, + "rewards//std": 0.02746720239520073, + "step": 1516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.3034, + "grad_norm": 1.0765959024429321, + "kl": 0.2695039063692093, + "learning_rate": 7.987489623410235e-07, + "loss": 0.0106, + "num_tokens": 11039157.0, + "reward": 0.84429931640625, + "reward_std": 0.011769606731832027, + "rewards//mean": 0.84429931640625, + "rewards//std": 0.02421882376074791, + "step": 1517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3036, + "grad_norm": 1.4361984729766846, + "kl": 0.3050003759562969, + "learning_rate": 7.984944425600613e-07, + "loss": 0.0122, + "num_tokens": 11046397.0, + "reward": 0.85150146484375, + "reward_std": 0.01887320913374424, + "rewards//mean": 0.85150146484375, + "rewards//std": 0.026707379147410393, + "step": 1518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3038, + "grad_norm": 1.5134775638580322, + "kl": 0.2859603203833103, + "learning_rate": 7.982398025455732e-07, + "loss": 0.0114, + "num_tokens": 11053821.0, + "reward": 0.8433837890625, + "reward_std": 0.011725958436727524, + "rewards//mean": 0.8433837890625, + "rewards//std": 0.018580777570605278, + "step": 1519 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.304, + "grad_norm": 1.7668704986572266, + "kl": 0.2887365594506264, + "learning_rate": 7.979850424001282e-07, + "loss": 0.0115, + "num_tokens": 11061101.0, + "reward": 0.84259033203125, + "reward_std": 0.018853208050131798, + "rewards//mean": 0.84259033203125, + "rewards//std": 0.02089926041662693, + "step": 1520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3042, + "grad_norm": 1.4853137731552124, + "kl": 0.3278784900903702, + "learning_rate": 7.97730162226344e-07, + "loss": 0.0131, + "num_tokens": 11068469.0, + "reward": 0.8406982421875, + "reward_std": 0.01726514659821987, + "rewards//mean": 0.8406982421875, + "rewards//std": 0.025661934167146683, + "step": 1521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3044, + "grad_norm": 1.4116122722625732, + "kl": 0.3020116835832596, + "learning_rate": 7.974751621268858e-07, + "loss": 0.0121, + "num_tokens": 11075797.0, + "reward": 0.8843994140625, + "reward_std": 0.015084611251950264, + "rewards//mean": 0.8843994140625, + "rewards//std": 0.027601348236203194, + "step": 1522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3046, + "grad_norm": 1.6119470596313477, + "kl": 0.3691091537475586, + "learning_rate": 7.972200422044682e-07, + "loss": 0.0148, + "num_tokens": 11083069.0, + "reward": 0.8638916015625, + "reward_std": 0.012143673375248909, + "rewards//mean": 0.8638916015625, + "rewards//std": 0.01848275400698185, + "step": 1523 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.3048, + "grad_norm": 1.4739766120910645, + "kl": 0.37241657450795174, + "learning_rate": 7.969648025618529e-07, + "loss": 0.0143, + "num_tokens": 11090345.0, + "reward": 0.84942626953125, + "reward_std": 0.013504072092473507, + "rewards//mean": 0.84942626953125, + "rewards//std": 0.021360628306865692, + "step": 1524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.305, + "grad_norm": 1.5616451501846313, + "kl": 0.3186550848186016, + "learning_rate": 7.967094433018508e-07, + "loss": 0.0127, + "num_tokens": 11097577.0, + "reward": 0.8826904296875, + "reward_std": 0.02238243818283081, + "rewards//mean": 0.8826904296875, + "rewards//std": 0.030662957578897476, + "step": 1525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3052, + "grad_norm": 1.5345213413238525, + "kl": 0.30035484209656715, + "learning_rate": 7.964539645273202e-07, + "loss": 0.012, + "num_tokens": 11104785.0, + "reward": 0.77734375, + "reward_std": 0.010961033403873444, + "rewards//mean": 0.77734375, + "rewards//std": 0.013076518662273884, + "step": 1526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3054, + "grad_norm": 1.300873041152954, + "kl": 0.31680581346154213, + "learning_rate": 7.961983663411684e-07, + "loss": 0.0127, + "num_tokens": 11112073.0, + "reward": 0.82196044921875, + "reward_std": 0.01205800473690033, + "rewards//mean": 0.82196044921875, + "rewards//std": 0.019146820530295372, + "step": 1527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3056, + "grad_norm": 1.2947779893875122, + "kl": 0.36263856291770935, + "learning_rate": 7.959426488463499e-07, + "loss": 0.0145, + "num_tokens": 11119361.0, + "reward": 0.82293701171875, + "reward_std": 0.019859064370393753, + "rewards//mean": 0.82293701171875, + "rewards//std": 0.02817743830382824, + "step": 1528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3058, + "grad_norm": 1.4106121063232422, + "kl": 0.3214700911194086, + "learning_rate": 7.956868121458677e-07, + "loss": 0.0129, + "num_tokens": 11126577.0, + "reward": 0.80841064453125, + "reward_std": 0.010638820938766003, + "rewards//mean": 0.80841064453125, + "rewards//std": 0.014270903542637825, + "step": 1529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.306, + "grad_norm": 1.3563889265060425, + "kl": 0.34627447091042995, + "learning_rate": 7.954308563427732e-07, + "loss": 0.0139, + "num_tokens": 11133841.0, + "reward": 0.82611083984375, + "reward_std": 0.011916514486074448, + "rewards//mean": 0.82611083984375, + "rewards//std": 0.01711905002593994, + "step": 1530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3062, + "grad_norm": 1.562270998954773, + "kl": 0.3032269589602947, + "learning_rate": 7.951747815401649e-07, + "loss": 0.0121, + "num_tokens": 11141193.0, + "reward": 0.86334228515625, + "reward_std": 0.019735772162675858, + "rewards//mean": 0.86334228515625, + "rewards//std": 0.028868958353996277, + "step": 1531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3064, + "grad_norm": 1.5342754125595093, + "kl": 0.3066043592989445, + "learning_rate": 7.949185878411899e-07, + "loss": 0.0123, + "num_tokens": 11148473.0, + "reward": 0.83587646484375, + "reward_std": 0.016549717634916306, + "rewards//mean": 0.83587646484375, + "rewards//std": 0.021093200892210007, + "step": 1532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.8125, + "epoch": 0.3066, + "grad_norm": 1.4996311664581299, + "kl": 0.3297623097896576, + "learning_rate": 7.946622753490432e-07, + "loss": 0.0048, + "num_tokens": 11155733.0, + "reward": 0.84747314453125, + "reward_std": 0.013064393773674965, + "rewards//mean": 0.84747314453125, + "rewards//std": 0.016681043431162834, + "step": 1533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3068, + "grad_norm": 1.8746000528335571, + "kl": 0.5180549696087837, + "learning_rate": 7.94405844166967e-07, + "loss": 0.0207, + "num_tokens": 11163013.0, + "reward": 0.85089111328125, + "reward_std": 0.014771486632525921, + "rewards//mean": 0.85089111328125, + "rewards//std": 0.021082432940602303, + "step": 1534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.307, + "grad_norm": 1.2732497453689575, + "kl": 0.359910786151886, + "learning_rate": 7.941492943982521e-07, + "loss": 0.0144, + "num_tokens": 11170389.0, + "reward": 0.86114501953125, + "reward_std": 0.018286842852830887, + "rewards//mean": 0.86114501953125, + "rewards//std": 0.024802451953291893, + "step": 1535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3072, + "grad_norm": 1.3057727813720703, + "kl": 0.35700664296746254, + "learning_rate": 7.938926261462365e-07, + "loss": 0.0143, + "num_tokens": 11177661.0, + "reward": 0.85400390625, + "reward_std": 0.01457191351801157, + "rewards//mean": 0.85400390625, + "rewards//std": 0.015985123813152313, + "step": 1536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3074, + "grad_norm": 1.4248286485671997, + "kl": 0.35258909687399864, + "learning_rate": 7.936358395143063e-07, + "loss": 0.0141, + "num_tokens": 11184949.0, + "reward": 0.84649658203125, + "reward_std": 0.015755251049995422, + "rewards//mean": 0.84649658203125, + "rewards//std": 0.01861933246254921, + "step": 1537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.515625, + "epoch": 0.3076, + "grad_norm": 1.4746887683868408, + "kl": 0.3358766492456198, + "learning_rate": 7.93378934605895e-07, + "loss": -0.0185, + "num_tokens": 11192254.0, + "reward": 0.80828857421875, + "reward_std": 0.014723297208547592, + "rewards//mean": 0.80828857421875, + "rewards//std": 0.02031899429857731, + "step": 1538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3078, + "grad_norm": 1.2939313650131226, + "kl": 0.276173347607255, + "learning_rate": 7.93121911524484e-07, + "loss": 0.011, + "num_tokens": 11199486.0, + "reward": 0.86138916015625, + "reward_std": 0.01181066408753395, + "rewards//mean": 0.86138916015625, + "rewards//std": 0.02303086593747139, + "step": 1539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.308, + "grad_norm": 1.3849245309829712, + "kl": 0.35869951359927654, + "learning_rate": 7.928647703736023e-07, + "loss": 0.0143, + "num_tokens": 11206870.0, + "reward": 0.8165283203125, + "reward_std": 0.016535885632038116, + "rewards//mean": 0.8165283203125, + "rewards//std": 0.02940492518246174, + "step": 1540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.828125, + "epoch": 0.3082, + "grad_norm": 1.2777093648910522, + "kl": 0.29023104533553123, + "learning_rate": 7.926075112568258e-07, + "loss": 0.0185, + "num_tokens": 11214099.0, + "reward": 0.834228515625, + "reward_std": 0.014018209651112556, + "rewards//mean": 0.834228515625, + "rewards//std": 0.027240585535764694, + "step": 1541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.859375, + "epoch": 0.3084, + "grad_norm": 1.508439302444458, + "kl": 0.35291630774736404, + "learning_rate": 7.923501342777787e-07, + "loss": 0.0061, + "num_tokens": 11221306.0, + "reward": 0.88232421875, + "reward_std": 0.01758796162903309, + "rewards//mean": 0.88232421875, + "rewards//std": 0.02333912067115307, + "step": 1542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.75, + "epoch": 0.3086, + "grad_norm": 1.6801505088806152, + "kl": 0.3370529729872942, + "learning_rate": 7.920926395401326e-07, + "loss": -0.0012, + "num_tokens": 11228506.0, + "reward": 0.8372802734375, + "reward_std": 0.014366175048053265, + "rewards//mean": 0.8372802734375, + "rewards//std": 0.019749755039811134, + "step": 1543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3088, + "grad_norm": 1.36531662940979, + "kl": 0.37579402327537537, + "learning_rate": 7.918350271476063e-07, + "loss": 0.015, + "num_tokens": 11235810.0, + "reward": 0.81689453125, + "reward_std": 0.02260320633649826, + "rewards//mean": 0.81689453125, + "rewards//std": 0.03471951186656952, + "step": 1544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.309, + "grad_norm": 1.7454863786697388, + "kl": 0.35440210066735744, + "learning_rate": 7.915772972039659e-07, + "loss": 0.0142, + "num_tokens": 11243050.0, + "reward": 0.88531494140625, + "reward_std": 0.015229430980980396, + "rewards//mean": 0.88531494140625, + "rewards//std": 0.027102351188659668, + "step": 1545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3092, + "grad_norm": 1.5567535161972046, + "kl": 0.33685025945305824, + "learning_rate": 7.913194498130251e-07, + "loss": 0.0135, + "num_tokens": 11250386.0, + "reward": 0.84600830078125, + "reward_std": 0.020101681351661682, + "rewards//mean": 0.84600830078125, + "rewards//std": 0.034088052809238434, + "step": 1546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3094, + "grad_norm": 1.7220451831817627, + "kl": 0.34259142354130745, + "learning_rate": 7.910614850786447e-07, + "loss": 0.0137, + "num_tokens": 11257650.0, + "reward": 0.83563232421875, + "reward_std": 0.016377581283450127, + "rewards//mean": 0.83563232421875, + "rewards//std": 0.03615349158644676, + "step": 1547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3096, + "grad_norm": 1.4022778272628784, + "kl": 0.32925427705049515, + "learning_rate": 7.90803403104733e-07, + "loss": 0.0132, + "num_tokens": 11264954.0, + "reward": 0.88421630859375, + "reward_std": 0.01340584084391594, + "rewards//mean": 0.88421630859375, + "rewards//std": 0.016223758459091187, + "step": 1548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3098, + "grad_norm": 1.413926124572754, + "kl": 0.3285839110612869, + "learning_rate": 7.905452039952451e-07, + "loss": 0.0131, + "num_tokens": 11272218.0, + "reward": 0.83294677734375, + "reward_std": 0.017551623284816742, + "rewards//mean": 0.83294677734375, + "rewards//std": 0.03265046328306198, + "step": 1549 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.31, + "grad_norm": 1.547845721244812, + "kl": 0.30811190605163574, + "learning_rate": 7.90286887854184e-07, + "loss": 0.0123, + "num_tokens": 11279482.0, + "reward": 0.84716796875, + "reward_std": 0.016538698226213455, + "rewards//mean": 0.84716796875, + "rewards//std": 0.019679497927427292, + "step": 1550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3102, + "grad_norm": 1.3506731986999512, + "kl": 0.30751625820994377, + "learning_rate": 7.900284547855991e-07, + "loss": 0.0123, + "num_tokens": 11286818.0, + "reward": 0.84088134765625, + "reward_std": 0.014567839913070202, + "rewards//mean": 0.84088134765625, + "rewards//std": 0.023253267630934715, + "step": 1551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3104, + "grad_norm": 1.448291540145874, + "kl": 0.37098027020692825, + "learning_rate": 7.897699048935873e-07, + "loss": 0.0148, + "num_tokens": 11294122.0, + "reward": 0.853515625, + "reward_std": 0.016871996223926544, + "rewards//mean": 0.853515625, + "rewards//std": 0.020623009651899338, + "step": 1552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.8125, + "epoch": 0.3106, + "grad_norm": 1.2662723064422607, + "kl": 0.2950649056583643, + "learning_rate": 7.895112382822924e-07, + "loss": 0.0052, + "num_tokens": 11301342.0, + "reward": 0.88494873046875, + "reward_std": 0.016436900943517685, + "rewards//mean": 0.88494873046875, + "rewards//std": 0.028797032311558723, + "step": 1553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.578125, + "epoch": 0.3108, + "grad_norm": 1.2657948732376099, + "kl": 0.33756398409605026, + "learning_rate": 7.892524550559055e-07, + "loss": -0.0224, + "num_tokens": 11308603.0, + "reward": 0.78631591796875, + "reward_std": 0.015408900566399097, + "rewards//mean": 0.78631591796875, + "rewards//std": 0.020251832902431488, + "step": 1554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.311, + "grad_norm": 1.5757801532745361, + "kl": 0.4080413840711117, + "learning_rate": 7.889935553186641e-07, + "loss": 0.0163, + "num_tokens": 11315931.0, + "reward": 0.88140869140625, + "reward_std": 0.014688185416162014, + "rewards//mean": 0.88140869140625, + "rewards//std": 0.02023613080382347, + "step": 1555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3112, + "grad_norm": 1.676984429359436, + "kl": 0.35173830948770046, + "learning_rate": 7.887345391748532e-07, + "loss": 0.0141, + "num_tokens": 11323283.0, + "reward": 0.806884765625, + "reward_std": 0.012270376086235046, + "rewards//mean": 0.806884765625, + "rewards//std": 0.017837999388575554, + "step": 1556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3114, + "grad_norm": 1.3488930463790894, + "kl": 0.3343107271939516, + "learning_rate": 7.884754067288046e-07, + "loss": 0.0134, + "num_tokens": 11330435.0, + "reward": 0.8480224609375, + "reward_std": 0.016975991427898407, + "rewards//mean": 0.8480224609375, + "rewards//std": 0.019926784560084343, + "step": 1557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3116, + "grad_norm": 1.847101092338562, + "kl": 0.34380051866173744, + "learning_rate": 7.882161580848966e-07, + "loss": 0.0138, + "num_tokens": 11337723.0, + "reward": 0.8658447265625, + "reward_std": 0.014799877069890499, + "rewards//mean": 0.8658447265625, + "rewards//std": 0.02472703717648983, + "step": 1558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3118, + "grad_norm": 1.5020004510879517, + "kl": 0.3383469991385937, + "learning_rate": 7.879567933475546e-07, + "loss": 0.0135, + "num_tokens": 11344987.0, + "reward": 0.85693359375, + "reward_std": 0.016765952110290527, + "rewards//mean": 0.85693359375, + "rewards//std": 0.022298941388726234, + "step": 1559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.312, + "grad_norm": 1.225193977355957, + "kl": 0.327405234798789, + "learning_rate": 7.876973126212506e-07, + "loss": 0.0131, + "num_tokens": 11352219.0, + "reward": 0.85858154296875, + "reward_std": 0.01513639185577631, + "rewards//mean": 0.85858154296875, + "rewards//std": 0.020788872614502907, + "step": 1560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3122, + "grad_norm": 1.4892756938934326, + "kl": 0.32218126580119133, + "learning_rate": 7.874377160105036e-07, + "loss": 0.0129, + "num_tokens": 11359555.0, + "reward": 0.77001953125, + "reward_std": 0.011042831465601921, + "rewards//mean": 0.77001953125, + "rewards//std": 0.01881112903356552, + "step": 1561 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.84375, + "epoch": 0.3124, + "grad_norm": 1.271380066871643, + "kl": 0.32908363454043865, + "learning_rate": 7.871780036198788e-07, + "loss": 0.0155, + "num_tokens": 11366849.0, + "reward": 0.88140869140625, + "reward_std": 0.01544979214668274, + "rewards//mean": 0.88140869140625, + "rewards//std": 0.02469909004867077, + "step": 1562 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3126, + "grad_norm": 1.5440170764923096, + "kl": 0.3200910873711109, + "learning_rate": 7.869181755539887e-07, + "loss": 0.0128, + "num_tokens": 11374073.0, + "reward": 0.84136962890625, + "reward_std": 0.016223900020122528, + "rewards//mean": 0.84136962890625, + "rewards//std": 0.020131129771471024, + "step": 1563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.765625, + "epoch": 0.3128, + "grad_norm": 1.3706849813461304, + "kl": 0.3330387808382511, + "learning_rate": 7.866582319174917e-07, + "loss": 0.018, + "num_tokens": 11381410.0, + "reward": 0.84088134765625, + "reward_std": 0.015541348606348038, + "rewards//mean": 0.84088134765625, + "rewards//std": 0.018272193148732185, + "step": 1564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.313, + "grad_norm": 1.364280104637146, + "kl": 0.32195328548550606, + "learning_rate": 7.863981728150931e-07, + "loss": 0.0129, + "num_tokens": 11388754.0, + "reward": 0.88482666015625, + "reward_std": 0.017376258969306946, + "rewards//mean": 0.88482666015625, + "rewards//std": 0.0208680909126997, + "step": 1565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.78125, + "epoch": 0.3132, + "grad_norm": 1.7892605066299438, + "kl": 0.41841744631528854, + "learning_rate": 7.861379983515448e-07, + "loss": 0.0039, + "num_tokens": 11396204.0, + "reward": 0.85638427734375, + "reward_std": 0.015307621099054813, + "rewards//mean": 0.85638427734375, + "rewards//std": 0.01929723657667637, + "step": 1566 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3134, + "grad_norm": 1.453551173210144, + "kl": 0.33656006306409836, + "learning_rate": 7.858777086316451e-07, + "loss": 0.0135, + "num_tokens": 11403636.0, + "reward": 0.87890625, + "reward_std": 0.018505392596125603, + "rewards//mean": 0.87890625, + "rewards//std": 0.026566512882709503, + "step": 1567 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3136, + "grad_norm": 1.164448618888855, + "kl": 0.26974714174866676, + "learning_rate": 7.856173037602382e-07, + "loss": 0.0108, + "num_tokens": 11410948.0, + "reward": 0.87896728515625, + "reward_std": 0.01203098613768816, + "rewards//mean": 0.87896728515625, + "rewards//std": 0.018155839294195175, + "step": 1568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3138, + "grad_norm": 1.3344517946243286, + "kl": 0.34795097075402737, + "learning_rate": 7.853567838422159e-07, + "loss": 0.0139, + "num_tokens": 11418252.0, + "reward": 0.84625244140625, + "reward_std": 0.012521596625447273, + "rewards//mean": 0.84625244140625, + "rewards//std": 0.016088837757706642, + "step": 1569 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.314, + "grad_norm": 1.39815092086792, + "kl": 0.3805798329412937, + "learning_rate": 7.850961489825149e-07, + "loss": 0.0152, + "num_tokens": 11425596.0, + "reward": 0.85137939453125, + "reward_std": 0.01699395664036274, + "rewards//mean": 0.85137939453125, + "rewards//std": 0.025821518152952194, + "step": 1570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3142, + "grad_norm": 1.414891242980957, + "kl": 0.3306307978928089, + "learning_rate": 7.848353992861194e-07, + "loss": 0.0132, + "num_tokens": 11432940.0, + "reward": 0.81768798828125, + "reward_std": 0.01301993615925312, + "rewards//mean": 0.81768798828125, + "rewards//std": 0.018016062676906586, + "step": 1571 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3144, + "grad_norm": 1.3804469108581543, + "kl": 0.3120267931371927, + "learning_rate": 7.84574534858059e-07, + "loss": 0.0125, + "num_tokens": 11440132.0, + "reward": 0.8902587890625, + "reward_std": 0.01763727329671383, + "rewards//mean": 0.8902587890625, + "rewards//std": 0.026399319991469383, + "step": 1572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3146, + "grad_norm": 1.5417300462722778, + "kl": 0.31827268470078707, + "learning_rate": 7.8431355580341e-07, + "loss": 0.0127, + "num_tokens": 11447484.0, + "reward": 0.83477783203125, + "reward_std": 0.019711285829544067, + "rewards//mean": 0.83477783203125, + "rewards//std": 0.03202837333083153, + "step": 1573 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3148, + "grad_norm": 1.3997340202331543, + "kl": 0.3056262470781803, + "learning_rate": 7.840524622272948e-07, + "loss": 0.0122, + "num_tokens": 11454788.0, + "reward": 0.84100341796875, + "reward_std": 0.016291622072458267, + "rewards//mean": 0.84100341796875, + "rewards//std": 0.02034728415310383, + "step": 1574 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.315, + "grad_norm": 1.4513640403747559, + "kl": 0.3600594289600849, + "learning_rate": 7.837912542348817e-07, + "loss": 0.0144, + "num_tokens": 11462068.0, + "reward": 0.86572265625, + "reward_std": 0.015590596944093704, + "rewards//mean": 0.86572265625, + "rewards//std": 0.027317164465785027, + "step": 1575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3152, + "grad_norm": 1.5303791761398315, + "kl": 0.30835063196718693, + "learning_rate": 7.835299319313853e-07, + "loss": 0.0123, + "num_tokens": 11469260.0, + "reward": 0.861328125, + "reward_std": 0.016042914241552353, + "rewards//mean": 0.861328125, + "rewards//std": 0.026171552017331123, + "step": 1576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3154, + "grad_norm": 1.6250885725021362, + "kl": 0.3548947274684906, + "learning_rate": 7.832684954220663e-07, + "loss": 0.0142, + "num_tokens": 11476716.0, + "reward": 0.841064453125, + "reward_std": 0.020513657480478287, + "rewards//mean": 0.841064453125, + "rewards//std": 0.024625511839985847, + "step": 1577 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3156, + "grad_norm": 1.4778770208358765, + "kl": 0.33477216958999634, + "learning_rate": 7.830069448122312e-07, + "loss": 0.0134, + "num_tokens": 11483964.0, + "reward": 0.8594970703125, + "reward_std": 0.019713925197720528, + "rewards//mean": 0.8594970703125, + "rewards//std": 0.030193326994776726, + "step": 1578 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3158, + "grad_norm": 1.217482328414917, + "kl": 0.2694059554487467, + "learning_rate": 7.827452802072327e-07, + "loss": 0.0108, + "num_tokens": 11491340.0, + "reward": 0.83837890625, + "reward_std": 0.009791996330022812, + "rewards//mean": 0.83837890625, + "rewards//std": 0.011801132932305336, + "step": 1579 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.796875, + "epoch": 0.316, + "grad_norm": 1.4441535472869873, + "kl": 0.3461875580251217, + "learning_rate": 7.82483501712469e-07, + "loss": 0.0116, + "num_tokens": 11498591.0, + "reward": 0.84576416015625, + "reward_std": 0.017470575869083405, + "rewards//mean": 0.84576416015625, + "rewards//std": 0.028662674129009247, + "step": 1580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3162, + "grad_norm": 1.609462857246399, + "kl": 0.33203436620533466, + "learning_rate": 7.822216094333847e-07, + "loss": 0.0133, + "num_tokens": 11505831.0, + "reward": 0.851318359375, + "reward_std": 0.018439367413520813, + "rewards//mean": 0.851318359375, + "rewards//std": 0.022237760946154594, + "step": 1581 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3164, + "grad_norm": 1.5849857330322266, + "kl": 0.33608773723244667, + "learning_rate": 7.819596034754696e-07, + "loss": 0.0134, + "num_tokens": 11513071.0, + "reward": 0.8612060546875, + "reward_std": 0.019021708518266678, + "rewards//mean": 0.8612060546875, + "rewards//std": 0.02076309733092785, + "step": 1582 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3166, + "grad_norm": 1.2563574314117432, + "kl": 0.33111658319830894, + "learning_rate": 7.816974839442603e-07, + "loss": 0.0132, + "num_tokens": 11520383.0, + "reward": 0.84564208984375, + "reward_std": 0.014512693509459496, + "rewards//mean": 0.84564208984375, + "rewards//std": 0.02341674268245697, + "step": 1583 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.78125, + "epoch": 0.3168, + "grad_norm": 1.4368191957473755, + "kl": 0.33049737103283405, + "learning_rate": 7.814352509453379e-07, + "loss": 0.019, + "num_tokens": 11527601.0, + "reward": 0.810791015625, + "reward_std": 0.012638639658689499, + "rewards//mean": 0.810791015625, + "rewards//std": 0.018147604539990425, + "step": 1584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.317, + "grad_norm": 2.085989475250244, + "kl": 0.4226964861154556, + "learning_rate": 7.811729045843301e-07, + "loss": 0.0169, + "num_tokens": 11534849.0, + "reward": 0.791748046875, + "reward_std": 0.011744322255253792, + "rewards//mean": 0.791748046875, + "rewards//std": 0.018712695688009262, + "step": 1585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3172, + "grad_norm": 2.1375393867492676, + "kl": 0.33367676474153996, + "learning_rate": 7.8091044496691e-07, + "loss": 0.0133, + "num_tokens": 11542129.0, + "reward": 0.86163330078125, + "reward_std": 0.014016760513186455, + "rewards//mean": 0.86163330078125, + "rewards//std": 0.017215164378285408, + "step": 1586 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3174, + "grad_norm": 1.8265730142593384, + "kl": 0.26303882896900177, + "learning_rate": 7.806478721987963e-07, + "loss": 0.0105, + "num_tokens": 11549521.0, + "reward": 0.86749267578125, + "reward_std": 0.015359662473201752, + "rewards//mean": 0.86749267578125, + "rewards//std": 0.026814289391040802, + "step": 1587 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3176, + "grad_norm": 1.532178521156311, + "kl": 0.380752544850111, + "learning_rate": 7.803851863857532e-07, + "loss": 0.0152, + "num_tokens": 11556905.0, + "reward": 0.83538818359375, + "reward_std": 0.016468051820993423, + "rewards//mean": 0.83538818359375, + "rewards//std": 0.020420804619789124, + "step": 1588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3178, + "grad_norm": 1.2931063175201416, + "kl": 0.3116430304944515, + "learning_rate": 7.801223876335907e-07, + "loss": 0.0125, + "num_tokens": 11564249.0, + "reward": 0.84930419921875, + "reward_std": 0.015276344493031502, + "rewards//mean": 0.84930419921875, + "rewards//std": 0.0247719157487154, + "step": 1589 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.318, + "grad_norm": 1.6417688131332397, + "kl": 0.3584008291363716, + "learning_rate": 7.798594760481637e-07, + "loss": 0.0143, + "num_tokens": 11571481.0, + "reward": 0.82513427734375, + "reward_std": 0.01652529463171959, + "rewards//mean": 0.82513427734375, + "rewards//std": 0.024077149108052254, + "step": 1590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3182, + "grad_norm": 1.4458110332489014, + "kl": 0.3092430457472801, + "learning_rate": 7.795964517353733e-07, + "loss": 0.0124, + "num_tokens": 11578841.0, + "reward": 0.8355712890625, + "reward_std": 0.015428265556693077, + "rewards//mean": 0.8355712890625, + "rewards//std": 0.021673431620001793, + "step": 1591 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3184, + "grad_norm": 1.25544273853302, + "kl": 0.33686067163944244, + "learning_rate": 7.793333148011657e-07, + "loss": 0.0135, + "num_tokens": 11586081.0, + "reward": 0.81964111328125, + "reward_std": 0.010494636371731758, + "rewards//mean": 0.81964111328125, + "rewards//std": 0.01640469580888748, + "step": 1592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3186, + "grad_norm": 1.3988193273544312, + "kl": 0.35821497812867165, + "learning_rate": 7.790700653515323e-07, + "loss": 0.0143, + "num_tokens": 11593273.0, + "reward": 0.841552734375, + "reward_std": 0.018824685364961624, + "rewards//mean": 0.841552734375, + "rewards//std": 0.026963546872138977, + "step": 1593 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3188, + "grad_norm": 1.3386166095733643, + "kl": 0.30554920621216297, + "learning_rate": 7.788067034925099e-07, + "loss": 0.0122, + "num_tokens": 11600497.0, + "reward": 0.883056640625, + "reward_std": 0.01950962282717228, + "rewards//mean": 0.883056640625, + "rewards//std": 0.022987527772784233, + "step": 1594 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.319, + "grad_norm": 1.326844573020935, + "kl": 0.33497556298971176, + "learning_rate": 7.785432293301806e-07, + "loss": 0.0134, + "num_tokens": 11607753.0, + "reward": 0.85028076171875, + "reward_std": 0.014966240152716637, + "rewards//mean": 0.85028076171875, + "rewards//std": 0.031211648136377335, + "step": 1595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3192, + "grad_norm": 1.4028054475784302, + "kl": 0.3034185338765383, + "learning_rate": 7.78279642970672e-07, + "loss": 0.0121, + "num_tokens": 11614969.0, + "reward": 0.84906005859375, + "reward_std": 0.011753956787288189, + "rewards//mean": 0.84906005859375, + "rewards//std": 0.02194715104997158, + "step": 1596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.828125, + "epoch": 0.3194, + "grad_norm": 1.392458438873291, + "kl": 0.3160701673477888, + "learning_rate": 7.780159445201562e-07, + "loss": 0.009, + "num_tokens": 11622278.0, + "reward": 0.78955078125, + "reward_std": 0.010731037706136703, + "rewards//mean": 0.78955078125, + "rewards//std": 0.01581757515668869, + "step": 1597 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3196, + "grad_norm": 1.6120555400848389, + "kl": 0.30513573437929153, + "learning_rate": 7.777521340848514e-07, + "loss": 0.0122, + "num_tokens": 11629574.0, + "reward": 0.8896484375, + "reward_std": 0.016332227736711502, + "rewards//mean": 0.8896484375, + "rewards//std": 0.025515519082546234, + "step": 1598 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3198, + "grad_norm": 1.3329142332077026, + "kl": 0.2814145404845476, + "learning_rate": 7.774882117710202e-07, + "loss": 0.0113, + "num_tokens": 11636806.0, + "reward": 0.859375, + "reward_std": 0.01236917357891798, + "rewards//mean": 0.859375, + "rewards//std": 0.021202094852924347, + "step": 1599 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.32, + "grad_norm": 1.5694488286972046, + "kl": 0.3383716493844986, + "learning_rate": 7.772241776849704e-07, + "loss": 0.0135, + "num_tokens": 11644006.0, + "reward": 0.83740234375, + "reward_std": 0.01750022917985916, + "rewards//mean": 0.83740234375, + "rewards//std": 0.02221187949180603, + "step": 1600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3202, + "grad_norm": 1.9435886144638062, + "kl": 0.31080831214785576, + "learning_rate": 7.769600319330552e-07, + "loss": 0.0124, + "num_tokens": 11651318.0, + "reward": 0.7938232421875, + "reward_std": 0.014236312359571457, + "rewards//mean": 0.7938232421875, + "rewards//std": 0.018990132957696915, + "step": 1601 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3204, + "grad_norm": 1.4323463439941406, + "kl": 0.37908921763300896, + "learning_rate": 7.76695774621672e-07, + "loss": 0.0152, + "num_tokens": 11658622.0, + "reward": 0.88336181640625, + "reward_std": 0.018829654902219772, + "rewards//mean": 0.88336181640625, + "rewards//std": 0.027124682441353798, + "step": 1602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3206, + "grad_norm": 1.3979514837265015, + "kl": 0.3380451127886772, + "learning_rate": 7.764314058572639e-07, + "loss": 0.0135, + "num_tokens": 11665910.0, + "reward": 0.83502197265625, + "reward_std": 0.013833221048116684, + "rewards//mean": 0.83502197265625, + "rewards//std": 0.017242401838302612, + "step": 1603 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3208, + "grad_norm": 1.4940869808197021, + "kl": 0.41140924021601677, + "learning_rate": 7.761669257463187e-07, + "loss": 0.0165, + "num_tokens": 11673358.0, + "reward": 0.83160400390625, + "reward_std": 0.013190140947699547, + "rewards//mean": 0.83160400390625, + "rewards//std": 0.015657784417271614, + "step": 1604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.321, + "grad_norm": 1.6469711065292358, + "kl": 0.3353906013071537, + "learning_rate": 7.759023343953688e-07, + "loss": 0.0134, + "num_tokens": 11680534.0, + "reward": 0.84918212890625, + "reward_std": 0.013246983289718628, + "rewards//mean": 0.84918212890625, + "rewards//std": 0.019594606012105942, + "step": 1605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3212, + "grad_norm": 1.3263895511627197, + "kl": 0.3695889674127102, + "learning_rate": 7.756376319109916e-07, + "loss": 0.0148, + "num_tokens": 11687846.0, + "reward": 0.8170166015625, + "reward_std": 0.014184250496327877, + "rewards//mean": 0.8170166015625, + "rewards//std": 0.018311606720089912, + "step": 1606 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3214, + "grad_norm": 1.4943768978118896, + "kl": 0.31411219388246536, + "learning_rate": 7.753728183998092e-07, + "loss": 0.0126, + "num_tokens": 11695134.0, + "reward": 0.828125, + "reward_std": 0.01062968373298645, + "rewards//mean": 0.828125, + "rewards//std": 0.013333304785192013, + "step": 1607 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3216, + "grad_norm": 1.3691518306732178, + "kl": 0.3139073848724365, + "learning_rate": 7.751078939684885e-07, + "loss": 0.0126, + "num_tokens": 11702358.0, + "reward": 0.837158203125, + "reward_std": 0.016839593648910522, + "rewards//mean": 0.837158203125, + "rewards//std": 0.02269059419631958, + "step": 1608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.796875, + "epoch": 0.3218, + "grad_norm": 1.5188887119293213, + "kl": 0.30921892635524273, + "learning_rate": 7.748428587237411e-07, + "loss": 0.0129, + "num_tokens": 11709593.0, + "reward": 0.872802734375, + "reward_std": 0.01543283462524414, + "rewards//mean": 0.872802734375, + "rewards//std": 0.02075033448636532, + "step": 1609 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.322, + "grad_norm": 1.1548656225204468, + "kl": 0.2935113776475191, + "learning_rate": 7.74577712772323e-07, + "loss": 0.0117, + "num_tokens": 11716857.0, + "reward": 0.81097412109375, + "reward_std": 0.013166401535272598, + "rewards//mean": 0.81097412109375, + "rewards//std": 0.018037056550383568, + "step": 1610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3222, + "grad_norm": 1.3869324922561646, + "kl": 0.34002725034952164, + "learning_rate": 7.743124562210351e-07, + "loss": 0.0136, + "num_tokens": 11724121.0, + "reward": 0.8365478515625, + "reward_std": 0.015340914018452168, + "rewards//mean": 0.8365478515625, + "rewards//std": 0.024282259866595268, + "step": 1611 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3224, + "grad_norm": 3.3432071208953857, + "kl": 0.5111357662826777, + "learning_rate": 7.740470891767224e-07, + "loss": 0.0204, + "num_tokens": 11731361.0, + "reward": 0.8897705078125, + "reward_std": 0.02055625058710575, + "rewards//mean": 0.8897705078125, + "rewards//std": 0.03520422801375389, + "step": 1612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.859375, + "epoch": 0.3226, + "grad_norm": 1.5326749086380005, + "kl": 0.3224012181162834, + "learning_rate": 7.737816117462751e-07, + "loss": 0.0121, + "num_tokens": 11738536.0, + "reward": 0.81884765625, + "reward_std": 0.017077423632144928, + "rewards//mean": 0.81884765625, + "rewards//std": 0.0254347063601017, + "step": 1613 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3228, + "grad_norm": 1.547853946685791, + "kl": 0.33038954995572567, + "learning_rate": 7.735160240366274e-07, + "loss": 0.0132, + "num_tokens": 11745928.0, + "reward": 0.8345947265625, + "reward_std": 0.017406653612852097, + "rewards//mean": 0.8345947265625, + "rewards//std": 0.027190247550606728, + "step": 1614 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.323, + "grad_norm": 1.1813148260116577, + "kl": 0.31022995710372925, + "learning_rate": 7.732503261547578e-07, + "loss": 0.0124, + "num_tokens": 11753248.0, + "reward": 0.808349609375, + "reward_std": 0.013380684889853, + "rewards//mean": 0.808349609375, + "rewards//std": 0.01894424296915531, + "step": 1615 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3232, + "grad_norm": 1.391114354133606, + "kl": 0.3125507738441229, + "learning_rate": 7.729845182076895e-07, + "loss": 0.0125, + "num_tokens": 11760520.0, + "reward": 0.85162353515625, + "reward_std": 0.019541125744581223, + "rewards//mean": 0.85162353515625, + "rewards//std": 0.028378183022141457, + "step": 1616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3234, + "grad_norm": 1.6723219156265259, + "kl": 0.2620921563357115, + "learning_rate": 7.7271860030249e-07, + "loss": 0.0105, + "num_tokens": 11767800.0, + "reward": 0.8690185546875, + "reward_std": 0.014536735601723194, + "rewards//mean": 0.8690185546875, + "rewards//std": 0.023591680452227592, + "step": 1617 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3236, + "grad_norm": 1.2632496356964111, + "kl": 0.32202494889497757, + "learning_rate": 7.72452572546271e-07, + "loss": 0.0129, + "num_tokens": 11775160.0, + "reward": 0.8590087890625, + "reward_std": 0.0111900819465518, + "rewards//mean": 0.8590087890625, + "rewards//std": 0.024327103048563004, + "step": 1618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3238, + "grad_norm": 1.349116563796997, + "kl": 0.33347225934267044, + "learning_rate": 7.721864350461882e-07, + "loss": 0.0133, + "num_tokens": 11782464.0, + "reward": 0.83697509765625, + "reward_std": 0.01701373979449272, + "rewards//mean": 0.83697509765625, + "rewards//std": 0.024051988497376442, + "step": 1619 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.324, + "grad_norm": 1.2362542152404785, + "kl": 0.377959493547678, + "learning_rate": 7.71920187909442e-07, + "loss": 0.0151, + "num_tokens": 11789712.0, + "reward": 0.85455322265625, + "reward_std": 0.01698944717645645, + "rewards//mean": 0.85455322265625, + "rewards//std": 0.032613351941108704, + "step": 1620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3242, + "grad_norm": 1.6130287647247314, + "kl": 0.3677781857550144, + "learning_rate": 7.716538312432765e-07, + "loss": 0.0147, + "num_tokens": 11796864.0, + "reward": 0.836669921875, + "reward_std": 0.019467461854219437, + "rewards//mean": 0.836669921875, + "rewards//std": 0.02261575125157833, + "step": 1621 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3244, + "grad_norm": 1.6362024545669556, + "kl": 0.35362686589360237, + "learning_rate": 7.713873651549804e-07, + "loss": 0.0141, + "num_tokens": 11804104.0, + "reward": 0.7730712890625, + "reward_std": 0.017134791240096092, + "rewards//mean": 0.7730712890625, + "rewards//std": 0.025178389623761177, + "step": 1622 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3246, + "grad_norm": 1.1918649673461914, + "kl": 0.3232458829879761, + "learning_rate": 7.71120789751886e-07, + "loss": 0.0129, + "num_tokens": 11811304.0, + "reward": 0.84686279296875, + "reward_std": 0.01338629424571991, + "rewards//mean": 0.84686279296875, + "rewards//std": 0.0167643241584301, + "step": 1623 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.796875, + "epoch": 0.3248, + "grad_norm": 1.4776630401611328, + "kl": 0.31281563080847263, + "learning_rate": 7.7085410514137e-07, + "loss": -0.0003, + "num_tokens": 11818675.0, + "reward": 0.85638427734375, + "reward_std": 0.014902222901582718, + "rewards//mean": 0.85638427734375, + "rewards//std": 0.030199531465768814, + "step": 1624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.325, + "grad_norm": 1.6832634210586548, + "kl": 0.3628905601799488, + "learning_rate": 7.705873114308527e-07, + "loss": 0.0145, + "num_tokens": 11826003.0, + "reward": 0.77593994140625, + "reward_std": 0.015093226917088032, + "rewards//mean": 0.77593994140625, + "rewards//std": 0.027333714067935944, + "step": 1625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3252, + "grad_norm": 1.3298511505126953, + "kl": 0.2962909825146198, + "learning_rate": 7.703204087277988e-07, + "loss": 0.0119, + "num_tokens": 11833331.0, + "reward": 0.8365478515625, + "reward_std": 0.0134806577116251, + "rewards//mean": 0.8365478515625, + "rewards//std": 0.018768833950161934, + "step": 1626 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3254, + "grad_norm": 1.593498706817627, + "kl": 0.37222232669591904, + "learning_rate": 7.700533971397165e-07, + "loss": 0.0149, + "num_tokens": 11840619.0, + "reward": 0.8492431640625, + "reward_std": 0.009882592596113682, + "rewards//mean": 0.8492431640625, + "rewards//std": 0.018371030688285828, + "step": 1627 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3256, + "grad_norm": 1.2624154090881348, + "kl": 0.3200800083577633, + "learning_rate": 7.697862767741583e-07, + "loss": 0.0128, + "num_tokens": 11847883.0, + "reward": 0.88861083984375, + "reward_std": 0.017584307119250298, + "rewards//mean": 0.88861083984375, + "rewards//std": 0.027072172611951828, + "step": 1628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.3258, + "grad_norm": 1.9642666578292847, + "kl": 0.2832256630063057, + "learning_rate": 7.695190477387199e-07, + "loss": 0.0109, + "num_tokens": 11855143.0, + "reward": 0.85394287109375, + "reward_std": 0.01733282208442688, + "rewards//mean": 0.85394287109375, + "rewards//std": 0.023005876690149307, + "step": 1629 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.326, + "grad_norm": 1.5409107208251953, + "kl": 0.3311961740255356, + "learning_rate": 7.692517101410414e-07, + "loss": 0.0132, + "num_tokens": 11862519.0, + "reward": 0.857421875, + "reward_std": 0.01848490536212921, + "rewards//mean": 0.857421875, + "rewards//std": 0.027018509805202484, + "step": 1630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3262, + "grad_norm": 1.5858988761901855, + "kl": 0.319524422287941, + "learning_rate": 7.689842640888063e-07, + "loss": 0.0128, + "num_tokens": 11869743.0, + "reward": 0.82061767578125, + "reward_std": 0.01451466977596283, + "rewards//mean": 0.82061767578125, + "rewards//std": 0.02257748320698738, + "step": 1631 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3264, + "grad_norm": 1.5043359994888306, + "kl": 0.3640426695346832, + "learning_rate": 7.687167096897418e-07, + "loss": 0.0146, + "num_tokens": 11877127.0, + "reward": 0.765625, + "reward_std": 0.014646805822849274, + "rewards//mean": 0.765625, + "rewards//std": 0.019611690193414688, + "step": 1632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3266, + "grad_norm": 1.5628849267959595, + "kl": 0.2905687130987644, + "learning_rate": 7.684490470516185e-07, + "loss": 0.0116, + "num_tokens": 11884343.0, + "reward": 0.815185546875, + "reward_std": 0.014068251475691795, + "rewards//mean": 0.815185546875, + "rewards//std": 0.02055094949901104, + "step": 1633 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.59375, + "epoch": 0.3268, + "grad_norm": 1.232248067855835, + "kl": 0.35837874561548233, + "learning_rate": 7.681812762822515e-07, + "loss": -0.0088, + "num_tokens": 11891621.0, + "reward": 0.85772705078125, + "reward_std": 0.015128266997635365, + "rewards//mean": 0.85772705078125, + "rewards//std": 0.028753895312547684, + "step": 1634 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.327, + "grad_norm": 1.401201605796814, + "kl": 0.3298170380294323, + "learning_rate": 7.679133974894982e-07, + "loss": 0.0132, + "num_tokens": 11898861.0, + "reward": 0.87799072265625, + "reward_std": 0.012143636122345924, + "rewards//mean": 0.87799072265625, + "rewards//std": 0.02056116610765457, + "step": 1635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3272, + "grad_norm": 1.334531307220459, + "kl": 0.2985645458102226, + "learning_rate": 7.676454107812607e-07, + "loss": 0.0119, + "num_tokens": 11906117.0, + "reward": 0.854736328125, + "reward_std": 0.013043894432485104, + "rewards//mean": 0.854736328125, + "rewards//std": 0.024694262072443962, + "step": 1636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3274, + "grad_norm": 1.3323217630386353, + "kl": 0.3254656679928303, + "learning_rate": 7.673773162654836e-07, + "loss": 0.013, + "num_tokens": 11913469.0, + "reward": 0.84039306640625, + "reward_std": 0.016873255372047424, + "rewards//mean": 0.84039306640625, + "rewards//std": 0.02245444990694523, + "step": 1637 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3276, + "grad_norm": 1.6107046604156494, + "kl": 0.40503247641026974, + "learning_rate": 7.671091140501555e-07, + "loss": 0.0162, + "num_tokens": 11920725.0, + "reward": 0.8033447265625, + "reward_std": 0.013821935281157494, + "rewards//mean": 0.8033447265625, + "rewards//std": 0.018085353076457977, + "step": 1638 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3278, + "grad_norm": 1.4607503414154053, + "kl": 0.35896341875195503, + "learning_rate": 7.668408042433081e-07, + "loss": 0.0144, + "num_tokens": 11927965.0, + "reward": 0.85430908203125, + "reward_std": 0.013269290328025818, + "rewards//mean": 0.85430908203125, + "rewards//std": 0.01918315514922142, + "step": 1639 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.328, + "grad_norm": 1.2713176012039185, + "kl": 0.31054247356951237, + "learning_rate": 7.665723869530169e-07, + "loss": 0.0128, + "num_tokens": 11935164.0, + "reward": 0.86883544921875, + "reward_std": 0.015451299026608467, + "rewards//mean": 0.86883544921875, + "rewards//std": 0.03316611796617508, + "step": 1640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3282, + "grad_norm": 1.359779715538025, + "kl": 0.2817864455282688, + "learning_rate": 7.663038622873999e-07, + "loss": 0.0113, + "num_tokens": 11942420.0, + "reward": 0.8001708984375, + "reward_std": 0.011332456022500992, + "rewards//mean": 0.8001708984375, + "rewards//std": 0.01688006892800331, + "step": 1641 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3284, + "grad_norm": 1.4007446765899658, + "kl": 0.33225501514971256, + "learning_rate": 7.660352303546192e-07, + "loss": 0.0133, + "num_tokens": 11949716.0, + "reward": 0.8421630859375, + "reward_std": 0.0162956602871418, + "rewards//mean": 0.8421630859375, + "rewards//std": 0.03204379230737686, + "step": 1642 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3286, + "grad_norm": 1.3927632570266724, + "kl": 0.32813186198472977, + "learning_rate": 7.657664912628794e-07, + "loss": 0.0131, + "num_tokens": 11957132.0, + "reward": 0.84765625, + "reward_std": 0.012039920315146446, + "rewards//mean": 0.84765625, + "rewards//std": 0.022282643243670464, + "step": 1643 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3288, + "grad_norm": 1.2355237007141113, + "kl": 0.30804489739239216, + "learning_rate": 7.654976451204287e-07, + "loss": 0.0123, + "num_tokens": 11964348.0, + "reward": 0.859375, + "reward_std": 0.016398195177316666, + "rewards//mean": 0.859375, + "rewards//std": 0.027917902916669846, + "step": 1644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.329, + "grad_norm": 1.3010647296905518, + "kl": 0.2880748510360718, + "learning_rate": 7.652286920355583e-07, + "loss": 0.0115, + "num_tokens": 11971716.0, + "reward": 0.86572265625, + "reward_std": 0.01624777726829052, + "rewards//mean": 0.86572265625, + "rewards//std": 0.01945670321583748, + "step": 1645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3292, + "grad_norm": 1.4350637197494507, + "kl": 0.30987974628806114, + "learning_rate": 7.649596321166024e-07, + "loss": 0.0124, + "num_tokens": 11979124.0, + "reward": 0.8336181640625, + "reward_std": 0.01165890321135521, + "rewards//mean": 0.8336181640625, + "rewards//std": 0.012884682044386864, + "step": 1646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3294, + "grad_norm": 1.3099453449249268, + "kl": 0.28885284066200256, + "learning_rate": 7.646904654719385e-07, + "loss": 0.0116, + "num_tokens": 11986436.0, + "reward": 0.8253173828125, + "reward_std": 0.012183002196252346, + "rewards//mean": 0.8253173828125, + "rewards//std": 0.017280658707022667, + "step": 1647 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.828125, + "epoch": 0.3296, + "grad_norm": 1.2793776988983154, + "kl": 0.3289286382496357, + "learning_rate": 7.644211922099867e-07, + "loss": 0.0132, + "num_tokens": 11993689.0, + "reward": 0.8494873046875, + "reward_std": 0.012361256405711174, + "rewards//mean": 0.8494873046875, + "rewards//std": 0.016645273193717003, + "step": 1648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3298, + "grad_norm": 1.481427788734436, + "kl": 0.3059687167406082, + "learning_rate": 7.641518124392103e-07, + "loss": 0.0122, + "num_tokens": 12000969.0, + "reward": 0.85089111328125, + "reward_std": 0.012678323313593864, + "rewards//mean": 0.85089111328125, + "rewards//std": 0.020886218175292015, + "step": 1649 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.33, + "grad_norm": 1.3420878648757935, + "kl": 0.2927614487707615, + "learning_rate": 7.638823262681154e-07, + "loss": 0.0119, + "num_tokens": 12008199.0, + "reward": 0.85528564453125, + "reward_std": 0.01672820933163166, + "rewards//mean": 0.85528564453125, + "rewards//std": 0.03320717439055443, + "step": 1650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.828125, + "epoch": 0.3302, + "grad_norm": 1.2022922039031982, + "kl": 0.28466419130563736, + "learning_rate": 7.636127338052511e-07, + "loss": 0.0035, + "num_tokens": 12015444.0, + "reward": 0.84515380859375, + "reward_std": 0.01169316004961729, + "rewards//mean": 0.84515380859375, + "rewards//std": 0.01930508017539978, + "step": 1651 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3304, + "grad_norm": 1.1693214178085327, + "kl": 0.2826785333454609, + "learning_rate": 7.633430351592093e-07, + "loss": 0.0113, + "num_tokens": 12022700.0, + "reward": 0.8505859375, + "reward_std": 0.016573067754507065, + "rewards//mean": 0.8505859375, + "rewards//std": 0.02834836021065712, + "step": 1652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3306, + "grad_norm": 1.2696653604507446, + "kl": 0.24333197437226772, + "learning_rate": 7.630732304386243e-07, + "loss": 0.0097, + "num_tokens": 12029964.0, + "reward": 0.875, + "reward_std": 0.018585680052638054, + "rewards//mean": 0.875, + "rewards//std": 0.03181075677275658, + "step": 1653 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.84375, + "epoch": 0.3308, + "grad_norm": 1.2064720392227173, + "kl": 0.3260943926870823, + "learning_rate": 7.628033197521735e-07, + "loss": 0.0044, + "num_tokens": 12037250.0, + "reward": 0.77142333984375, + "reward_std": 0.014374888502061367, + "rewards//mean": 0.77142333984375, + "rewards//std": 0.020291410386562347, + "step": 1654 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.331, + "grad_norm": 1.4796620607376099, + "kl": 0.31209271028637886, + "learning_rate": 7.625333032085769e-07, + "loss": 0.0125, + "num_tokens": 12044530.0, + "reward": 0.86065673828125, + "reward_std": 0.01474627573043108, + "rewards//mean": 0.86065673828125, + "rewards//std": 0.016038892790675163, + "step": 1655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.71875, + "epoch": 0.3312, + "grad_norm": 1.2887307405471802, + "kl": 0.3532552234828472, + "learning_rate": 7.622631809165972e-07, + "loss": -0.0012, + "num_tokens": 12051800.0, + "reward": 0.84051513671875, + "reward_std": 0.0160948745906353, + "rewards//mean": 0.84051513671875, + "rewards//std": 0.025832070037722588, + "step": 1656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3314, + "grad_norm": 1.2900751829147339, + "kl": 0.3430363703519106, + "learning_rate": 7.619929529850396e-07, + "loss": 0.0137, + "num_tokens": 12059040.0, + "reward": 0.81640625, + "reward_std": 0.010397573001682758, + "rewards//mean": 0.81640625, + "rewards//std": 0.014314459636807442, + "step": 1657 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3316, + "grad_norm": 1.2004592418670654, + "kl": 0.3012267891317606, + "learning_rate": 7.617226195227517e-07, + "loss": 0.012, + "num_tokens": 12066320.0, + "reward": 0.7899169921875, + "reward_std": 0.010304298251867294, + "rewards//mean": 0.7899169921875, + "rewards//std": 0.016754047945141792, + "step": 1658 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3318, + "grad_norm": 1.6618386507034302, + "kl": 0.3156704902648926, + "learning_rate": 7.614521806386243e-07, + "loss": 0.0126, + "num_tokens": 12073560.0, + "reward": 0.8609619140625, + "reward_std": 0.01713225431740284, + "rewards//mean": 0.8609619140625, + "rewards//std": 0.020713461562991142, + "step": 1659 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.332, + "grad_norm": 1.5970956087112427, + "kl": 0.3165662959218025, + "learning_rate": 7.611816364415895e-07, + "loss": 0.0127, + "num_tokens": 12080872.0, + "reward": 0.85565185546875, + "reward_std": 0.014915421605110168, + "rewards//mean": 0.85565185546875, + "rewards//std": 0.019053300842642784, + "step": 1660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3322, + "grad_norm": 1.626603126525879, + "kl": 0.3309243004769087, + "learning_rate": 7.60910987040623e-07, + "loss": 0.0132, + "num_tokens": 12088112.0, + "reward": 0.81243896484375, + "reward_std": 0.014787489548325539, + "rewards//mean": 0.81243896484375, + "rewards//std": 0.024905383586883545, + "step": 1661 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.3324, + "grad_norm": 1.4514198303222656, + "kl": 0.2880251966416836, + "learning_rate": 7.606402325447419e-07, + "loss": 0.0158, + "num_tokens": 12095491.0, + "reward": 0.8251953125, + "reward_std": 0.018788030371069908, + "rewards//mean": 0.8251953125, + "rewards//std": 0.027250586077570915, + "step": 1662 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3326, + "grad_norm": 1.5187820196151733, + "kl": 0.3270924612879753, + "learning_rate": 7.603693730630066e-07, + "loss": 0.0131, + "num_tokens": 12102779.0, + "reward": 0.8311767578125, + "reward_std": 0.01313871145248413, + "rewards//mean": 0.8311767578125, + "rewards//std": 0.025214435532689095, + "step": 1663 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3328, + "grad_norm": 1.9202423095703125, + "kl": 0.38409480825066566, + "learning_rate": 7.600984087045186e-07, + "loss": 0.0154, + "num_tokens": 12110091.0, + "reward": 0.8502197265625, + "reward_std": 0.011669322848320007, + "rewards//mean": 0.8502197265625, + "rewards//std": 0.016086602583527565, + "step": 1664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.333, + "grad_norm": 1.4015127420425415, + "kl": 0.3557385317981243, + "learning_rate": 7.598273395784229e-07, + "loss": 0.0142, + "num_tokens": 12117347.0, + "reward": 0.86639404296875, + "reward_std": 0.011879565194249153, + "rewards//mean": 0.86639404296875, + "rewards//std": 0.01844288595020771, + "step": 1665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3332, + "grad_norm": 1.385929822921753, + "kl": 0.32905086129903793, + "learning_rate": 7.59556165793906e-07, + "loss": 0.0132, + "num_tokens": 12124587.0, + "reward": 0.82330322265625, + "reward_std": 0.018583878874778748, + "rewards//mean": 0.82330322265625, + "rewards//std": 0.024594679474830627, + "step": 1666 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3334, + "grad_norm": 1.3105688095092773, + "kl": 0.3095182552933693, + "learning_rate": 7.592848874601963e-07, + "loss": 0.0124, + "num_tokens": 12131867.0, + "reward": 0.8778076171875, + "reward_std": 0.017927763983607292, + "rewards//mean": 0.8778076171875, + "rewards//std": 0.03011702373623848, + "step": 1667 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3336, + "grad_norm": 1.6880972385406494, + "kl": 0.3764103464782238, + "learning_rate": 7.590135046865651e-07, + "loss": 0.0151, + "num_tokens": 12139139.0, + "reward": 0.85821533203125, + "reward_std": 0.021671969443559647, + "rewards//mean": 0.85821533203125, + "rewards//std": 0.026925837621092796, + "step": 1668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.78125, + "epoch": 0.3338, + "grad_norm": 1.2012169361114502, + "kl": 0.32449532486498356, + "learning_rate": 7.587420175823252e-07, + "loss": 0.0028, + "num_tokens": 12146373.0, + "reward": 0.84344482421875, + "reward_std": 0.011466486379504204, + "rewards//mean": 0.84344482421875, + "rewards//std": 0.01954122632741928, + "step": 1669 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.334, + "grad_norm": 1.601219892501831, + "kl": 0.36365261673927307, + "learning_rate": 7.584704262568314e-07, + "loss": 0.0145, + "num_tokens": 12153597.0, + "reward": 0.8541259765625, + "reward_std": 0.014721287414431572, + "rewards//mean": 0.8541259765625, + "rewards//std": 0.0197374876588583, + "step": 1670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3342, + "grad_norm": 1.3751511573791504, + "kl": 0.3502420149743557, + "learning_rate": 7.581987308194809e-07, + "loss": 0.014, + "num_tokens": 12160893.0, + "reward": 0.8074951171875, + "reward_std": 0.019313689321279526, + "rewards//mean": 0.8074951171875, + "rewards//std": 0.0243644081056118, + "step": 1671 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.3344, + "grad_norm": 1.5082298517227173, + "kl": 0.3220510706305504, + "learning_rate": 7.579269313797125e-07, + "loss": 0.0123, + "num_tokens": 12168075.0, + "reward": 0.84942626953125, + "reward_std": 0.01726769283413887, + "rewards//mean": 0.84942626953125, + "rewards//std": 0.029330378398299217, + "step": 1672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3346, + "grad_norm": 1.4687153100967407, + "kl": 0.34271077439188957, + "learning_rate": 7.576550280470071e-07, + "loss": 0.0137, + "num_tokens": 12175459.0, + "reward": 0.85015869140625, + "reward_std": 0.02081160433590412, + "rewards//mean": 0.85015869140625, + "rewards//std": 0.028033630922436714, + "step": 1673 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3348, + "grad_norm": 1.311950445175171, + "kl": 0.3506612069904804, + "learning_rate": 7.573830209308872e-07, + "loss": 0.014, + "num_tokens": 12182739.0, + "reward": 0.88287353515625, + "reward_std": 0.011477503925561905, + "rewards//mean": 0.88287353515625, + "rewards//std": 0.01773747242987156, + "step": 1674 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.335, + "grad_norm": 1.4280576705932617, + "kl": 0.28409228660166264, + "learning_rate": 7.57110910140917e-07, + "loss": 0.0114, + "num_tokens": 12190011.0, + "reward": 0.87738037109375, + "reward_std": 0.012195432558655739, + "rewards//mean": 0.87738037109375, + "rewards//std": 0.01684269867837429, + "step": 1675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3352, + "grad_norm": 1.7219434976577759, + "kl": 0.33833057433366776, + "learning_rate": 7.568386957867032e-07, + "loss": 0.0135, + "num_tokens": 12197307.0, + "reward": 0.87054443359375, + "reward_std": 0.016400113701820374, + "rewards//mean": 0.87054443359375, + "rewards//std": 0.026068320497870445, + "step": 1676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3354, + "grad_norm": 1.4729468822479248, + "kl": 0.33131776563823223, + "learning_rate": 7.565663779778933e-07, + "loss": 0.0133, + "num_tokens": 12204547.0, + "reward": 0.87030029296875, + "reward_std": 0.013809453696012497, + "rewards//mean": 0.87030029296875, + "rewards//std": 0.02292744070291519, + "step": 1677 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3356, + "grad_norm": 1.488652229309082, + "kl": 0.30659862607717514, + "learning_rate": 7.562939568241771e-07, + "loss": 0.0123, + "num_tokens": 12211803.0, + "reward": 0.79510498046875, + "reward_std": 0.01290581189095974, + "rewards//mean": 0.79510498046875, + "rewards//std": 0.02820213884115219, + "step": 1678 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3358, + "grad_norm": 1.4739843606948853, + "kl": 0.30338800325989723, + "learning_rate": 7.560214324352858e-07, + "loss": 0.0121, + "num_tokens": 12219083.0, + "reward": 0.8563232421875, + "reward_std": 0.016188189387321472, + "rewards//mean": 0.8563232421875, + "rewards//std": 0.023014839738607407, + "step": 1679 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.336, + "grad_norm": 1.1764307022094727, + "kl": 0.2967243455350399, + "learning_rate": 7.55748804920992e-07, + "loss": 0.0119, + "num_tokens": 12226443.0, + "reward": 0.86273193359375, + "reward_std": 0.01206089649349451, + "rewards//mean": 0.86273193359375, + "rewards//std": 0.022860661149024963, + "step": 1680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3362, + "grad_norm": 1.4594961404800415, + "kl": 0.35619720071554184, + "learning_rate": 7.554760743911103e-07, + "loss": 0.0142, + "num_tokens": 12233803.0, + "reward": 0.85272216796875, + "reward_std": 0.014805903658270836, + "rewards//mean": 0.85272216796875, + "rewards//std": 0.021728798747062683, + "step": 1681 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3364, + "grad_norm": 1.506652593612671, + "kl": 0.34371329471468925, + "learning_rate": 7.552032409554962e-07, + "loss": 0.0137, + "num_tokens": 12241043.0, + "reward": 0.809814453125, + "reward_std": 0.012136869132518768, + "rewards//mean": 0.809814453125, + "rewards//std": 0.023363754153251648, + "step": 1682 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3366, + "grad_norm": 1.901390552520752, + "kl": 0.3562243767082691, + "learning_rate": 7.549303047240474e-07, + "loss": 0.0142, + "num_tokens": 12248299.0, + "reward": 0.8255615234375, + "reward_std": 0.019720902666449547, + "rewards//mean": 0.8255615234375, + "rewards//std": 0.021945513784885406, + "step": 1683 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.3368, + "grad_norm": 1.5149033069610596, + "kl": 0.3989630490541458, + "learning_rate": 7.54657265806702e-07, + "loss": 0.0164, + "num_tokens": 12255552.0, + "reward": 0.8587646484375, + "reward_std": 0.012811415828764439, + "rewards//mean": 0.8587646484375, + "rewards//std": 0.022640839219093323, + "step": 1684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.337, + "grad_norm": 1.20064377784729, + "kl": 0.3027157410979271, + "learning_rate": 7.543841243134408e-07, + "loss": 0.0121, + "num_tokens": 12262832.0, + "reward": 0.83331298828125, + "reward_std": 0.0100923553109169, + "rewards//mean": 0.83331298828125, + "rewards//std": 0.016675597056746483, + "step": 1685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3372, + "grad_norm": 1.3473749160766602, + "kl": 0.2783954329788685, + "learning_rate": 7.541108803542845e-07, + "loss": 0.0111, + "num_tokens": 12270088.0, + "reward": 0.82415771484375, + "reward_std": 0.01772741600871086, + "rewards//mean": 0.82415771484375, + "rewards//std": 0.021837901324033737, + "step": 1686 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3374, + "grad_norm": 1.2730780839920044, + "kl": 0.3465697653591633, + "learning_rate": 7.538375340392961e-07, + "loss": 0.0139, + "num_tokens": 12277400.0, + "reward": 0.85894775390625, + "reward_std": 0.01827005110681057, + "rewards//mean": 0.85894775390625, + "rewards//std": 0.02061263844370842, + "step": 1687 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3376, + "grad_norm": 1.478083848953247, + "kl": 0.33720899000763893, + "learning_rate": 7.535640854785791e-07, + "loss": 0.0135, + "num_tokens": 12284656.0, + "reward": 0.85455322265625, + "reward_std": 0.017353415489196777, + "rewards//mean": 0.85455322265625, + "rewards//std": 0.02135850116610527, + "step": 1688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3378, + "grad_norm": 1.393418312072754, + "kl": 0.3283802419900894, + "learning_rate": 7.532905347822791e-07, + "loss": 0.0131, + "num_tokens": 12291920.0, + "reward": 0.8565673828125, + "reward_std": 0.01718803495168686, + "rewards//mean": 0.8565673828125, + "rewards//std": 0.031369663774967194, + "step": 1689 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.338, + "grad_norm": 1.3961249589920044, + "kl": 0.36955014057457447, + "learning_rate": 7.530168820605818e-07, + "loss": 0.0148, + "num_tokens": 12299232.0, + "reward": 0.85809326171875, + "reward_std": 0.017852384597063065, + "rewards//mean": 0.85809326171875, + "rewards//std": 0.02624598704278469, + "step": 1690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3382, + "grad_norm": 1.3923335075378418, + "kl": 0.332891758531332, + "learning_rate": 7.527431274237149e-07, + "loss": 0.0133, + "num_tokens": 12306608.0, + "reward": 0.85888671875, + "reward_std": 0.01596328802406788, + "rewards//mean": 0.85888671875, + "rewards//std": 0.018169280141592026, + "step": 1691 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3384, + "grad_norm": 1.3649203777313232, + "kl": 0.3117251116782427, + "learning_rate": 7.524692709819463e-07, + "loss": 0.0125, + "num_tokens": 12313872.0, + "reward": 0.84771728515625, + "reward_std": 0.021815240383148193, + "rewards//mean": 0.84771728515625, + "rewards//std": 0.024798179045319557, + "step": 1692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3386, + "grad_norm": 1.4519718885421753, + "kl": 0.29800859838724136, + "learning_rate": 7.521953128455855e-07, + "loss": 0.0119, + "num_tokens": 12321128.0, + "reward": 0.81365966796875, + "reward_std": 0.013326648622751236, + "rewards//mean": 0.81365966796875, + "rewards//std": 0.025062311440706253, + "step": 1693 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3388, + "grad_norm": 1.4809486865997314, + "kl": 0.33240076899528503, + "learning_rate": 7.519212531249829e-07, + "loss": 0.0133, + "num_tokens": 12328376.0, + "reward": 0.802978515625, + "reward_std": 0.02406466193497181, + "rewards//mean": 0.802978515625, + "rewards//std": 0.02629043348133564, + "step": 1694 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.339, + "grad_norm": 1.628666639328003, + "kl": 0.34262726455926895, + "learning_rate": 7.516470919305298e-07, + "loss": 0.0137, + "num_tokens": 12335648.0, + "reward": 0.836669921875, + "reward_std": 0.019438818097114563, + "rewards//mean": 0.836669921875, + "rewards//std": 0.03271814063191414, + "step": 1695 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.3392, + "grad_norm": 1.4738249778747559, + "kl": 0.3298093881458044, + "learning_rate": 7.513728293726579e-07, + "loss": 0.0063, + "num_tokens": 12342850.0, + "reward": 0.87347412109375, + "reward_std": 0.010674663819372654, + "rewards//mean": 0.87347412109375, + "rewards//std": 0.02218042127788067, + "step": 1696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3394, + "grad_norm": 1.2919684648513794, + "kl": 0.3135477639734745, + "learning_rate": 7.510984655618406e-07, + "loss": 0.0125, + "num_tokens": 12350138.0, + "reward": 0.86199951171875, + "reward_std": 0.015133596956729889, + "rewards//mean": 0.86199951171875, + "rewards//std": 0.02511420100927353, + "step": 1697 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3396, + "grad_norm": 1.393872857093811, + "kl": 0.30440257117152214, + "learning_rate": 7.508240006085913e-07, + "loss": 0.0122, + "num_tokens": 12357482.0, + "reward": 0.8250732421875, + "reward_std": 0.01183552946895361, + "rewards//mean": 0.8250732421875, + "rewards//std": 0.018551424145698547, + "step": 1698 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3398, + "grad_norm": 1.764649510383606, + "kl": 0.4542177151888609, + "learning_rate": 7.505494346234647e-07, + "loss": 0.0182, + "num_tokens": 12364698.0, + "reward": 0.865234375, + "reward_std": 0.012295342981815338, + "rewards//mean": 0.865234375, + "rewards//std": 0.02031536214053631, + "step": 1699 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.34, + "grad_norm": 1.6568615436553955, + "kl": 0.29921073839068413, + "learning_rate": 7.502747677170555e-07, + "loss": 0.012, + "num_tokens": 12371994.0, + "reward": 0.77899169921875, + "reward_std": 0.01846805028617382, + "rewards//mean": 0.77899169921875, + "rewards//std": 0.02567453868687153, + "step": 1700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3402, + "grad_norm": 1.5581049919128418, + "kl": 0.4229926373809576, + "learning_rate": 7.5e-07, + "loss": 0.0169, + "num_tokens": 12379274.0, + "reward": 0.87579345703125, + "reward_std": 0.0113875987008214, + "rewards//mean": 0.87579345703125, + "rewards//std": 0.015995418652892113, + "step": 1701 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.3404, + "grad_norm": 1.4790221452713013, + "kl": 0.32665684446692467, + "learning_rate": 7.497251315829743e-07, + "loss": 0.0133, + "num_tokens": 12386576.0, + "reward": 0.85888671875, + "reward_std": 0.01921910047531128, + "rewards//mean": 0.85888671875, + "rewards//std": 0.028000198304653168, + "step": 1702 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3406, + "grad_norm": 1.4879364967346191, + "kl": 0.34781783632934093, + "learning_rate": 7.494501625766955e-07, + "loss": 0.0139, + "num_tokens": 12393936.0, + "reward": 0.8399658203125, + "reward_std": 0.017650607973337173, + "rewards//mean": 0.8399658203125, + "rewards//std": 0.02537243627011776, + "step": 1703 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3408, + "grad_norm": 1.444380283355713, + "kl": 0.32775698602199554, + "learning_rate": 7.491750930919212e-07, + "loss": 0.0131, + "num_tokens": 12401208.0, + "reward": 0.845458984375, + "reward_std": 0.012372934259474277, + "rewards//mean": 0.845458984375, + "rewards//std": 0.0198312159627676, + "step": 1704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.734375, + "epoch": 0.341, + "grad_norm": 1.304953694343567, + "kl": 0.29817502386868, + "learning_rate": 7.488999232394491e-07, + "loss": -0.0061, + "num_tokens": 12408535.0, + "reward": 0.80584716796875, + "reward_std": 0.01672002673149109, + "rewards//mean": 0.80584716796875, + "rewards//std": 0.02061924710869789, + "step": 1705 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3412, + "grad_norm": 1.3487471342086792, + "kl": 0.32399921491742134, + "learning_rate": 7.486246531301177e-07, + "loss": 0.013, + "num_tokens": 12415839.0, + "reward": 0.83099365234375, + "reward_std": 0.017497895285487175, + "rewards//mean": 0.83099365234375, + "rewards//std": 0.02334682270884514, + "step": 1706 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3414, + "grad_norm": 1.6821225881576538, + "kl": 0.2795609515160322, + "learning_rate": 7.483492828748056e-07, + "loss": 0.0112, + "num_tokens": 12423199.0, + "reward": 0.84002685546875, + "reward_std": 0.016150876879692078, + "rewards//mean": 0.84002685546875, + "rewards//std": 0.020803431048989296, + "step": 1707 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3416, + "grad_norm": 1.558677077293396, + "kl": 0.33866774663329124, + "learning_rate": 7.480738125844322e-07, + "loss": 0.0135, + "num_tokens": 12430471.0, + "reward": 0.8538818359375, + "reward_std": 0.01989196613430977, + "rewards//mean": 0.8538818359375, + "rewards//std": 0.032382138073444366, + "step": 1708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3418, + "grad_norm": 1.357872724533081, + "kl": 0.30263821594417095, + "learning_rate": 7.477982423699567e-07, + "loss": 0.0121, + "num_tokens": 12437775.0, + "reward": 0.85614013671875, + "reward_std": 0.011449817568063736, + "rewards//mean": 0.85614013671875, + "rewards//std": 0.017587488517165184, + "step": 1709 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.342, + "grad_norm": 1.4704139232635498, + "kl": 0.3233280275017023, + "learning_rate": 7.475225723423788e-07, + "loss": 0.0129, + "num_tokens": 12445111.0, + "reward": 0.81671142578125, + "reward_std": 0.01730852760374546, + "rewards//mean": 0.81671142578125, + "rewards//std": 0.02021143026649952, + "step": 1710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3422, + "grad_norm": 1.2921147346496582, + "kl": 0.3425549566745758, + "learning_rate": 7.472468026127384e-07, + "loss": 0.0137, + "num_tokens": 12452527.0, + "reward": 0.8060302734375, + "reward_std": 0.013604572042822838, + "rewards//mean": 0.8060302734375, + "rewards//std": 0.018613336607813835, + "step": 1711 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3424, + "grad_norm": 1.3210055828094482, + "kl": 0.3123988378793001, + "learning_rate": 7.469709332921154e-07, + "loss": 0.0125, + "num_tokens": 12459735.0, + "reward": 0.8399658203125, + "reward_std": 0.016680259257555008, + "rewards//mean": 0.8399658203125, + "rewards//std": 0.03876979276537895, + "step": 1712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.3426, + "grad_norm": 1.5978310108184814, + "kl": 0.36547116935253143, + "learning_rate": 7.4669496449163e-07, + "loss": 0.0148, + "num_tokens": 12467022.0, + "reward": 0.84613037109375, + "reward_std": 0.013162808492779732, + "rewards//mean": 0.84613037109375, + "rewards//std": 0.015431860461831093, + "step": 1713 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3428, + "grad_norm": 1.3643471002578735, + "kl": 0.3876275382936001, + "learning_rate": 7.464188963224427e-07, + "loss": 0.0155, + "num_tokens": 12474278.0, + "reward": 0.80853271484375, + "reward_std": 0.013413889333605766, + "rewards//mean": 0.80853271484375, + "rewards//std": 0.022233590483665466, + "step": 1714 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.343, + "grad_norm": 1.1475675106048584, + "kl": 0.2709645237773657, + "learning_rate": 7.461427288957531e-07, + "loss": 0.0108, + "num_tokens": 12481534.0, + "reward": 0.85479736328125, + "reward_std": 0.011422235518693924, + "rewards//mean": 0.85479736328125, + "rewards//std": 0.018276335671544075, + "step": 1715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3432, + "grad_norm": 1.5646576881408691, + "kl": 0.34728871658444405, + "learning_rate": 7.45866462322802e-07, + "loss": 0.0139, + "num_tokens": 12488750.0, + "reward": 0.84515380859375, + "reward_std": 0.011298373341560364, + "rewards//mean": 0.84515380859375, + "rewards//std": 0.023076830431818962, + "step": 1716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.3434, + "grad_norm": 1.1277873516082764, + "kl": 0.33202885277569294, + "learning_rate": 7.45590096714869e-07, + "loss": 0.0135, + "num_tokens": 12495972.0, + "reward": 0.8046875, + "reward_std": 0.012957911938428879, + "rewards//mean": 0.8046875, + "rewards//std": 0.020231734961271286, + "step": 1717 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3436, + "grad_norm": 1.349047303199768, + "kl": 0.3717232868075371, + "learning_rate": 7.453136321832745e-07, + "loss": 0.0149, + "num_tokens": 12503220.0, + "reward": 0.874267578125, + "reward_std": 0.014715325087308884, + "rewards//mean": 0.874267578125, + "rewards//std": 0.027986139059066772, + "step": 1718 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3438, + "grad_norm": 1.5136128664016724, + "kl": 0.2890872173011303, + "learning_rate": 7.450370688393784e-07, + "loss": 0.0116, + "num_tokens": 12510476.0, + "reward": 0.79473876953125, + "reward_std": 0.013619303703308105, + "rewards//mean": 0.79473876953125, + "rewards//std": 0.024743791669607162, + "step": 1719 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.344, + "grad_norm": 1.2929282188415527, + "kl": 0.37381679378449917, + "learning_rate": 7.447604067945802e-07, + "loss": 0.015, + "num_tokens": 12517820.0, + "reward": 0.84344482421875, + "reward_std": 0.01802837662398815, + "rewards//mean": 0.84344482421875, + "rewards//std": 0.026081673800945282, + "step": 1720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3442, + "grad_norm": 1.3360874652862549, + "kl": 0.29150519892573357, + "learning_rate": 7.444836461603194e-07, + "loss": 0.0117, + "num_tokens": 12525300.0, + "reward": 0.851806640625, + "reward_std": 0.012079497799277306, + "rewards//mean": 0.851806640625, + "rewards//std": 0.013618625700473785, + "step": 1721 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3444, + "grad_norm": 1.4161980152130127, + "kl": 0.3049562834203243, + "learning_rate": 7.442067870480751e-07, + "loss": 0.0122, + "num_tokens": 12532524.0, + "reward": 0.85992431640625, + "reward_std": 0.017522267997264862, + "rewards//mean": 0.85992431640625, + "rewards//std": 0.02519904635846615, + "step": 1722 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3446, + "grad_norm": 1.5631870031356812, + "kl": 0.28968435153365135, + "learning_rate": 7.439298295693663e-07, + "loss": 0.0116, + "num_tokens": 12539852.0, + "reward": 0.858642578125, + "reward_std": 0.019475558772683144, + "rewards//mean": 0.858642578125, + "rewards//std": 0.03012833185493946, + "step": 1723 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.3448, + "grad_norm": 1.4511770009994507, + "kl": 0.33749881759285927, + "learning_rate": 7.436527738357513e-07, + "loss": 0.0152, + "num_tokens": 12547106.0, + "reward": 0.84954833984375, + "reward_std": 0.01529824547469616, + "rewards//mean": 0.84954833984375, + "rewards//std": 0.019648607820272446, + "step": 1724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.345, + "grad_norm": 1.3881431818008423, + "kl": 0.34504370391368866, + "learning_rate": 7.433756199588282e-07, + "loss": 0.0138, + "num_tokens": 12554394.0, + "reward": 0.79779052734375, + "reward_std": 0.017813751474022865, + "rewards//mean": 0.79779052734375, + "rewards//std": 0.023465821519494057, + "step": 1725 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3452, + "grad_norm": 1.4086196422576904, + "kl": 0.3130856156349182, + "learning_rate": 7.430983680502343e-07, + "loss": 0.0125, + "num_tokens": 12561714.0, + "reward": 0.8585205078125, + "reward_std": 0.016843033954501152, + "rewards//mean": 0.8585205078125, + "rewards//std": 0.029695892706513405, + "step": 1726 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3454, + "grad_norm": 1.5077602863311768, + "kl": 0.31319775246083736, + "learning_rate": 7.42821018221647e-07, + "loss": 0.0125, + "num_tokens": 12569114.0, + "reward": 0.8538818359375, + "reward_std": 0.016233544796705246, + "rewards//mean": 0.8538818359375, + "rewards//std": 0.025474848225712776, + "step": 1727 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3456, + "grad_norm": 1.2062835693359375, + "kl": 0.28688253834843636, + "learning_rate": 7.425435705847825e-07, + "loss": 0.0115, + "num_tokens": 12576458.0, + "reward": 0.8310546875, + "reward_std": 0.01348064187914133, + "rewards//mean": 0.8310546875, + "rewards//std": 0.020099617540836334, + "step": 1728 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.390625, + "epoch": 0.3458, + "grad_norm": 1.3733114004135132, + "kl": 0.32700863666832447, + "learning_rate": 7.422660252513968e-07, + "loss": -0.0274, + "num_tokens": 12583715.0, + "reward": 0.79351806640625, + "reward_std": 0.014218026772141457, + "rewards//mean": 0.79351806640625, + "rewards//std": 0.02260495536029339, + "step": 1729 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.346, + "grad_norm": 1.4783236980438232, + "kl": 0.2776276245713234, + "learning_rate": 7.41988382333285e-07, + "loss": 0.0111, + "num_tokens": 12591051.0, + "reward": 0.8533935546875, + "reward_std": 0.016241278499364853, + "rewards//mean": 0.8533935546875, + "rewards//std": 0.02748923934996128, + "step": 1730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.640625, + "epoch": 0.3462, + "grad_norm": 1.8211215734481812, + "kl": 0.3350187074393034, + "learning_rate": 7.417106419422818e-07, + "loss": 0.0076, + "num_tokens": 12598236.0, + "reward": 0.8006591796875, + "reward_std": 0.014758465811610222, + "rewards//mean": 0.8006591796875, + "rewards//std": 0.02600414678454399, + "step": 1731 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3464, + "grad_norm": 1.3072750568389893, + "kl": 0.3856653869152069, + "learning_rate": 7.41432804190261e-07, + "loss": 0.0154, + "num_tokens": 12605476.0, + "reward": 0.8603515625, + "reward_std": 0.014853213913738728, + "rewards//mean": 0.8603515625, + "rewards//std": 0.024119943380355835, + "step": 1732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3466, + "grad_norm": 1.4974030256271362, + "kl": 0.2894123448058963, + "learning_rate": 7.411548691891357e-07, + "loss": 0.0116, + "num_tokens": 12612772.0, + "reward": 0.8702392578125, + "reward_std": 0.015089056454598904, + "rewards//mean": 0.8702392578125, + "rewards//std": 0.028885535895824432, + "step": 1733 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.171875, + "epoch": 0.3468, + "grad_norm": 1.6282540559768677, + "kl": 0.3531618397682905, + "learning_rate": 7.408768370508576e-07, + "loss": -0.031, + "num_tokens": 12620079.0, + "reward": 0.79412841796875, + "reward_std": 0.013216488063335419, + "rewards//mean": 0.79412841796875, + "rewards//std": 0.021232683211565018, + "step": 1734 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.347, + "grad_norm": 1.876924991607666, + "kl": 0.373610807582736, + "learning_rate": 7.405987078874185e-07, + "loss": 0.0155, + "num_tokens": 12627322.0, + "reward": 0.86761474609375, + "reward_std": 0.024198582395911217, + "rewards//mean": 0.86761474609375, + "rewards//std": 0.029709333553910255, + "step": 1735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.3472, + "grad_norm": 1.6189697980880737, + "kl": 0.35806167870759964, + "learning_rate": 7.403204818108487e-07, + "loss": 0.0149, + "num_tokens": 12634624.0, + "reward": 0.871337890625, + "reward_std": 0.016099179163575172, + "rewards//mean": 0.871337890625, + "rewards//std": 0.026216628029942513, + "step": 1736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3474, + "grad_norm": 1.40178382396698, + "kl": 0.33931702747941017, + "learning_rate": 7.400421589332174e-07, + "loss": 0.0136, + "num_tokens": 12641856.0, + "reward": 0.89398193359375, + "reward_std": 0.017595216631889343, + "rewards//mean": 0.89398193359375, + "rewards//std": 0.022934703156352043, + "step": 1737 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3476, + "grad_norm": 1.6164922714233398, + "kl": 0.39911684207618237, + "learning_rate": 7.397637393666333e-07, + "loss": 0.016, + "num_tokens": 12649280.0, + "reward": 0.8321533203125, + "reward_std": 0.01010876428335905, + "rewards//mean": 0.8321533203125, + "rewards//std": 0.0157480388879776, + "step": 1738 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.484375, + "epoch": 0.3478, + "grad_norm": 1.6372629404067993, + "kl": 0.3976320493966341, + "learning_rate": 7.394852232232436e-07, + "loss": 0.0075, + "num_tokens": 12656575.0, + "reward": 0.821533203125, + "reward_std": 0.016409527510404587, + "rewards//mean": 0.821533203125, + "rewards//std": 0.02408854104578495, + "step": 1739 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.348, + "grad_norm": 1.2511436939239502, + "kl": 0.302096763625741, + "learning_rate": 7.392066106152345e-07, + "loss": 0.0121, + "num_tokens": 12663879.0, + "reward": 0.8568115234375, + "reward_std": 0.014607805758714676, + "rewards//mean": 0.8568115234375, + "rewards//std": 0.028891824185848236, + "step": 1740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3482, + "grad_norm": 1.3358983993530273, + "kl": 0.29680105298757553, + "learning_rate": 7.389279016548316e-07, + "loss": 0.0119, + "num_tokens": 12671127.0, + "reward": 0.83111572265625, + "reward_std": 0.013027558103203773, + "rewards//mean": 0.83111572265625, + "rewards//std": 0.019817376509308815, + "step": 1741 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3484, + "grad_norm": 1.3151443004608154, + "kl": 0.29233986325562, + "learning_rate": 7.386490964542982e-07, + "loss": 0.0117, + "num_tokens": 12678407.0, + "reward": 0.859619140625, + "reward_std": 0.01752888411283493, + "rewards//mean": 0.859619140625, + "rewards//std": 0.027514833956956863, + "step": 1742 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3486, + "grad_norm": 1.2652745246887207, + "kl": 0.2893199324607849, + "learning_rate": 7.383701951259375e-07, + "loss": 0.0116, + "num_tokens": 12685615.0, + "reward": 0.80865478515625, + "reward_std": 0.013392496854066849, + "rewards//mean": 0.80865478515625, + "rewards//std": 0.017710141837596893, + "step": 1743 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3488, + "grad_norm": 1.5559178590774536, + "kl": 0.35842921771109104, + "learning_rate": 7.380911977820906e-07, + "loss": 0.0143, + "num_tokens": 12692951.0, + "reward": 0.86968994140625, + "reward_std": 0.02142229862511158, + "rewards//mean": 0.86968994140625, + "rewards//std": 0.028615102171897888, + "step": 1744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.349, + "grad_norm": 2.6244680881500244, + "kl": 0.6026144362986088, + "learning_rate": 7.378121045351377e-07, + "loss": 0.0241, + "num_tokens": 12700239.0, + "reward": 0.86322021484375, + "reward_std": 0.021910572424530983, + "rewards//mean": 0.86322021484375, + "rewards//std": 0.025128060951828957, + "step": 1745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3492, + "grad_norm": 1.3641687631607056, + "kl": 0.3991056717932224, + "learning_rate": 7.375329154974975e-07, + "loss": 0.016, + "num_tokens": 12707511.0, + "reward": 0.883544921875, + "reward_std": 0.014505678787827492, + "rewards//mean": 0.883544921875, + "rewards//std": 0.016687508672475815, + "step": 1746 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3494, + "grad_norm": 1.4574376344680786, + "kl": 0.32365928404033184, + "learning_rate": 7.372536307816272e-07, + "loss": 0.0129, + "num_tokens": 12714751.0, + "reward": 0.85467529296875, + "reward_std": 0.021790437400341034, + "rewards//mean": 0.85467529296875, + "rewards//std": 0.023511577397584915, + "step": 1747 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3496, + "grad_norm": 1.4317995309829712, + "kl": 0.33188681676983833, + "learning_rate": 7.369742505000231e-07, + "loss": 0.0133, + "num_tokens": 12722023.0, + "reward": 0.87860107421875, + "reward_std": 0.01495516300201416, + "rewards//mean": 0.87860107421875, + "rewards//std": 0.021713467314839363, + "step": 1748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3498, + "grad_norm": 1.4537633657455444, + "kl": 0.28825753182172775, + "learning_rate": 7.366947747652191e-07, + "loss": 0.0115, + "num_tokens": 12729239.0, + "reward": 0.79803466796875, + "reward_std": 0.017469599843025208, + "rewards//mean": 0.79803466796875, + "rewards//std": 0.02947658859193325, + "step": 1749 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.35, + "grad_norm": 1.221020221710205, + "kl": 0.25592808611691, + "learning_rate": 7.364152036897882e-07, + "loss": 0.0102, + "num_tokens": 12736503.0, + "reward": 0.8907470703125, + "reward_std": 0.011600921861827374, + "rewards//mean": 0.8907470703125, + "rewards//std": 0.022379916161298752, + "step": 1750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3502, + "grad_norm": 1.5791425704956055, + "kl": 0.3624233864247799, + "learning_rate": 7.361355373863413e-07, + "loss": 0.0145, + "num_tokens": 12743727.0, + "reward": 0.84918212890625, + "reward_std": 0.01243605837225914, + "rewards//mean": 0.84918212890625, + "rewards//std": 0.01864451915025711, + "step": 1751 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.3504, + "grad_norm": 3.2211902141571045, + "kl": 0.5690651014447212, + "learning_rate": 7.358557759675284e-07, + "loss": 0.0234, + "num_tokens": 12751097.0, + "reward": 0.8135986328125, + "reward_std": 0.016799088567495346, + "rewards//mean": 0.8135986328125, + "rewards//std": 0.026083195582032204, + "step": 1752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3506, + "grad_norm": 1.4635838270187378, + "kl": 0.3797619305551052, + "learning_rate": 7.35575919546037e-07, + "loss": 0.0152, + "num_tokens": 12758561.0, + "reward": 0.7491455078125, + "reward_std": 0.0109710693359375, + "rewards//mean": 0.7491455078125, + "rewards//std": 0.02225511334836483, + "step": 1753 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3508, + "grad_norm": 1.7242794036865234, + "kl": 0.4980655275285244, + "learning_rate": 7.352959682345935e-07, + "loss": 0.0199, + "num_tokens": 12765841.0, + "reward": 0.87213134765625, + "reward_std": 0.019575703889131546, + "rewards//mean": 0.87213134765625, + "rewards//std": 0.03175926208496094, + "step": 1754 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.351, + "grad_norm": 1.5026030540466309, + "kl": 0.38561695255339146, + "learning_rate": 7.350159221459621e-07, + "loss": 0.0154, + "num_tokens": 12773129.0, + "reward": 0.85662841796875, + "reward_std": 0.01589960791170597, + "rewards//mean": 0.85662841796875, + "rewards//std": 0.02595483884215355, + "step": 1755 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.3512, + "grad_norm": 1.351977825164795, + "kl": 0.3360167220234871, + "learning_rate": 7.347357813929454e-07, + "loss": 0.0154, + "num_tokens": 12780294.0, + "reward": 0.7584228515625, + "reward_std": 0.012051406316459179, + "rewards//mean": 0.7584228515625, + "rewards//std": 0.020963354036211967, + "step": 1756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.75, + "epoch": 0.3514, + "grad_norm": 1.5714515447616577, + "kl": 0.32332864589989185, + "learning_rate": 7.344555460883839e-07, + "loss": -0.0007, + "num_tokens": 12787502.0, + "reward": 0.86724853515625, + "reward_std": 0.011684889905154705, + "rewards//mean": 0.86724853515625, + "rewards//std": 0.014805064536631107, + "step": 1757 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3516, + "grad_norm": 1.1269747018814087, + "kl": 0.2962317243218422, + "learning_rate": 7.341752163451567e-07, + "loss": 0.0118, + "num_tokens": 12794798.0, + "reward": 0.85882568359375, + "reward_std": 0.013280782848596573, + "rewards//mean": 0.85882568359375, + "rewards//std": 0.02321808785200119, + "step": 1758 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3518, + "grad_norm": 1.4490885734558105, + "kl": 0.3551259506493807, + "learning_rate": 7.338947922761802e-07, + "loss": 0.0142, + "num_tokens": 12802086.0, + "reward": 0.82647705078125, + "reward_std": 0.010622123256325722, + "rewards//mean": 0.82647705078125, + "rewards//std": 0.020217420533299446, + "step": 1759 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.765625, + "epoch": 0.352, + "grad_norm": 1.4431562423706055, + "kl": 0.33968091383576393, + "learning_rate": 7.336142739944093e-07, + "loss": 0.0123, + "num_tokens": 12809447.0, + "reward": 0.86138916015625, + "reward_std": 0.014373487792909145, + "rewards//mean": 0.86138916015625, + "rewards//std": 0.021530739963054657, + "step": 1760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.828125, + "epoch": 0.3522, + "grad_norm": 1.3209984302520752, + "kl": 0.34739651903510094, + "learning_rate": 7.333336616128369e-07, + "loss": 0.012, + "num_tokens": 12816708.0, + "reward": 0.8634033203125, + "reward_std": 0.014693229459226131, + "rewards//mean": 0.8634033203125, + "rewards//std": 0.018557950854301453, + "step": 1761 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3524, + "grad_norm": 1.3459833860397339, + "kl": 0.29588280990719795, + "learning_rate": 7.330529552444932e-07, + "loss": 0.0118, + "num_tokens": 12824036.0, + "reward": 0.835693359375, + "reward_std": 0.010448121465742588, + "rewards//mean": 0.835693359375, + "rewards//std": 0.017783604562282562, + "step": 1762 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3526, + "grad_norm": 1.3201289176940918, + "kl": 0.3345465287566185, + "learning_rate": 7.327721550024475e-07, + "loss": 0.0134, + "num_tokens": 12831380.0, + "reward": 0.83917236328125, + "reward_std": 0.012283523567020893, + "rewards//mean": 0.83917236328125, + "rewards//std": 0.014273024164140224, + "step": 1763 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3528, + "grad_norm": 1.4913758039474487, + "kl": 0.31420661322772503, + "learning_rate": 7.324912609998053e-07, + "loss": 0.0126, + "num_tokens": 12838708.0, + "reward": 0.84747314453125, + "reward_std": 0.016367319971323013, + "rewards//mean": 0.84747314453125, + "rewards//std": 0.02386179380118847, + "step": 1764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.353, + "grad_norm": 1.6446822881698608, + "kl": 0.3366082590073347, + "learning_rate": 7.322102733497109e-07, + "loss": 0.0135, + "num_tokens": 12846012.0, + "reward": 0.83148193359375, + "reward_std": 0.016670847311615944, + "rewards//mean": 0.83148193359375, + "rewards//std": 0.03136501833796501, + "step": 1765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3532, + "grad_norm": 1.4761569499969482, + "kl": 0.34729661606252193, + "learning_rate": 7.319291921653463e-07, + "loss": 0.0139, + "num_tokens": 12853292.0, + "reward": 0.8070068359375, + "reward_std": 0.015038937330245972, + "rewards//mean": 0.8070068359375, + "rewards//std": 0.021862581372261047, + "step": 1766 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3534, + "grad_norm": 1.601037859916687, + "kl": 0.33775024488568306, + "learning_rate": 7.316480175599308e-07, + "loss": 0.0135, + "num_tokens": 12860588.0, + "reward": 0.82958984375, + "reward_std": 0.011500689201056957, + "rewards//mean": 0.82958984375, + "rewards//std": 0.016195859760046005, + "step": 1767 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.765625, + "epoch": 0.3536, + "grad_norm": 1.4369398355484009, + "kl": 0.38819198682904243, + "learning_rate": 7.313667496467215e-07, + "loss": 0.0058, + "num_tokens": 12867813.0, + "reward": 0.84698486328125, + "reward_std": 0.01404052134603262, + "rewards//mean": 0.84698486328125, + "rewards//std": 0.023852910846471786, + "step": 1768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.3538, + "grad_norm": 1.3208333253860474, + "kl": 0.34907032921910286, + "learning_rate": 7.310853885390132e-07, + "loss": 0.0145, + "num_tokens": 12875066.0, + "reward": 0.84710693359375, + "reward_std": 0.014842121861875057, + "rewards//mean": 0.84710693359375, + "rewards//std": 0.016364969313144684, + "step": 1769 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.354, + "grad_norm": 1.45050847530365, + "kl": 0.3179214224219322, + "learning_rate": 7.308039343501379e-07, + "loss": 0.0136, + "num_tokens": 12882413.0, + "reward": 0.825439453125, + "reward_std": 0.01593451388180256, + "rewards//mean": 0.825439453125, + "rewards//std": 0.03099220246076584, + "step": 1770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.578125, + "epoch": 0.3542, + "grad_norm": 1.4283778667449951, + "kl": 0.3423707149922848, + "learning_rate": 7.305223871934656e-07, + "loss": -0.0112, + "num_tokens": 12889706.0, + "reward": 0.86016845703125, + "reward_std": 0.01879536733031273, + "rewards//mean": 0.86016845703125, + "rewards//std": 0.025163577869534492, + "step": 1771 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.3544, + "grad_norm": 1.3387900590896606, + "kl": 0.34775509871542454, + "learning_rate": 7.302407471824033e-07, + "loss": 0.0131, + "num_tokens": 12897040.0, + "reward": 0.8568115234375, + "reward_std": 0.01751323975622654, + "rewards//mean": 0.8568115234375, + "rewards//std": 0.02746499888598919, + "step": 1772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3546, + "grad_norm": 1.5804762840270996, + "kl": 0.26966886781156063, + "learning_rate": 7.299590144303954e-07, + "loss": 0.0108, + "num_tokens": 12904296.0, + "reward": 0.854736328125, + "reward_std": 0.017412738874554634, + "rewards//mean": 0.854736328125, + "rewards//std": 0.021563144400715828, + "step": 1773 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.859375, + "epoch": 0.3548, + "grad_norm": 1.4419353008270264, + "kl": 0.2921508885920048, + "learning_rate": 7.296771890509242e-07, + "loss": 0.009, + "num_tokens": 12911519.0, + "reward": 0.8182373046875, + "reward_std": 0.020284362137317657, + "rewards//mean": 0.8182373046875, + "rewards//std": 0.025910839438438416, + "step": 1774 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.355, + "grad_norm": 1.4275953769683838, + "kl": 0.32839203998446465, + "learning_rate": 7.293952711575086e-07, + "loss": 0.0131, + "num_tokens": 12918759.0, + "reward": 0.83245849609375, + "reward_std": 0.014571838080883026, + "rewards//mean": 0.83245849609375, + "rewards//std": 0.02627423405647278, + "step": 1775 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3552, + "grad_norm": 1.715017318725586, + "kl": 0.32557161897420883, + "learning_rate": 7.291132608637052e-07, + "loss": 0.013, + "num_tokens": 12926031.0, + "reward": 0.87689208984375, + "reward_std": 0.017572754994034767, + "rewards//mean": 0.87689208984375, + "rewards//std": 0.025761064141988754, + "step": 1776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3554, + "grad_norm": 1.471991777420044, + "kl": 0.34522269666194916, + "learning_rate": 7.288311582831077e-07, + "loss": 0.0138, + "num_tokens": 12933383.0, + "reward": 0.81842041015625, + "reward_std": 0.013254689984023571, + "rewards//mean": 0.81842041015625, + "rewards//std": 0.018714213743805885, + "step": 1777 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.828125, + "epoch": 0.3556, + "grad_norm": 1.2961159944534302, + "kl": 0.3398747742176056, + "learning_rate": 7.285489635293471e-07, + "loss": 0.0124, + "num_tokens": 12940628.0, + "reward": 0.82568359375, + "reward_std": 0.01858273521065712, + "rewards//mean": 0.82568359375, + "rewards//std": 0.023699551820755005, + "step": 1778 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.3558, + "grad_norm": 1.5779229402542114, + "kl": 0.3915677033364773, + "learning_rate": 7.282666767160912e-07, + "loss": 0.0159, + "num_tokens": 12947947.0, + "reward": 0.7752685546875, + "reward_std": 0.01956242322921753, + "rewards//mean": 0.7752685546875, + "rewards//std": 0.033953957259655, + "step": 1779 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.828125, + "epoch": 0.356, + "grad_norm": 1.2544364929199219, + "kl": 0.30964076705276966, + "learning_rate": 7.279842979570453e-07, + "loss": 0.0101, + "num_tokens": 12955288.0, + "reward": 0.83538818359375, + "reward_std": 0.012154212221503258, + "rewards//mean": 0.83538818359375, + "rewards//std": 0.01728624291718006, + "step": 1780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3562, + "grad_norm": 2.5896730422973633, + "kl": 0.4678031001240015, + "learning_rate": 7.277018273659516e-07, + "loss": 0.0187, + "num_tokens": 12962544.0, + "reward": 0.83453369140625, + "reward_std": 0.012585675343871117, + "rewards//mean": 0.83453369140625, + "rewards//std": 0.02223222889006138, + "step": 1781 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3564, + "grad_norm": 1.637034296989441, + "kl": 0.29470736533403397, + "learning_rate": 7.274192650565889e-07, + "loss": 0.0118, + "num_tokens": 12969808.0, + "reward": 0.87091064453125, + "reward_std": 0.014191143214702606, + "rewards//mean": 0.87091064453125, + "rewards//std": 0.018797343596816063, + "step": 1782 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3566, + "grad_norm": 1.091575026512146, + "kl": 0.29269224777817726, + "learning_rate": 7.271366111427734e-07, + "loss": 0.0117, + "num_tokens": 12977144.0, + "reward": 0.9013671875, + "reward_std": 0.011930938810110092, + "rewards//mean": 0.9013671875, + "rewards//std": 0.01922503113746643, + "step": 1783 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3568, + "grad_norm": 1.373505711555481, + "kl": 0.3065312933176756, + "learning_rate": 7.26853865738358e-07, + "loss": 0.0123, + "num_tokens": 12984448.0, + "reward": 0.81427001953125, + "reward_std": 0.012016938999295235, + "rewards//mean": 0.81427001953125, + "rewards//std": 0.02378554455935955, + "step": 1784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.357, + "grad_norm": 1.3361104726791382, + "kl": 0.3464656323194504, + "learning_rate": 7.265710289572328e-07, + "loss": 0.0139, + "num_tokens": 12991672.0, + "reward": 0.8232421875, + "reward_std": 0.014221981167793274, + "rewards//mean": 0.8232421875, + "rewards//std": 0.018506081774830818, + "step": 1785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3572, + "grad_norm": 1.4885035753250122, + "kl": 0.33833348006010056, + "learning_rate": 7.262881009133241e-07, + "loss": 0.0135, + "num_tokens": 12998960.0, + "reward": 0.88140869140625, + "reward_std": 0.014115924946963787, + "rewards//mean": 0.88140869140625, + "rewards//std": 0.022610312327742577, + "step": 1786 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3574, + "grad_norm": 1.669877052307129, + "kl": 0.3004380874335766, + "learning_rate": 7.260050817205955e-07, + "loss": 0.012, + "num_tokens": 13006216.0, + "reward": 0.8836669921875, + "reward_std": 0.020358888432383537, + "rewards//mean": 0.8836669921875, + "rewards//std": 0.030484715476632118, + "step": 1787 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3576, + "grad_norm": 1.5044134855270386, + "kl": 0.3339064195752144, + "learning_rate": 7.25721971493047e-07, + "loss": 0.0134, + "num_tokens": 13013416.0, + "reward": 0.8575439453125, + "reward_std": 0.02454851195216179, + "rewards//mean": 0.8575439453125, + "rewards//std": 0.03050457127392292, + "step": 1788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3578, + "grad_norm": 1.5886855125427246, + "kl": 0.3419422507286072, + "learning_rate": 7.254387703447153e-07, + "loss": 0.0137, + "num_tokens": 13020656.0, + "reward": 0.82269287109375, + "reward_std": 0.015725690871477127, + "rewards//mean": 0.82269287109375, + "rewards//std": 0.024945467710494995, + "step": 1789 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.358, + "grad_norm": 1.3306888341903687, + "kl": 0.311840295791626, + "learning_rate": 7.25155478389674e-07, + "loss": 0.0139, + "num_tokens": 13027940.0, + "reward": 0.82818603515625, + "reward_std": 0.013569097965955734, + "rewards//mean": 0.82818603515625, + "rewards//std": 0.020297378301620483, + "step": 1790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3582, + "grad_norm": 1.9873504638671875, + "kl": 0.46894069015979767, + "learning_rate": 7.248720957420329e-07, + "loss": 0.0188, + "num_tokens": 13035268.0, + "reward": 0.7864990234375, + "reward_std": 0.013596100732684135, + "rewards//mean": 0.7864990234375, + "rewards//std": 0.020997988060116768, + "step": 1791 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.84375, + "epoch": 0.3584, + "grad_norm": 1.4985787868499756, + "kl": 0.33086369931697845, + "learning_rate": 7.245886225159386e-07, + "loss": 0.0151, + "num_tokens": 13042530.0, + "reward": 0.8350830078125, + "reward_std": 0.017819885164499283, + "rewards//mean": 0.8350830078125, + "rewards//std": 0.03395752236247063, + "step": 1792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.84375, + "epoch": 0.3586, + "grad_norm": 1.3326328992843628, + "kl": 0.3290709163993597, + "learning_rate": 7.243050588255737e-07, + "loss": 0.0081, + "num_tokens": 13049792.0, + "reward": 0.85321044921875, + "reward_std": 0.01724749431014061, + "rewards//mean": 0.85321044921875, + "rewards//std": 0.020085208117961884, + "step": 1793 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3588, + "grad_norm": 1.6536238193511963, + "kl": 0.2911773081868887, + "learning_rate": 7.240214047851581e-07, + "loss": 0.0116, + "num_tokens": 13057080.0, + "reward": 0.8729248046875, + "reward_std": 0.01498456858098507, + "rewards//mean": 0.8729248046875, + "rewards//std": 0.021609079092741013, + "step": 1794 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.359, + "grad_norm": 1.7938519716262817, + "kl": 0.3152005337178707, + "learning_rate": 7.237376605089476e-07, + "loss": 0.0126, + "num_tokens": 13064400.0, + "reward": 0.8841552734375, + "reward_std": 0.014753509312868118, + "rewards//mean": 0.8841552734375, + "rewards//std": 0.031072933226823807, + "step": 1795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.3592, + "grad_norm": 1.6713520288467407, + "kl": 0.30559369176626205, + "learning_rate": 7.234538261112341e-07, + "loss": 0.0127, + "num_tokens": 13071637.0, + "reward": 0.84185791015625, + "reward_std": 0.018749283626675606, + "rewards//mean": 0.84185791015625, + "rewards//std": 0.024685604497790337, + "step": 1796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3594, + "grad_norm": 1.426536202430725, + "kl": 0.3477012626826763, + "learning_rate": 7.23169901706346e-07, + "loss": 0.0139, + "num_tokens": 13078885.0, + "reward": 0.8333740234375, + "reward_std": 0.012916945852339268, + "rewards//mean": 0.8333740234375, + "rewards//std": 0.019390849396586418, + "step": 1797 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3596, + "grad_norm": 1.2148747444152832, + "kl": 0.31141859106719494, + "learning_rate": 7.228858874086484e-07, + "loss": 0.0125, + "num_tokens": 13086109.0, + "reward": 0.82666015625, + "reward_std": 0.008876778185367584, + "rewards//mean": 0.82666015625, + "rewards//std": 0.015398595482110977, + "step": 1798 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.3598, + "grad_norm": 1.6082701683044434, + "kl": 0.33381498232483864, + "learning_rate": 7.226017833325419e-07, + "loss": 0.0102, + "num_tokens": 13093319.0, + "reward": 0.85211181640625, + "reward_std": 0.017070725560188293, + "rewards//mean": 0.85211181640625, + "rewards//std": 0.020621448755264282, + "step": 1799 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.36, + "grad_norm": 4.6703057289123535, + "kl": 0.7459075208753347, + "learning_rate": 7.223175895924637e-07, + "loss": 0.0298, + "num_tokens": 13100543.0, + "reward": 0.8251953125, + "reward_std": 0.01260833628475666, + "rewards//mean": 0.8251953125, + "rewards//std": 0.019869346171617508, + "step": 1800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.3602, + "grad_norm": 1.4567111730575562, + "kl": 0.3541328087449074, + "learning_rate": 7.220333063028871e-07, + "loss": 0.0146, + "num_tokens": 13107710.0, + "reward": 0.77447509765625, + "reward_std": 0.0188608355820179, + "rewards//mean": 0.77447509765625, + "rewards//std": 0.02288580872118473, + "step": 1801 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3604, + "grad_norm": 1.783666729927063, + "kl": 0.46103810146450996, + "learning_rate": 7.217489335783211e-07, + "loss": 0.0184, + "num_tokens": 13115150.0, + "reward": 0.80426025390625, + "reward_std": 0.025779712945222855, + "rewards//mean": 0.80426025390625, + "rewards//std": 0.04039827734231949, + "step": 1802 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3606, + "grad_norm": 1.4960055351257324, + "kl": 0.2899995185434818, + "learning_rate": 7.214644715333114e-07, + "loss": 0.0116, + "num_tokens": 13122398.0, + "reward": 0.8287353515625, + "reward_std": 0.02116817608475685, + "rewards//mean": 0.8287353515625, + "rewards//std": 0.02483454905450344, + "step": 1803 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3608, + "grad_norm": 1.5012084245681763, + "kl": 0.3818160016089678, + "learning_rate": 7.211799202824388e-07, + "loss": 0.0153, + "num_tokens": 13129678.0, + "reward": 0.84197998046875, + "reward_std": 0.013134417124092579, + "rewards//mean": 0.84197998046875, + "rewards//std": 0.015747439116239548, + "step": 1804 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.361, + "grad_norm": 1.4456658363342285, + "kl": 0.42006150260567665, + "learning_rate": 7.20895279940321e-07, + "loss": 0.017, + "num_tokens": 13136917.0, + "reward": 0.8638916015625, + "reward_std": 0.01782601699233055, + "rewards//mean": 0.8638916015625, + "rewards//std": 0.02189026214182377, + "step": 1805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3612, + "grad_norm": 1.3601667881011963, + "kl": 0.3295281417667866, + "learning_rate": 7.206105506216106e-07, + "loss": 0.0132, + "num_tokens": 13144181.0, + "reward": 0.8853759765625, + "reward_std": 0.02165200561285019, + "rewards//mean": 0.8853759765625, + "rewards//std": 0.025987843051552773, + "step": 1806 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3614, + "grad_norm": 1.4922152757644653, + "kl": 0.3270743004977703, + "learning_rate": 7.203257324409971e-07, + "loss": 0.0131, + "num_tokens": 13151429.0, + "reward": 0.86419677734375, + "reward_std": 0.019750911742448807, + "rewards//mean": 0.86419677734375, + "rewards//std": 0.03644997626543045, + "step": 1807 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3616, + "grad_norm": 1.4118459224700928, + "kl": 0.33841897919774055, + "learning_rate": 7.200408255132045e-07, + "loss": 0.0135, + "num_tokens": 13158621.0, + "reward": 0.8343505859375, + "reward_std": 0.012906162068247795, + "rewards//mean": 0.8343505859375, + "rewards//std": 0.017423728480935097, + "step": 1808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.578125, + "epoch": 0.3618, + "grad_norm": 2.0021615028381348, + "kl": 0.26085574366152287, + "learning_rate": 7.19755829952994e-07, + "loss": -0.0248, + "num_tokens": 13165946.0, + "reward": 0.85845947265625, + "reward_std": 0.02079731598496437, + "rewards//mean": 0.85845947265625, + "rewards//std": 0.031663790345191956, + "step": 1809 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.362, + "grad_norm": 1.491319179534912, + "kl": 0.31329959258437157, + "learning_rate": 7.194707458751615e-07, + "loss": 0.0091, + "num_tokens": 13173294.0, + "reward": 0.8289794921875, + "reward_std": 0.022315306589007378, + "rewards//mean": 0.8289794921875, + "rewards//std": 0.03351961448788643, + "step": 1810 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3622, + "grad_norm": 1.5840795040130615, + "kl": 0.38937829062342644, + "learning_rate": 7.191855733945386e-07, + "loss": 0.0156, + "num_tokens": 13180814.0, + "reward": 0.87115478515625, + "reward_std": 0.01737811788916588, + "rewards//mean": 0.87115478515625, + "rewards//std": 0.02141229808330536, + "step": 1811 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3624, + "grad_norm": 1.5284273624420166, + "kl": 0.33072328567504883, + "learning_rate": 7.189003126259931e-07, + "loss": 0.0132, + "num_tokens": 13188182.0, + "reward": 0.7891845703125, + "reward_std": 0.01880543678998947, + "rewards//mean": 0.7891845703125, + "rewards//std": 0.026069262996315956, + "step": 1812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.625, + "epoch": 0.3626, + "grad_norm": 1.6649948358535767, + "kl": 0.37574559450149536, + "learning_rate": 7.186149636844279e-07, + "loss": -0.016, + "num_tokens": 13195534.0, + "reward": 0.84539794921875, + "reward_std": 0.017725780606269836, + "rewards//mean": 0.84539794921875, + "rewards//std": 0.025073179975152016, + "step": 1813 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3628, + "grad_norm": 1.4898111820220947, + "kl": 0.3601376749575138, + "learning_rate": 7.183295266847814e-07, + "loss": 0.0144, + "num_tokens": 13202806.0, + "reward": 0.83392333984375, + "reward_std": 0.019132371991872787, + "rewards//mean": 0.83392333984375, + "rewards//std": 0.028501668944954872, + "step": 1814 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.363, + "grad_norm": 1.3065546751022339, + "kl": 0.31438931450247765, + "learning_rate": 7.180440017420276e-07, + "loss": 0.0126, + "num_tokens": 13210030.0, + "reward": 0.87774658203125, + "reward_std": 0.015463817864656448, + "rewards//mean": 0.87774658203125, + "rewards//std": 0.020945565775036812, + "step": 1815 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3632, + "grad_norm": 1.2498735189437866, + "kl": 0.3238566219806671, + "learning_rate": 7.177583889711762e-07, + "loss": 0.013, + "num_tokens": 13217334.0, + "reward": 0.9019775390625, + "reward_std": 0.016827505081892014, + "rewards//mean": 0.9019775390625, + "rewards//std": 0.02555079385638237, + "step": 1816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3634, + "grad_norm": 1.7888028621673584, + "kl": 0.33021591044962406, + "learning_rate": 7.174726884872715e-07, + "loss": 0.0132, + "num_tokens": 13224558.0, + "reward": 0.87689208984375, + "reward_std": 0.02265920490026474, + "rewards//mean": 0.87689208984375, + "rewards//std": 0.031156795099377632, + "step": 1817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.3636, + "grad_norm": 2.037165403366089, + "kl": 0.3881691098213196, + "learning_rate": 7.17186900405394e-07, + "loss": 0.0188, + "num_tokens": 13231785.0, + "reward": 0.82373046875, + "reward_std": 0.028452929109334946, + "rewards//mean": 0.82373046875, + "rewards//std": 0.03166958689689636, + "step": 1818 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3638, + "grad_norm": 1.4272879362106323, + "kl": 0.3009617105126381, + "learning_rate": 7.169010248406588e-07, + "loss": 0.012, + "num_tokens": 13239049.0, + "reward": 0.85162353515625, + "reward_std": 0.01579582691192627, + "rewards//mean": 0.85162353515625, + "rewards//std": 0.025999708101153374, + "step": 1819 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.65625, + "epoch": 0.364, + "grad_norm": 1.6092252731323242, + "kl": 0.3249921053647995, + "learning_rate": 7.16615061908217e-07, + "loss": 0.0184, + "num_tokens": 13246347.0, + "reward": 0.8583984375, + "reward_std": 0.0120505066588521, + "rewards//mean": 0.8583984375, + "rewards//std": 0.02916533872485161, + "step": 1820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3642, + "grad_norm": 1.512402057647705, + "kl": 0.3106341287493706, + "learning_rate": 7.163290117232541e-07, + "loss": 0.0124, + "num_tokens": 13253563.0, + "reward": 0.862548828125, + "reward_std": 0.01890312321484089, + "rewards//mean": 0.862548828125, + "rewards//std": 0.02844112366437912, + "step": 1821 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3644, + "grad_norm": 1.3272719383239746, + "kl": 0.29179687425494194, + "learning_rate": 7.160428744009912e-07, + "loss": 0.0117, + "num_tokens": 13260827.0, + "reward": 0.80291748046875, + "reward_std": 0.014773186296224594, + "rewards//mean": 0.80291748046875, + "rewards//std": 0.019831884652376175, + "step": 1822 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.75, + "epoch": 0.3646, + "grad_norm": 1.6066099405288696, + "kl": 0.3240814320743084, + "learning_rate": 7.157566500566842e-07, + "loss": 0.0166, + "num_tokens": 13268107.0, + "reward": 0.83447265625, + "reward_std": 0.02432638593018055, + "rewards//mean": 0.83447265625, + "rewards//std": 0.0278614554554224, + "step": 1823 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3648, + "grad_norm": 1.6368844509124756, + "kl": 0.3949446380138397, + "learning_rate": 7.154703388056244e-07, + "loss": 0.0158, + "num_tokens": 13275339.0, + "reward": 0.8514404296875, + "reward_std": 0.018605750054121017, + "rewards//mean": 0.8514404296875, + "rewards//std": 0.029984036460518837, + "step": 1824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.365, + "grad_norm": 1.6511799097061157, + "kl": 0.31257528625428677, + "learning_rate": 7.15183940763138e-07, + "loss": 0.0125, + "num_tokens": 13282611.0, + "reward": 0.82440185546875, + "reward_std": 0.014579997397959232, + "rewards//mean": 0.82440185546875, + "rewards//std": 0.020545700564980507, + "step": 1825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3652, + "grad_norm": 1.7197332382202148, + "kl": 0.26758963987231255, + "learning_rate": 7.148974560445858e-07, + "loss": 0.0107, + "num_tokens": 13289931.0, + "reward": 0.830322265625, + "reward_std": 0.01709955371916294, + "rewards//mean": 0.830322265625, + "rewards//std": 0.028955938294529915, + "step": 1826 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3654, + "grad_norm": 2.0907888412475586, + "kl": 0.4327459204941988, + "learning_rate": 7.146108847653641e-07, + "loss": 0.0173, + "num_tokens": 13297171.0, + "reward": 0.8543701171875, + "reward_std": 0.013765614479780197, + "rewards//mean": 0.8543701171875, + "rewards//std": 0.01875915378332138, + "step": 1827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3656, + "grad_norm": 1.5722298622131348, + "kl": 0.30997989885509014, + "learning_rate": 7.143242270409037e-07, + "loss": 0.0124, + "num_tokens": 13304563.0, + "reward": 0.845458984375, + "reward_std": 0.017091190442442894, + "rewards//mean": 0.845458984375, + "rewards//std": 0.024977076798677444, + "step": 1828 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3658, + "grad_norm": 1.5342881679534912, + "kl": 0.3095131888985634, + "learning_rate": 7.140374829866702e-07, + "loss": 0.0124, + "num_tokens": 13311795.0, + "reward": 0.88897705078125, + "reward_std": 0.015041787177324295, + "rewards//mean": 0.88897705078125, + "rewards//std": 0.020691068843007088, + "step": 1829 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.366, + "grad_norm": 1.5070405006408691, + "kl": 0.3404368497431278, + "learning_rate": 7.137506527181643e-07, + "loss": 0.0136, + "num_tokens": 13319091.0, + "reward": 0.760498046875, + "reward_std": 0.016157478094100952, + "rewards//mean": 0.760498046875, + "rewards//std": 0.025179890915751457, + "step": 1830 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.71875, + "epoch": 0.3662, + "grad_norm": 1.9608359336853027, + "kl": 0.3365403264760971, + "learning_rate": 7.134637363509209e-07, + "loss": -0.0052, + "num_tokens": 13326329.0, + "reward": 0.78387451171875, + "reward_std": 0.016087880358099937, + "rewards//mean": 0.78387451171875, + "rewards//std": 0.026097917929291725, + "step": 1831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3664, + "grad_norm": 1.6780877113342285, + "kl": 0.35304195806384087, + "learning_rate": 7.131767340005101e-07, + "loss": 0.0141, + "num_tokens": 13333609.0, + "reward": 0.77679443359375, + "reward_std": 0.012029063887894154, + "rewards//mean": 0.77679443359375, + "rewards//std": 0.02196369878947735, + "step": 1832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3666, + "grad_norm": 1.315347671508789, + "kl": 0.2953651510179043, + "learning_rate": 7.128896457825363e-07, + "loss": 0.0118, + "num_tokens": 13340961.0, + "reward": 0.85791015625, + "reward_std": 0.012490374967455864, + "rewards//mean": 0.85791015625, + "rewards//std": 0.021230634301900864, + "step": 1833 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3668, + "grad_norm": 1.2285568714141846, + "kl": 0.27632822282612324, + "learning_rate": 7.126024718126387e-07, + "loss": 0.0111, + "num_tokens": 13348321.0, + "reward": 0.849609375, + "reward_std": 0.014120740815997124, + "rewards//mean": 0.849609375, + "rewards//std": 0.020446086302399635, + "step": 1834 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.367, + "grad_norm": 1.5910043716430664, + "kl": 0.3328109811991453, + "learning_rate": 7.123152122064908e-07, + "loss": 0.0133, + "num_tokens": 13355537.0, + "reward": 0.84576416015625, + "reward_std": 0.02322617545723915, + "rewards//mean": 0.84576416015625, + "rewards//std": 0.03142769634723663, + "step": 1835 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.3672, + "grad_norm": 1.453460693359375, + "kl": 0.41217342019081116, + "learning_rate": 7.120278670798009e-07, + "loss": 0.0134, + "num_tokens": 13362795.0, + "reward": 0.83563232421875, + "reward_std": 0.013536947779357433, + "rewards//mean": 0.83563232421875, + "rewards//std": 0.024092862382531166, + "step": 1836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3674, + "grad_norm": 1.391517996788025, + "kl": 0.33991361409425735, + "learning_rate": 7.117404365483115e-07, + "loss": 0.0136, + "num_tokens": 13370187.0, + "reward": 0.86932373046875, + "reward_std": 0.016096044331789017, + "rewards//mean": 0.86932373046875, + "rewards//std": 0.021986430510878563, + "step": 1837 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3676, + "grad_norm": 1.520994782447815, + "kl": 0.31763334944844246, + "learning_rate": 7.114529207277995e-07, + "loss": 0.0127, + "num_tokens": 13377499.0, + "reward": 0.86151123046875, + "reward_std": 0.012335943058133125, + "rewards//mean": 0.86151123046875, + "rewards//std": 0.02426939830183983, + "step": 1838 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.3678, + "grad_norm": 1.2823296785354614, + "kl": 0.26826061867177486, + "learning_rate": 7.111653197340764e-07, + "loss": 0.0092, + "num_tokens": 13384758.0, + "reward": 0.79376220703125, + "reward_std": 0.014360502362251282, + "rewards//mean": 0.79376220703125, + "rewards//std": 0.02718989923596382, + "step": 1839 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.368, + "grad_norm": 1.5031300783157349, + "kl": 0.3300153370946646, + "learning_rate": 7.108776336829876e-07, + "loss": 0.0132, + "num_tokens": 13391958.0, + "reward": 0.84881591796875, + "reward_std": 0.015552978031337261, + "rewards//mean": 0.84881591796875, + "rewards//std": 0.019828831776976585, + "step": 1840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3682, + "grad_norm": 1.3874731063842773, + "kl": 0.3194307256489992, + "learning_rate": 7.105898626904134e-07, + "loss": 0.0128, + "num_tokens": 13399246.0, + "reward": 0.83953857421875, + "reward_std": 0.013732979074120522, + "rewards//mean": 0.83953857421875, + "rewards//std": 0.02391122654080391, + "step": 1841 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3684, + "grad_norm": 1.329150676727295, + "kl": 0.3054366894066334, + "learning_rate": 7.103020068722674e-07, + "loss": 0.0122, + "num_tokens": 13406598.0, + "reward": 0.76324462890625, + "reward_std": 0.014096824452280998, + "rewards//mean": 0.76324462890625, + "rewards//std": 0.018748154863715172, + "step": 1842 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3686, + "grad_norm": 1.6792678833007812, + "kl": 0.3369921166449785, + "learning_rate": 7.100140663444984e-07, + "loss": 0.0135, + "num_tokens": 13413926.0, + "reward": 0.84808349609375, + "reward_std": 0.016827670857310295, + "rewards//mean": 0.84808349609375, + "rewards//std": 0.03600454702973366, + "step": 1843 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3688, + "grad_norm": 1.7198008298873901, + "kl": 0.28918019123375416, + "learning_rate": 7.097260412230885e-07, + "loss": 0.0116, + "num_tokens": 13421214.0, + "reward": 0.86834716796875, + "reward_std": 0.014429625123739243, + "rewards//mean": 0.86834716796875, + "rewards//std": 0.02061924710869789, + "step": 1844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.369, + "grad_norm": 1.4114543199539185, + "kl": 0.3666899912059307, + "learning_rate": 7.094379316240544e-07, + "loss": 0.0147, + "num_tokens": 13428486.0, + "reward": 0.82855224609375, + "reward_std": 0.010454796254634857, + "rewards//mean": 0.82855224609375, + "rewards//std": 0.016950208693742752, + "step": 1845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3692, + "grad_norm": 1.4615943431854248, + "kl": 0.2768028825521469, + "learning_rate": 7.091497376634463e-07, + "loss": 0.0111, + "num_tokens": 13435806.0, + "reward": 0.83868408203125, + "reward_std": 0.010924738831818104, + "rewards//mean": 0.83868408203125, + "rewards//std": 0.01671368069946766, + "step": 1846 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3694, + "grad_norm": 1.30558443069458, + "kl": 0.32445787638425827, + "learning_rate": 7.088614594573491e-07, + "loss": 0.013, + "num_tokens": 13443070.0, + "reward": 0.8441162109375, + "reward_std": 0.016356993466615677, + "rewards//mean": 0.8441162109375, + "rewards//std": 0.02170693129301071, + "step": 1847 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3696, + "grad_norm": 1.8577969074249268, + "kl": 0.3331902213394642, + "learning_rate": 7.085730971218809e-07, + "loss": 0.0133, + "num_tokens": 13450342.0, + "reward": 0.836181640625, + "reward_std": 0.01583714410662651, + "rewards//mean": 0.836181640625, + "rewards//std": 0.023927126079797745, + "step": 1848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3698, + "grad_norm": 1.4292131662368774, + "kl": 0.26339045353233814, + "learning_rate": 7.082846507731941e-07, + "loss": 0.0105, + "num_tokens": 13457630.0, + "reward": 0.84161376953125, + "reward_std": 0.015434913337230682, + "rewards//mean": 0.84161376953125, + "rewards//std": 0.023652799427509308, + "step": 1849 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.37, + "grad_norm": 1.5262118577957153, + "kl": 0.3594434279948473, + "learning_rate": 7.079961205274748e-07, + "loss": 0.0144, + "num_tokens": 13464958.0, + "reward": 0.8619384765625, + "reward_std": 0.021309498697519302, + "rewards//mean": 0.8619384765625, + "rewards//std": 0.028230370953679085, + "step": 1850 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3702, + "grad_norm": 1.7127439975738525, + "kl": 0.34760988503694534, + "learning_rate": 7.077075065009433e-07, + "loss": 0.0139, + "num_tokens": 13472230.0, + "reward": 0.8590087890625, + "reward_std": 0.018272168934345245, + "rewards//mean": 0.8590087890625, + "rewards//std": 0.027575010433793068, + "step": 1851 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.609375, + "epoch": 0.3704, + "grad_norm": 1.5232025384902954, + "kl": 0.3414961975067854, + "learning_rate": 7.074188088098527e-07, + "loss": -0.0139, + "num_tokens": 13479437.0, + "reward": 0.81573486328125, + "reward_std": 0.01961115375161171, + "rewards//mean": 0.81573486328125, + "rewards//std": 0.02522486448287964, + "step": 1852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3706, + "grad_norm": 2.018962860107422, + "kl": 0.38079632818698883, + "learning_rate": 7.071300275704909e-07, + "loss": 0.0152, + "num_tokens": 13486829.0, + "reward": 0.88055419921875, + "reward_std": 0.0221368670463562, + "rewards//mean": 0.88055419921875, + "rewards//std": 0.028779154643416405, + "step": 1853 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3708, + "grad_norm": 1.6073048114776611, + "kl": 0.31726269237697124, + "learning_rate": 7.068411628991787e-07, + "loss": 0.0127, + "num_tokens": 13494213.0, + "reward": 0.8240966796875, + "reward_std": 0.013257456943392754, + "rewards//mean": 0.8240966796875, + "rewards//std": 0.021462874487042427, + "step": 1854 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.371, + "grad_norm": 1.4907405376434326, + "kl": 0.3609944600611925, + "learning_rate": 7.065522149122709e-07, + "loss": 0.0144, + "num_tokens": 13501469.0, + "reward": 0.83660888671875, + "reward_std": 0.01886327937245369, + "rewards//mean": 0.83660888671875, + "rewards//std": 0.023771541193127632, + "step": 1855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.828125, + "epoch": 0.3712, + "grad_norm": 1.482851505279541, + "kl": 0.34037135913968086, + "learning_rate": 7.062631837261556e-07, + "loss": 0.0114, + "num_tokens": 13508610.0, + "reward": 0.83575439453125, + "reward_std": 0.014541294425725937, + "rewards//mean": 0.83575439453125, + "rewards//std": 0.025708714500069618, + "step": 1856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3714, + "grad_norm": 2.0688676834106445, + "kl": 0.4126032181084156, + "learning_rate": 7.059740694572545e-07, + "loss": 0.0165, + "num_tokens": 13515882.0, + "reward": 0.813720703125, + "reward_std": 0.013712752610445023, + "rewards//mean": 0.813720703125, + "rewards//std": 0.015262330882251263, + "step": 1857 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3716, + "grad_norm": 1.2845219373703003, + "kl": 0.3035856131464243, + "learning_rate": 7.056848722220228e-07, + "loss": 0.0121, + "num_tokens": 13523186.0, + "reward": 0.83795166015625, + "reward_std": 0.013953818008303642, + "rewards//mean": 0.83795166015625, + "rewards//std": 0.020634658634662628, + "step": 1858 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3718, + "grad_norm": 1.3338671922683716, + "kl": 0.3323751762509346, + "learning_rate": 7.053955921369493e-07, + "loss": 0.0133, + "num_tokens": 13530418.0, + "reward": 0.8525390625, + "reward_std": 0.01437668688595295, + "rewards//mean": 0.8525390625, + "rewards//std": 0.019869346171617508, + "step": 1859 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.372, + "grad_norm": 1.749800205230713, + "kl": 0.29190447740256786, + "learning_rate": 7.051062293185559e-07, + "loss": 0.0117, + "num_tokens": 13537706.0, + "reward": 0.84600830078125, + "reward_std": 0.011378924362361431, + "rewards//mean": 0.84600830078125, + "rewards//std": 0.019335636869072914, + "step": 1860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3722, + "grad_norm": 1.2933152914047241, + "kl": 0.30271927267313004, + "learning_rate": 7.048167838833976e-07, + "loss": 0.0121, + "num_tokens": 13544922.0, + "reward": 0.8641357421875, + "reward_std": 0.01715749502182007, + "rewards//mean": 0.8641357421875, + "rewards//std": 0.02293049544095993, + "step": 1861 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3724, + "grad_norm": 1.7847217321395874, + "kl": 0.3528548777103424, + "learning_rate": 7.045272559480635e-07, + "loss": 0.0141, + "num_tokens": 13552162.0, + "reward": 0.8447265625, + "reward_std": 0.013560364954173565, + "rewards//mean": 0.8447265625, + "rewards//std": 0.017839696258306503, + "step": 1862 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3726, + "grad_norm": 1.086585521697998, + "kl": 0.2729060146957636, + "learning_rate": 7.042376456291751e-07, + "loss": 0.0109, + "num_tokens": 13559458.0, + "reward": 0.81524658203125, + "reward_std": 0.010116470977663994, + "rewards//mean": 0.81524658203125, + "rewards//std": 0.021862149238586426, + "step": 1863 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3728, + "grad_norm": 1.5963716506958008, + "kl": 0.3188556171953678, + "learning_rate": 7.039479530433874e-07, + "loss": 0.0128, + "num_tokens": 13566714.0, + "reward": 0.81585693359375, + "reward_std": 0.012813501991331577, + "rewards//mean": 0.81585693359375, + "rewards//std": 0.016468243673443794, + "step": 1864 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.75, + "epoch": 0.373, + "grad_norm": 1.682019591331482, + "kl": 0.31830945052206516, + "learning_rate": 7.036581783073887e-07, + "loss": 0.0042, + "num_tokens": 13573962.0, + "reward": 0.828369140625, + "reward_std": 0.013317353092133999, + "rewards//mean": 0.828369140625, + "rewards//std": 0.020994020625948906, + "step": 1865 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3732, + "grad_norm": 1.4871675968170166, + "kl": 0.30953940376639366, + "learning_rate": 7.033683215379002e-07, + "loss": 0.0124, + "num_tokens": 13581362.0, + "reward": 0.87371826171875, + "reward_std": 0.014798401854932308, + "rewards//mean": 0.87371826171875, + "rewards//std": 0.021076688542962074, + "step": 1866 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3734, + "grad_norm": 1.2947285175323486, + "kl": 0.2779895793646574, + "learning_rate": 7.030783828516759e-07, + "loss": 0.0111, + "num_tokens": 13588714.0, + "reward": 0.862548828125, + "reward_std": 0.016146374866366386, + "rewards//mean": 0.862548828125, + "rewards//std": 0.019109725952148438, + "step": 1867 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3736, + "grad_norm": 1.9640898704528809, + "kl": 0.3223051093518734, + "learning_rate": 7.027883623655034e-07, + "loss": 0.0129, + "num_tokens": 13595922.0, + "reward": 0.86651611328125, + "reward_std": 0.015001020394265652, + "rewards//mean": 0.86651611328125, + "rewards//std": 0.02205929160118103, + "step": 1868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.3738, + "grad_norm": 1.8531932830810547, + "kl": 0.29975447803735733, + "learning_rate": 7.024982601962026e-07, + "loss": 0.0164, + "num_tokens": 13603254.0, + "reward": 0.8209228515625, + "reward_std": 0.014232181012630463, + "rewards//mean": 0.8209228515625, + "rewards//std": 0.018626343458890915, + "step": 1869 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.374, + "grad_norm": 1.2588213682174683, + "kl": 0.30350483767688274, + "learning_rate": 7.022080764606271e-07, + "loss": 0.0121, + "num_tokens": 13610470.0, + "reward": 0.82415771484375, + "reward_std": 0.012441646307706833, + "rewards//mean": 0.82415771484375, + "rewards//std": 0.02779175341129303, + "step": 1870 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3742, + "grad_norm": 1.545120358467102, + "kl": 0.3731802050024271, + "learning_rate": 7.019178112756625e-07, + "loss": 0.0149, + "num_tokens": 13617790.0, + "reward": 0.8157958984375, + "reward_std": 0.017868446186184883, + "rewards//mean": 0.8157958984375, + "rewards//std": 0.019663721323013306, + "step": 1871 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3744, + "grad_norm": 1.5428271293640137, + "kl": 0.344201048836112, + "learning_rate": 7.016274647582276e-07, + "loss": 0.0138, + "num_tokens": 13625110.0, + "reward": 0.8353271484375, + "reward_std": 0.01976671814918518, + "rewards//mean": 0.8353271484375, + "rewards//std": 0.020660776644945145, + "step": 1872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3746, + "grad_norm": 1.4896584749221802, + "kl": 0.34005532413721085, + "learning_rate": 7.013370370252739e-07, + "loss": 0.0136, + "num_tokens": 13632286.0, + "reward": 0.80816650390625, + "reward_std": 0.014416845515370369, + "rewards//mean": 0.80816650390625, + "rewards//std": 0.023314379155635834, + "step": 1873 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3748, + "grad_norm": 1.1723021268844604, + "kl": 0.3813207522034645, + "learning_rate": 7.010465281937858e-07, + "loss": 0.0153, + "num_tokens": 13639510.0, + "reward": 0.86639404296875, + "reward_std": 0.009659882634878159, + "rewards//mean": 0.86639404296875, + "rewards//std": 0.012453793548047543, + "step": 1874 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.8125, + "epoch": 0.375, + "grad_norm": 1.4214482307434082, + "kl": 0.32091911137104034, + "learning_rate": 7.007559383807802e-07, + "loss": 0.0104, + "num_tokens": 13646818.0, + "reward": 0.83367919921875, + "reward_std": 0.013388672843575478, + "rewards//mean": 0.83367919921875, + "rewards//std": 0.022923480719327927, + "step": 1875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.59375, + "epoch": 0.3752, + "grad_norm": 1.4295099973678589, + "kl": 0.3099609315395355, + "learning_rate": 7.004652677033068e-07, + "loss": -0.021, + "num_tokens": 13654056.0, + "reward": 0.84405517578125, + "reward_std": 0.016676519066095352, + "rewards//mean": 0.84405517578125, + "rewards//std": 0.021824726834893227, + "step": 1876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3754, + "grad_norm": 1.524109125137329, + "kl": 0.25556511618196964, + "learning_rate": 7.001745162784475e-07, + "loss": 0.0102, + "num_tokens": 13661288.0, + "reward": 0.85736083984375, + "reward_std": 0.014558538794517517, + "rewards//mean": 0.85736083984375, + "rewards//std": 0.024419255554676056, + "step": 1877 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3756, + "grad_norm": 1.3370939493179321, + "kl": 0.27722025848925114, + "learning_rate": 6.998836842233169e-07, + "loss": 0.0111, + "num_tokens": 13668592.0, + "reward": 0.841552734375, + "reward_std": 0.013199593871831894, + "rewards//mean": 0.841552734375, + "rewards//std": 0.0216975137591362, + "step": 1878 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3758, + "grad_norm": 1.5486077070236206, + "kl": 0.32793302834033966, + "learning_rate": 6.995927716550622e-07, + "loss": 0.0131, + "num_tokens": 13675880.0, + "reward": 0.86431884765625, + "reward_std": 0.01618846133351326, + "rewards//mean": 0.86431884765625, + "rewards//std": 0.019503232091665268, + "step": 1879 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.376, + "grad_norm": 1.4318468570709229, + "kl": 0.3789338208734989, + "learning_rate": 6.99301778690863e-07, + "loss": 0.0152, + "num_tokens": 13683168.0, + "reward": 0.8763427734375, + "reward_std": 0.014453928917646408, + "rewards//mean": 0.8763427734375, + "rewards//std": 0.02277948334813118, + "step": 1880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3762, + "grad_norm": 1.4492031335830688, + "kl": 0.31087539717555046, + "learning_rate": 6.990107054479312e-07, + "loss": 0.0124, + "num_tokens": 13690472.0, + "reward": 0.8365478515625, + "reward_std": 0.016713179647922516, + "rewards//mean": 0.8365478515625, + "rewards//std": 0.024668196216225624, + "step": 1881 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3764, + "grad_norm": 1.3783320188522339, + "kl": 0.322294719517231, + "learning_rate": 6.987195520435109e-07, + "loss": 0.0129, + "num_tokens": 13697792.0, + "reward": 0.858154296875, + "reward_std": 0.012469882145524025, + "rewards//mean": 0.858154296875, + "rewards//std": 0.02180885523557663, + "step": 1882 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.703125, + "epoch": 0.3766, + "grad_norm": 1.3950878381729126, + "kl": 0.3857176937162876, + "learning_rate": 6.984283185948789e-07, + "loss": 0.008, + "num_tokens": 13705053.0, + "reward": 0.8607177734375, + "reward_std": 0.021254850551486015, + "rewards//mean": 0.8607177734375, + "rewards//std": 0.02887505479156971, + "step": 1883 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3768, + "grad_norm": 1.4226009845733643, + "kl": 0.288821030408144, + "learning_rate": 6.981370052193439e-07, + "loss": 0.0116, + "num_tokens": 13712541.0, + "reward": 0.8089599609375, + "reward_std": 0.011511318385601044, + "rewards//mean": 0.8089599609375, + "rewards//std": 0.01939709298312664, + "step": 1884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.377, + "grad_norm": 1.157956838607788, + "kl": 0.298569492995739, + "learning_rate": 6.978456120342469e-07, + "loss": 0.0119, + "num_tokens": 13719901.0, + "reward": 0.84619140625, + "reward_std": 0.016123345121741295, + "rewards//mean": 0.84619140625, + "rewards//std": 0.01777849718928337, + "step": 1885 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3772, + "grad_norm": 1.6478760242462158, + "kl": 0.34547257982194424, + "learning_rate": 6.975541391569609e-07, + "loss": 0.0138, + "num_tokens": 13727133.0, + "reward": 0.85394287109375, + "reward_std": 0.021744975820183754, + "rewards//mean": 0.85394287109375, + "rewards//std": 0.030549386516213417, + "step": 1886 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.3774, + "grad_norm": 1.387528419494629, + "kl": 0.34124702401459217, + "learning_rate": 6.972625867048914e-07, + "loss": 0.0143, + "num_tokens": 13734353.0, + "reward": 0.8485107421875, + "reward_std": 0.01281137578189373, + "rewards//mean": 0.8485107421875, + "rewards//std": 0.019703712314367294, + "step": 1887 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3776, + "grad_norm": 1.4507193565368652, + "kl": 0.3100567925721407, + "learning_rate": 6.969709547954755e-07, + "loss": 0.0124, + "num_tokens": 13741609.0, + "reward": 0.84112548828125, + "reward_std": 0.013624574057757854, + "rewards//mean": 0.84112548828125, + "rewards//std": 0.023230470716953278, + "step": 1888 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3778, + "grad_norm": 1.5388301610946655, + "kl": 0.32222601398825645, + "learning_rate": 6.966792435461826e-07, + "loss": 0.0129, + "num_tokens": 13748897.0, + "reward": 0.7535400390625, + "reward_std": 0.018139628693461418, + "rewards//mean": 0.7535400390625, + "rewards//std": 0.028226081281900406, + "step": 1889 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.378, + "grad_norm": 1.4758803844451904, + "kl": 0.316789785400033, + "learning_rate": 6.963874530745139e-07, + "loss": 0.0127, + "num_tokens": 13756113.0, + "reward": 0.83880615234375, + "reward_std": 0.019504830241203308, + "rewards//mean": 0.83880615234375, + "rewards//std": 0.023460660129785538, + "step": 1890 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3782, + "grad_norm": 1.6516839265823364, + "kl": 0.32713914290070534, + "learning_rate": 6.960955834980027e-07, + "loss": 0.0131, + "num_tokens": 13763449.0, + "reward": 0.8404541015625, + "reward_std": 0.01633816584944725, + "rewards//mean": 0.8404541015625, + "rewards//std": 0.024815035983920097, + "step": 1891 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3784, + "grad_norm": 1.840234398841858, + "kl": 0.33908705599606037, + "learning_rate": 6.958036349342139e-07, + "loss": 0.0136, + "num_tokens": 13770657.0, + "reward": 0.83843994140625, + "reward_std": 0.021240049973130226, + "rewards//mean": 0.83843994140625, + "rewards//std": 0.030705569311976433, + "step": 1892 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3786, + "grad_norm": 1.8907175064086914, + "kl": 0.31406490318477154, + "learning_rate": 6.955116075007442e-07, + "loss": 0.0126, + "num_tokens": 13777937.0, + "reward": 0.8577880859375, + "reward_std": 0.020632244646549225, + "rewards//mean": 0.8577880859375, + "rewards//std": 0.027958810329437256, + "step": 1893 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.828125, + "epoch": 0.3788, + "grad_norm": 1.3769023418426514, + "kl": 0.31641292758286, + "learning_rate": 6.952195013152225e-07, + "loss": -0.0002, + "num_tokens": 13785150.0, + "reward": 0.87030029296875, + "reward_std": 0.014070052653551102, + "rewards//mean": 0.87030029296875, + "rewards//std": 0.02252243645489216, + "step": 1894 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.379, + "grad_norm": 1.781870722770691, + "kl": 0.38371102325618267, + "learning_rate": 6.94927316495309e-07, + "loss": 0.0153, + "num_tokens": 13792430.0, + "reward": 0.8145751953125, + "reward_std": 0.013288578949868679, + "rewards//mean": 0.8145751953125, + "rewards//std": 0.017812097445130348, + "step": 1895 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.5625, + "epoch": 0.3792, + "grad_norm": 1.4230388402938843, + "kl": 0.32850990258157253, + "learning_rate": 6.946350531586957e-07, + "loss": -0.0201, + "num_tokens": 13799690.0, + "reward": 0.81280517578125, + "reward_std": 0.014947950839996338, + "rewards//mean": 0.81280517578125, + "rewards//std": 0.022801650688052177, + "step": 1896 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3794, + "grad_norm": 1.530210256576538, + "kl": 0.3557496629655361, + "learning_rate": 6.943427114231063e-07, + "loss": 0.0142, + "num_tokens": 13806954.0, + "reward": 0.81121826171875, + "reward_std": 0.01454958226531744, + "rewards//mean": 0.81121826171875, + "rewards//std": 0.020541278645396233, + "step": 1897 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3796, + "grad_norm": 1.2277356386184692, + "kl": 0.3308297283947468, + "learning_rate": 6.94050291406296e-07, + "loss": 0.0132, + "num_tokens": 13814250.0, + "reward": 0.82989501953125, + "reward_std": 0.017353635281324387, + "rewards//mean": 0.82989501953125, + "rewards//std": 0.01888091117143631, + "step": 1898 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.546875, + "epoch": 0.3798, + "grad_norm": 1.9680103063583374, + "kl": 0.33097589015960693, + "learning_rate": 6.937577932260514e-07, + "loss": -0.0022, + "num_tokens": 13821605.0, + "reward": 0.82781982421875, + "reward_std": 0.015430444851517677, + "rewards//mean": 0.82781982421875, + "rewards//std": 0.01916578598320484, + "step": 1899 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.38, + "grad_norm": 1.7219464778900146, + "kl": 0.3280312716960907, + "learning_rate": 6.93465217000191e-07, + "loss": 0.0131, + "num_tokens": 13828837.0, + "reward": 0.829833984375, + "reward_std": 0.014782963320612907, + "rewards//mean": 0.829833984375, + "rewards//std": 0.029247231781482697, + "step": 1900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3802, + "grad_norm": 1.7107527256011963, + "kl": 0.3423987105488777, + "learning_rate": 6.931725628465642e-07, + "loss": 0.0137, + "num_tokens": 13836117.0, + "reward": 0.84320068359375, + "reward_std": 0.013791464269161224, + "rewards//mean": 0.84320068359375, + "rewards//std": 0.01615643873810768, + "step": 1901 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3804, + "grad_norm": 1.599567174911499, + "kl": 0.35815320163965225, + "learning_rate": 6.928798308830523e-07, + "loss": 0.0143, + "num_tokens": 13843389.0, + "reward": 0.84228515625, + "reward_std": 0.012867866083979607, + "rewards//mean": 0.84228515625, + "rewards//std": 0.016841012984514236, + "step": 1902 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.796875, + "epoch": 0.3806, + "grad_norm": 1.4562785625457764, + "kl": 0.34185181744396687, + "learning_rate": 6.925870212275676e-07, + "loss": 0.0039, + "num_tokens": 13850640.0, + "reward": 0.80072021484375, + "reward_std": 0.016474805772304535, + "rewards//mean": 0.80072021484375, + "rewards//std": 0.023953603580594063, + "step": 1903 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3808, + "grad_norm": 1.3769649267196655, + "kl": 0.2796116564422846, + "learning_rate": 6.922941339980537e-07, + "loss": 0.0112, + "num_tokens": 13857912.0, + "reward": 0.87164306640625, + "reward_std": 0.015356115996837616, + "rewards//mean": 0.87164306640625, + "rewards//std": 0.027611423283815384, + "step": 1904 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.8125, + "epoch": 0.381, + "grad_norm": 1.5024359226226807, + "kl": 0.34545596688985825, + "learning_rate": 6.920011693124856e-07, + "loss": 0.0097, + "num_tokens": 13865236.0, + "reward": 0.81903076171875, + "reward_std": 0.018077854067087173, + "rewards//mean": 0.81903076171875, + "rewards//std": 0.029074301943182945, + "step": 1905 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.3812, + "grad_norm": 1.5341808795928955, + "kl": 0.34248769097030163, + "learning_rate": 6.917081272888696e-07, + "loss": 0.0138, + "num_tokens": 13872517.0, + "reward": 0.8463134765625, + "reward_std": 0.018374862149357796, + "rewards//mean": 0.8463134765625, + "rewards//std": 0.03124394454061985, + "step": 1906 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3814, + "grad_norm": 1.717010259628296, + "kl": 0.3589784409850836, + "learning_rate": 6.914150080452428e-07, + "loss": 0.0144, + "num_tokens": 13879805.0, + "reward": 0.8399658203125, + "reward_std": 0.01630811020731926, + "rewards//mean": 0.8399658203125, + "rewards//std": 0.020716384053230286, + "step": 1907 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3816, + "grad_norm": 1.5178656578063965, + "kl": 0.34410784766077995, + "learning_rate": 6.911218116996736e-07, + "loss": 0.0138, + "num_tokens": 13887077.0, + "reward": 0.86688232421875, + "reward_std": 0.014393717050552368, + "rewards//mean": 0.86688232421875, + "rewards//std": 0.019701696932315826, + "step": 1908 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3818, + "grad_norm": 1.3212772607803345, + "kl": 0.2852855585515499, + "learning_rate": 6.908285383702616e-07, + "loss": 0.0114, + "num_tokens": 13894357.0, + "reward": 0.85638427734375, + "reward_std": 0.016349609941244125, + "rewards//mean": 0.85638427734375, + "rewards//std": 0.025207456201314926, + "step": 1909 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.382, + "grad_norm": 1.6455856561660767, + "kl": 0.31258721090853214, + "learning_rate": 6.905351881751371e-07, + "loss": 0.0125, + "num_tokens": 13901589.0, + "reward": 0.8614501953125, + "reward_std": 0.012256243266165257, + "rewards//mean": 0.8614501953125, + "rewards//std": 0.02325041964650154, + "step": 1910 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3822, + "grad_norm": 1.6836494207382202, + "kl": 0.33753401786088943, + "learning_rate": 6.902417612324615e-07, + "loss": 0.0135, + "num_tokens": 13908861.0, + "reward": 0.8489990234375, + "reward_std": 0.0184866301715374, + "rewards//mean": 0.8489990234375, + "rewards//std": 0.028264669701457024, + "step": 1911 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3824, + "grad_norm": 1.3575350046157837, + "kl": 0.31602487340569496, + "learning_rate": 6.899482576604274e-07, + "loss": 0.0126, + "num_tokens": 13916189.0, + "reward": 0.8642578125, + "reward_std": 0.014029249548912048, + "rewards//mean": 0.8642578125, + "rewards//std": 0.020856572315096855, + "step": 1912 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3826, + "grad_norm": 1.7310616970062256, + "kl": 0.2857834082096815, + "learning_rate": 6.896546775772576e-07, + "loss": 0.0114, + "num_tokens": 13923429.0, + "reward": 0.775634765625, + "reward_std": 0.01335756853222847, + "rewards//mean": 0.775634765625, + "rewards//std": 0.022226866334676743, + "step": 1913 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3828, + "grad_norm": 1.5041061639785767, + "kl": 0.31629333086311817, + "learning_rate": 6.893610211012066e-07, + "loss": 0.0127, + "num_tokens": 13930725.0, + "reward": 0.77777099609375, + "reward_std": 0.012581879273056984, + "rewards//mean": 0.77777099609375, + "rewards//std": 0.017303748056292534, + "step": 1914 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.383, + "grad_norm": 1.6723722219467163, + "kl": 0.33772503212094307, + "learning_rate": 6.890672883505588e-07, + "loss": 0.0128, + "num_tokens": 13938060.0, + "reward": 0.8314208984375, + "reward_std": 0.011762122623622417, + "rewards//mean": 0.8314208984375, + "rewards//std": 0.017856234684586525, + "step": 1915 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3832, + "grad_norm": 1.3085989952087402, + "kl": 0.2962023764848709, + "learning_rate": 6.887734794436299e-07, + "loss": 0.0118, + "num_tokens": 13945332.0, + "reward": 0.7933349609375, + "reward_std": 0.008977817371487617, + "rewards//mean": 0.7933349609375, + "rewards//std": 0.010685672052204609, + "step": 1916 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3834, + "grad_norm": 1.3241102695465088, + "kl": 0.34581612050533295, + "learning_rate": 6.884795944987661e-07, + "loss": 0.0138, + "num_tokens": 13952660.0, + "reward": 0.7943115234375, + "reward_std": 0.015624706633388996, + "rewards//mean": 0.7943115234375, + "rewards//std": 0.031865671277046204, + "step": 1917 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3836, + "grad_norm": 1.5666850805282593, + "kl": 0.33963238820433617, + "learning_rate": 6.881856336343441e-07, + "loss": 0.0136, + "num_tokens": 13959876.0, + "reward": 0.8367919921875, + "reward_std": 0.014448682777583599, + "rewards//mean": 0.8367919921875, + "rewards//std": 0.017224503681063652, + "step": 1918 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.828125, + "epoch": 0.3838, + "grad_norm": 1.3896430730819702, + "kl": 0.27323178946971893, + "learning_rate": 6.878915969687714e-07, + "loss": 0.0076, + "num_tokens": 13967209.0, + "reward": 0.851318359375, + "reward_std": 0.013405363075435162, + "rewards//mean": 0.851318359375, + "rewards//std": 0.033681128174066544, + "step": 1919 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.384, + "grad_norm": 1.869046688079834, + "kl": 0.2609628662467003, + "learning_rate": 6.875974846204858e-07, + "loss": 0.0104, + "num_tokens": 13974617.0, + "reward": 0.86175537109375, + "reward_std": 0.021392308175563812, + "rewards//mean": 0.86175537109375, + "rewards//std": 0.02764429897069931, + "step": 1920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3842, + "grad_norm": 1.3021212816238403, + "kl": 0.31681668013334274, + "learning_rate": 6.87303296707956e-07, + "loss": 0.0127, + "num_tokens": 13981841.0, + "reward": 0.8604736328125, + "reward_std": 0.016321051865816116, + "rewards//mean": 0.8604736328125, + "rewards//std": 0.02310935966670513, + "step": 1921 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.3844, + "grad_norm": 1.3195878267288208, + "kl": 0.2976117916405201, + "learning_rate": 6.870090333496806e-07, + "loss": 0.0128, + "num_tokens": 13989199.0, + "reward": 0.84271240234375, + "reward_std": 0.011486915871500969, + "rewards//mean": 0.84271240234375, + "rewards//std": 0.014915082603693008, + "step": 1922 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3846, + "grad_norm": 1.4558216333389282, + "kl": 0.3853066209703684, + "learning_rate": 6.867146946641891e-07, + "loss": 0.0154, + "num_tokens": 13996551.0, + "reward": 0.8033447265625, + "reward_std": 0.013967134989798069, + "rewards//mean": 0.8033447265625, + "rewards//std": 0.017856234684586525, + "step": 1923 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3848, + "grad_norm": 1.3866549730300903, + "kl": 0.3453008346259594, + "learning_rate": 6.864202807700407e-07, + "loss": 0.0138, + "num_tokens": 14003823.0, + "reward": 0.85833740234375, + "reward_std": 0.017112383618950844, + "rewards//mean": 0.85833740234375, + "rewards//std": 0.0240419153124094, + "step": 1924 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.385, + "grad_norm": 1.4757179021835327, + "kl": 0.3335813954472542, + "learning_rate": 6.861257917858257e-07, + "loss": 0.0133, + "num_tokens": 14011007.0, + "reward": 0.8154296875, + "reward_std": 0.014672953635454178, + "rewards//mean": 0.8154296875, + "rewards//std": 0.02214088849723339, + "step": 1925 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3852, + "grad_norm": 1.3801058530807495, + "kl": 0.34026747941970825, + "learning_rate": 6.858312278301637e-07, + "loss": 0.0136, + "num_tokens": 14018279.0, + "reward": 0.8541259765625, + "reward_std": 0.01618356816470623, + "rewards//mean": 0.8541259765625, + "rewards//std": 0.019564935937523842, + "step": 1926 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3854, + "grad_norm": 1.5249838829040527, + "kl": 0.38812698796391487, + "learning_rate": 6.855365890217056e-07, + "loss": 0.0155, + "num_tokens": 14025551.0, + "reward": 0.8240966796875, + "reward_std": 0.010647229850292206, + "rewards//mean": 0.8240966796875, + "rewards//std": 0.012371627613902092, + "step": 1927 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3856, + "grad_norm": 1.6717764139175415, + "kl": 0.3440629877150059, + "learning_rate": 6.852418754791316e-07, + "loss": 0.0138, + "num_tokens": 14032895.0, + "reward": 0.7689208984375, + "reward_std": 0.015564357861876488, + "rewards//mean": 0.7689208984375, + "rewards//std": 0.02416226826608181, + "step": 1928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.75, + "epoch": 0.3858, + "grad_norm": 1.633506417274475, + "kl": 0.3279437981545925, + "learning_rate": 6.849470873211522e-07, + "loss": -0.0053, + "num_tokens": 14040199.0, + "reward": 0.861328125, + "reward_std": 0.020222803577780724, + "rewards//mean": 0.861328125, + "rewards//std": 0.02409985102713108, + "step": 1929 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.386, + "grad_norm": 1.572308897972107, + "kl": 0.3302511163055897, + "learning_rate": 6.846522246665083e-07, + "loss": 0.0132, + "num_tokens": 14047447.0, + "reward": 0.8424072265625, + "reward_std": 0.017064325511455536, + "rewards//mean": 0.8424072265625, + "rewards//std": 0.02603440172970295, + "step": 1930 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3862, + "grad_norm": 1.3810186386108398, + "kl": 0.2933749109506607, + "learning_rate": 6.843572876339704e-07, + "loss": 0.0117, + "num_tokens": 14054703.0, + "reward": 0.8426513671875, + "reward_std": 0.020107710734009743, + "rewards//mean": 0.8426513671875, + "rewards//std": 0.02423483319580555, + "step": 1931 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3864, + "grad_norm": 1.7339723110198975, + "kl": 0.34968753159046173, + "learning_rate": 6.840622763423391e-07, + "loss": 0.014, + "num_tokens": 14061951.0, + "reward": 0.82666015625, + "reward_std": 0.01357671245932579, + "rewards//mean": 0.82666015625, + "rewards//std": 0.018182605504989624, + "step": 1932 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3866, + "grad_norm": 1.4843508005142212, + "kl": 0.35183484479784966, + "learning_rate": 6.837671909104447e-07, + "loss": 0.0141, + "num_tokens": 14069271.0, + "reward": 0.86932373046875, + "reward_std": 0.01539338193833828, + "rewards//mean": 0.86932373046875, + "rewards//std": 0.03169580549001694, + "step": 1933 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3868, + "grad_norm": 1.8349225521087646, + "kl": 0.3560308925807476, + "learning_rate": 6.834720314571479e-07, + "loss": 0.0142, + "num_tokens": 14076631.0, + "reward": 0.86334228515625, + "reward_std": 0.011480688117444515, + "rewards//mean": 0.86334228515625, + "rewards//std": 0.017448363825678825, + "step": 1934 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.387, + "grad_norm": 1.640338659286499, + "kl": 0.31692124530673027, + "learning_rate": 6.831767981013388e-07, + "loss": 0.0127, + "num_tokens": 14083983.0, + "reward": 0.800048828125, + "reward_std": 0.015241893008351326, + "rewards//mean": 0.800048828125, + "rewards//std": 0.033939018845558167, + "step": 1935 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3872, + "grad_norm": 1.6410245895385742, + "kl": 0.2766203097999096, + "learning_rate": 6.828814909619372e-07, + "loss": 0.0111, + "num_tokens": 14091295.0, + "reward": 0.874755859375, + "reward_std": 0.01385432481765747, + "rewards//mean": 0.874755859375, + "rewards//std": 0.020994020625948906, + "step": 1936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3874, + "grad_norm": 1.2616686820983887, + "kl": 0.34551832638680935, + "learning_rate": 6.82586110157893e-07, + "loss": 0.0138, + "num_tokens": 14098607.0, + "reward": 0.8758544921875, + "reward_std": 0.012959754094481468, + "rewards//mean": 0.8758544921875, + "rewards//std": 0.022792767733335495, + "step": 1937 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3876, + "grad_norm": 1.3821359872817993, + "kl": 0.3916392717510462, + "learning_rate": 6.822906558081856e-07, + "loss": 0.0157, + "num_tokens": 14105943.0, + "reward": 0.8233642578125, + "reward_std": 0.016036296263337135, + "rewards//mean": 0.8233642578125, + "rewards//std": 0.020977791398763657, + "step": 1938 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.875, + "epoch": 0.3878, + "grad_norm": 1.7445034980773926, + "kl": 0.36216312646865845, + "learning_rate": 6.819951280318236e-07, + "loss": 0.014, + "num_tokens": 14113207.0, + "reward": 0.825927734375, + "reward_std": 0.01592608168721199, + "rewards//mean": 0.825927734375, + "rewards//std": 0.02358044870197773, + "step": 1939 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.8125, + "epoch": 0.388, + "grad_norm": 1.9586985111236572, + "kl": 0.30492144636809826, + "learning_rate": 6.816995269478459e-07, + "loss": 0.0127, + "num_tokens": 14120531.0, + "reward": 0.86669921875, + "reward_std": 0.020942706614732742, + "rewards//mean": 0.86669921875, + "rewards//std": 0.031408485025167465, + "step": 1940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.3882, + "grad_norm": 1.984647512435913, + "kl": 0.3135528638958931, + "learning_rate": 6.814038526753204e-07, + "loss": 0.0136, + "num_tokens": 14127855.0, + "reward": 0.8702392578125, + "reward_std": 0.014657477848231792, + "rewards//mean": 0.8702392578125, + "rewards//std": 0.020344773307442665, + "step": 1941 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3884, + "grad_norm": 1.6234065294265747, + "kl": 0.34107372537255287, + "learning_rate": 6.811081053333449e-07, + "loss": 0.0136, + "num_tokens": 14135127.0, + "reward": 0.795166015625, + "reward_std": 0.010710081085562706, + "rewards//mean": 0.795166015625, + "rewards//std": 0.017973264679312706, + "step": 1942 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3886, + "grad_norm": 1.3711265325546265, + "kl": 0.35838334262371063, + "learning_rate": 6.80812285041046e-07, + "loss": 0.0143, + "num_tokens": 14142391.0, + "reward": 0.78656005859375, + "reward_std": 0.011383282020688057, + "rewards//mean": 0.78656005859375, + "rewards//std": 0.017474371939897537, + "step": 1943 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3888, + "grad_norm": 1.4299075603485107, + "kl": 0.32366296276450157, + "learning_rate": 6.805163919175806e-07, + "loss": 0.0129, + "num_tokens": 14149655.0, + "reward": 0.78253173828125, + "reward_std": 0.01370613370090723, + "rewards//mean": 0.78253173828125, + "rewards//std": 0.021932661533355713, + "step": 1944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.389, + "grad_norm": 1.7289263010025024, + "kl": 0.3525472469627857, + "learning_rate": 6.80220426082134e-07, + "loss": 0.0141, + "num_tokens": 14157007.0, + "reward": 0.82366943359375, + "reward_std": 0.017799315974116325, + "rewards//mean": 0.82366943359375, + "rewards//std": 0.023713519796729088, + "step": 1945 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3892, + "grad_norm": 1.4701634645462036, + "kl": 0.38530447520315647, + "learning_rate": 6.799243876539213e-07, + "loss": 0.0154, + "num_tokens": 14164279.0, + "reward": 0.85040283203125, + "reward_std": 0.013337982818484306, + "rewards//mean": 0.85040283203125, + "rewards//std": 0.02423694171011448, + "step": 1946 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3894, + "grad_norm": 1.2676440477371216, + "kl": 0.31108430586755276, + "learning_rate": 6.796282767521869e-07, + "loss": 0.0124, + "num_tokens": 14171551.0, + "reward": 0.8375244140625, + "reward_std": 0.011155656538903713, + "rewards//mean": 0.8375244140625, + "rewards//std": 0.018916653469204903, + "step": 1947 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3896, + "grad_norm": 1.246211051940918, + "kl": 0.2954984474927187, + "learning_rate": 6.793320934962038e-07, + "loss": 0.0118, + "num_tokens": 14178839.0, + "reward": 0.87841796875, + "reward_std": 0.013276543468236923, + "rewards//mean": 0.87841796875, + "rewards//std": 0.018354956060647964, + "step": 1948 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3898, + "grad_norm": 1.310772180557251, + "kl": 0.3302397634834051, + "learning_rate": 6.790358380052751e-07, + "loss": 0.0132, + "num_tokens": 14186119.0, + "reward": 0.85382080078125, + "reward_std": 0.018229465931653976, + "rewards//mean": 0.85382080078125, + "rewards//std": 0.02078450284898281, + "step": 1949 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.39, + "grad_norm": 1.7882150411605835, + "kl": 0.36315466091036797, + "learning_rate": 6.787395103987322e-07, + "loss": 0.0145, + "num_tokens": 14193399.0, + "reward": 0.8446044921875, + "reward_std": 0.017835047096014023, + "rewards//mean": 0.8446044921875, + "rewards//std": 0.022803394123911858, + "step": 1950 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3902, + "grad_norm": 1.6594762802124023, + "kl": 0.3503291457891464, + "learning_rate": 6.784431107959358e-07, + "loss": 0.014, + "num_tokens": 14200719.0, + "reward": 0.86004638671875, + "reward_std": 0.013658428564667702, + "rewards//mean": 0.86004638671875, + "rewards//std": 0.01653704047203064, + "step": 1951 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3904, + "grad_norm": 1.9548310041427612, + "kl": 0.3074972666800022, + "learning_rate": 6.781466393162761e-07, + "loss": 0.0123, + "num_tokens": 14207919.0, + "reward": 0.8304443359375, + "reward_std": 0.01831691712141037, + "rewards//mean": 0.8304443359375, + "rewards//std": 0.03568427264690399, + "step": 1952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3906, + "grad_norm": 1.40070641040802, + "kl": 0.3896813727915287, + "learning_rate": 6.778500960791708e-07, + "loss": 0.0156, + "num_tokens": 14215215.0, + "reward": 0.8802490234375, + "reward_std": 0.024917688220739365, + "rewards//mean": 0.8802490234375, + "rewards//std": 0.03304477408528328, + "step": 1953 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3908, + "grad_norm": 1.3172026872634888, + "kl": 0.31078490801155567, + "learning_rate": 6.775534812040686e-07, + "loss": 0.0124, + "num_tokens": 14222543.0, + "reward": 0.7728271484375, + "reward_std": 0.011450774036347866, + "rewards//mean": 0.7728271484375, + "rewards//std": 0.018085353076457977, + "step": 1954 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.391, + "grad_norm": 1.6145573854446411, + "kl": 0.3534512519836426, + "learning_rate": 6.772567948104452e-07, + "loss": 0.0141, + "num_tokens": 14229767.0, + "reward": 0.865966796875, + "reward_std": 0.016951996833086014, + "rewards//mean": 0.865966796875, + "rewards//std": 0.030764734372496605, + "step": 1955 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3912, + "grad_norm": 1.4537168741226196, + "kl": 0.2875959351658821, + "learning_rate": 6.769600370178059e-07, + "loss": 0.0115, + "num_tokens": 14237087.0, + "reward": 0.8607177734375, + "reward_std": 0.017891012132167816, + "rewards//mean": 0.8607177734375, + "rewards//std": 0.02674115262925625, + "step": 1956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3914, + "grad_norm": 1.2240192890167236, + "kl": 0.3122484125196934, + "learning_rate": 6.766632079456851e-07, + "loss": 0.0125, + "num_tokens": 14244399.0, + "reward": 0.83868408203125, + "reward_std": 0.013588715344667435, + "rewards//mean": 0.83868408203125, + "rewards//std": 0.020991768687963486, + "step": 1957 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3916, + "grad_norm": 1.6275248527526855, + "kl": 0.31780864857137203, + "learning_rate": 6.76366307713645e-07, + "loss": 0.0127, + "num_tokens": 14251631.0, + "reward": 0.83984375, + "reward_std": 0.014851566404104233, + "rewards//mean": 0.83984375, + "rewards//std": 0.030433276668190956, + "step": 1958 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3918, + "grad_norm": 1.48079252243042, + "kl": 0.28648137487471104, + "learning_rate": 6.760693364412775e-07, + "loss": 0.0115, + "num_tokens": 14258935.0, + "reward": 0.87567138671875, + "reward_std": 0.018128395080566406, + "rewards//mean": 0.87567138671875, + "rewards//std": 0.025568192824721336, + "step": 1959 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.392, + "grad_norm": 1.6035444736480713, + "kl": 0.34209033101797104, + "learning_rate": 6.757722942482022e-07, + "loss": 0.0137, + "num_tokens": 14266287.0, + "reward": 0.81683349609375, + "reward_std": 0.013029053807258606, + "rewards//mean": 0.81683349609375, + "rewards//std": 0.022185880690813065, + "step": 1960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3922, + "grad_norm": 1.4542628526687622, + "kl": 0.3307732939720154, + "learning_rate": 6.754751812540679e-07, + "loss": 0.0132, + "num_tokens": 14273599.0, + "reward": 0.83709716796875, + "reward_std": 0.017143981531262398, + "rewards//mean": 0.83709716796875, + "rewards//std": 0.020311543717980385, + "step": 1961 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3924, + "grad_norm": 1.481806755065918, + "kl": 0.30817965418100357, + "learning_rate": 6.751779975785514e-07, + "loss": 0.0123, + "num_tokens": 14280887.0, + "reward": 0.87005615234375, + "reward_std": 0.01798739656805992, + "rewards//mean": 0.87005615234375, + "rewards//std": 0.02759716659784317, + "step": 1962 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3926, + "grad_norm": 1.5379908084869385, + "kl": 0.34696025401353836, + "learning_rate": 6.748807433413586e-07, + "loss": 0.0139, + "num_tokens": 14288151.0, + "reward": 0.866943359375, + "reward_std": 0.016516465693712234, + "rewards//mean": 0.866943359375, + "rewards//std": 0.020796971395611763, + "step": 1963 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3928, + "grad_norm": 1.4380850791931152, + "kl": 0.32016896829009056, + "learning_rate": 6.745834186622231e-07, + "loss": 0.0128, + "num_tokens": 14295607.0, + "reward": 0.8314208984375, + "reward_std": 0.021506858989596367, + "rewards//mean": 0.8314208984375, + "rewards//std": 0.034254010766744614, + "step": 1964 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.393, + "grad_norm": 1.5105657577514648, + "kl": 0.3330346867442131, + "learning_rate": 6.742860236609076e-07, + "loss": 0.0133, + "num_tokens": 14302919.0, + "reward": 0.8646240234375, + "reward_std": 0.013938810676336288, + "rewards//mean": 0.8646240234375, + "rewards//std": 0.023981157690286636, + "step": 1965 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3932, + "grad_norm": 1.3987486362457275, + "kl": 0.2646183017641306, + "learning_rate": 6.739885584572025e-07, + "loss": 0.0106, + "num_tokens": 14310231.0, + "reward": 0.8673095703125, + "reward_std": 0.016712384298443794, + "rewards//mean": 0.8673095703125, + "rewards//std": 0.02316693216562271, + "step": 1966 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3934, + "grad_norm": 1.3611518144607544, + "kl": 0.3304434772580862, + "learning_rate": 6.73691023170927e-07, + "loss": 0.0132, + "num_tokens": 14317495.0, + "reward": 0.8709716796875, + "reward_std": 0.014469113200902939, + "rewards//mean": 0.8709716796875, + "rewards//std": 0.016455024480819702, + "step": 1967 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3936, + "grad_norm": 1.339815378189087, + "kl": 0.3203568961471319, + "learning_rate": 6.733934179219281e-07, + "loss": 0.0128, + "num_tokens": 14324775.0, + "reward": 0.8214111328125, + "reward_std": 0.01575065776705742, + "rewards//mean": 0.8214111328125, + "rewards//std": 0.017682448029518127, + "step": 1968 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3938, + "grad_norm": 2.579857349395752, + "kl": 0.3509376756846905, + "learning_rate": 6.730957428300811e-07, + "loss": 0.014, + "num_tokens": 14331943.0, + "reward": 0.84552001953125, + "reward_std": 0.022740349173545837, + "rewards//mean": 0.84552001953125, + "rewards//std": 0.035186756402254105, + "step": 1969 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.394, + "grad_norm": 1.3629324436187744, + "kl": 0.40793038345873356, + "learning_rate": 6.727979980152898e-07, + "loss": 0.0163, + "num_tokens": 14339183.0, + "reward": 0.84033203125, + "reward_std": 0.014494160190224648, + "rewards//mean": 0.84033203125, + "rewards//std": 0.022428901866078377, + "step": 1970 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3942, + "grad_norm": 1.3900387287139893, + "kl": 0.33269215375185013, + "learning_rate": 6.725001835974852e-07, + "loss": 0.0133, + "num_tokens": 14346431.0, + "reward": 0.83514404296875, + "reward_std": 0.014177599921822548, + "rewards//mean": 0.83514404296875, + "rewards//std": 0.01927761547267437, + "step": 1971 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3944, + "grad_norm": 1.488873839378357, + "kl": 0.44072479754686356, + "learning_rate": 6.722022996966277e-07, + "loss": 0.0176, + "num_tokens": 14353679.0, + "reward": 0.83270263671875, + "reward_std": 0.014401951804757118, + "rewards//mean": 0.83270263671875, + "rewards//std": 0.025794537737965584, + "step": 1972 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3946, + "grad_norm": 1.5389386415481567, + "kl": 0.30883027240633965, + "learning_rate": 6.719043464327042e-07, + "loss": 0.0124, + "num_tokens": 14360895.0, + "reward": 0.79327392578125, + "reward_std": 0.015035952441394329, + "rewards//mean": 0.79327392578125, + "rewards//std": 0.02335784211754799, + "step": 1973 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.625, + "epoch": 0.3948, + "grad_norm": 1.5373263359069824, + "kl": 0.33902071230113506, + "learning_rate": 6.716063239257306e-07, + "loss": 0.001, + "num_tokens": 14368167.0, + "reward": 0.8670654296875, + "reward_std": 0.024029210209846497, + "rewards//mean": 0.8670654296875, + "rewards//std": 0.029165079817175865, + "step": 1974 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.395, + "grad_norm": 1.4652858972549438, + "kl": 0.3545099552720785, + "learning_rate": 6.713082322957502e-07, + "loss": 0.0142, + "num_tokens": 14375415.0, + "reward": 0.89154052734375, + "reward_std": 0.012493504211306572, + "rewards//mean": 0.89154052734375, + "rewards//std": 0.017590070143342018, + "step": 1975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.84375, + "epoch": 0.3952, + "grad_norm": 1.2013700008392334, + "kl": 0.33475834876298904, + "learning_rate": 6.710100716628344e-07, + "loss": 0.0128, + "num_tokens": 14382709.0, + "reward": 0.86029052734375, + "reward_std": 0.01557212695479393, + "rewards//mean": 0.86029052734375, + "rewards//std": 0.029763294383883476, + "step": 1976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3954, + "grad_norm": 1.881140112876892, + "kl": 0.36746065132319927, + "learning_rate": 6.70711842147082e-07, + "loss": 0.0147, + "num_tokens": 14389917.0, + "reward": 0.8671875, + "reward_std": 0.015939736738801003, + "rewards//mean": 0.8671875, + "rewards//std": 0.026392724364995956, + "step": 1977 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3956, + "grad_norm": 1.2912240028381348, + "kl": 0.33367819897830486, + "learning_rate": 6.704135438686203e-07, + "loss": 0.0133, + "num_tokens": 14397197.0, + "reward": 0.816650390625, + "reward_std": 0.010166341438889503, + "rewards//mean": 0.816650390625, + "rewards//std": 0.015404492616653442, + "step": 1978 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3958, + "grad_norm": 1.4663528203964233, + "kl": 0.3351971246302128, + "learning_rate": 6.701151769476032e-07, + "loss": 0.0134, + "num_tokens": 14404437.0, + "reward": 0.82183837890625, + "reward_std": 0.013813100755214691, + "rewards//mean": 0.82183837890625, + "rewards//std": 0.022645099088549614, + "step": 1979 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.396, + "grad_norm": 1.2447073459625244, + "kl": 0.31411379389464855, + "learning_rate": 6.698167415042134e-07, + "loss": 0.014, + "num_tokens": 14411794.0, + "reward": 0.8458251953125, + "reward_std": 0.010858561843633652, + "rewards//mean": 0.8458251953125, + "rewards//std": 0.03261503577232361, + "step": 1980 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3962, + "grad_norm": 1.6235089302062988, + "kl": 0.31874109990894794, + "learning_rate": 6.695182376586602e-07, + "loss": 0.0127, + "num_tokens": 14419218.0, + "reward": 0.86737060546875, + "reward_std": 0.023143917322158813, + "rewards//mean": 0.86737060546875, + "rewards//std": 0.037060752511024475, + "step": 1981 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3964, + "grad_norm": 1.449535846710205, + "kl": 0.36872923374176025, + "learning_rate": 6.692196655311814e-07, + "loss": 0.0147, + "num_tokens": 14426426.0, + "reward": 0.8489990234375, + "reward_std": 0.020224155858159065, + "rewards//mean": 0.8489990234375, + "rewards//std": 0.02479550801217556, + "step": 1982 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.3966, + "grad_norm": 1.4069538116455078, + "kl": 0.39873639307916164, + "learning_rate": 6.689210252420415e-07, + "loss": 0.015, + "num_tokens": 14433750.0, + "reward": 0.83599853515625, + "reward_std": 0.014522689394652843, + "rewards//mean": 0.83599853515625, + "rewards//std": 0.02278239093720913, + "step": 1983 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3968, + "grad_norm": 1.4495997428894043, + "kl": 0.2798336446285248, + "learning_rate": 6.686223169115327e-07, + "loss": 0.0112, + "num_tokens": 14441182.0, + "reward": 0.80615234375, + "reward_std": 0.010711676441133022, + "rewards//mean": 0.80615234375, + "rewards//std": 0.013486824929714203, + "step": 1984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.397, + "grad_norm": 1.5452702045440674, + "kl": 0.3480636514723301, + "learning_rate": 6.683235406599749e-07, + "loss": 0.0139, + "num_tokens": 14448494.0, + "reward": 0.851318359375, + "reward_std": 0.015892580151557922, + "rewards//mean": 0.851318359375, + "rewards//std": 0.018053939566016197, + "step": 1985 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3972, + "grad_norm": 1.1943323612213135, + "kl": 0.31233773566782475, + "learning_rate": 6.68024696607715e-07, + "loss": 0.0125, + "num_tokens": 14455862.0, + "reward": 0.85400390625, + "reward_std": 0.011820230633020401, + "rewards//mean": 0.85400390625, + "rewards//std": 0.013858823105692863, + "step": 1986 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.78125, + "epoch": 0.3974, + "grad_norm": 1.2808865308761597, + "kl": 0.33272584713995457, + "learning_rate": 6.677257848751276e-07, + "loss": 0.0088, + "num_tokens": 14463128.0, + "reward": 0.851318359375, + "reward_std": 0.01467146910727024, + "rewards//mean": 0.851318359375, + "rewards//std": 0.020444603636860847, + "step": 1987 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3976, + "grad_norm": 1.2583991289138794, + "kl": 0.2942848224192858, + "learning_rate": 6.674268055826138e-07, + "loss": 0.0118, + "num_tokens": 14470496.0, + "reward": 0.8798828125, + "reward_std": 0.01500263623893261, + "rewards//mean": 0.8798828125, + "rewards//std": 0.02569524198770523, + "step": 1988 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3978, + "grad_norm": 1.3403276205062866, + "kl": 0.32506612315773964, + "learning_rate": 6.671277588506029e-07, + "loss": 0.013, + "num_tokens": 14477824.0, + "reward": 0.80816650390625, + "reward_std": 0.01691604033112526, + "rewards//mean": 0.80816650390625, + "rewards//std": 0.029790746048092842, + "step": 1989 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.398, + "grad_norm": 1.660504937171936, + "kl": 0.42166148126125336, + "learning_rate": 6.668286447995507e-07, + "loss": 0.0169, + "num_tokens": 14485096.0, + "reward": 0.8487548828125, + "reward_std": 0.012023660354316235, + "rewards//mean": 0.8487548828125, + "rewards//std": 0.01961747743189335, + "step": 1990 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3982, + "grad_norm": 1.380285382270813, + "kl": 0.34242941439151764, + "learning_rate": 6.665294635499403e-07, + "loss": 0.0137, + "num_tokens": 14492392.0, + "reward": 0.869873046875, + "reward_std": 0.020719509571790695, + "rewards//mean": 0.869873046875, + "rewards//std": 0.024128727614879608, + "step": 1991 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.3984, + "grad_norm": 1.396243929862976, + "kl": 0.30579925887286663, + "learning_rate": 6.66230215222282e-07, + "loss": 0.0107, + "num_tokens": 14499638.0, + "reward": 0.89453125, + "reward_std": 0.016672927886247635, + "rewards//mean": 0.89453125, + "rewards//std": 0.022649193182587624, + "step": 1992 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3986, + "grad_norm": 1.7602136135101318, + "kl": 0.3282650541514158, + "learning_rate": 6.659308999371129e-07, + "loss": 0.0131, + "num_tokens": 14506950.0, + "reward": 0.81982421875, + "reward_std": 0.0155414380133152, + "rewards//mean": 0.81982421875, + "rewards//std": 0.025272304192185402, + "step": 1993 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3988, + "grad_norm": 1.5547682046890259, + "kl": 0.3275126516819, + "learning_rate": 6.65631517814997e-07, + "loss": 0.0131, + "num_tokens": 14514230.0, + "reward": 0.86773681640625, + "reward_std": 0.01401968952268362, + "rewards//mean": 0.86773681640625, + "rewards//std": 0.021853147074580193, + "step": 1994 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.399, + "grad_norm": 1.3627418279647827, + "kl": 0.29931108839809895, + "learning_rate": 6.653320689765256e-07, + "loss": 0.012, + "num_tokens": 14521502.0, + "reward": 0.8564453125, + "reward_std": 0.017129402607679367, + "rewards//mean": 0.8564453125, + "rewards//std": 0.022936105728149414, + "step": 1995 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3992, + "grad_norm": 1.384017825126648, + "kl": 0.33333343639969826, + "learning_rate": 6.650325535423166e-07, + "loss": 0.0133, + "num_tokens": 14528766.0, + "reward": 0.8616943359375, + "reward_std": 0.01928081549704075, + "rewards//mean": 0.8616943359375, + "rewards//std": 0.026554828509688377, + "step": 1996 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.3994, + "grad_norm": 1.7968597412109375, + "kl": 0.3758692089468241, + "learning_rate": 6.647329716330147e-07, + "loss": 0.0138, + "num_tokens": 14536023.0, + "reward": 0.79754638671875, + "reward_std": 0.013084445148706436, + "rewards//mean": 0.79754638671875, + "rewards//std": 0.014119474217295647, + "step": 1997 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.3996, + "grad_norm": 1.4872997999191284, + "kl": 0.33283051662147045, + "learning_rate": 6.644333233692916e-07, + "loss": 0.0121, + "num_tokens": 14543475.0, + "reward": 0.82342529296875, + "reward_std": 0.009792404249310493, + "rewards//mean": 0.82342529296875, + "rewards//std": 0.01901194266974926, + "step": 1998 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.3998, + "grad_norm": 1.4944043159484863, + "kl": 0.26910197734832764, + "learning_rate": 6.641336088718456e-07, + "loss": 0.0108, + "num_tokens": 14550763.0, + "reward": 0.8328857421875, + "reward_std": 0.013353399001061916, + "rewards//mean": 0.8328857421875, + "rewards//std": 0.02552945725619793, + "step": 1999 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4, + "grad_norm": 1.4210307598114014, + "kl": 0.24051328748464584, + "learning_rate": 6.638338282614014e-07, + "loss": 0.0096, + "num_tokens": 14558051.0, + "reward": 0.83074951171875, + "reward_std": 0.0126325199380517, + "rewards//mean": 0.83074951171875, + "rewards//std": 0.018069759011268616, + "step": 2000 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4002, + "grad_norm": 1.2387146949768066, + "kl": 0.3400968350470066, + "learning_rate": 6.635339816587108e-07, + "loss": 0.0136, + "num_tokens": 14565259.0, + "reward": 0.815185546875, + "reward_std": 0.009495868347585201, + "rewards//mean": 0.815185546875, + "rewards//std": 0.014597195200622082, + "step": 2001 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4004, + "grad_norm": 1.2202662229537964, + "kl": 0.34789387322962284, + "learning_rate": 6.632340691845519e-07, + "loss": 0.0139, + "num_tokens": 14572515.0, + "reward": 0.87249755859375, + "reward_std": 0.011908762156963348, + "rewards//mean": 0.87249755859375, + "rewards//std": 0.020721042528748512, + "step": 2002 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4006, + "grad_norm": 1.5498034954071045, + "kl": 0.3578600510954857, + "learning_rate": 6.629340909597297e-07, + "loss": 0.0143, + "num_tokens": 14579835.0, + "reward": 0.84765625, + "reward_std": 0.015052912756800652, + "rewards//mean": 0.84765625, + "rewards//std": 0.020611261948943138, + "step": 2003 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4008, + "grad_norm": 1.9139083623886108, + "kl": 0.4116261191666126, + "learning_rate": 6.626340471050748e-07, + "loss": 0.0165, + "num_tokens": 14587139.0, + "reward": 0.852783203125, + "reward_std": 0.014511451125144958, + "rewards//mean": 0.852783203125, + "rewards//std": 0.019782302901148796, + "step": 2004 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.401, + "grad_norm": 1.5045256614685059, + "kl": 0.35196092166006565, + "learning_rate": 6.623339377414455e-07, + "loss": 0.0141, + "num_tokens": 14594355.0, + "reward": 0.87939453125, + "reward_std": 0.023932289332151413, + "rewards//mean": 0.87939453125, + "rewards//std": 0.03273016959428787, + "step": 2005 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.4012, + "grad_norm": 1.8364931344985962, + "kl": 0.35324806347489357, + "learning_rate": 6.620337629897252e-07, + "loss": 0.0129, + "num_tokens": 14601601.0, + "reward": 0.8031005859375, + "reward_std": 0.01143915019929409, + "rewards//mean": 0.8031005859375, + "rewards//std": 0.01756218634545803, + "step": 2006 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4014, + "grad_norm": 1.4734095335006714, + "kl": 0.3083182983100414, + "learning_rate": 6.617335229708248e-07, + "loss": 0.0123, + "num_tokens": 14608945.0, + "reward": 0.82257080078125, + "reward_std": 0.011948507279157639, + "rewards//mean": 0.82257080078125, + "rewards//std": 0.024067716673016548, + "step": 2007 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4016, + "grad_norm": 1.6148561239242554, + "kl": 0.3314418215304613, + "learning_rate": 6.614332178056805e-07, + "loss": 0.0133, + "num_tokens": 14616169.0, + "reward": 0.85675048828125, + "reward_std": 0.022321229800581932, + "rewards//mean": 0.85675048828125, + "rewards//std": 0.03414662182331085, + "step": 2008 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.4018, + "grad_norm": 1.543725609779358, + "kl": 0.3316193763166666, + "learning_rate": 6.611328476152556e-07, + "loss": 0.0141, + "num_tokens": 14623478.0, + "reward": 0.78790283203125, + "reward_std": 0.015699343755841255, + "rewards//mean": 0.78790283203125, + "rewards//std": 0.02552196942269802, + "step": 2009 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.402, + "grad_norm": 1.6386140584945679, + "kl": 0.3511341027915478, + "learning_rate": 6.608324125205387e-07, + "loss": 0.014, + "num_tokens": 14630782.0, + "reward": 0.87799072265625, + "reward_std": 0.012336976826190948, + "rewards//mean": 0.87799072265625, + "rewards//std": 0.019146030768752098, + "step": 2010 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4022, + "grad_norm": 1.640411615371704, + "kl": 0.34199193492531776, + "learning_rate": 6.605319126425453e-07, + "loss": 0.0137, + "num_tokens": 14638086.0, + "reward": 0.82867431640625, + "reward_std": 0.011615173891186714, + "rewards//mean": 0.82867431640625, + "rewards//std": 0.0240482110530138, + "step": 2011 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4024, + "grad_norm": 1.7172142267227173, + "kl": 0.39923265017569065, + "learning_rate": 6.60231348102317e-07, + "loss": 0.016, + "num_tokens": 14645294.0, + "reward": 0.846923828125, + "reward_std": 0.015358650125563145, + "rewards//mean": 0.846923828125, + "rewards//std": 0.01772904209792614, + "step": 2012 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4026, + "grad_norm": 1.7122750282287598, + "kl": 0.3554496169090271, + "learning_rate": 6.599307190209204e-07, + "loss": 0.0142, + "num_tokens": 14652550.0, + "reward": 0.809814453125, + "reward_std": 0.013866900466382504, + "rewards//mean": 0.809814453125, + "rewards//std": 0.016066357493400574, + "step": 2013 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.4028, + "grad_norm": 1.5821326971054077, + "kl": 0.34379817917943, + "learning_rate": 6.596300255194496e-07, + "loss": 0.0168, + "num_tokens": 14659712.0, + "reward": 0.8291015625, + "reward_std": 0.01779792830348015, + "rewards//mean": 0.8291015625, + "rewards//std": 0.02308347076177597, + "step": 2014 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.403, + "grad_norm": 1.721096396446228, + "kl": 0.3802051432430744, + "learning_rate": 6.593292677190235e-07, + "loss": 0.0152, + "num_tokens": 14667000.0, + "reward": 0.7786865234375, + "reward_std": 0.017941243946552277, + "rewards//mean": 0.7786865234375, + "rewards//std": 0.022906716912984848, + "step": 2015 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4032, + "grad_norm": 1.432665467262268, + "kl": 0.25868678465485573, + "learning_rate": 6.590284457407875e-07, + "loss": 0.0103, + "num_tokens": 14674320.0, + "reward": 0.82861328125, + "reward_std": 0.015801355242729187, + "rewards//mean": 0.82861328125, + "rewards//std": 0.02112771011888981, + "step": 2016 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.765625, + "epoch": 0.4034, + "grad_norm": 1.5745834112167358, + "kl": 0.36893025040626526, + "learning_rate": 6.587275597059124e-07, + "loss": 0.0015, + "num_tokens": 14681569.0, + "reward": 0.81793212890625, + "reward_std": 0.01283422950655222, + "rewards//mean": 0.81793212890625, + "rewards//std": 0.01937083527445793, + "step": 2017 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4036, + "grad_norm": 1.3745322227478027, + "kl": 0.2977375378832221, + "learning_rate": 6.584266097355954e-07, + "loss": 0.0119, + "num_tokens": 14688929.0, + "reward": 0.83172607421875, + "reward_std": 0.01340736448764801, + "rewards//mean": 0.83172607421875, + "rewards//std": 0.024531202390789986, + "step": 2018 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4038, + "grad_norm": 1.663325548171997, + "kl": 0.29675300512462854, + "learning_rate": 6.581255959510588e-07, + "loss": 0.0119, + "num_tokens": 14696241.0, + "reward": 0.86944580078125, + "reward_std": 0.015929419547319412, + "rewards//mean": 0.86944580078125, + "rewards//std": 0.027275502681732178, + "step": 2019 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.404, + "grad_norm": 1.4277936220169067, + "kl": 0.35693976283073425, + "learning_rate": 6.578245184735512e-07, + "loss": 0.0143, + "num_tokens": 14703601.0, + "reward": 0.8663330078125, + "reward_std": 0.020692449063062668, + "rewards//mean": 0.8663330078125, + "rewards//std": 0.027378883212804794, + "step": 2020 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4042, + "grad_norm": 1.537709355354309, + "kl": 0.38880813494324684, + "learning_rate": 6.575233774243464e-07, + "loss": 0.0156, + "num_tokens": 14710873.0, + "reward": 0.84466552734375, + "reward_std": 0.018659329041838646, + "rewards//mean": 0.84466552734375, + "rewards//std": 0.024584829807281494, + "step": 2021 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4044, + "grad_norm": 2.607950448989868, + "kl": 0.5204866752028465, + "learning_rate": 6.57222172924744e-07, + "loss": 0.0208, + "num_tokens": 14718177.0, + "reward": 0.83123779296875, + "reward_std": 0.013190241530537605, + "rewards//mean": 0.83123779296875, + "rewards//std": 0.016502218320965767, + "step": 2022 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4046, + "grad_norm": 1.5746989250183105, + "kl": 0.32240620627999306, + "learning_rate": 6.569209050960691e-07, + "loss": 0.0129, + "num_tokens": 14725465.0, + "reward": 0.8663330078125, + "reward_std": 0.01839650608599186, + "rewards//mean": 0.8663330078125, + "rewards//std": 0.03472626954317093, + "step": 2023 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.84375, + "epoch": 0.4048, + "grad_norm": 1.6427708864212036, + "kl": 0.31727616116404533, + "learning_rate": 6.566195740596725e-07, + "loss": 0.0105, + "num_tokens": 14732727.0, + "reward": 0.82171630859375, + "reward_std": 0.014893535524606705, + "rewards//mean": 0.82171630859375, + "rewards//std": 0.021546201780438423, + "step": 2024 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.405, + "grad_norm": 1.4570603370666504, + "kl": 0.356360187754035, + "learning_rate": 6.563181799369301e-07, + "loss": 0.0143, + "num_tokens": 14740023.0, + "reward": 0.88507080078125, + "reward_std": 0.014639312401413918, + "rewards//mean": 0.88507080078125, + "rewards//std": 0.024624818935990334, + "step": 2025 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4052, + "grad_norm": 1.4406250715255737, + "kl": 0.38378549739718437, + "learning_rate": 6.560167228492434e-07, + "loss": 0.0154, + "num_tokens": 14747319.0, + "reward": 0.850830078125, + "reward_std": 0.01353495940566063, + "rewards//mean": 0.850830078125, + "rewards//std": 0.02727612666785717, + "step": 2026 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4054, + "grad_norm": 1.3237229585647583, + "kl": 0.3184395506978035, + "learning_rate": 6.557152029180397e-07, + "loss": 0.0127, + "num_tokens": 14754687.0, + "reward": 0.8411865234375, + "reward_std": 0.010845817625522614, + "rewards//mean": 0.8411865234375, + "rewards//std": 0.015736499801278114, + "step": 2027 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.4056, + "grad_norm": 1.3461883068084717, + "kl": 0.35645036213099957, + "learning_rate": 6.554136202647706e-07, + "loss": 0.0156, + "num_tokens": 14762057.0, + "reward": 0.853271484375, + "reward_std": 0.016895290464162827, + "rewards//mean": 0.853271484375, + "rewards//std": 0.023113617673516273, + "step": 2028 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4058, + "grad_norm": 1.3433946371078491, + "kl": 0.33178192004561424, + "learning_rate": 6.551119750109141e-07, + "loss": 0.0133, + "num_tokens": 14769313.0, + "reward": 0.848388671875, + "reward_std": 0.014085076749324799, + "rewards//mean": 0.848388671875, + "rewards//std": 0.023363754153251648, + "step": 2029 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.406, + "grad_norm": 1.4891730546951294, + "kl": 0.2744369301944971, + "learning_rate": 6.548102672779724e-07, + "loss": 0.011, + "num_tokens": 14776569.0, + "reward": 0.8692626953125, + "reward_std": 0.01563076302409172, + "rewards//mean": 0.8692626953125, + "rewards//std": 0.01992982253432274, + "step": 2030 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.546875, + "epoch": 0.4062, + "grad_norm": 1.4808708429336548, + "kl": 0.3103486206382513, + "learning_rate": 6.545084971874736e-07, + "loss": -0.0267, + "num_tokens": 14783876.0, + "reward": 0.863525390625, + "reward_std": 0.013890442438423634, + "rewards//mean": 0.863525390625, + "rewards//std": 0.026235099881887436, + "step": 2031 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4064, + "grad_norm": 1.374067783355713, + "kl": 0.28801845759153366, + "learning_rate": 6.542066648609707e-07, + "loss": 0.0115, + "num_tokens": 14791244.0, + "reward": 0.8814697265625, + "reward_std": 0.011529987677931786, + "rewards//mean": 0.8814697265625, + "rewards//std": 0.02416226826608181, + "step": 2032 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4066, + "grad_norm": 1.4853092432022095, + "kl": 0.32708191126585007, + "learning_rate": 6.539047704200417e-07, + "loss": 0.0131, + "num_tokens": 14798460.0, + "reward": 0.79815673828125, + "reward_std": 0.011902762576937675, + "rewards//mean": 0.79815673828125, + "rewards//std": 0.017274854704737663, + "step": 2033 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4068, + "grad_norm": 1.5366487503051758, + "kl": 0.2874423936009407, + "learning_rate": 6.536028139862893e-07, + "loss": 0.0115, + "num_tokens": 14805748.0, + "reward": 0.83880615234375, + "reward_std": 0.015167555771768093, + "rewards//mean": 0.83880615234375, + "rewards//std": 0.02229614183306694, + "step": 2034 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.407, + "grad_norm": 1.2154380083084106, + "kl": 0.3227575942873955, + "learning_rate": 6.53300795681342e-07, + "loss": 0.0129, + "num_tokens": 14813036.0, + "reward": 0.851318359375, + "reward_std": 0.010215602815151215, + "rewards//mean": 0.851318359375, + "rewards//std": 0.024308739230036736, + "step": 2035 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4072, + "grad_norm": 1.5336205959320068, + "kl": 0.27492182329297066, + "learning_rate": 6.529987156268526e-07, + "loss": 0.011, + "num_tokens": 14820332.0, + "reward": 0.82611083984375, + "reward_std": 0.01279536634683609, + "rewards//mean": 0.82611083984375, + "rewards//std": 0.02198711968958378, + "step": 2036 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.4074, + "grad_norm": 1.270586609840393, + "kl": 0.32984720543026924, + "learning_rate": 6.526965739444988e-07, + "loss": 0.0118, + "num_tokens": 14827688.0, + "reward": 0.85284423828125, + "reward_std": 0.014610693790018559, + "rewards//mean": 0.85284423828125, + "rewards//std": 0.02077576145529747, + "step": 2037 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4076, + "grad_norm": 1.6034013032913208, + "kl": 0.27730931527912617, + "learning_rate": 6.523943707559832e-07, + "loss": 0.0111, + "num_tokens": 14835024.0, + "reward": 0.8248291015625, + "reward_std": 0.014901324175298214, + "rewards//mean": 0.8248291015625, + "rewards//std": 0.02482479438185692, + "step": 2038 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.4078, + "grad_norm": 1.5551282167434692, + "kl": 0.30776157416403294, + "learning_rate": 6.520921061830333e-07, + "loss": 0.0103, + "num_tokens": 14842381.0, + "reward": 0.85076904296875, + "reward_std": 0.015459940768778324, + "rewards//mean": 0.85076904296875, + "rewards//std": 0.02312728390097618, + "step": 2039 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.875, + "epoch": 0.408, + "grad_norm": 1.5390475988388062, + "kl": 0.3577136695384979, + "learning_rate": 6.517897803474011e-07, + "loss": 0.0067, + "num_tokens": 14849629.0, + "reward": 0.8408203125, + "reward_std": 0.015549328178167343, + "rewards//mean": 0.8408203125, + "rewards//std": 0.023281974717974663, + "step": 2040 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.4082, + "grad_norm": 1.2985643148422241, + "kl": 0.37853493355214596, + "learning_rate": 6.514873933708637e-07, + "loss": 0.0174, + "num_tokens": 14856879.0, + "reward": 0.805419921875, + "reward_std": 0.008787177503108978, + "rewards//mean": 0.805419921875, + "rewards//std": 0.01629091612994671, + "step": 2041 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4084, + "grad_norm": 1.5172704458236694, + "kl": 0.3216056879609823, + "learning_rate": 6.511849453752223e-07, + "loss": 0.0129, + "num_tokens": 14864175.0, + "reward": 0.8128662109375, + "reward_std": 0.010178844444453716, + "rewards//mean": 0.8128662109375, + "rewards//std": 0.0124836890026927, + "step": 2042 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4086, + "grad_norm": 1.43656587600708, + "kl": 0.2884284630417824, + "learning_rate": 6.50882436482303e-07, + "loss": 0.0115, + "num_tokens": 14871423.0, + "reward": 0.84454345703125, + "reward_std": 0.012979257851839066, + "rewards//mean": 0.84454345703125, + "rewards//std": 0.020293649286031723, + "step": 2043 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4088, + "grad_norm": 1.4953258037567139, + "kl": 0.3803155794739723, + "learning_rate": 6.505798668139563e-07, + "loss": 0.0152, + "num_tokens": 14878743.0, + "reward": 0.757080078125, + "reward_std": 0.012972688302397728, + "rewards//mean": 0.757080078125, + "rewards//std": 0.017495257779955864, + "step": 2044 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.409, + "grad_norm": 1.4980772733688354, + "kl": 0.37741004303097725, + "learning_rate": 6.502772364920573e-07, + "loss": 0.0151, + "num_tokens": 14886103.0, + "reward": 0.8135986328125, + "reward_std": 0.018014030531048775, + "rewards//mean": 0.8135986328125, + "rewards//std": 0.022428564727306366, + "step": 2045 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4092, + "grad_norm": 1.2890230417251587, + "kl": 0.32883927412331104, + "learning_rate": 6.499745456385053e-07, + "loss": 0.0132, + "num_tokens": 14893359.0, + "reward": 0.7691650390625, + "reward_std": 0.015113857574760914, + "rewards//mean": 0.7691650390625, + "rewards//std": 0.020060038194060326, + "step": 2046 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4094, + "grad_norm": 1.8553431034088135, + "kl": 0.3306463733315468, + "learning_rate": 6.496717943752243e-07, + "loss": 0.0132, + "num_tokens": 14900639.0, + "reward": 0.82086181640625, + "reward_std": 0.02199612557888031, + "rewards//mean": 0.82086181640625, + "rewards//std": 0.02336367405951023, + "step": 2047 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4096, + "grad_norm": 1.4777463674545288, + "kl": 0.35052115097641945, + "learning_rate": 6.493689828241624e-07, + "loss": 0.014, + "num_tokens": 14907959.0, + "reward": 0.82379150390625, + "reward_std": 0.011656535789370537, + "rewards//mean": 0.82379150390625, + "rewards//std": 0.014518036507070065, + "step": 2048 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4098, + "grad_norm": 1.9791163206100464, + "kl": 0.3711877688765526, + "learning_rate": 6.490661111072922e-07, + "loss": 0.0148, + "num_tokens": 14915199.0, + "reward": 0.78326416015625, + "reward_std": 0.013956421986222267, + "rewards//mean": 0.78326416015625, + "rewards//std": 0.019375523552298546, + "step": 2049 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.41, + "grad_norm": 1.3998312950134277, + "kl": 0.30053481459617615, + "learning_rate": 6.487631793466103e-07, + "loss": 0.012, + "num_tokens": 14922471.0, + "reward": 0.82171630859375, + "reward_std": 0.01102500967681408, + "rewards//mean": 0.82171630859375, + "rewards//std": 0.016148941591382027, + "step": 2050 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4102, + "grad_norm": 1.5966218709945679, + "kl": 0.3459085803478956, + "learning_rate": 6.484601876641375e-07, + "loss": 0.0138, + "num_tokens": 14929751.0, + "reward": 0.823486328125, + "reward_std": 0.020916499197483063, + "rewards//mean": 0.823486328125, + "rewards//std": 0.026936586946249008, + "step": 2051 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4104, + "grad_norm": 1.6300405263900757, + "kl": 0.3167321737855673, + "learning_rate": 6.481571361819188e-07, + "loss": 0.0127, + "num_tokens": 14936879.0, + "reward": 0.86767578125, + "reward_std": 0.024501696228981018, + "rewards//mean": 0.86767578125, + "rewards//std": 0.03303217515349388, + "step": 2052 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4106, + "grad_norm": 1.2420302629470825, + "kl": 0.33038718067109585, + "learning_rate": 6.478540250220233e-07, + "loss": 0.0132, + "num_tokens": 14944135.0, + "reward": 0.85675048828125, + "reward_std": 0.00970250740647316, + "rewards//mean": 0.85675048828125, + "rewards//std": 0.011245286092162132, + "step": 2053 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4108, + "grad_norm": 1.6240407228469849, + "kl": 0.32596449740231037, + "learning_rate": 6.475508543065445e-07, + "loss": 0.013, + "num_tokens": 14951439.0, + "reward": 0.83819580078125, + "reward_std": 0.012671265751123428, + "rewards//mean": 0.83819580078125, + "rewards//std": 0.01466225367039442, + "step": 2054 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.411, + "grad_norm": 1.4051626920700073, + "kl": 0.30330889113247395, + "learning_rate": 6.472476241575988e-07, + "loss": 0.0121, + "num_tokens": 14958655.0, + "reward": 0.8714599609375, + "reward_std": 0.01420554704964161, + "rewards//mean": 0.8714599609375, + "rewards//std": 0.021561389788985252, + "step": 2055 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4112, + "grad_norm": 1.3287208080291748, + "kl": 0.28875264152884483, + "learning_rate": 6.46944334697328e-07, + "loss": 0.0116, + "num_tokens": 14965879.0, + "reward": 0.848876953125, + "reward_std": 0.01056292187422514, + "rewards//mean": 0.848876953125, + "rewards//std": 0.016497746109962463, + "step": 2056 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4114, + "grad_norm": 1.364251732826233, + "kl": 0.3007897362112999, + "learning_rate": 6.466409860478966e-07, + "loss": 0.012, + "num_tokens": 14973143.0, + "reward": 0.88531494140625, + "reward_std": 0.014164997264742851, + "rewards//mean": 0.88531494140625, + "rewards//std": 0.027422182261943817, + "step": 2057 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4116, + "grad_norm": 1.7004464864730835, + "kl": 0.35351515375077724, + "learning_rate": 6.463375783314938e-07, + "loss": 0.0141, + "num_tokens": 14980447.0, + "reward": 0.8055419921875, + "reward_std": 0.01284072082489729, + "rewards//mean": 0.8055419921875, + "rewards//std": 0.026634516194462776, + "step": 2058 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.4118, + "grad_norm": 1.7609437704086304, + "kl": 0.3858203627169132, + "learning_rate": 6.460341116703316e-07, + "loss": 0.0107, + "num_tokens": 14987688.0, + "reward": 0.85406494140625, + "reward_std": 0.021739952266216278, + "rewards//mean": 0.85406494140625, + "rewards//std": 0.02612864226102829, + "step": 2059 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.412, + "grad_norm": 1.3262202739715576, + "kl": 0.32302568666636944, + "learning_rate": 6.45730586186647e-07, + "loss": 0.0129, + "num_tokens": 14994960.0, + "reward": 0.8858642578125, + "reward_std": 0.01599188894033432, + "rewards//mean": 0.8858642578125, + "rewards//std": 0.024700086563825607, + "step": 2060 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4122, + "grad_norm": 1.5140655040740967, + "kl": 0.3555286042392254, + "learning_rate": 6.454270020026995e-07, + "loss": 0.0142, + "num_tokens": 15002216.0, + "reward": 0.8328857421875, + "reward_std": 0.014709369279444218, + "rewards//mean": 0.8328857421875, + "rewards//std": 0.03106708638370037, + "step": 2061 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.4124, + "grad_norm": 1.3313430547714233, + "kl": 0.3607112616300583, + "learning_rate": 6.451233592407731e-07, + "loss": 0.0113, + "num_tokens": 15009475.0, + "reward": 0.78448486328125, + "reward_std": 0.011221496388316154, + "rewards//mean": 0.78448486328125, + "rewards//std": 0.015772411599755287, + "step": 2062 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4126, + "grad_norm": 1.6513029336929321, + "kl": 0.363980520516634, + "learning_rate": 6.448196580231748e-07, + "loss": 0.0146, + "num_tokens": 15016819.0, + "reward": 0.85064697265625, + "reward_std": 0.01550448127090931, + "rewards//mean": 0.85064697265625, + "rewards//std": 0.019890571013092995, + "step": 2063 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4128, + "grad_norm": 1.4022269248962402, + "kl": 0.3563254326581955, + "learning_rate": 6.445158984722358e-07, + "loss": 0.0143, + "num_tokens": 15024107.0, + "reward": 0.86083984375, + "reward_std": 0.019120272248983383, + "rewards//mean": 0.86083984375, + "rewards//std": 0.026452306658029556, + "step": 2064 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.413, + "grad_norm": 1.360336422920227, + "kl": 0.32426271960139275, + "learning_rate": 6.442120807103101e-07, + "loss": 0.013, + "num_tokens": 15031403.0, + "reward": 0.831787109375, + "reward_std": 0.013851637952029705, + "rewards//mean": 0.831787109375, + "rewards//std": 0.0193740576505661, + "step": 2065 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4132, + "grad_norm": 1.6497997045516968, + "kl": 0.302944365888834, + "learning_rate": 6.439082048597755e-07, + "loss": 0.0121, + "num_tokens": 15038707.0, + "reward": 0.8094482421875, + "reward_std": 0.01282226387411356, + "rewards//mean": 0.8094482421875, + "rewards//std": 0.017168166115880013, + "step": 2066 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.4134, + "grad_norm": 1.2973049879074097, + "kl": 0.3521454483270645, + "learning_rate": 6.436042710430332e-07, + "loss": 0.0111, + "num_tokens": 15046063.0, + "reward": 0.75531005859375, + "reward_std": 0.0109690111130476, + "rewards//mean": 0.75531005859375, + "rewards//std": 0.014201788231730461, + "step": 2067 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4136, + "grad_norm": 1.5975685119628906, + "kl": 0.31363446451723576, + "learning_rate": 6.433002793825075e-07, + "loss": 0.0125, + "num_tokens": 15053431.0, + "reward": 0.8759765625, + "reward_std": 0.017449460923671722, + "rewards//mean": 0.8759765625, + "rewards//std": 0.029181944206357002, + "step": 2068 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4138, + "grad_norm": 1.4470802545547485, + "kl": 0.32630570605397224, + "learning_rate": 6.429962300006467e-07, + "loss": 0.0131, + "num_tokens": 15060711.0, + "reward": 0.8291015625, + "reward_std": 0.019245371222496033, + "rewards//mean": 0.8291015625, + "rewards//std": 0.022488217800855637, + "step": 2069 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.414, + "grad_norm": 1.5201152563095093, + "kl": 0.2745747081935406, + "learning_rate": 6.426921230199214e-07, + "loss": 0.011, + "num_tokens": 15067983.0, + "reward": 0.8399658203125, + "reward_std": 0.010642554610967636, + "rewards//mean": 0.8399658203125, + "rewards//std": 0.016513796523213387, + "step": 2070 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4142, + "grad_norm": 1.2489662170410156, + "kl": 0.3025618679821491, + "learning_rate": 6.423879585628261e-07, + "loss": 0.0121, + "num_tokens": 15075255.0, + "reward": 0.8636474609375, + "reward_std": 0.012305624783039093, + "rewards//mean": 0.8636474609375, + "rewards//std": 0.016627075150609016, + "step": 2071 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4144, + "grad_norm": 1.274716854095459, + "kl": 0.35381110198795795, + "learning_rate": 6.420837367518779e-07, + "loss": 0.0142, + "num_tokens": 15082615.0, + "reward": 0.8382568359375, + "reward_std": 0.015050425194203854, + "rewards//mean": 0.8382568359375, + "rewards//std": 0.021118752658367157, + "step": 2072 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4146, + "grad_norm": 1.2677432298660278, + "kl": 0.3204325847327709, + "learning_rate": 6.417794577096178e-07, + "loss": 0.0128, + "num_tokens": 15089943.0, + "reward": 0.85931396484375, + "reward_std": 0.011789515614509583, + "rewards//mean": 0.85931396484375, + "rewards//std": 0.01712612248957157, + "step": 2073 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4148, + "grad_norm": 1.4708386659622192, + "kl": 0.30672165751457214, + "learning_rate": 6.414751215586089e-07, + "loss": 0.0123, + "num_tokens": 15097303.0, + "reward": 0.82989501953125, + "reward_std": 0.010746510699391365, + "rewards//mean": 0.82989501953125, + "rewards//std": 0.020492583513259888, + "step": 2074 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.415, + "grad_norm": 1.4051549434661865, + "kl": 0.2988546472042799, + "learning_rate": 6.411707284214383e-07, + "loss": 0.012, + "num_tokens": 15104575.0, + "reward": 0.86761474609375, + "reward_std": 0.018267236649990082, + "rewards//mean": 0.86761474609375, + "rewards//std": 0.029635872691869736, + "step": 2075 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4152, + "grad_norm": 1.1182688474655151, + "kl": 0.25841217674314976, + "learning_rate": 6.408662784207149e-07, + "loss": 0.0103, + "num_tokens": 15111847.0, + "reward": 0.851806640625, + "reward_std": 0.011804051697254181, + "rewards//mean": 0.851806640625, + "rewards//std": 0.02197480946779251, + "step": 2076 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4154, + "grad_norm": 1.3936203718185425, + "kl": 0.312256159260869, + "learning_rate": 6.405617716790714e-07, + "loss": 0.0125, + "num_tokens": 15119119.0, + "reward": 0.82525634765625, + "reward_std": 0.010253187268972397, + "rewards//mean": 0.82525634765625, + "rewards//std": 0.013006730005145073, + "step": 2077 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4156, + "grad_norm": 1.482667326927185, + "kl": 0.2963226120918989, + "learning_rate": 6.402572083191631e-07, + "loss": 0.0119, + "num_tokens": 15126415.0, + "reward": 0.85150146484375, + "reward_std": 0.011601877398788929, + "rewards//mean": 0.85150146484375, + "rewards//std": 0.013753427192568779, + "step": 2078 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4158, + "grad_norm": 1.9316672086715698, + "kl": 0.3287455253303051, + "learning_rate": 6.39952588463668e-07, + "loss": 0.0131, + "num_tokens": 15133727.0, + "reward": 0.81805419921875, + "reward_std": 0.0145412003621459, + "rewards//mean": 0.81805419921875, + "rewards//std": 0.02047928236424923, + "step": 2079 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.416, + "grad_norm": 1.6021140813827515, + "kl": 0.3534549754112959, + "learning_rate": 6.396479122352872e-07, + "loss": 0.0141, + "num_tokens": 15140975.0, + "reward": 0.84307861328125, + "reward_std": 0.021970413625240326, + "rewards//mean": 0.84307861328125, + "rewards//std": 0.02957400120794773, + "step": 2080 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4162, + "grad_norm": 1.70368230342865, + "kl": 0.34752100333571434, + "learning_rate": 6.393431797567439e-07, + "loss": 0.0139, + "num_tokens": 15148231.0, + "reward": 0.8509521484375, + "reward_std": 0.015319593250751495, + "rewards//mean": 0.8509521484375, + "rewards//std": 0.01740286499261856, + "step": 2081 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.875, + "epoch": 0.4164, + "grad_norm": 1.5190011262893677, + "kl": 0.3508720397949219, + "learning_rate": 6.390383911507844e-07, + "loss": 0.01, + "num_tokens": 15155519.0, + "reward": 0.8006591796875, + "reward_std": 0.013938600197434425, + "rewards//mean": 0.8006591796875, + "rewards//std": 0.01877851039171219, + "step": 2082 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4166, + "grad_norm": 1.7284722328186035, + "kl": 0.336577657610178, + "learning_rate": 6.387335465401776e-07, + "loss": 0.0135, + "num_tokens": 15162799.0, + "reward": 0.83123779296875, + "reward_std": 0.015064781531691551, + "rewards//mean": 0.83123779296875, + "rewards//std": 0.0179908387362957, + "step": 2083 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4168, + "grad_norm": 1.3945200443267822, + "kl": 0.30194857716560364, + "learning_rate": 6.384286460477149e-07, + "loss": 0.0121, + "num_tokens": 15170159.0, + "reward": 0.84381103515625, + "reward_std": 0.011483021080493927, + "rewards//mean": 0.84381103515625, + "rewards//std": 0.023349415510892868, + "step": 2084 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.417, + "grad_norm": 1.4620431661605835, + "kl": 0.31571873277425766, + "learning_rate": 6.381236897962102e-07, + "loss": 0.0126, + "num_tokens": 15177415.0, + "reward": 0.85491943359375, + "reward_std": 0.014993082731962204, + "rewards//mean": 0.85491943359375, + "rewards//std": 0.01608130894601345, + "step": 2085 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.84375, + "epoch": 0.4172, + "grad_norm": 2.7907378673553467, + "kl": 0.4150765649974346, + "learning_rate": 6.378186779084995e-07, + "loss": 0.0194, + "num_tokens": 15184701.0, + "reward": 0.875, + "reward_std": 0.019341671839356422, + "rewards//mean": 0.875, + "rewards//std": 0.0336388535797596, + "step": 2086 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4174, + "grad_norm": 1.5162569284439087, + "kl": 0.313828082755208, + "learning_rate": 6.375136105074422e-07, + "loss": 0.0126, + "num_tokens": 15191965.0, + "reward": 0.85015869140625, + "reward_std": 0.015067866072058678, + "rewards//mean": 0.85015869140625, + "rewards//std": 0.02442297339439392, + "step": 2087 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4176, + "grad_norm": 1.4779616594314575, + "kl": 0.38103680685162544, + "learning_rate": 6.372084877159187e-07, + "loss": 0.0152, + "num_tokens": 15199253.0, + "reward": 0.8319091796875, + "reward_std": 0.013499084860086441, + "rewards//mean": 0.8319091796875, + "rewards//std": 0.0259295292198658, + "step": 2088 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.875, + "epoch": 0.4178, + "grad_norm": 1.471876859664917, + "kl": 0.3058391287922859, + "learning_rate": 6.369033096568329e-07, + "loss": 0.0078, + "num_tokens": 15206525.0, + "reward": 0.86297607421875, + "reward_std": 0.017810985445976257, + "rewards//mean": 0.86297607421875, + "rewards//std": 0.03423340246081352, + "step": 2089 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.418, + "grad_norm": 1.5272643566131592, + "kl": 0.3022992257028818, + "learning_rate": 6.365980764531105e-07, + "loss": 0.0129, + "num_tokens": 15213921.0, + "reward": 0.87261962890625, + "reward_std": 0.017342017963528633, + "rewards//mean": 0.87261962890625, + "rewards//std": 0.022473318502306938, + "step": 2090 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.875, + "epoch": 0.4182, + "grad_norm": 1.3648666143417358, + "kl": 0.28361560218036175, + "learning_rate": 6.362927882276989e-07, + "loss": 0.0085, + "num_tokens": 15221169.0, + "reward": 0.8009033203125, + "reward_std": 0.01272334810346365, + "rewards//mean": 0.8009033203125, + "rewards//std": 0.02886456623673439, + "step": 2091 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4184, + "grad_norm": 1.6769200563430786, + "kl": 0.2863926403224468, + "learning_rate": 6.359874451035687e-07, + "loss": 0.0115, + "num_tokens": 15228465.0, + "reward": 0.82135009765625, + "reward_std": 0.012579357251524925, + "rewards//mean": 0.82135009765625, + "rewards//std": 0.02157849632203579, + "step": 2092 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4186, + "grad_norm": 1.5907679796218872, + "kl": 0.334414467215538, + "learning_rate": 6.356820472037118e-07, + "loss": 0.0134, + "num_tokens": 15235673.0, + "reward": 0.85888671875, + "reward_std": 0.02120255120098591, + "rewards//mean": 0.85888671875, + "rewards//std": 0.03138533979654312, + "step": 2093 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4188, + "grad_norm": 1.302498698234558, + "kl": 0.3681252468377352, + "learning_rate": 6.353765946511427e-07, + "loss": 0.0147, + "num_tokens": 15242961.0, + "reward": 0.85723876953125, + "reward_std": 0.01922716572880745, + "rewards//mean": 0.85723876953125, + "rewards//std": 0.030024083331227303, + "step": 2094 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.419, + "grad_norm": 1.4988080263137817, + "kl": 0.3380241096019745, + "learning_rate": 6.350710875688972e-07, + "loss": 0.0084, + "num_tokens": 15250235.0, + "reward": 0.84161376953125, + "reward_std": 0.01750708557665348, + "rewards//mean": 0.84161376953125, + "rewards//std": 0.023467756807804108, + "step": 2095 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4192, + "grad_norm": 1.3517026901245117, + "kl": 0.29910075664520264, + "learning_rate": 6.34765526080034e-07, + "loss": 0.012, + "num_tokens": 15257571.0, + "reward": 0.80389404296875, + "reward_std": 0.012138579040765762, + "rewards//mean": 0.80389404296875, + "rewards//std": 0.022916875779628754, + "step": 2096 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.4194, + "grad_norm": 1.835735559463501, + "kl": 0.3858860544860363, + "learning_rate": 6.344599103076328e-07, + "loss": 0.0128, + "num_tokens": 15264807.0, + "reward": 0.85693359375, + "reward_std": 0.024730488657951355, + "rewards//mean": 0.85693359375, + "rewards//std": 0.030981454998254776, + "step": 2097 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4196, + "grad_norm": 1.554247260093689, + "kl": 0.2969211433082819, + "learning_rate": 6.341542403747959e-07, + "loss": 0.0119, + "num_tokens": 15272151.0, + "reward": 0.89654541015625, + "reward_std": 0.019243283197283745, + "rewards//mean": 0.89654541015625, + "rewards//std": 0.02813388779759407, + "step": 2098 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4198, + "grad_norm": 1.3855699300765991, + "kl": 0.31776274740695953, + "learning_rate": 6.338485164046471e-07, + "loss": 0.0127, + "num_tokens": 15279407.0, + "reward": 0.82818603515625, + "reward_std": 0.01490839570760727, + "rewards//mean": 0.82818603515625, + "rewards//std": 0.02996402606368065, + "step": 2099 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.42, + "grad_norm": 1.5696637630462646, + "kl": 0.3552805297076702, + "learning_rate": 6.335427385203319e-07, + "loss": 0.0142, + "num_tokens": 15286615.0, + "reward": 0.82110595703125, + "reward_std": 0.018159139901399612, + "rewards//mean": 0.82110595703125, + "rewards//std": 0.023148220032453537, + "step": 2100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4202, + "grad_norm": 1.5248987674713135, + "kl": 0.375587347894907, + "learning_rate": 6.332369068450174e-07, + "loss": 0.015, + "num_tokens": 15293799.0, + "reward": 0.8519287109375, + "reward_std": 0.01585426926612854, + "rewards//mean": 0.8519287109375, + "rewards//std": 0.021198881790041924, + "step": 2101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4204, + "grad_norm": 1.2981318235397339, + "kl": 0.3121776431798935, + "learning_rate": 6.329310215018931e-07, + "loss": 0.0125, + "num_tokens": 15301063.0, + "reward": 0.783935546875, + "reward_std": 0.012201188132166862, + "rewards//mean": 0.783935546875, + "rewards//std": 0.023662477731704712, + "step": 2102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4206, + "grad_norm": 2.3082869052886963, + "kl": 0.2755323965102434, + "learning_rate": 6.326250826141688e-07, + "loss": 0.011, + "num_tokens": 15308391.0, + "reward": 0.84649658203125, + "reward_std": 0.01712639071047306, + "rewards//mean": 0.84649658203125, + "rewards//std": 0.023413510993123055, + "step": 2103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4208, + "grad_norm": 1.5335673093795776, + "kl": 0.3679174091666937, + "learning_rate": 6.323190903050774e-07, + "loss": 0.0147, + "num_tokens": 15315679.0, + "reward": 0.82470703125, + "reward_std": 0.02378033474087715, + "rewards//mean": 0.82470703125, + "rewards//std": 0.026452306658029556, + "step": 2104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.421, + "grad_norm": 1.4100496768951416, + "kl": 0.3237088993191719, + "learning_rate": 6.320130446978722e-07, + "loss": 0.0129, + "num_tokens": 15323079.0, + "reward": 0.87567138671875, + "reward_std": 0.01757415197789669, + "rewards//mean": 0.87567138671875, + "rewards//std": 0.022846750915050507, + "step": 2105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.4212, + "grad_norm": 1.6812642812728882, + "kl": 0.32482586801052094, + "learning_rate": 6.317069459158282e-07, + "loss": 0.0129, + "num_tokens": 15330349.0, + "reward": 0.84368896484375, + "reward_std": 0.014886472374200821, + "rewards//mean": 0.84368896484375, + "rewards//std": 0.02717486396431923, + "step": 2106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4214, + "grad_norm": 1.5733819007873535, + "kl": 0.31333765015006065, + "learning_rate": 6.314007940822425e-07, + "loss": 0.0125, + "num_tokens": 15337685.0, + "reward": 0.85101318359375, + "reward_std": 0.015528632327914238, + "rewards//mean": 0.85101318359375, + "rewards//std": 0.022733166813850403, + "step": 2107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4216, + "grad_norm": 1.7247388362884521, + "kl": 0.3656384013593197, + "learning_rate": 6.310945893204324e-07, + "loss": 0.0146, + "num_tokens": 15344997.0, + "reward": 0.8267822265625, + "reward_std": 0.012809637933969498, + "rewards//mean": 0.8267822265625, + "rewards//std": 0.021645475178956985, + "step": 2108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.75, + "epoch": 0.4218, + "grad_norm": 1.483669400215149, + "kl": 0.305874090641737, + "learning_rate": 6.307883317537374e-07, + "loss": -0.0007, + "num_tokens": 15352349.0, + "reward": 0.83087158203125, + "reward_std": 0.013866787776350975, + "rewards//mean": 0.83087158203125, + "rewards//std": 0.01934659481048584, + "step": 2109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.422, + "grad_norm": 2.6099462509155273, + "kl": 0.47348035871982574, + "learning_rate": 6.30482021505518e-07, + "loss": 0.0189, + "num_tokens": 15359589.0, + "reward": 0.7850341796875, + "reward_std": 0.012253006920218468, + "rewards//mean": 0.7850341796875, + "rewards//std": 0.022379916161298752, + "step": 2110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4222, + "grad_norm": 1.8072547912597656, + "kl": 0.42136677727103233, + "learning_rate": 6.30175658699156e-07, + "loss": 0.0169, + "num_tokens": 15366901.0, + "reward": 0.88226318359375, + "reward_std": 0.010855462402105331, + "rewards//mean": 0.88226318359375, + "rewards//std": 0.0184781476855278, + "step": 2111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4224, + "grad_norm": 1.29784095287323, + "kl": 0.3034827336668968, + "learning_rate": 6.298692434580542e-07, + "loss": 0.0121, + "num_tokens": 15374165.0, + "reward": 0.86297607421875, + "reward_std": 0.013491775840520859, + "rewards//mean": 0.86297607421875, + "rewards//std": 0.027217723429203033, + "step": 2112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4226, + "grad_norm": 1.84852135181427, + "kl": 0.40097702480852604, + "learning_rate": 6.295627759056368e-07, + "loss": 0.016, + "num_tokens": 15381397.0, + "reward": 0.823974609375, + "reward_std": 0.014492429792881012, + "rewards//mean": 0.823974609375, + "rewards//std": 0.021529421210289, + "step": 2113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4228, + "grad_norm": 1.3652828931808472, + "kl": 0.3029989171773195, + "learning_rate": 6.292562561653485e-07, + "loss": 0.0121, + "num_tokens": 15388725.0, + "reward": 0.89306640625, + "reward_std": 0.012773586437106133, + "rewards//mean": 0.89306640625, + "rewards//std": 0.026788944378495216, + "step": 2114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.423, + "grad_norm": 1.3769155740737915, + "kl": 0.3503374084830284, + "learning_rate": 6.289496843606559e-07, + "loss": 0.014, + "num_tokens": 15396101.0, + "reward": 0.8162841796875, + "reward_std": 0.012213034555315971, + "rewards//mean": 0.8162841796875, + "rewards//std": 0.01792054809629917, + "step": 2115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.578125, + "epoch": 0.4232, + "grad_norm": 1.8232357501983643, + "kl": 0.3639508318156004, + "learning_rate": 6.286430606150458e-07, + "loss": -0.0098, + "num_tokens": 15403362.0, + "reward": 0.87445068359375, + "reward_std": 0.01495035458356142, + "rewards//mean": 0.87445068359375, + "rewards//std": 0.023301390931010246, + "step": 2116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4234, + "grad_norm": 1.342454433441162, + "kl": 0.3472656011581421, + "learning_rate": 6.283363850520263e-07, + "loss": 0.0139, + "num_tokens": 15410658.0, + "reward": 0.84063720703125, + "reward_std": 0.015648260712623596, + "rewards//mean": 0.84063720703125, + "rewards//std": 0.019151564687490463, + "step": 2117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4236, + "grad_norm": 1.3977584838867188, + "kl": 0.30761462077498436, + "learning_rate": 6.280296577951261e-07, + "loss": 0.0123, + "num_tokens": 15417962.0, + "reward": 0.80072021484375, + "reward_std": 0.010438394732773304, + "rewards//mean": 0.80072021484375, + "rewards//std": 0.020710812881588936, + "step": 2118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4238, + "grad_norm": 1.5480289459228516, + "kl": 0.3455258309841156, + "learning_rate": 6.277228789678953e-07, + "loss": 0.0138, + "num_tokens": 15425322.0, + "reward": 0.7833251953125, + "reward_std": 0.01709784008562565, + "rewards//mean": 0.7833251953125, + "rewards//std": 0.02025528997182846, + "step": 2119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.424, + "grad_norm": 1.2809983491897583, + "kl": 0.27849619276821613, + "learning_rate": 6.27416048693904e-07, + "loss": 0.0111, + "num_tokens": 15432634.0, + "reward": 0.83984375, + "reward_std": 0.010855261236429214, + "rewards//mean": 0.83984375, + "rewards//std": 0.024119943380355835, + "step": 2120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4242, + "grad_norm": 1.5277061462402344, + "kl": 0.289086002856493, + "learning_rate": 6.271091670967436e-07, + "loss": 0.0116, + "num_tokens": 15439978.0, + "reward": 0.86669921875, + "reward_std": 0.017399808391928673, + "rewards//mean": 0.86669921875, + "rewards//std": 0.019814416766166687, + "step": 2121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4244, + "grad_norm": 1.6286649703979492, + "kl": 0.34456759691238403, + "learning_rate": 6.268022343000257e-07, + "loss": 0.0138, + "num_tokens": 15447186.0, + "reward": 0.81939697265625, + "reward_std": 0.01346561312675476, + "rewards//mean": 0.81939697265625, + "rewards//std": 0.024505889043211937, + "step": 2122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4246, + "grad_norm": 1.755959391593933, + "kl": 0.3314257897436619, + "learning_rate": 6.26495250427383e-07, + "loss": 0.0133, + "num_tokens": 15454490.0, + "reward": 0.8724365234375, + "reward_std": 0.016346098855137825, + "rewards//mean": 0.8724365234375, + "rewards//std": 0.023991255089640617, + "step": 2123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4248, + "grad_norm": 1.6332684755325317, + "kl": 0.3337462954223156, + "learning_rate": 6.261882156024687e-07, + "loss": 0.0133, + "num_tokens": 15461874.0, + "reward": 0.8380126953125, + "reward_std": 0.017934946343302727, + "rewards//mean": 0.8380126953125, + "rewards//std": 0.025007059797644615, + "step": 2124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.425, + "grad_norm": 1.4156949520111084, + "kl": 0.3245489038527012, + "learning_rate": 6.258811299489563e-07, + "loss": 0.013, + "num_tokens": 15469202.0, + "reward": 0.8702392578125, + "reward_std": 0.014445689506828785, + "rewards//mean": 0.8702392578125, + "rewards//std": 0.020141372457146645, + "step": 2125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4252, + "grad_norm": 1.6642465591430664, + "kl": 0.4014807268977165, + "learning_rate": 6.255739935905395e-07, + "loss": 0.0161, + "num_tokens": 15476418.0, + "reward": 0.86126708984375, + "reward_std": 0.013145365752279758, + "rewards//mean": 0.86126708984375, + "rewards//std": 0.020646393299102783, + "step": 2126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4254, + "grad_norm": 1.345576524734497, + "kl": 0.3024662956595421, + "learning_rate": 6.252668066509334e-07, + "loss": 0.0121, + "num_tokens": 15483746.0, + "reward": 0.8701171875, + "reward_std": 0.0170576311647892, + "rewards//mean": 0.8701171875, + "rewards//std": 0.025723503902554512, + "step": 2127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4256, + "grad_norm": 1.4941109418869019, + "kl": 0.34049156680703163, + "learning_rate": 6.249595692538725e-07, + "loss": 0.0136, + "num_tokens": 15491010.0, + "reward": 0.79534912109375, + "reward_std": 0.02033878304064274, + "rewards//mean": 0.79534912109375, + "rewards//std": 0.029344309121370316, + "step": 2128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4258, + "grad_norm": 2.119701623916626, + "kl": 0.4201590195298195, + "learning_rate": 6.24652281523112e-07, + "loss": 0.0168, + "num_tokens": 15498298.0, + "reward": 0.8646240234375, + "reward_std": 0.02299954928457737, + "rewards//mean": 0.8646240234375, + "rewards//std": 0.03232599049806595, + "step": 2129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.426, + "grad_norm": 1.3778594732284546, + "kl": 0.23332431353628635, + "learning_rate": 6.243449435824276e-07, + "loss": 0.0093, + "num_tokens": 15505490.0, + "reward": 0.8109130859375, + "reward_std": 0.012405140325427055, + "rewards//mean": 0.8109130859375, + "rewards//std": 0.017143459990620613, + "step": 2130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4262, + "grad_norm": 1.3509931564331055, + "kl": 0.3556797690689564, + "learning_rate": 6.240375555556145e-07, + "loss": 0.0142, + "num_tokens": 15512738.0, + "reward": 0.7464599609375, + "reward_std": 0.013356196694076061, + "rewards//mean": 0.7464599609375, + "rewards//std": 0.022366385906934738, + "step": 2131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.4264, + "grad_norm": 1.5603368282318115, + "kl": 0.3083466440439224, + "learning_rate": 6.23730117566489e-07, + "loss": 0.0134, + "num_tokens": 15519909.0, + "reward": 0.8138427734375, + "reward_std": 0.01213269867002964, + "rewards//mean": 0.8138427734375, + "rewards//std": 0.02102104388177395, + "step": 2132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.4266, + "grad_norm": 1.5191072225570679, + "kl": 0.3433690667152405, + "learning_rate": 6.234226297388868e-07, + "loss": 0.015, + "num_tokens": 15527233.0, + "reward": 0.83673095703125, + "reward_std": 0.018331198021769524, + "rewards//mean": 0.83673095703125, + "rewards//std": 0.02570635825395584, + "step": 2133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4268, + "grad_norm": 1.469115138053894, + "kl": 0.34857812337577343, + "learning_rate": 6.231150921966642e-07, + "loss": 0.0139, + "num_tokens": 15534577.0, + "reward": 0.85064697265625, + "reward_std": 0.01610160805284977, + "rewards//mean": 0.85064697265625, + "rewards//std": 0.02465369552373886, + "step": 2134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.427, + "grad_norm": 1.3068501949310303, + "kl": 0.29597543738782406, + "learning_rate": 6.228075050636972e-07, + "loss": 0.0118, + "num_tokens": 15541721.0, + "reward": 0.82379150390625, + "reward_std": 0.01664726808667183, + "rewards//mean": 0.82379150390625, + "rewards//std": 0.02437085472047329, + "step": 2135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4272, + "grad_norm": 1.5902693271636963, + "kl": 0.3080125227570534, + "learning_rate": 6.22499868463882e-07, + "loss": 0.0123, + "num_tokens": 15549001.0, + "reward": 0.85931396484375, + "reward_std": 0.017526526004076004, + "rewards//mean": 0.85931396484375, + "rewards//std": 0.027991479262709618, + "step": 2136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4274, + "grad_norm": 1.8775135278701782, + "kl": 0.3253372423350811, + "learning_rate": 6.221921825211341e-07, + "loss": 0.013, + "num_tokens": 15556361.0, + "reward": 0.867919921875, + "reward_std": 0.011053239926695824, + "rewards//mean": 0.867919921875, + "rewards//std": 0.02359071746468544, + "step": 2137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.828125, + "epoch": 0.4276, + "grad_norm": 1.5808205604553223, + "kl": 0.34213182143867016, + "learning_rate": 6.2188444735939e-07, + "loss": 0.0014, + "num_tokens": 15563654.0, + "reward": 0.8658447265625, + "reward_std": 0.01982281357049942, + "rewards//mean": 0.8658447265625, + "rewards//std": 0.02809276431798935, + "step": 2138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4278, + "grad_norm": 1.5733834505081177, + "kl": 0.31283364817500114, + "learning_rate": 6.215766631026049e-07, + "loss": 0.0125, + "num_tokens": 15570934.0, + "reward": 0.86474609375, + "reward_std": 0.013857554644346237, + "rewards//mean": 0.86474609375, + "rewards//std": 0.02173794060945511, + "step": 2139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.428, + "grad_norm": 1.3001519441604614, + "kl": 0.3335474468767643, + "learning_rate": 6.212688298747545e-07, + "loss": 0.0148, + "num_tokens": 15578290.0, + "reward": 0.88409423828125, + "reward_std": 0.01808561012148857, + "rewards//mean": 0.88409423828125, + "rewards//std": 0.027463551610708237, + "step": 2140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4282, + "grad_norm": 1.547728180885315, + "kl": 0.2571494486182928, + "learning_rate": 6.209609477998338e-07, + "loss": 0.0103, + "num_tokens": 15585618.0, + "reward": 0.8695068359375, + "reward_std": 0.015240824781358242, + "rewards//mean": 0.8695068359375, + "rewards//std": 0.0224312637001276, + "step": 2141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4284, + "grad_norm": 1.2401201725006104, + "kl": 0.30947992391884327, + "learning_rate": 6.20653017001858e-07, + "loss": 0.0124, + "num_tokens": 15592986.0, + "reward": 0.893310546875, + "reward_std": 0.0176716148853302, + "rewards//mean": 0.893310546875, + "rewards//std": 0.022647857666015625, + "step": 2142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4286, + "grad_norm": 1.71807062625885, + "kl": 0.3253672756254673, + "learning_rate": 6.203450376048614e-07, + "loss": 0.013, + "num_tokens": 15600258.0, + "reward": 0.8629150390625, + "reward_std": 0.014980695210397243, + "rewards//mean": 0.8629150390625, + "rewards//std": 0.02376302145421505, + "step": 2143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4288, + "grad_norm": 1.6665146350860596, + "kl": 0.3607383631169796, + "learning_rate": 6.200370097328978e-07, + "loss": 0.0144, + "num_tokens": 15607434.0, + "reward": 0.873779296875, + "reward_std": 0.01600634679198265, + "rewards//mean": 0.873779296875, + "rewards//std": 0.029089462012052536, + "step": 2144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.429, + "grad_norm": 1.466338872909546, + "kl": 0.3477357905358076, + "learning_rate": 6.197289335100412e-07, + "loss": 0.0139, + "num_tokens": 15614754.0, + "reward": 0.86932373046875, + "reward_std": 0.01619919389486313, + "rewards//mean": 0.86932373046875, + "rewards//std": 0.021997444331645966, + "step": 2145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4292, + "grad_norm": 1.5169740915298462, + "kl": 0.35230799205601215, + "learning_rate": 6.194208090603844e-07, + "loss": 0.0141, + "num_tokens": 15621986.0, + "reward": 0.8653564453125, + "reward_std": 0.01670834794640541, + "rewards//mean": 0.8653564453125, + "rewards//std": 0.020516669377684593, + "step": 2146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4294, + "grad_norm": 1.3358334302902222, + "kl": 0.3247377462685108, + "learning_rate": 6.191126365080401e-07, + "loss": 0.013, + "num_tokens": 15629266.0, + "reward": 0.82904052734375, + "reward_std": 0.010320253670215607, + "rewards//mean": 0.82904052734375, + "rewards//std": 0.015775291249155998, + "step": 2147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4296, + "grad_norm": 1.6848008632659912, + "kl": 0.2759398799389601, + "learning_rate": 6.1880441597714e-07, + "loss": 0.011, + "num_tokens": 15636450.0, + "reward": 0.8411865234375, + "reward_std": 0.013624988496303558, + "rewards//mean": 0.8411865234375, + "rewards//std": 0.019252963364124298, + "step": 2148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.4298, + "grad_norm": 1.5469919443130493, + "kl": 0.38697353564202785, + "learning_rate": 6.184961475918355e-07, + "loss": 0.0166, + "num_tokens": 15643935.0, + "reward": 0.8685302734375, + "reward_std": 0.017136044800281525, + "rewards//mean": 0.8685302734375, + "rewards//std": 0.02749144285917282, + "step": 2149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.43, + "grad_norm": 1.5256062746047974, + "kl": 0.31390100717544556, + "learning_rate": 6.181878314762968e-07, + "loss": 0.0126, + "num_tokens": 15651183.0, + "reward": 0.84783935546875, + "reward_std": 0.01573541760444641, + "rewards//mean": 0.84783935546875, + "rewards//std": 0.020581034943461418, + "step": 2150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4302, + "grad_norm": 1.456486463546753, + "kl": 0.27414365112781525, + "learning_rate": 6.178794677547137e-07, + "loss": 0.011, + "num_tokens": 15658519.0, + "reward": 0.89007568359375, + "reward_std": 0.02193240262567997, + "rewards//mean": 0.89007568359375, + "rewards//std": 0.02539055235683918, + "step": 2151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4304, + "grad_norm": 1.2849576473236084, + "kl": 0.31138866022229195, + "learning_rate": 6.17571056551295e-07, + "loss": 0.0125, + "num_tokens": 15665839.0, + "reward": 0.87554931640625, + "reward_std": 0.017250385135412216, + "rewards//mean": 0.87554931640625, + "rewards//std": 0.024168768897652626, + "step": 2152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4306, + "grad_norm": 1.59898841381073, + "kl": 0.3653735928237438, + "learning_rate": 6.172625979902689e-07, + "loss": 0.0146, + "num_tokens": 15673103.0, + "reward": 0.82305908203125, + "reward_std": 0.016562171280384064, + "rewards//mean": 0.82305908203125, + "rewards//std": 0.021187005564570427, + "step": 2153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4308, + "grad_norm": 1.37191903591156, + "kl": 0.2916932590305805, + "learning_rate": 6.169540921958822e-07, + "loss": 0.0117, + "num_tokens": 15680335.0, + "reward": 0.85614013671875, + "reward_std": 0.015768345445394516, + "rewards//mean": 0.85614013671875, + "rewards//std": 0.019581466913223267, + "step": 2154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.515625, + "epoch": 0.431, + "grad_norm": 1.4267691373825073, + "kl": 0.4095838125795126, + "learning_rate": 6.166455392924014e-07, + "loss": -0.0183, + "num_tokens": 15687584.0, + "reward": 0.86529541015625, + "reward_std": 0.023357518017292023, + "rewards//mean": 0.86529541015625, + "rewards//std": 0.03386126458644867, + "step": 2155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4312, + "grad_norm": 1.391535997390747, + "kl": 0.2802893463522196, + "learning_rate": 6.163369394041111e-07, + "loss": 0.0112, + "num_tokens": 15694952.0, + "reward": 0.8367919921875, + "reward_std": 0.012341356836259365, + "rewards//mean": 0.8367919921875, + "rewards//std": 0.020616767928004265, + "step": 2156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.546875, + "epoch": 0.4314, + "grad_norm": 1.4479811191558838, + "kl": 0.3500138930976391, + "learning_rate": 6.160282926553158e-07, + "loss": -0.0209, + "num_tokens": 15702211.0, + "reward": 0.81719970703125, + "reward_std": 0.012039713561534882, + "rewards//mean": 0.81719970703125, + "rewards//std": 0.01748562976717949, + "step": 2157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.859375, + "epoch": 0.4316, + "grad_norm": 1.5243412256240845, + "kl": 0.36882942728698254, + "learning_rate": 6.157195991703377e-07, + "loss": 0.0179, + "num_tokens": 15709442.0, + "reward": 0.85223388671875, + "reward_std": 0.0115938950330019, + "rewards//mean": 0.85223388671875, + "rewards//std": 0.016286153346300125, + "step": 2158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4318, + "grad_norm": 1.6838074922561646, + "kl": 0.3642085939645767, + "learning_rate": 6.154108590735191e-07, + "loss": 0.0146, + "num_tokens": 15716722.0, + "reward": 0.84222412109375, + "reward_std": 0.014635035768151283, + "rewards//mean": 0.84222412109375, + "rewards//std": 0.01865750551223755, + "step": 2159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.432, + "grad_norm": 1.3627315759658813, + "kl": 0.3210041355341673, + "learning_rate": 6.151020724892204e-07, + "loss": 0.0128, + "num_tokens": 15724034.0, + "reward": 0.8253173828125, + "reward_std": 0.01874704658985138, + "rewards//mean": 0.8253173828125, + "rewards//std": 0.02677057310938835, + "step": 2160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4322, + "grad_norm": 1.449042797088623, + "kl": 0.31797507777810097, + "learning_rate": 6.147932395418205e-07, + "loss": 0.0127, + "num_tokens": 15731338.0, + "reward": 0.79443359375, + "reward_std": 0.0164569690823555, + "rewards//mean": 0.79443359375, + "rewards//std": 0.023432333022356033, + "step": 2161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.671875, + "epoch": 0.4324, + "grad_norm": 2.723618268966675, + "kl": 0.4707583822309971, + "learning_rate": 6.144843603557175e-07, + "loss": -0.0053, + "num_tokens": 15738557.0, + "reward": 0.8587646484375, + "reward_std": 0.020112695172429085, + "rewards//mean": 0.8587646484375, + "rewards//std": 0.026759261265397072, + "step": 2162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4326, + "grad_norm": 1.5602394342422485, + "kl": 0.3578193951398134, + "learning_rate": 6.141754350553279e-07, + "loss": 0.0143, + "num_tokens": 15745805.0, + "reward": 0.85870361328125, + "reward_std": 0.012838734313845634, + "rewards//mean": 0.85870361328125, + "rewards//std": 0.016956457868218422, + "step": 2163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4328, + "grad_norm": 1.398519515991211, + "kl": 0.3039713315665722, + "learning_rate": 6.138664637650866e-07, + "loss": 0.0122, + "num_tokens": 15753077.0, + "reward": 0.82080078125, + "reward_std": 0.013478047214448452, + "rewards//mean": 0.82080078125, + "rewards//std": 0.02501220442354679, + "step": 2164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.433, + "grad_norm": 1.45485520362854, + "kl": 0.34615372866392136, + "learning_rate": 6.135574466094475e-07, + "loss": 0.0138, + "num_tokens": 15760405.0, + "reward": 0.82757568359375, + "reward_std": 0.011440235190093517, + "rewards//mean": 0.82757568359375, + "rewards//std": 0.02093616873025894, + "step": 2165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.578125, + "epoch": 0.4332, + "grad_norm": 2.22511887550354, + "kl": 0.3474332019686699, + "learning_rate": 6.132483837128823e-07, + "loss": -0.0196, + "num_tokens": 15767698.0, + "reward": 0.87579345703125, + "reward_std": 0.021225977689027786, + "rewards//mean": 0.87579345703125, + "rewards//std": 0.03391977772116661, + "step": 2166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4334, + "grad_norm": 1.5881006717681885, + "kl": 0.4097900278866291, + "learning_rate": 6.129392751998816e-07, + "loss": 0.0164, + "num_tokens": 15775018.0, + "reward": 0.8648681640625, + "reward_std": 0.01523966807872057, + "rewards//mean": 0.8648681640625, + "rewards//std": 0.016318304464221, + "step": 2167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4336, + "grad_norm": 1.5697954893112183, + "kl": 0.3919224143028259, + "learning_rate": 6.126301211949545e-07, + "loss": 0.0157, + "num_tokens": 15782322.0, + "reward": 0.86749267578125, + "reward_std": 0.012865597382187843, + "rewards//mean": 0.86749267578125, + "rewards//std": 0.019178418442606926, + "step": 2168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4338, + "grad_norm": 1.3473520278930664, + "kl": 0.33633628115057945, + "learning_rate": 6.12320921822628e-07, + "loss": 0.0135, + "num_tokens": 15789658.0, + "reward": 0.86505126953125, + "reward_std": 0.011547137051820755, + "rewards//mean": 0.86505126953125, + "rewards//std": 0.015483763068914413, + "step": 2169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.434, + "grad_norm": 1.4357432126998901, + "kl": 0.3433766681700945, + "learning_rate": 6.120116772074477e-07, + "loss": 0.0142, + "num_tokens": 15796976.0, + "reward": 0.84527587890625, + "reward_std": 0.01725730299949646, + "rewards//mean": 0.84527587890625, + "rewards//std": 0.020770659670233727, + "step": 2170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4342, + "grad_norm": 1.5140691995620728, + "kl": 0.3813743107020855, + "learning_rate": 6.117023874739771e-07, + "loss": 0.0153, + "num_tokens": 15804184.0, + "reward": 0.86407470703125, + "reward_std": 0.013116591610014439, + "rewards//mean": 0.86407470703125, + "rewards//std": 0.025336239486932755, + "step": 2171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4344, + "grad_norm": 1.4339035749435425, + "kl": 0.3301283325999975, + "learning_rate": 6.113930527467983e-07, + "loss": 0.0132, + "num_tokens": 15811432.0, + "reward": 0.84942626953125, + "reward_std": 0.017224568873643875, + "rewards//mean": 0.84942626953125, + "rewards//std": 0.024899912998080254, + "step": 2172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.4346, + "grad_norm": 1.5614217519760132, + "kl": 0.2950735744088888, + "learning_rate": 6.110836731505111e-07, + "loss": 0.0103, + "num_tokens": 15818668.0, + "reward": 0.86859130859375, + "reward_std": 0.015368307940661907, + "rewards//mean": 0.86859130859375, + "rewards//std": 0.022738493978977203, + "step": 2173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4348, + "grad_norm": 1.4064451456069946, + "kl": 0.3259081933647394, + "learning_rate": 6.107742488097338e-07, + "loss": 0.013, + "num_tokens": 15826020.0, + "reward": 0.8446044921875, + "reward_std": 0.012584494426846504, + "rewards//mean": 0.8446044921875, + "rewards//std": 0.021625883877277374, + "step": 2174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.435, + "grad_norm": 1.375756025314331, + "kl": 0.3300835080444813, + "learning_rate": 6.104647798491021e-07, + "loss": 0.0128, + "num_tokens": 15833281.0, + "reward": 0.85009765625, + "reward_std": 0.022811152040958405, + "rewards//mean": 0.85009765625, + "rewards//std": 0.030808985233306885, + "step": 2175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4352, + "grad_norm": 1.6699765920639038, + "kl": 0.3051898144185543, + "learning_rate": 6.101552663932703e-07, + "loss": 0.0122, + "num_tokens": 15840489.0, + "reward": 0.8271484375, + "reward_std": 0.016371680423617363, + "rewards//mean": 0.8271484375, + "rewards//std": 0.02499767579138279, + "step": 2176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.4354, + "grad_norm": 1.5779061317443848, + "kl": 0.28795759938657284, + "learning_rate": 6.098457085669104e-07, + "loss": 0.0109, + "num_tokens": 15847787.0, + "reward": 0.834716796875, + "reward_std": 0.017809569835662842, + "rewards//mean": 0.834716796875, + "rewards//std": 0.029081134125590324, + "step": 2177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4356, + "grad_norm": 1.3604493141174316, + "kl": 0.27805171348154545, + "learning_rate": 6.095361064947123e-07, + "loss": 0.0111, + "num_tokens": 15855315.0, + "reward": 0.85894775390625, + "reward_std": 0.014092635363340378, + "rewards//mean": 0.85894775390625, + "rewards//std": 0.02132871374487877, + "step": 2178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.828125, + "epoch": 0.4358, + "grad_norm": 1.5092880725860596, + "kl": 0.32761042937636375, + "learning_rate": 6.092264603013836e-07, + "loss": 0.008, + "num_tokens": 15862552.0, + "reward": 0.8253173828125, + "reward_std": 0.014327451586723328, + "rewards//mean": 0.8253173828125, + "rewards//std": 0.02015339396893978, + "step": 2179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.436, + "grad_norm": 1.3528639078140259, + "kl": 0.334808174520731, + "learning_rate": 6.089167701116498e-07, + "loss": 0.0134, + "num_tokens": 15869776.0, + "reward": 0.86737060546875, + "reward_std": 0.016375042498111725, + "rewards//mean": 0.86737060546875, + "rewards//std": 0.03435434773564339, + "step": 2180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4362, + "grad_norm": 1.4121302366256714, + "kl": 0.31373527087271214, + "learning_rate": 6.086070360502539e-07, + "loss": 0.0125, + "num_tokens": 15877080.0, + "reward": 0.84173583984375, + "reward_std": 0.011609029956161976, + "rewards//mean": 0.84173583984375, + "rewards//std": 0.015232423320412636, + "step": 2181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4364, + "grad_norm": 1.720296025276184, + "kl": 0.28071643970906734, + "learning_rate": 6.082972582419568e-07, + "loss": 0.0112, + "num_tokens": 15884440.0, + "reward": 0.76837158203125, + "reward_std": 0.013226065784692764, + "rewards//mean": 0.76837158203125, + "rewards//std": 0.020794697105884552, + "step": 2182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4366, + "grad_norm": 1.6311025619506836, + "kl": 0.33634219877421856, + "learning_rate": 6.079874368115373e-07, + "loss": 0.0135, + "num_tokens": 15891744.0, + "reward": 0.83917236328125, + "reward_std": 0.013104695826768875, + "rewards//mean": 0.83917236328125, + "rewards//std": 0.02650768682360649, + "step": 2183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4368, + "grad_norm": 1.491748571395874, + "kl": 0.30053018033504486, + "learning_rate": 6.07677571883791e-07, + "loss": 0.012, + "num_tokens": 15899136.0, + "reward": 0.87347412109375, + "reward_std": 0.01985737308859825, + "rewards//mean": 0.87347412109375, + "rewards//std": 0.031254783272743225, + "step": 2184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.437, + "grad_norm": 1.2486289739608765, + "kl": 0.28797473944723606, + "learning_rate": 6.073676635835316e-07, + "loss": 0.0115, + "num_tokens": 15906448.0, + "reward": 0.753173828125, + "reward_std": 0.011154127307236195, + "rewards//mean": 0.753173828125, + "rewards//std": 0.02265854924917221, + "step": 2185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4372, + "grad_norm": 1.43997061252594, + "kl": 0.33218106254935265, + "learning_rate": 6.070577120355902e-07, + "loss": 0.0133, + "num_tokens": 15913776.0, + "reward": 0.85333251953125, + "reward_std": 0.013077866286039352, + "rewards//mean": 0.85333251953125, + "rewards//std": 0.02670227736234665, + "step": 2186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4374, + "grad_norm": 1.784114956855774, + "kl": 0.3424488753080368, + "learning_rate": 6.067477173648152e-07, + "loss": 0.0137, + "num_tokens": 15921120.0, + "reward": 0.86492919921875, + "reward_std": 0.017301788553595543, + "rewards//mean": 0.86492919921875, + "rewards//std": 0.030336065217852592, + "step": 2187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4376, + "grad_norm": 1.6120264530181885, + "kl": 0.36032912507653236, + "learning_rate": 6.064376796960723e-07, + "loss": 0.0144, + "num_tokens": 15928376.0, + "reward": 0.8465576171875, + "reward_std": 0.011616716161370277, + "rewards//mean": 0.8465576171875, + "rewards//std": 0.015908708795905113, + "step": 2188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4378, + "grad_norm": 1.4332990646362305, + "kl": 0.32522119395434856, + "learning_rate": 6.06127599154245e-07, + "loss": 0.013, + "num_tokens": 15935640.0, + "reward": 0.80316162109375, + "reward_std": 0.0136763546615839, + "rewards//mean": 0.80316162109375, + "rewards//std": 0.01905568316578865, + "step": 2189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.438, + "grad_norm": 1.4512227773666382, + "kl": 0.33775436505675316, + "learning_rate": 6.058174758642332e-07, + "loss": 0.0135, + "num_tokens": 15942928.0, + "reward": 0.83001708984375, + "reward_std": 0.008299920707941055, + "rewards//mean": 0.83001708984375, + "rewards//std": 0.01590145193040371, + "step": 2190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4382, + "grad_norm": 1.3644723892211914, + "kl": 0.3187594301998615, + "learning_rate": 6.055073099509549e-07, + "loss": 0.0128, + "num_tokens": 15950240.0, + "reward": 0.83929443359375, + "reward_std": 0.011737402528524399, + "rewards//mean": 0.83929443359375, + "rewards//std": 0.026207314804196358, + "step": 2191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4384, + "grad_norm": 1.8046865463256836, + "kl": 0.34237421676516533, + "learning_rate": 6.051971015393446e-07, + "loss": 0.0137, + "num_tokens": 15957544.0, + "reward": 0.8282470703125, + "reward_std": 0.01731550134718418, + "rewards//mean": 0.8282470703125, + "rewards//std": 0.022487880662083626, + "step": 2192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4386, + "grad_norm": 1.4476544857025146, + "kl": 0.30378840677440166, + "learning_rate": 6.048868507543546e-07, + "loss": 0.0122, + "num_tokens": 15964880.0, + "reward": 0.85247802734375, + "reward_std": 0.013823708519339561, + "rewards//mean": 0.85247802734375, + "rewards//std": 0.018685070797801018, + "step": 2193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4388, + "grad_norm": 1.4379322528839111, + "kl": 0.3087480254471302, + "learning_rate": 6.045765577209536e-07, + "loss": 0.0123, + "num_tokens": 15972152.0, + "reward": 0.8355712890625, + "reward_std": 0.009636862203478813, + "rewards//mean": 0.8355712890625, + "rewards//std": 0.018993321806192398, + "step": 2194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.439, + "grad_norm": 1.4730790853500366, + "kl": 0.36761027574539185, + "learning_rate": 6.042662225641276e-07, + "loss": 0.0147, + "num_tokens": 15979488.0, + "reward": 0.754638671875, + "reward_std": 0.01589655503630638, + "rewards//mean": 0.754638671875, + "rewards//std": 0.019248638302087784, + "step": 2195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4392, + "grad_norm": 1.84211003780365, + "kl": 0.3448919504880905, + "learning_rate": 6.039558454088795e-07, + "loss": 0.0138, + "num_tokens": 15986704.0, + "reward": 0.84356689453125, + "reward_std": 0.01359998807311058, + "rewards//mean": 0.84356689453125, + "rewards//std": 0.020368104800581932, + "step": 2196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4394, + "grad_norm": 1.8938422203063965, + "kl": 0.368423979729414, + "learning_rate": 6.036454263802297e-07, + "loss": 0.0147, + "num_tokens": 15993992.0, + "reward": 0.8636474609375, + "reward_std": 0.02481103129684925, + "rewards//mean": 0.8636474609375, + "rewards//std": 0.03519218787550926, + "step": 2197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4396, + "grad_norm": 1.8261628150939941, + "kl": 0.3395250737667084, + "learning_rate": 6.033349656032143e-07, + "loss": 0.0136, + "num_tokens": 16001280.0, + "reward": 0.83416748046875, + "reward_std": 0.016004573553800583, + "rewards//mean": 0.83416748046875, + "rewards//std": 0.020692532882094383, + "step": 2198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4398, + "grad_norm": 1.4308639764785767, + "kl": 0.3120981901884079, + "learning_rate": 6.03024463202887e-07, + "loss": 0.0125, + "num_tokens": 16008560.0, + "reward": 0.8377685546875, + "reward_std": 0.016924409195780754, + "rewards//mean": 0.8377685546875, + "rewards//std": 0.02364295721054077, + "step": 2199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.44, + "grad_norm": 1.5660737752914429, + "kl": 0.27765847742557526, + "learning_rate": 6.027139193043183e-07, + "loss": 0.0111, + "num_tokens": 16015904.0, + "reward": 0.8780517578125, + "reward_std": 0.016621893271803856, + "rewards//mean": 0.8780517578125, + "rewards//std": 0.02355572022497654, + "step": 2200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.828125, + "epoch": 0.4402, + "grad_norm": 1.4414695501327515, + "kl": 0.32099948078393936, + "learning_rate": 6.024033340325954e-07, + "loss": 0.0171, + "num_tokens": 16023181.0, + "reward": 0.78253173828125, + "reward_std": 0.010690493509173393, + "rewards//mean": 0.78253173828125, + "rewards//std": 0.014332293532788754, + "step": 2201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4404, + "grad_norm": 1.7776015996932983, + "kl": 0.2956379968672991, + "learning_rate": 6.020927075128216e-07, + "loss": 0.0118, + "num_tokens": 16030493.0, + "reward": 0.82647705078125, + "reward_std": 0.018744513392448425, + "rewards//mean": 0.82647705078125, + "rewards//std": 0.035548850893974304, + "step": 2202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4406, + "grad_norm": 1.3722834587097168, + "kl": 0.41473115235567093, + "learning_rate": 6.017820398701174e-07, + "loss": 0.0166, + "num_tokens": 16037821.0, + "reward": 0.8489990234375, + "reward_std": 0.018523260951042175, + "rewards//mean": 0.8489990234375, + "rewards//std": 0.02332582138478756, + "step": 2203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.4408, + "grad_norm": 1.661139726638794, + "kl": 0.42020524479448795, + "learning_rate": 6.014713312296198e-07, + "loss": 0.0167, + "num_tokens": 16045147.0, + "reward": 0.88177490234375, + "reward_std": 0.015842098742723465, + "rewards//mean": 0.88177490234375, + "rewards//std": 0.024001585319638252, + "step": 2204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.441, + "grad_norm": 1.7558039426803589, + "kl": 0.34776831790804863, + "learning_rate": 6.011605817164821e-07, + "loss": 0.0139, + "num_tokens": 16052339.0, + "reward": 0.80767822265625, + "reward_std": 0.01262994110584259, + "rewards//mean": 0.80767822265625, + "rewards//std": 0.01680130511522293, + "step": 2205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4412, + "grad_norm": 1.5696868896484375, + "kl": 0.26583814807236195, + "learning_rate": 6.008497914558743e-07, + "loss": 0.0106, + "num_tokens": 16059667.0, + "reward": 0.887939453125, + "reward_std": 0.012070976197719574, + "rewards//mean": 0.887939453125, + "rewards//std": 0.022205062210559845, + "step": 2206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4414, + "grad_norm": 1.5146180391311646, + "kl": 0.2985404245555401, + "learning_rate": 6.005389605729824e-07, + "loss": 0.0119, + "num_tokens": 16067147.0, + "reward": 0.8455810546875, + "reward_std": 0.013071340508759022, + "rewards//mean": 0.8455810546875, + "rewards//std": 0.01809539459645748, + "step": 2207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.765625, + "epoch": 0.4416, + "grad_norm": 1.4255293607711792, + "kl": 0.32731411792337894, + "learning_rate": 6.002280891930093e-07, + "loss": -0.0043, + "num_tokens": 16074372.0, + "reward": 0.85076904296875, + "reward_std": 0.020849257707595825, + "rewards//mean": 0.85076904296875, + "rewards//std": 0.025959502905607224, + "step": 2208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4418, + "grad_norm": 1.5512806177139282, + "kl": 0.3194073401391506, + "learning_rate": 5.999171774411736e-07, + "loss": 0.0128, + "num_tokens": 16081700.0, + "reward": 0.86968994140625, + "reward_std": 0.018314223736524582, + "rewards//mean": 0.86968994140625, + "rewards//std": 0.024123629555106163, + "step": 2209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.442, + "grad_norm": 1.8857237100601196, + "kl": 0.3010980747640133, + "learning_rate": 5.996062254427111e-07, + "loss": 0.012, + "num_tokens": 16089028.0, + "reward": 0.88226318359375, + "reward_std": 0.020362241193652153, + "rewards//mean": 0.88226318359375, + "rewards//std": 0.031148048117756844, + "step": 2210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4422, + "grad_norm": 1.4224797487258911, + "kl": 0.3548112064599991, + "learning_rate": 5.992952333228726e-07, + "loss": 0.0142, + "num_tokens": 16096316.0, + "reward": 0.8509521484375, + "reward_std": 0.018979575484991074, + "rewards//mean": 0.8509521484375, + "rewards//std": 0.02884148247539997, + "step": 2211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4424, + "grad_norm": 1.5199095010757446, + "kl": 0.3910178989171982, + "learning_rate": 5.989842012069264e-07, + "loss": 0.0156, + "num_tokens": 16103620.0, + "reward": 0.86944580078125, + "reward_std": 0.017741061747074127, + "rewards//mean": 0.86944580078125, + "rewards//std": 0.022475339472293854, + "step": 2212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4426, + "grad_norm": 1.7331105470657349, + "kl": 0.3157598078250885, + "learning_rate": 5.986731292201554e-07, + "loss": 0.0126, + "num_tokens": 16110876.0, + "reward": 0.8603515625, + "reward_std": 0.017351383343338966, + "rewards//mean": 0.8603515625, + "rewards//std": 0.026548271998763084, + "step": 2213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4428, + "grad_norm": 1.7581456899642944, + "kl": 0.3063448555767536, + "learning_rate": 5.983620174878601e-07, + "loss": 0.0123, + "num_tokens": 16118068.0, + "reward": 0.8433837890625, + "reward_std": 0.01286265067756176, + "rewards//mean": 0.8433837890625, + "rewards//std": 0.020430902019143105, + "step": 2214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.443, + "grad_norm": 1.456160068511963, + "kl": 0.3155167605727911, + "learning_rate": 5.980508661353556e-07, + "loss": 0.0153, + "num_tokens": 16125320.0, + "reward": 0.7955322265625, + "reward_std": 0.010730544105172157, + "rewards//mean": 0.7955322265625, + "rewards//std": 0.020519619807600975, + "step": 2215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4432, + "grad_norm": 1.8904516696929932, + "kl": 0.3633027411997318, + "learning_rate": 5.977396752879741e-07, + "loss": 0.0145, + "num_tokens": 16132536.0, + "reward": 0.86334228515625, + "reward_std": 0.018218165263533592, + "rewards//mean": 0.86334228515625, + "rewards//std": 0.02326628379523754, + "step": 2216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4434, + "grad_norm": 1.5210694074630737, + "kl": 0.3182669449597597, + "learning_rate": 5.97428445071063e-07, + "loss": 0.0127, + "num_tokens": 16139848.0, + "reward": 0.84552001953125, + "reward_std": 0.01535303145647049, + "rewards//mean": 0.84552001953125, + "rewards//std": 0.024199439212679863, + "step": 2217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4436, + "grad_norm": 1.492266058921814, + "kl": 0.3362657018005848, + "learning_rate": 5.97117175609986e-07, + "loss": 0.0135, + "num_tokens": 16147040.0, + "reward": 0.83087158203125, + "reward_std": 0.016074061393737793, + "rewards//mean": 0.83087158203125, + "rewards//std": 0.021129770204424858, + "step": 2218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4438, + "grad_norm": 1.2773387432098389, + "kl": 0.29688821732997894, + "learning_rate": 5.968058670301221e-07, + "loss": 0.0119, + "num_tokens": 16154296.0, + "reward": 0.85809326171875, + "reward_std": 0.018606701865792274, + "rewards//mean": 0.85809326171875, + "rewards//std": 0.036044467240571976, + "step": 2219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.444, + "grad_norm": 1.4432791471481323, + "kl": 0.324137756600976, + "learning_rate": 5.964945194568668e-07, + "loss": 0.013, + "num_tokens": 16161560.0, + "reward": 0.83953857421875, + "reward_std": 0.0154979033395648, + "rewards//mean": 0.83953857421875, + "rewards//std": 0.019102495163679123, + "step": 2220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4442, + "grad_norm": 1.269000768661499, + "kl": 0.363388380035758, + "learning_rate": 5.961831330156305e-07, + "loss": 0.0145, + "num_tokens": 16168832.0, + "reward": 0.85577392578125, + "reward_std": 0.012782318517565727, + "rewards//mean": 0.85577392578125, + "rewards//std": 0.018193319439888, + "step": 2221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4444, + "grad_norm": 1.2357325553894043, + "kl": 0.35842714458703995, + "learning_rate": 5.958717078318396e-07, + "loss": 0.0143, + "num_tokens": 16176152.0, + "reward": 0.8275146484375, + "reward_std": 0.012499134987592697, + "rewards//mean": 0.8275146484375, + "rewards//std": 0.019139407202601433, + "step": 2222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4446, + "grad_norm": 1.5367382764816284, + "kl": 0.39839400723576546, + "learning_rate": 5.955602440309365e-07, + "loss": 0.0159, + "num_tokens": 16183464.0, + "reward": 0.810302734375, + "reward_std": 0.014689529314637184, + "rewards//mean": 0.810302734375, + "rewards//std": 0.0229453444480896, + "step": 2223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.4448, + "grad_norm": 1.4774938821792603, + "kl": 0.31194446980953217, + "learning_rate": 5.952487417383781e-07, + "loss": 0.0129, + "num_tokens": 16190703.0, + "reward": 0.8818359375, + "reward_std": 0.014485650695860386, + "rewards//mean": 0.8818359375, + "rewards//std": 0.020705055445432663, + "step": 2224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.445, + "grad_norm": 1.5857726335525513, + "kl": 0.2945986744016409, + "learning_rate": 5.949372010796383e-07, + "loss": 0.0118, + "num_tokens": 16197975.0, + "reward": 0.8621826171875, + "reward_std": 0.013977273367345333, + "rewards//mean": 0.8621826171875, + "rewards//std": 0.017873181030154228, + "step": 2225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4452, + "grad_norm": 1.6917929649353027, + "kl": 0.3491784743964672, + "learning_rate": 5.946256221802051e-07, + "loss": 0.014, + "num_tokens": 16205279.0, + "reward": 0.8765869140625, + "reward_std": 0.011563300155103207, + "rewards//mean": 0.8765869140625, + "rewards//std": 0.016288593411445618, + "step": 2226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4454, + "grad_norm": 1.332709789276123, + "kl": 0.3121735695749521, + "learning_rate": 5.943140051655827e-07, + "loss": 0.0125, + "num_tokens": 16212535.0, + "reward": 0.82098388671875, + "reward_std": 0.012313015758991241, + "rewards//mean": 0.82098388671875, + "rewards//std": 0.01746310666203499, + "step": 2227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.84375, + "epoch": 0.4456, + "grad_norm": 1.4990259408950806, + "kl": 0.28317031636834145, + "learning_rate": 5.940023501612902e-07, + "loss": 0.01, + "num_tokens": 16219781.0, + "reward": 0.8021240234375, + "reward_std": 0.01168108731508255, + "rewards//mean": 0.8021240234375, + "rewards//std": 0.013677963986992836, + "step": 2228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4458, + "grad_norm": 1.5735620260238647, + "kl": 0.3889701794832945, + "learning_rate": 5.936906572928624e-07, + "loss": 0.0156, + "num_tokens": 16226997.0, + "reward": 0.79705810546875, + "reward_std": 0.009817452169954777, + "rewards//mean": 0.79705810546875, + "rewards//std": 0.013413529843091965, + "step": 2229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.446, + "grad_norm": 1.5192129611968994, + "kl": 0.3790886662900448, + "learning_rate": 5.933789266858488e-07, + "loss": 0.0152, + "num_tokens": 16234317.0, + "reward": 0.8565673828125, + "reward_std": 0.016529783606529236, + "rewards//mean": 0.8565673828125, + "rewards//std": 0.030141141265630722, + "step": 2230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.4462, + "grad_norm": 1.626715898513794, + "kl": 0.35440340638160706, + "learning_rate": 5.93067158465815e-07, + "loss": 0.0122, + "num_tokens": 16241593.0, + "reward": 0.82830810546875, + "reward_std": 0.013569911941885948, + "rewards//mean": 0.82830810546875, + "rewards//std": 0.02072177268564701, + "step": 2231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4464, + "grad_norm": 1.703404426574707, + "kl": 0.2843221817165613, + "learning_rate": 5.927553527583407e-07, + "loss": 0.0114, + "num_tokens": 16248873.0, + "reward": 0.83148193359375, + "reward_std": 0.012177775613963604, + "rewards//mean": 0.83148193359375, + "rewards//std": 0.017046386376023293, + "step": 2232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4466, + "grad_norm": 1.4812755584716797, + "kl": 0.2708888091146946, + "learning_rate": 5.924435096890216e-07, + "loss": 0.0108, + "num_tokens": 16256105.0, + "reward": 0.85076904296875, + "reward_std": 0.02144663780927658, + "rewards//mean": 0.85076904296875, + "rewards//std": 0.036744069308042526, + "step": 2233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.609375, + "epoch": 0.4468, + "grad_norm": 1.590750813484192, + "kl": 0.36035382747650146, + "learning_rate": 5.921316293834676e-07, + "loss": -0.0188, + "num_tokens": 16263368.0, + "reward": 0.83831787109375, + "reward_std": 0.02067079395055771, + "rewards//mean": 0.83831787109375, + "rewards//std": 0.02914554439485073, + "step": 2234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.8125, + "epoch": 0.447, + "grad_norm": 1.4605602025985718, + "kl": 0.3733503632247448, + "learning_rate": 5.918197119673046e-07, + "loss": 0.0046, + "num_tokens": 16270692.0, + "reward": 0.7685546875, + "reward_std": 0.012037011794745922, + "rewards//mean": 0.7685546875, + "rewards//std": 0.018558358773589134, + "step": 2235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4472, + "grad_norm": 1.889945149421692, + "kl": 0.4245830327272415, + "learning_rate": 5.915077575661722e-07, + "loss": 0.017, + "num_tokens": 16277964.0, + "reward": 0.85162353515625, + "reward_std": 0.016602903604507446, + "rewards//mean": 0.85162353515625, + "rewards//std": 0.023801451548933983, + "step": 2236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4474, + "grad_norm": 1.3924810886383057, + "kl": 0.30756160244345665, + "learning_rate": 5.911957663057263e-07, + "loss": 0.0123, + "num_tokens": 16285364.0, + "reward": 0.855224609375, + "reward_std": 0.013212352059781551, + "rewards//mean": 0.855224609375, + "rewards//std": 0.0198312159627676, + "step": 2237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.4476, + "grad_norm": 1.4908796548843384, + "kl": 0.28515107184648514, + "learning_rate": 5.908837383116367e-07, + "loss": 0.0115, + "num_tokens": 16292809.0, + "reward": 0.85687255859375, + "reward_std": 0.014501148834824562, + "rewards//mean": 0.85687255859375, + "rewards//std": 0.024233819916844368, + "step": 2238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4478, + "grad_norm": 1.3738642930984497, + "kl": 0.2985352724790573, + "learning_rate": 5.905716737095879e-07, + "loss": 0.0119, + "num_tokens": 16300217.0, + "reward": 0.8714599609375, + "reward_std": 0.01549053005874157, + "rewards//mean": 0.8714599609375, + "rewards//std": 0.028191737830638885, + "step": 2239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.448, + "grad_norm": 1.66134512424469, + "kl": 0.3422825466841459, + "learning_rate": 5.9025957262528e-07, + "loss": 0.0137, + "num_tokens": 16307441.0, + "reward": 0.8223876953125, + "reward_std": 0.012543021701276302, + "rewards//mean": 0.8223876953125, + "rewards//std": 0.016336847096681595, + "step": 2240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4482, + "grad_norm": 1.8841184377670288, + "kl": 0.3216011207550764, + "learning_rate": 5.899474351844269e-07, + "loss": 0.0129, + "num_tokens": 16314817.0, + "reward": 0.884033203125, + "reward_std": 0.01570514217019081, + "rewards//mean": 0.884033203125, + "rewards//std": 0.019286349415779114, + "step": 2241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.4484, + "grad_norm": 1.3184716701507568, + "kl": 0.3910026140511036, + "learning_rate": 5.896352615127578e-07, + "loss": 0.0162, + "num_tokens": 16322186.0, + "reward": 0.83026123046875, + "reward_std": 0.015952352434396744, + "rewards//mean": 0.83026123046875, + "rewards//std": 0.03339264541864395, + "step": 2242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4486, + "grad_norm": 1.4915701150894165, + "kl": 0.2943268399685621, + "learning_rate": 5.893230517360159e-07, + "loss": 0.0118, + "num_tokens": 16329570.0, + "reward": 0.85595703125, + "reward_std": 0.017673175781965256, + "rewards//mean": 0.85595703125, + "rewards//std": 0.02969614788889885, + "step": 2243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4488, + "grad_norm": 1.5037710666656494, + "kl": 0.32205628231167793, + "learning_rate": 5.890108059799595e-07, + "loss": 0.0129, + "num_tokens": 16336842.0, + "reward": 0.85284423828125, + "reward_std": 0.015719261020421982, + "rewards//mean": 0.85284423828125, + "rewards//std": 0.0172327421605587, + "step": 2244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.449, + "grad_norm": 1.4857733249664307, + "kl": 0.3531201910227537, + "learning_rate": 5.886985243703611e-07, + "loss": 0.0141, + "num_tokens": 16344082.0, + "reward": 0.83721923828125, + "reward_std": 0.01400769129395485, + "rewards//mean": 0.83721923828125, + "rewards//std": 0.024110449478030205, + "step": 2245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4492, + "grad_norm": 1.312983751296997, + "kl": 0.28358996100723743, + "learning_rate": 5.883862070330078e-07, + "loss": 0.0113, + "num_tokens": 16351258.0, + "reward": 0.8577880859375, + "reward_std": 0.013733446598052979, + "rewards//mean": 0.8577880859375, + "rewards//std": 0.02212686650454998, + "step": 2246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.4494, + "grad_norm": 1.5081782341003418, + "kl": 0.34280743822455406, + "learning_rate": 5.880738540937007e-07, + "loss": 0.0106, + "num_tokens": 16358508.0, + "reward": 0.8643798828125, + "reward_std": 0.01283862255513668, + "rewards//mean": 0.8643798828125, + "rewards//std": 0.018784958869218826, + "step": 2247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4496, + "grad_norm": 1.2976646423339844, + "kl": 0.35579196736216545, + "learning_rate": 5.877614656782559e-07, + "loss": 0.0142, + "num_tokens": 16365812.0, + "reward": 0.83734130859375, + "reward_std": 0.012529734522104263, + "rewards//mean": 0.83734130859375, + "rewards//std": 0.016342753544449806, + "step": 2248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4498, + "grad_norm": 1.4784384965896606, + "kl": 0.30964744836091995, + "learning_rate": 5.874490419125032e-07, + "loss": 0.0124, + "num_tokens": 16373156.0, + "reward": 0.885009765625, + "reward_std": 0.014992567710578442, + "rewards//mean": 0.885009765625, + "rewards//std": 0.016745463013648987, + "step": 2249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.45, + "grad_norm": 1.5536381006240845, + "kl": 0.3823833893984556, + "learning_rate": 5.871365829222868e-07, + "loss": 0.0153, + "num_tokens": 16380428.0, + "reward": 0.81219482421875, + "reward_std": 0.011456916108727455, + "rewards//mean": 0.81219482421875, + "rewards//std": 0.015814585611224174, + "step": 2250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4502, + "grad_norm": 1.1976706981658936, + "kl": 0.2903411854058504, + "learning_rate": 5.868240888334652e-07, + "loss": 0.0116, + "num_tokens": 16387820.0, + "reward": 0.84906005859375, + "reward_std": 0.010962491855025291, + "rewards//mean": 0.84906005859375, + "rewards//std": 0.01610388420522213, + "step": 2251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4504, + "grad_norm": 1.430914044380188, + "kl": 0.3215508833527565, + "learning_rate": 5.86511559771911e-07, + "loss": 0.0129, + "num_tokens": 16395036.0, + "reward": 0.822265625, + "reward_std": 0.01334737055003643, + "rewards//mean": 0.822265625, + "rewards//std": 0.017771683633327484, + "step": 2252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4506, + "grad_norm": 2.1467697620391846, + "kl": 0.3499818742275238, + "learning_rate": 5.861989958635109e-07, + "loss": 0.014, + "num_tokens": 16402316.0, + "reward": 0.86212158203125, + "reward_std": 0.026020074263215065, + "rewards//mean": 0.86212158203125, + "rewards//std": 0.03134039416909218, + "step": 2253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4508, + "grad_norm": 1.6351141929626465, + "kl": 0.36038268357515335, + "learning_rate": 5.858863972341655e-07, + "loss": 0.0144, + "num_tokens": 16409572.0, + "reward": 0.87017822265625, + "reward_std": 0.020755279809236526, + "rewards//mean": 0.87017822265625, + "rewards//std": 0.030135301873087883, + "step": 2254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.796875, + "epoch": 0.451, + "grad_norm": 1.6555827856063843, + "kl": 0.41047811321914196, + "learning_rate": 5.855737640097897e-07, + "loss": 0.0018, + "num_tokens": 16416799.0, + "reward": 0.87200927734375, + "reward_std": 0.015057300217449665, + "rewards//mean": 0.87200927734375, + "rewards//std": 0.022539233788847923, + "step": 2255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4512, + "grad_norm": 1.7411129474639893, + "kl": 0.346550514921546, + "learning_rate": 5.852610963163119e-07, + "loss": 0.0139, + "num_tokens": 16424071.0, + "reward": 0.81475830078125, + "reward_std": 0.01613600179553032, + "rewards//mean": 0.81475830078125, + "rewards//std": 0.020036915317177773, + "step": 2256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4514, + "grad_norm": 1.6893254518508911, + "kl": 0.31147428043186665, + "learning_rate": 5.849483942796747e-07, + "loss": 0.0125, + "num_tokens": 16431319.0, + "reward": 0.8189697265625, + "reward_std": 0.013128907419741154, + "rewards//mean": 0.8189697265625, + "rewards//std": 0.02482479438185692, + "step": 2257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4516, + "grad_norm": 1.5656418800354004, + "kl": 0.31710493192076683, + "learning_rate": 5.846356580258345e-07, + "loss": 0.0127, + "num_tokens": 16438623.0, + "reward": 0.8414306640625, + "reward_std": 0.013203201815485954, + "rewards//mean": 0.8414306640625, + "rewards//std": 0.01674320362508297, + "step": 2258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4518, + "grad_norm": 2.050144672393799, + "kl": 0.3252601958811283, + "learning_rate": 5.843228876807613e-07, + "loss": 0.013, + "num_tokens": 16445807.0, + "reward": 0.80792236328125, + "reward_std": 0.008598074316978455, + "rewards//mean": 0.80792236328125, + "rewards//std": 0.022793682292103767, + "step": 2259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.452, + "grad_norm": 1.0968323945999146, + "kl": 0.27977256290614605, + "learning_rate": 5.840100833704391e-07, + "loss": 0.0112, + "num_tokens": 16453111.0, + "reward": 0.87567138671875, + "reward_std": 0.011325940489768982, + "rewards//mean": 0.87567138671875, + "rewards//std": 0.015145717188715935, + "step": 2260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4522, + "grad_norm": 1.5571221113204956, + "kl": 0.3370476309210062, + "learning_rate": 5.836972452208654e-07, + "loss": 0.0135, + "num_tokens": 16460351.0, + "reward": 0.80792236328125, + "reward_std": 0.017717279493808746, + "rewards//mean": 0.80792236328125, + "rewards//std": 0.024385137483477592, + "step": 2261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4524, + "grad_norm": 1.2510086297988892, + "kl": 0.2959546223282814, + "learning_rate": 5.833843733580512e-07, + "loss": 0.0118, + "num_tokens": 16467639.0, + "reward": 0.8974609375, + "reward_std": 0.01580478809773922, + "rewards//mean": 0.8974609375, + "rewards//std": 0.026013750582933426, + "step": 2262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4526, + "grad_norm": 1.4627043008804321, + "kl": 0.3669191002845764, + "learning_rate": 5.830714679080215e-07, + "loss": 0.0147, + "num_tokens": 16474863.0, + "reward": 0.84820556640625, + "reward_std": 0.015403781086206436, + "rewards//mean": 0.84820556640625, + "rewards//std": 0.023896660655736923, + "step": 2263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4528, + "grad_norm": 1.3599046468734741, + "kl": 0.33632611483335495, + "learning_rate": 5.827585289968142e-07, + "loss": 0.0135, + "num_tokens": 16482103.0, + "reward": 0.86224365234375, + "reward_std": 0.013026932254433632, + "rewards//mean": 0.86224365234375, + "rewards//std": 0.01773405820131302, + "step": 2264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.453, + "grad_norm": 1.569414496421814, + "kl": 0.3126742374151945, + "learning_rate": 5.824455567504817e-07, + "loss": 0.0125, + "num_tokens": 16489479.0, + "reward": 0.86212158203125, + "reward_std": 0.01817181333899498, + "rewards//mean": 0.86212158203125, + "rewards//std": 0.030030133202672005, + "step": 2265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.578125, + "epoch": 0.4532, + "grad_norm": 1.508668065071106, + "kl": 0.33000399358570576, + "learning_rate": 5.821325512950885e-07, + "loss": -0.0157, + "num_tokens": 16496780.0, + "reward": 0.8394775390625, + "reward_std": 0.016134724020957947, + "rewards//mean": 0.8394775390625, + "rewards//std": 0.023945782333612442, + "step": 2266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4534, + "grad_norm": 1.502909541130066, + "kl": 0.3263779804110527, + "learning_rate": 5.818195127567135e-07, + "loss": 0.0131, + "num_tokens": 16504068.0, + "reward": 0.8330078125, + "reward_std": 0.014340918511152267, + "rewards//mean": 0.8330078125, + "rewards//std": 0.020669933408498764, + "step": 2267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4536, + "grad_norm": 1.2993170022964478, + "kl": 0.31301186978816986, + "learning_rate": 5.815064412614486e-07, + "loss": 0.0125, + "num_tokens": 16511444.0, + "reward": 0.86712646484375, + "reward_std": 0.01748591661453247, + "rewards//mean": 0.86712646484375, + "rewards//std": 0.02521466091275215, + "step": 2268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4538, + "grad_norm": 1.5685358047485352, + "kl": 0.3133993726223707, + "learning_rate": 5.81193336935399e-07, + "loss": 0.0125, + "num_tokens": 16518708.0, + "reward": 0.8360595703125, + "reward_std": 0.014208785258233547, + "rewards//mean": 0.8360595703125, + "rewards//std": 0.02072807215154171, + "step": 2269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.454, + "grad_norm": 1.418445348739624, + "kl": 0.32407495379447937, + "learning_rate": 5.808801999046829e-07, + "loss": 0.013, + "num_tokens": 16526044.0, + "reward": 0.86669921875, + "reward_std": 0.015414582565426826, + "rewards//mean": 0.86669921875, + "rewards//std": 0.02261173538863659, + "step": 2270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4542, + "grad_norm": 1.446516752243042, + "kl": 0.3383069634437561, + "learning_rate": 5.805670302954321e-07, + "loss": 0.0135, + "num_tokens": 16533372.0, + "reward": 0.84368896484375, + "reward_std": 0.014827505685389042, + "rewards//mean": 0.84368896484375, + "rewards//std": 0.024324847385287285, + "step": 2271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4544, + "grad_norm": 1.713549017906189, + "kl": 0.3043335024267435, + "learning_rate": 5.802538282337909e-07, + "loss": 0.0122, + "num_tokens": 16540660.0, + "reward": 0.858642578125, + "reward_std": 0.019773386418819427, + "rewards//mean": 0.858642578125, + "rewards//std": 0.03309352695941925, + "step": 2272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4546, + "grad_norm": 1.7240742444992065, + "kl": 0.31793593242764473, + "learning_rate": 5.799405938459174e-07, + "loss": 0.0127, + "num_tokens": 16547948.0, + "reward": 0.82232666015625, + "reward_std": 0.012002311646938324, + "rewards//mean": 0.82232666015625, + "rewards//std": 0.01744142174720764, + "step": 2273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.4548, + "grad_norm": 1.530139684677124, + "kl": 0.3404651191085577, + "learning_rate": 5.796273272579823e-07, + "loss": 0.0133, + "num_tokens": 16555209.0, + "reward": 0.8668212890625, + "reward_std": 0.019360797479748726, + "rewards//mean": 0.8668212890625, + "rewards//std": 0.02545582503080368, + "step": 2274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.455, + "grad_norm": 1.2947074174880981, + "kl": 0.30697335861623287, + "learning_rate": 5.793140285961692e-07, + "loss": 0.0123, + "num_tokens": 16562673.0, + "reward": 0.86883544921875, + "reward_std": 0.019968681037425995, + "rewards//mean": 0.86883544921875, + "rewards//std": 0.02718655951321125, + "step": 2275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4552, + "grad_norm": 1.5302540063858032, + "kl": 0.34005992114543915, + "learning_rate": 5.79000697986675e-07, + "loss": 0.0136, + "num_tokens": 16569929.0, + "reward": 0.844970703125, + "reward_std": 0.013517376035451889, + "rewards//mean": 0.844970703125, + "rewards//std": 0.018214214593172073, + "step": 2276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.4554, + "grad_norm": 1.531536340713501, + "kl": 0.3069956712424755, + "learning_rate": 5.78687335555709e-07, + "loss": 0.0112, + "num_tokens": 16577252.0, + "reward": 0.7725830078125, + "reward_std": 0.011172558180987835, + "rewards//mean": 0.7725830078125, + "rewards//std": 0.01750347577035427, + "step": 2277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.4556, + "grad_norm": 1.5746506452560425, + "kl": 0.33866576105356216, + "learning_rate": 5.783739414294937e-07, + "loss": 0.0141, + "num_tokens": 16584523.0, + "reward": 0.85211181640625, + "reward_std": 0.01486942544579506, + "rewards//mean": 0.85211181640625, + "rewards//std": 0.0240078903734684, + "step": 2278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4558, + "grad_norm": 1.4271868467330933, + "kl": 0.3380439057946205, + "learning_rate": 5.780605157342641e-07, + "loss": 0.0135, + "num_tokens": 16591795.0, + "reward": 0.81561279296875, + "reward_std": 0.01356214378029108, + "rewards//mean": 0.81561279296875, + "rewards//std": 0.01705082505941391, + "step": 2279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.78125, + "epoch": 0.456, + "grad_norm": 1.4991915225982666, + "kl": 0.34830914437770844, + "learning_rate": 5.777470585962681e-07, + "loss": 0.0202, + "num_tokens": 16599061.0, + "reward": 0.843017578125, + "reward_std": 0.012702924199402332, + "rewards//mean": 0.843017578125, + "rewards//std": 0.03639766573905945, + "step": 2280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4562, + "grad_norm": 1.3616441488265991, + "kl": 0.3304944522678852, + "learning_rate": 5.774335701417662e-07, + "loss": 0.0132, + "num_tokens": 16606269.0, + "reward": 0.81890869140625, + "reward_std": 0.019109733402729034, + "rewards//mean": 0.81890869140625, + "rewards//std": 0.032386988401412964, + "step": 2281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.8125, + "epoch": 0.4564, + "grad_norm": 1.4907299280166626, + "kl": 0.31893021054565907, + "learning_rate": 5.771200504970315e-07, + "loss": 0.0227, + "num_tokens": 16613609.0, + "reward": 0.79736328125, + "reward_std": 0.014140559360384941, + "rewards//mean": 0.79736328125, + "rewards//std": 0.022515127435326576, + "step": 2282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4566, + "grad_norm": 1.3650413751602173, + "kl": 0.33821350149810314, + "learning_rate": 5.768064997883498e-07, + "loss": 0.0135, + "num_tokens": 16620833.0, + "reward": 0.83050537109375, + "reward_std": 0.010855749249458313, + "rewards//mean": 0.83050537109375, + "rewards//std": 0.01301371119916439, + "step": 2283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4568, + "grad_norm": 1.6320968866348267, + "kl": 0.4005892910063267, + "learning_rate": 5.764929181420191e-07, + "loss": 0.016, + "num_tokens": 16628049.0, + "reward": 0.87054443359375, + "reward_std": 0.0199566800147295, + "rewards//mean": 0.87054443359375, + "rewards//std": 0.02372373268008232, + "step": 2284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.457, + "grad_norm": 1.829511284828186, + "kl": 0.35227320343255997, + "learning_rate": 5.7617930568435e-07, + "loss": 0.0141, + "num_tokens": 16635393.0, + "reward": 0.82244873046875, + "reward_std": 0.014580577611923218, + "rewards//mean": 0.82244873046875, + "rewards//std": 0.024388860911130905, + "step": 2285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4572, + "grad_norm": 1.4785434007644653, + "kl": 0.3222558796405792, + "learning_rate": 5.758656625416658e-07, + "loss": 0.0129, + "num_tokens": 16642721.0, + "reward": 0.8466796875, + "reward_std": 0.018438320606946945, + "rewards//mean": 0.8466796875, + "rewards//std": 0.02669384330511093, + "step": 2286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4574, + "grad_norm": 1.3968687057495117, + "kl": 0.4000097867101431, + "learning_rate": 5.755519888403017e-07, + "loss": 0.016, + "num_tokens": 16649985.0, + "reward": 0.88232421875, + "reward_std": 0.014865259639918804, + "rewards//mean": 0.88232421875, + "rewards//std": 0.024424292147159576, + "step": 2287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.453125, + "epoch": 0.4576, + "grad_norm": 1.373521089553833, + "kl": 0.290193073451519, + "learning_rate": 5.752382847066058e-07, + "loss": -0.0089, + "num_tokens": 16657214.0, + "reward": 0.8331298828125, + "reward_std": 0.013342236168682575, + "rewards//mean": 0.8331298828125, + "rewards//std": 0.017668744549155235, + "step": 2288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.4578, + "grad_norm": 1.379320502281189, + "kl": 0.35162681341171265, + "learning_rate": 5.749245502669375e-07, + "loss": 0.0117, + "num_tokens": 16664552.0, + "reward": 0.845703125, + "reward_std": 0.01365506649017334, + "rewards//mean": 0.845703125, + "rewards//std": 0.01605316251516342, + "step": 2289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.458, + "grad_norm": 1.4296072721481323, + "kl": 0.3174925036728382, + "learning_rate": 5.746107856476694e-07, + "loss": 0.0126, + "num_tokens": 16671887.0, + "reward": 0.8895263671875, + "reward_std": 0.01588613912463188, + "rewards//mean": 0.8895263671875, + "rewards//std": 0.023014839738607407, + "step": 2290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4582, + "grad_norm": 1.4327119588851929, + "kl": 0.3553072661161423, + "learning_rate": 5.742969909751858e-07, + "loss": 0.0142, + "num_tokens": 16679127.0, + "reward": 0.84283447265625, + "reward_std": 0.017132816836237907, + "rewards//mean": 0.84283447265625, + "rewards//std": 0.034002240747213364, + "step": 2291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4584, + "grad_norm": 1.4166162014007568, + "kl": 0.25701068341732025, + "learning_rate": 5.739831663758833e-07, + "loss": 0.0103, + "num_tokens": 16686407.0, + "reward": 0.83624267578125, + "reward_std": 0.009824207052588463, + "rewards//mean": 0.83624267578125, + "rewards//std": 0.03259524330496788, + "step": 2292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4586, + "grad_norm": 1.4018795490264893, + "kl": 0.4500226266682148, + "learning_rate": 5.7366931197617e-07, + "loss": 0.018, + "num_tokens": 16693719.0, + "reward": 0.8438720703125, + "reward_std": 0.018461355939507484, + "rewards//mean": 0.8438720703125, + "rewards//std": 0.026611773297190666, + "step": 2293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.4588, + "grad_norm": 1.3995252847671509, + "kl": 0.32126463763415813, + "learning_rate": 5.733554279024667e-07, + "loss": 0.0148, + "num_tokens": 16701074.0, + "reward": 0.8316650390625, + "reward_std": 0.012232963927090168, + "rewards//mean": 0.8316650390625, + "rewards//std": 0.015003751963376999, + "step": 2294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.459, + "grad_norm": 1.711667776107788, + "kl": 0.34632501751184464, + "learning_rate": 5.730415142812058e-07, + "loss": 0.0139, + "num_tokens": 16708330.0, + "reward": 0.86480712890625, + "reward_std": 0.017350029200315475, + "rewards//mean": 0.86480712890625, + "rewards//std": 0.019395826384425163, + "step": 2295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4592, + "grad_norm": 1.705626130104065, + "kl": 0.35680272802710533, + "learning_rate": 5.727275712388317e-07, + "loss": 0.0143, + "num_tokens": 16715578.0, + "reward": 0.8477783203125, + "reward_std": 0.01270887441933155, + "rewards//mean": 0.8477783203125, + "rewards//std": 0.01902199164032936, + "step": 2296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4594, + "grad_norm": 1.4378409385681152, + "kl": 0.29141072556376457, + "learning_rate": 5.724135989018006e-07, + "loss": 0.0117, + "num_tokens": 16722866.0, + "reward": 0.86602783203125, + "reward_std": 0.012162449769675732, + "rewards//mean": 0.86602783203125, + "rewards//std": 0.018954528495669365, + "step": 2297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.59375, + "epoch": 0.4596, + "grad_norm": 1.3135360479354858, + "kl": 0.327910378575325, + "learning_rate": 5.720995973965805e-07, + "loss": -0.013, + "num_tokens": 16730128.0, + "reward": 0.8060302734375, + "reward_std": 0.01478402316570282, + "rewards//mean": 0.8060302734375, + "rewards//std": 0.01971293054521084, + "step": 2298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4598, + "grad_norm": 1.3592333793640137, + "kl": 0.2960389629006386, + "learning_rate": 5.717855668496513e-07, + "loss": 0.0118, + "num_tokens": 16737416.0, + "reward": 0.8699951171875, + "reward_std": 0.014144688844680786, + "rewards//mean": 0.8699951171875, + "rewards//std": 0.019568031653761864, + "step": 2299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.46, + "grad_norm": 1.4737838506698608, + "kl": 0.2924044504761696, + "learning_rate": 5.714715073875043e-07, + "loss": 0.0117, + "num_tokens": 16744704.0, + "reward": 0.799072265625, + "reward_std": 0.007552264723926783, + "rewards//mean": 0.799072265625, + "rewards//std": 0.013385428115725517, + "step": 2300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4602, + "grad_norm": 1.312337875366211, + "kl": 0.3168789763003588, + "learning_rate": 5.711574191366427e-07, + "loss": 0.0127, + "num_tokens": 16751960.0, + "reward": 0.83551025390625, + "reward_std": 0.012222648598253727, + "rewards//mean": 0.83551025390625, + "rewards//std": 0.018428929150104523, + "step": 2301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4604, + "grad_norm": 1.2481316328048706, + "kl": 0.34419654682278633, + "learning_rate": 5.70843302223581e-07, + "loss": 0.0138, + "num_tokens": 16759312.0, + "reward": 0.83221435546875, + "reward_std": 0.013937104493379593, + "rewards//mean": 0.83221435546875, + "rewards//std": 0.01599920354783535, + "step": 2302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.4606, + "grad_norm": 1.4267473220825195, + "kl": 0.29921723529696465, + "learning_rate": 5.705291567748458e-07, + "loss": 0.0113, + "num_tokens": 16766591.0, + "reward": 0.84576416015625, + "reward_std": 0.016101961955428123, + "rewards//mean": 0.84576416015625, + "rewards//std": 0.032279305160045624, + "step": 2303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4608, + "grad_norm": 1.4728842973709106, + "kl": 0.27348887361586094, + "learning_rate": 5.702149829169746e-07, + "loss": 0.0109, + "num_tokens": 16773863.0, + "reward": 0.83251953125, + "reward_std": 0.010877709835767746, + "rewards//mean": 0.83251953125, + "rewards//std": 0.015985123813152313, + "step": 2304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.461, + "grad_norm": 1.5669991970062256, + "kl": 0.3218528535217047, + "learning_rate": 5.699007807765168e-07, + "loss": 0.0129, + "num_tokens": 16781055.0, + "reward": 0.8951416015625, + "reward_std": 0.013491692021489143, + "rewards//mean": 0.8951416015625, + "rewards//std": 0.023583978414535522, + "step": 2305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4612, + "grad_norm": 1.3088551759719849, + "kl": 0.3165646716952324, + "learning_rate": 5.695865504800327e-07, + "loss": 0.0127, + "num_tokens": 16788359.0, + "reward": 0.84942626953125, + "reward_std": 0.015448133461177349, + "rewards//mean": 0.84942626953125, + "rewards//std": 0.019721662625670433, + "step": 2306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4614, + "grad_norm": 1.4002454280853271, + "kl": 0.34112532064318657, + "learning_rate": 5.692722921540945e-07, + "loss": 0.0136, + "num_tokens": 16795751.0, + "reward": 0.8404541015625, + "reward_std": 0.021238867193460464, + "rewards//mean": 0.8404541015625, + "rewards//std": 0.030798912048339844, + "step": 2307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4616, + "grad_norm": 1.3804200887680054, + "kl": 0.38851927034556866, + "learning_rate": 5.689580059252852e-07, + "loss": 0.0155, + "num_tokens": 16802983.0, + "reward": 0.85772705078125, + "reward_std": 0.01223087403923273, + "rewards//mean": 0.85772705078125, + "rewards//std": 0.01589859649538994, + "step": 2308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.515625, + "epoch": 0.4618, + "grad_norm": 1.5100841522216797, + "kl": 0.3046862818300724, + "learning_rate": 5.686436919201996e-07, + "loss": -0.0133, + "num_tokens": 16810256.0, + "reward": 0.87432861328125, + "reward_std": 0.017200730741024017, + "rewards//mean": 0.87432861328125, + "rewards//std": 0.02351544052362442, + "step": 2309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.796875, + "epoch": 0.462, + "grad_norm": 1.3702367544174194, + "kl": 0.2967149596661329, + "learning_rate": 5.683293502654428e-07, + "loss": 0.004, + "num_tokens": 16817483.0, + "reward": 0.8321533203125, + "reward_std": 0.016602635383605957, + "rewards//mean": 0.8321533203125, + "rewards//std": 0.020751429721713066, + "step": 2310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4622, + "grad_norm": 2.0810530185699463, + "kl": 0.4009656235575676, + "learning_rate": 5.680149810876322e-07, + "loss": 0.016, + "num_tokens": 16824771.0, + "reward": 0.82135009765625, + "reward_std": 0.020026303827762604, + "rewards//mean": 0.82135009765625, + "rewards//std": 0.027252739295363426, + "step": 2311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.828125, + "epoch": 0.4624, + "grad_norm": 1.3901562690734863, + "kl": 0.32541342079639435, + "learning_rate": 5.677005845133951e-07, + "loss": 0.021, + "num_tokens": 16832032.0, + "reward": 0.7952880859375, + "reward_std": 0.0186227485537529, + "rewards//mean": 0.7952880859375, + "rewards//std": 0.022763527929782867, + "step": 2312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4626, + "grad_norm": 1.7509737014770508, + "kl": 0.342216145247221, + "learning_rate": 5.673861606693707e-07, + "loss": 0.0137, + "num_tokens": 16839272.0, + "reward": 0.8682861328125, + "reward_std": 0.011625850573182106, + "rewards//mean": 0.8682861328125, + "rewards//std": 0.024700086563825607, + "step": 2313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4628, + "grad_norm": 1.5638837814331055, + "kl": 0.3671966828405857, + "learning_rate": 5.670717096822088e-07, + "loss": 0.0147, + "num_tokens": 16846520.0, + "reward": 0.83807373046875, + "reward_std": 0.014391292817890644, + "rewards//mean": 0.83807373046875, + "rewards//std": 0.01888091117143631, + "step": 2314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.463, + "grad_norm": 1.4632747173309326, + "kl": 0.3829149305820465, + "learning_rate": 5.667572316785705e-07, + "loss": 0.0153, + "num_tokens": 16853856.0, + "reward": 0.81072998046875, + "reward_std": 0.01692821830511093, + "rewards//mean": 0.81072998046875, + "rewards//std": 0.023673269897699356, + "step": 2315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4632, + "grad_norm": 1.3727723360061646, + "kl": 0.30833204835653305, + "learning_rate": 5.664427267851271e-07, + "loss": 0.0123, + "num_tokens": 16861120.0, + "reward": 0.8558349609375, + "reward_std": 0.011571324430406094, + "rewards//mean": 0.8558349609375, + "rewards//std": 0.023393217474222183, + "step": 2316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.703125, + "epoch": 0.4634, + "grad_norm": 1.490371823310852, + "kl": 0.36112337559461594, + "learning_rate": 5.661281951285612e-07, + "loss": 0.0166, + "num_tokens": 16868381.0, + "reward": 0.87738037109375, + "reward_std": 0.01744011975824833, + "rewards//mean": 0.87738037109375, + "rewards//std": 0.02578573301434517, + "step": 2317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4636, + "grad_norm": 1.64262855052948, + "kl": 0.37203047052025795, + "learning_rate": 5.658136368355664e-07, + "loss": 0.0149, + "num_tokens": 16875693.0, + "reward": 0.8194580078125, + "reward_std": 0.014085326343774796, + "rewards//mean": 0.8194580078125, + "rewards//std": 0.019493624567985535, + "step": 2318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4638, + "grad_norm": 2.0669450759887695, + "kl": 0.317098043859005, + "learning_rate": 5.654990520328464e-07, + "loss": 0.0127, + "num_tokens": 16883029.0, + "reward": 0.8179931640625, + "reward_std": 0.015756797045469284, + "rewards//mean": 0.8179931640625, + "rewards//std": 0.019056973978877068, + "step": 2319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.464, + "grad_norm": 1.4603872299194336, + "kl": 0.31466234661638737, + "learning_rate": 5.651844408471162e-07, + "loss": 0.0126, + "num_tokens": 16890341.0, + "reward": 0.87408447265625, + "reward_std": 0.019324203953146935, + "rewards//mean": 0.87408447265625, + "rewards//std": 0.031625520437955856, + "step": 2320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4642, + "grad_norm": 1.9036576747894287, + "kl": 0.34548244811594486, + "learning_rate": 5.648698034051008e-07, + "loss": 0.0138, + "num_tokens": 16897669.0, + "reward": 0.821533203125, + "reward_std": 0.018507808446884155, + "rewards//mean": 0.821533203125, + "rewards//std": 0.023374119773507118, + "step": 2321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.671875, + "epoch": 0.4644, + "grad_norm": 1.4941167831420898, + "kl": 0.3216537982225418, + "learning_rate": 5.645551398335366e-07, + "loss": -0.0128, + "num_tokens": 16905008.0, + "reward": 0.85406494140625, + "reward_std": 0.017708800733089447, + "rewards//mean": 0.85406494140625, + "rewards//std": 0.02720937877893448, + "step": 2322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4646, + "grad_norm": 1.4420862197875977, + "kl": 0.3002531826496124, + "learning_rate": 5.642404502591697e-07, + "loss": 0.012, + "num_tokens": 16912248.0, + "reward": 0.84869384765625, + "reward_std": 0.014390340074896812, + "rewards//mean": 0.84869384765625, + "rewards//std": 0.022088779136538506, + "step": 2323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.734375, + "epoch": 0.4648, + "grad_norm": 1.5015455484390259, + "kl": 0.3742917813360691, + "learning_rate": 5.639257348087572e-07, + "loss": -0.0025, + "num_tokens": 16919479.0, + "reward": 0.857666015625, + "reward_std": 0.01638958603143692, + "rewards//mean": 0.857666015625, + "rewards//std": 0.020785322412848473, + "step": 2324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.765625, + "epoch": 0.465, + "grad_norm": 1.3577619791030884, + "kl": 0.35459692031145096, + "learning_rate": 5.636109936090661e-07, + "loss": -0.0029, + "num_tokens": 16926744.0, + "reward": 0.8609619140625, + "reward_std": 0.015825094655156136, + "rewards//mean": 0.8609619140625, + "rewards//std": 0.029876815155148506, + "step": 2325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.4652, + "grad_norm": 1.6476210355758667, + "kl": 0.3858598805963993, + "learning_rate": 5.632962267868746e-07, + "loss": 0.0145, + "num_tokens": 16934050.0, + "reward": 0.83251953125, + "reward_std": 0.018374236300587654, + "rewards//mean": 0.83251953125, + "rewards//std": 0.03016548976302147, + "step": 2326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4654, + "grad_norm": 1.3137147426605225, + "kl": 0.3355955444276333, + "learning_rate": 5.629814344689705e-07, + "loss": 0.0134, + "num_tokens": 16941266.0, + "reward": 0.80706787109375, + "reward_std": 0.010463542304933071, + "rewards//mean": 0.80706787109375, + "rewards//std": 0.01483978796750307, + "step": 2327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.875, + "epoch": 0.4656, + "grad_norm": 1.3773194551467896, + "kl": 0.34161820262670517, + "learning_rate": 5.626666167821521e-07, + "loss": 0.015, + "num_tokens": 16948642.0, + "reward": 0.8568115234375, + "reward_std": 0.017545480281114578, + "rewards//mean": 0.8568115234375, + "rewards//std": 0.02206658199429512, + "step": 2328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4658, + "grad_norm": 1.7872211933135986, + "kl": 0.38665008544921875, + "learning_rate": 5.623517738532279e-07, + "loss": 0.0155, + "num_tokens": 16955938.0, + "reward": 0.87054443359375, + "reward_std": 0.019833475351333618, + "rewards//mean": 0.87054443359375, + "rewards//std": 0.02646482177078724, + "step": 2329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.466, + "grad_norm": 1.5522462129592896, + "kl": 0.30743958055973053, + "learning_rate": 5.620369058090168e-07, + "loss": 0.0123, + "num_tokens": 16963186.0, + "reward": 0.87677001953125, + "reward_std": 0.013843214139342308, + "rewards//mean": 0.87677001953125, + "rewards//std": 0.018094873055815697, + "step": 2330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4662, + "grad_norm": 1.3940316438674927, + "kl": 0.31350768357515335, + "learning_rate": 5.617220127763474e-07, + "loss": 0.0125, + "num_tokens": 16970466.0, + "reward": 0.87432861328125, + "reward_std": 0.015374984592199326, + "rewards//mean": 0.87432861328125, + "rewards//std": 0.030997512862086296, + "step": 2331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4664, + "grad_norm": 1.3733043670654297, + "kl": 0.3145250454545021, + "learning_rate": 5.614070948820585e-07, + "loss": 0.0126, + "num_tokens": 16977754.0, + "reward": 0.821533203125, + "reward_std": 0.012130447663366795, + "rewards//mean": 0.821533203125, + "rewards//std": 0.028398511931300163, + "step": 2332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4666, + "grad_norm": 1.389761209487915, + "kl": 0.33008917793631554, + "learning_rate": 5.610921522529993e-07, + "loss": 0.0132, + "num_tokens": 16984962.0, + "reward": 0.89447021484375, + "reward_std": 0.019609464332461357, + "rewards//mean": 0.89447021484375, + "rewards//std": 0.022439613938331604, + "step": 2333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4668, + "grad_norm": 1.3954981565475464, + "kl": 0.3256386537104845, + "learning_rate": 5.607771850160284e-07, + "loss": 0.013, + "num_tokens": 16992338.0, + "reward": 0.84539794921875, + "reward_std": 0.013144716620445251, + "rewards//mean": 0.84539794921875, + "rewards//std": 0.023102398961782455, + "step": 2334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.467, + "grad_norm": 1.4068701267242432, + "kl": 0.29449519142508507, + "learning_rate": 5.604621932980147e-07, + "loss": 0.0118, + "num_tokens": 16999626.0, + "reward": 0.848876953125, + "reward_std": 0.01531308051198721, + "rewards//mean": 0.848876953125, + "rewards//std": 0.037922829389572144, + "step": 2335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4672, + "grad_norm": 1.476806402206421, + "kl": 0.2992158755660057, + "learning_rate": 5.601471772258367e-07, + "loss": 0.012, + "num_tokens": 17006858.0, + "reward": 0.87030029296875, + "reward_std": 0.014557733200490475, + "rewards//mean": 0.87030029296875, + "rewards//std": 0.022262847051024437, + "step": 2336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4674, + "grad_norm": 1.6241650581359863, + "kl": 0.3169133812189102, + "learning_rate": 5.598321369263829e-07, + "loss": 0.0127, + "num_tokens": 17014194.0, + "reward": 0.86376953125, + "reward_std": 0.021749937906861305, + "rewards//mean": 0.86376953125, + "rewards//std": 0.027537930756807327, + "step": 2337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4676, + "grad_norm": 1.4931342601776123, + "kl": 0.4038800895214081, + "learning_rate": 5.595170725265516e-07, + "loss": 0.0162, + "num_tokens": 17021450.0, + "reward": 0.83447265625, + "reward_std": 0.016028208658099174, + "rewards//mean": 0.83447265625, + "rewards//std": 0.018733717501163483, + "step": 2338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4678, + "grad_norm": 1.3784924745559692, + "kl": 0.29164848290383816, + "learning_rate": 5.592019841532506e-07, + "loss": 0.0117, + "num_tokens": 17028770.0, + "reward": 0.85870361328125, + "reward_std": 0.015160935930907726, + "rewards//mean": 0.85870361328125, + "rewards//std": 0.019581466913223267, + "step": 2339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.468, + "grad_norm": 1.3264857530593872, + "kl": 0.4082622155547142, + "learning_rate": 5.588868719333974e-07, + "loss": 0.0163, + "num_tokens": 17036002.0, + "reward": 0.84564208984375, + "reward_std": 0.010901980102062225, + "rewards//mean": 0.84564208984375, + "rewards//std": 0.022670486941933632, + "step": 2340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4682, + "grad_norm": 1.5357109308242798, + "kl": 0.2977010998874903, + "learning_rate": 5.585717359939192e-07, + "loss": 0.0119, + "num_tokens": 17043298.0, + "reward": 0.81549072265625, + "reward_std": 0.01352517120540142, + "rewards//mean": 0.81549072265625, + "rewards//std": 0.024878626689314842, + "step": 2341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4684, + "grad_norm": 1.5241726636886597, + "kl": 0.38486671447753906, + "learning_rate": 5.582565764617527e-07, + "loss": 0.0154, + "num_tokens": 17050546.0, + "reward": 0.86859130859375, + "reward_std": 0.016684040427207947, + "rewards//mean": 0.86859130859375, + "rewards//std": 0.02586778998374939, + "step": 2342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4686, + "grad_norm": 1.5037719011306763, + "kl": 0.34635366685688496, + "learning_rate": 5.579413934638442e-07, + "loss": 0.0139, + "num_tokens": 17057882.0, + "reward": 0.8250732421875, + "reward_std": 0.0105352271348238, + "rewards//mean": 0.8250732421875, + "rewards//std": 0.01624019630253315, + "step": 2343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4688, + "grad_norm": 2.542478322982788, + "kl": 0.4660397656261921, + "learning_rate": 5.576261871271494e-07, + "loss": 0.0186, + "num_tokens": 17065186.0, + "reward": 0.829345703125, + "reward_std": 0.014167509973049164, + "rewards//mean": 0.829345703125, + "rewards//std": 0.017398081719875336, + "step": 2344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.469, + "grad_norm": 1.5721025466918945, + "kl": 0.3148153368383646, + "learning_rate": 5.573109575786333e-07, + "loss": 0.0126, + "num_tokens": 17072514.0, + "reward": 0.87408447265625, + "reward_std": 0.019733160734176636, + "rewards//mean": 0.87408447265625, + "rewards//std": 0.030255619436502457, + "step": 2345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4692, + "grad_norm": 1.3776987791061401, + "kl": 0.3263516929000616, + "learning_rate": 5.569957049452702e-07, + "loss": 0.0131, + "num_tokens": 17079794.0, + "reward": 0.8209228515625, + "reward_std": 0.011170506477355957, + "rewards//mean": 0.8209228515625, + "rewards//std": 0.015503914095461369, + "step": 2346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.765625, + "epoch": 0.4694, + "grad_norm": 1.3966516256332397, + "kl": 0.38271929137408733, + "learning_rate": 5.566804293540443e-07, + "loss": 0.0146, + "num_tokens": 17087147.0, + "reward": 0.85198974609375, + "reward_std": 0.015002376399934292, + "rewards//mean": 0.85198974609375, + "rewards//std": 0.020589124411344528, + "step": 2347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4696, + "grad_norm": 1.6057180166244507, + "kl": 0.3382028192281723, + "learning_rate": 5.563651309319479e-07, + "loss": 0.0135, + "num_tokens": 17094467.0, + "reward": 0.8258056640625, + "reward_std": 0.015041803941130638, + "rewards//mean": 0.8258056640625, + "rewards//std": 0.03031940758228302, + "step": 2348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4698, + "grad_norm": 1.3443845510482788, + "kl": 0.2766585499048233, + "learning_rate": 5.560498098059837e-07, + "loss": 0.0111, + "num_tokens": 17101739.0, + "reward": 0.83160400390625, + "reward_std": 0.014905640855431557, + "rewards//mean": 0.83160400390625, + "rewards//std": 0.018402624875307083, + "step": 2349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.47, + "grad_norm": 1.6523327827453613, + "kl": 0.35117669962346554, + "learning_rate": 5.557344661031627e-07, + "loss": 0.014, + "num_tokens": 17109083.0, + "reward": 0.82562255859375, + "reward_std": 0.014046436175704002, + "rewards//mean": 0.82562255859375, + "rewards//std": 0.016148941591382027, + "step": 2350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4702, + "grad_norm": 1.3744518756866455, + "kl": 0.397247027605772, + "learning_rate": 5.554190999505055e-07, + "loss": 0.0159, + "num_tokens": 17116355.0, + "reward": 0.82635498046875, + "reward_std": 0.011401279829442501, + "rewards//mean": 0.82635498046875, + "rewards//std": 0.012851007282733917, + "step": 2351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4704, + "grad_norm": 1.535327434539795, + "kl": 0.3116863276809454, + "learning_rate": 5.551037114750414e-07, + "loss": 0.0125, + "num_tokens": 17123659.0, + "reward": 0.879150390625, + "reward_std": 0.017576877027750015, + "rewards//mean": 0.879150390625, + "rewards//std": 0.03169538825750351, + "step": 2352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4706, + "grad_norm": 1.4694327116012573, + "kl": 0.36702999100089073, + "learning_rate": 5.54788300803809e-07, + "loss": 0.0147, + "num_tokens": 17130923.0, + "reward": 0.87799072265625, + "reward_std": 0.012872977182269096, + "rewards//mean": 0.87799072265625, + "rewards//std": 0.026261555030941963, + "step": 2353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4708, + "grad_norm": 1.574371576309204, + "kl": 0.3226943090558052, + "learning_rate": 5.544728680638556e-07, + "loss": 0.0129, + "num_tokens": 17138187.0, + "reward": 0.81207275390625, + "reward_std": 0.018236998468637466, + "rewards//mean": 0.81207275390625, + "rewards//std": 0.026947753503918648, + "step": 2354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.471, + "grad_norm": 1.402172327041626, + "kl": 0.2961596418172121, + "learning_rate": 5.541574133822373e-07, + "loss": 0.0118, + "num_tokens": 17145467.0, + "reward": 0.87017822265625, + "reward_std": 0.01738920621573925, + "rewards//mean": 0.87017822265625, + "rewards//std": 0.026472827419638634, + "step": 2355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4712, + "grad_norm": 1.7814453840255737, + "kl": 0.3393451049923897, + "learning_rate": 5.538419368860195e-07, + "loss": 0.0136, + "num_tokens": 17152739.0, + "reward": 0.84283447265625, + "reward_std": 0.020039178431034088, + "rewards//mean": 0.84283447265625, + "rewards//std": 0.04022177681326866, + "step": 2356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.4714, + "grad_norm": 1.5753599405288696, + "kl": 0.2751696743071079, + "learning_rate": 5.535264387022759e-07, + "loss": 0.0049, + "num_tokens": 17159916.0, + "reward": 0.87347412109375, + "reward_std": 0.012764385901391506, + "rewards//mean": 0.87347412109375, + "rewards//std": 0.029721559956669807, + "step": 2357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4716, + "grad_norm": 1.3622314929962158, + "kl": 0.2859406713396311, + "learning_rate": 5.532109189580892e-07, + "loss": 0.0114, + "num_tokens": 17167228.0, + "reward": 0.85247802734375, + "reward_std": 0.012255849316716194, + "rewards//mean": 0.85247802734375, + "rewards//std": 0.023537959903478622, + "step": 2358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4718, + "grad_norm": 1.835752010345459, + "kl": 0.30109792202711105, + "learning_rate": 5.528953777805507e-07, + "loss": 0.012, + "num_tokens": 17174428.0, + "reward": 0.87841796875, + "reward_std": 0.020807169377803802, + "rewards//mean": 0.87841796875, + "rewards//std": 0.025137772783637047, + "step": 2359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.472, + "grad_norm": 1.469710111618042, + "kl": 0.36336956545710564, + "learning_rate": 5.525798152967605e-07, + "loss": 0.0149, + "num_tokens": 17181672.0, + "reward": 0.8682861328125, + "reward_std": 0.013986273668706417, + "rewards//mean": 0.8682861328125, + "rewards//std": 0.024856483563780785, + "step": 2360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.4722, + "grad_norm": 1.6603418588638306, + "kl": 0.39175924472510815, + "learning_rate": 5.522642316338268e-07, + "loss": 0.0147, + "num_tokens": 17189054.0, + "reward": 0.866943359375, + "reward_std": 0.014693505130708218, + "rewards//mean": 0.866943359375, + "rewards//std": 0.0229453444480896, + "step": 2361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4724, + "grad_norm": 1.2261914014816284, + "kl": 0.33290868252515793, + "learning_rate": 5.519486269188669e-07, + "loss": 0.0133, + "num_tokens": 17196374.0, + "reward": 0.82470703125, + "reward_std": 0.01543046347796917, + "rewards//mean": 0.82470703125, + "rewards//std": 0.019667185842990875, + "step": 2362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.828125, + "epoch": 0.4726, + "grad_norm": 1.472251057624817, + "kl": 0.32916848734021187, + "learning_rate": 5.516330012790062e-07, + "loss": 0.0182, + "num_tokens": 17203707.0, + "reward": 0.761474609375, + "reward_std": 0.011230330914258957, + "rewards//mean": 0.761474609375, + "rewards//std": 0.01684640534222126, + "step": 2363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4728, + "grad_norm": 1.4784951210021973, + "kl": 0.3048204034566879, + "learning_rate": 5.513173548413789e-07, + "loss": 0.0122, + "num_tokens": 17211011.0, + "reward": 0.83331298828125, + "reward_std": 0.012530307285487652, + "rewards//mean": 0.83331298828125, + "rewards//std": 0.017758795991539955, + "step": 2364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.473, + "grad_norm": 1.5451799631118774, + "kl": 0.37326720356941223, + "learning_rate": 5.51001687733127e-07, + "loss": 0.0149, + "num_tokens": 17218283.0, + "reward": 0.81243896484375, + "reward_std": 0.01196964830160141, + "rewards//mean": 0.81243896484375, + "rewards//std": 0.01864289492368698, + "step": 2365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.4732, + "grad_norm": 1.5157819986343384, + "kl": 0.4176044128835201, + "learning_rate": 5.506860000814017e-07, + "loss": 0.0162, + "num_tokens": 17225550.0, + "reward": 0.83087158203125, + "reward_std": 0.014503121376037598, + "rewards//mean": 0.83087158203125, + "rewards//std": 0.023142987862229347, + "step": 2366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4734, + "grad_norm": 1.6973224878311157, + "kl": 0.34885939583182335, + "learning_rate": 5.503702920133614e-07, + "loss": 0.014, + "num_tokens": 17232838.0, + "reward": 0.8323974609375, + "reward_std": 0.015544988214969635, + "rewards//mean": 0.8323974609375, + "rewards//std": 0.019975343719124794, + "step": 2367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4736, + "grad_norm": 1.603991150856018, + "kl": 0.32326309382915497, + "learning_rate": 5.500545636561736e-07, + "loss": 0.0129, + "num_tokens": 17240062.0, + "reward": 0.8631591796875, + "reward_std": 0.017568835988640785, + "rewards//mean": 0.8631591796875, + "rewards//std": 0.02099510282278061, + "step": 2368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4738, + "grad_norm": 1.3744122982025146, + "kl": 0.28130432963371277, + "learning_rate": 5.497388151370135e-07, + "loss": 0.0113, + "num_tokens": 17247214.0, + "reward": 0.82159423828125, + "reward_std": 0.01311366818845272, + "rewards//mean": 0.82159423828125, + "rewards//std": 0.014007531106472015, + "step": 2369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.474, + "grad_norm": 1.8509761095046997, + "kl": 0.3337865322828293, + "learning_rate": 5.494230465830647e-07, + "loss": 0.0134, + "num_tokens": 17254702.0, + "reward": 0.80731201171875, + "reward_std": 0.017451170831918716, + "rewards//mean": 0.80731201171875, + "rewards//std": 0.020564846694469452, + "step": 2370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4742, + "grad_norm": 1.415333867073059, + "kl": 0.3465006351470947, + "learning_rate": 5.491072581215186e-07, + "loss": 0.0139, + "num_tokens": 17261982.0, + "reward": 0.74029541015625, + "reward_std": 0.01667964830994606, + "rewards//mean": 0.74029541015625, + "rewards//std": 0.0286964550614357, + "step": 2371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4744, + "grad_norm": 1.9059627056121826, + "kl": 0.31619839556515217, + "learning_rate": 5.487914498795747e-07, + "loss": 0.0126, + "num_tokens": 17269182.0, + "reward": 0.822509765625, + "reward_std": 0.014867372810840607, + "rewards//mean": 0.822509765625, + "rewards//std": 0.01953590102493763, + "step": 2372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.875, + "epoch": 0.4746, + "grad_norm": 1.3196181058883667, + "kl": 0.3631968880072236, + "learning_rate": 5.484756219844407e-07, + "loss": 0.0171, + "num_tokens": 17276518.0, + "reward": 0.80938720703125, + "reward_std": 0.013136066496372223, + "rewards//mean": 0.80938720703125, + "rewards//std": 0.0167643241584301, + "step": 2373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4748, + "grad_norm": 1.843230962753296, + "kl": 0.3804176952689886, + "learning_rate": 5.48159774563332e-07, + "loss": 0.0152, + "num_tokens": 17283806.0, + "reward": 0.80157470703125, + "reward_std": 0.012708366848528385, + "rewards//mean": 0.80157470703125, + "rewards//std": 0.022853374481201172, + "step": 2374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.475, + "grad_norm": 1.3226146697998047, + "kl": 0.3170162308961153, + "learning_rate": 5.478439077434717e-07, + "loss": 0.0127, + "num_tokens": 17290990.0, + "reward": 0.8818359375, + "reward_std": 0.018570497632026672, + "rewards//mean": 0.8818359375, + "rewards//std": 0.02632840722799301, + "step": 2375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4752, + "grad_norm": 2.2722573280334473, + "kl": 0.4203906450420618, + "learning_rate": 5.475280216520912e-07, + "loss": 0.0168, + "num_tokens": 17298246.0, + "reward": 0.80548095703125, + "reward_std": 0.018382323905825615, + "rewards//mean": 0.80548095703125, + "rewards//std": 0.023221347481012344, + "step": 2376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4754, + "grad_norm": 1.4559471607208252, + "kl": 0.3309335447847843, + "learning_rate": 5.472121164164295e-07, + "loss": 0.0132, + "num_tokens": 17305542.0, + "reward": 0.80609130859375, + "reward_std": 0.014265151694417, + "rewards//mean": 0.80609130859375, + "rewards//std": 0.024413054808974266, + "step": 2377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.4756, + "grad_norm": 1.4365839958190918, + "kl": 0.2968854885548353, + "learning_rate": 5.468961921637326e-07, + "loss": 0.0116, + "num_tokens": 17312825.0, + "reward": 0.8204345703125, + "reward_std": 0.01488316711038351, + "rewards//mean": 0.8204345703125, + "rewards//std": 0.038994044065475464, + "step": 2378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.75, + "epoch": 0.4758, + "grad_norm": 1.6646250486373901, + "kl": 0.43174179643392563, + "learning_rate": 5.465802490212554e-07, + "loss": -0.0012, + "num_tokens": 17320081.0, + "reward": 0.82659912109375, + "reward_std": 0.014929205179214478, + "rewards//mean": 0.82659912109375, + "rewards//std": 0.025578254833817482, + "step": 2379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.476, + "grad_norm": 1.4206526279449463, + "kl": 0.3550833482295275, + "learning_rate": 5.462642871162592e-07, + "loss": 0.0142, + "num_tokens": 17327321.0, + "reward": 0.86834716796875, + "reward_std": 0.0171637162566185, + "rewards//mean": 0.86834716796875, + "rewards//std": 0.02459406480193138, + "step": 2380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4762, + "grad_norm": 1.4568432569503784, + "kl": 0.29813707806169987, + "learning_rate": 5.459483065760138e-07, + "loss": 0.0119, + "num_tokens": 17334633.0, + "reward": 0.88616943359375, + "reward_std": 0.012764301151037216, + "rewards//mean": 0.88616943359375, + "rewards//std": 0.020085962489247322, + "step": 2381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4764, + "grad_norm": 1.5534189939498901, + "kl": 0.33170517161488533, + "learning_rate": 5.456323075277959e-07, + "loss": 0.0133, + "num_tokens": 17341849.0, + "reward": 0.83172607421875, + "reward_std": 0.012380847707390785, + "rewards//mean": 0.83172607421875, + "rewards//std": 0.01710931956768036, + "step": 2382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4766, + "grad_norm": 1.4521749019622803, + "kl": 0.33807599544525146, + "learning_rate": 5.453162900988901e-07, + "loss": 0.0135, + "num_tokens": 17349193.0, + "reward": 0.84002685546875, + "reward_std": 0.014961311593651772, + "rewards//mean": 0.84002685546875, + "rewards//std": 0.026258673518896103, + "step": 2383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.796875, + "epoch": 0.4768, + "grad_norm": 1.543644905090332, + "kl": 0.31746718659996986, + "learning_rate": 5.45000254416588e-07, + "loss": 0.0252, + "num_tokens": 17356516.0, + "reward": 0.7825927734375, + "reward_std": 0.012128467671573162, + "rewards//mean": 0.7825927734375, + "rewards//std": 0.018522026017308235, + "step": 2384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.477, + "grad_norm": 1.4048370122909546, + "kl": 0.2985537722706795, + "learning_rate": 5.446842006081888e-07, + "loss": 0.0119, + "num_tokens": 17363868.0, + "reward": 0.81878662109375, + "reward_std": 0.014385750517249107, + "rewards//mean": 0.81878662109375, + "rewards//std": 0.018237365409731865, + "step": 2385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4772, + "grad_norm": 1.5494420528411865, + "kl": 0.34720510989427567, + "learning_rate": 5.443681288009991e-07, + "loss": 0.0139, + "num_tokens": 17371084.0, + "reward": 0.8646240234375, + "reward_std": 0.017165720462799072, + "rewards//mean": 0.8646240234375, + "rewards//std": 0.028401443734765053, + "step": 2386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4774, + "grad_norm": 1.2972042560577393, + "kl": 0.34015797823667526, + "learning_rate": 5.440520391223322e-07, + "loss": 0.0136, + "num_tokens": 17378404.0, + "reward": 0.81976318359375, + "reward_std": 0.01716548204421997, + "rewards//mean": 0.81976318359375, + "rewards//std": 0.0303605068475008, + "step": 2387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4776, + "grad_norm": 1.625765085220337, + "kl": 0.3211927078664303, + "learning_rate": 5.437359316995093e-07, + "loss": 0.0128, + "num_tokens": 17385636.0, + "reward": 0.832275390625, + "reward_std": 0.015352964401245117, + "rewards//mean": 0.832275390625, + "rewards//std": 0.025523822754621506, + "step": 2388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4778, + "grad_norm": 1.4123836755752563, + "kl": 0.28652566112577915, + "learning_rate": 5.434198066598584e-07, + "loss": 0.0115, + "num_tokens": 17392948.0, + "reward": 0.84619140625, + "reward_std": 0.016989439725875854, + "rewards//mean": 0.84619140625, + "rewards//std": 0.02468077465891838, + "step": 2389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.478, + "grad_norm": 1.534118413925171, + "kl": 0.3555862456560135, + "learning_rate": 5.431036641307145e-07, + "loss": 0.0142, + "num_tokens": 17400164.0, + "reward": 0.85406494140625, + "reward_std": 0.016357293352484703, + "rewards//mean": 0.85406494140625, + "rewards//std": 0.022578153759241104, + "step": 2390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.4782, + "grad_norm": 1.3815406560897827, + "kl": 0.2699141912162304, + "learning_rate": 5.427875042394199e-07, + "loss": 0.0122, + "num_tokens": 17407481.0, + "reward": 0.86785888671875, + "reward_std": 0.017113016918301582, + "rewards//mean": 0.86785888671875, + "rewards//std": 0.02380208671092987, + "step": 2391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4784, + "grad_norm": 1.362116813659668, + "kl": 0.3144776411354542, + "learning_rate": 5.424713271133236e-07, + "loss": 0.0126, + "num_tokens": 17414745.0, + "reward": 0.8677978515625, + "reward_std": 0.015899334102869034, + "rewards//mean": 0.8677978515625, + "rewards//std": 0.021284397691488266, + "step": 2392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.4786, + "grad_norm": 1.9109947681427002, + "kl": 0.35705954395234585, + "learning_rate": 5.421551328797819e-07, + "loss": 0.0145, + "num_tokens": 17422028.0, + "reward": 0.87408447265625, + "reward_std": 0.017532572150230408, + "rewards//mean": 0.87408447265625, + "rewards//std": 0.023599620908498764, + "step": 2393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4788, + "grad_norm": 1.574676513671875, + "kl": 0.3092162571847439, + "learning_rate": 5.418389216661578e-07, + "loss": 0.0124, + "num_tokens": 17429428.0, + "reward": 0.87164306640625, + "reward_std": 0.0214284285902977, + "rewards//mean": 0.87164306640625, + "rewards//std": 0.027497153729200363, + "step": 2394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.479, + "grad_norm": 1.3515756130218506, + "kl": 0.3060918413102627, + "learning_rate": 5.41522693599821e-07, + "loss": 0.0122, + "num_tokens": 17436644.0, + "reward": 0.84906005859375, + "reward_std": 0.013988031074404716, + "rewards//mean": 0.84906005859375, + "rewards//std": 0.018073946237564087, + "step": 2395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4792, + "grad_norm": 1.5401685237884521, + "kl": 0.3261805512011051, + "learning_rate": 5.412064488081481e-07, + "loss": 0.013, + "num_tokens": 17443996.0, + "reward": 0.8287353515625, + "reward_std": 0.011919111013412476, + "rewards//mean": 0.8287353515625, + "rewards//std": 0.01809874176979065, + "step": 2396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.546875, + "epoch": 0.4794, + "grad_norm": 1.4727890491485596, + "kl": 0.3266275245696306, + "learning_rate": 5.408901874185225e-07, + "loss": -0.018, + "num_tokens": 17451263.0, + "reward": 0.85870361328125, + "reward_std": 0.017522722482681274, + "rewards//mean": 0.85870361328125, + "rewards//std": 0.0261211097240448, + "step": 2397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4796, + "grad_norm": 1.3248127698898315, + "kl": 0.3098595719784498, + "learning_rate": 5.405739095583344e-07, + "loss": 0.0124, + "num_tokens": 17458391.0, + "reward": 0.790771484375, + "reward_std": 0.014775514602661133, + "rewards//mean": 0.790771484375, + "rewards//std": 0.025073861703276634, + "step": 2398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.875, + "epoch": 0.4798, + "grad_norm": 1.4922069311141968, + "kl": 0.2958532124757767, + "learning_rate": 5.402576153549804e-07, + "loss": 0.012, + "num_tokens": 17465655.0, + "reward": 0.856689453125, + "reward_std": 0.01340450718998909, + "rewards//mean": 0.856689453125, + "rewards//std": 0.02383585087954998, + "step": 2399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.48, + "grad_norm": 1.5653762817382812, + "kl": 0.30993527546525, + "learning_rate": 5.399413049358637e-07, + "loss": 0.0124, + "num_tokens": 17472959.0, + "reward": 0.86883544921875, + "reward_std": 0.017756611108779907, + "rewards//mean": 0.86883544921875, + "rewards//std": 0.022367315366864204, + "step": 2400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4802, + "grad_norm": 1.5497336387634277, + "kl": 0.28866592049598694, + "learning_rate": 5.396249784283942e-07, + "loss": 0.0115, + "num_tokens": 17480295.0, + "reward": 0.86187744140625, + "reward_std": 0.012434168718755245, + "rewards//mean": 0.86187744140625, + "rewards//std": 0.01754353754222393, + "step": 2401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4804, + "grad_norm": 1.7036911249160767, + "kl": 0.45162950083613396, + "learning_rate": 5.393086359599881e-07, + "loss": 0.0181, + "num_tokens": 17487519.0, + "reward": 0.85552978515625, + "reward_std": 0.017902448773384094, + "rewards//mean": 0.85552978515625, + "rewards//std": 0.03042127564549446, + "step": 2402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4806, + "grad_norm": 1.5525100231170654, + "kl": 0.32990336418151855, + "learning_rate": 5.389922776580681e-07, + "loss": 0.0132, + "num_tokens": 17494791.0, + "reward": 0.84320068359375, + "reward_std": 0.013010159134864807, + "rewards//mean": 0.84320068359375, + "rewards//std": 0.022903000935912132, + "step": 2403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.8125, + "epoch": 0.4808, + "grad_norm": 1.2748855352401733, + "kl": 0.2743702754378319, + "learning_rate": 5.386759036500634e-07, + "loss": 0.0055, + "num_tokens": 17502067.0, + "reward": 0.846435546875, + "reward_std": 0.012663032859563828, + "rewards//mean": 0.846435546875, + "rewards//std": 0.021540669724345207, + "step": 2404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.481, + "grad_norm": 1.3825390338897705, + "kl": 0.336236834526062, + "learning_rate": 5.383595140634093e-07, + "loss": 0.0134, + "num_tokens": 17509363.0, + "reward": 0.87286376953125, + "reward_std": 0.015880491584539413, + "rewards//mean": 0.87286376953125, + "rewards//std": 0.021698124706745148, + "step": 2405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4812, + "grad_norm": 1.3312467336654663, + "kl": 0.37822291254997253, + "learning_rate": 5.380431090255475e-07, + "loss": 0.0151, + "num_tokens": 17516595.0, + "reward": 0.85382080078125, + "reward_std": 0.014232274144887924, + "rewards//mean": 0.85382080078125, + "rewards//std": 0.02140451967716217, + "step": 2406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4814, + "grad_norm": 1.2577091455459595, + "kl": 0.40733293257653713, + "learning_rate": 5.377266886639259e-07, + "loss": 0.0163, + "num_tokens": 17523907.0, + "reward": 0.83746337890625, + "reward_std": 0.016516495496034622, + "rewards//mean": 0.83746337890625, + "rewards//std": 0.02318415977060795, + "step": 2407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4816, + "grad_norm": 1.3953183889389038, + "kl": 0.3512003030627966, + "learning_rate": 5.374102531059987e-07, + "loss": 0.014, + "num_tokens": 17531115.0, + "reward": 0.84527587890625, + "reward_std": 0.014412648975849152, + "rewards//mean": 0.84527587890625, + "rewards//std": 0.0214814692735672, + "step": 2408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4818, + "grad_norm": 1.5185785293579102, + "kl": 0.3365940786898136, + "learning_rate": 5.370938024792261e-07, + "loss": 0.0135, + "num_tokens": 17538387.0, + "reward": 0.85693359375, + "reward_std": 0.014014898799359798, + "rewards//mean": 0.85693359375, + "rewards//std": 0.01722494326531887, + "step": 2409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.482, + "grad_norm": 1.3935163021087646, + "kl": 0.4216582179069519, + "learning_rate": 5.367773369110741e-07, + "loss": 0.0169, + "num_tokens": 17545667.0, + "reward": 0.82177734375, + "reward_std": 0.013771893456578255, + "rewards//mean": 0.82177734375, + "rewards//std": 0.024710197001695633, + "step": 2410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4822, + "grad_norm": 1.5633444786071777, + "kl": 0.35166187956929207, + "learning_rate": 5.364608565290154e-07, + "loss": 0.0141, + "num_tokens": 17552907.0, + "reward": 0.823486328125, + "reward_std": 0.018559707328677177, + "rewards//mean": 0.823486328125, + "rewards//std": 0.03520702198147774, + "step": 2411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4824, + "grad_norm": 1.248695969581604, + "kl": 0.32478718273341656, + "learning_rate": 5.361443614605278e-07, + "loss": 0.013, + "num_tokens": 17560227.0, + "reward": 0.82672119140625, + "reward_std": 0.011821997351944447, + "rewards//mean": 0.82672119140625, + "rewards//std": 0.012359845452010632, + "step": 2412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4826, + "grad_norm": 1.8152087926864624, + "kl": 0.34219413809478283, + "learning_rate": 5.358278518330959e-07, + "loss": 0.0137, + "num_tokens": 17567619.0, + "reward": 0.82232666015625, + "reward_std": 0.011763770133256912, + "rewards//mean": 0.82232666015625, + "rewards//std": 0.01948770135641098, + "step": 2413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.859375, + "epoch": 0.4828, + "grad_norm": 1.427032232284546, + "kl": 0.343020873144269, + "learning_rate": 5.355113277742095e-07, + "loss": 0.0068, + "num_tokens": 17574922.0, + "reward": 0.82562255859375, + "reward_std": 0.01341967098414898, + "rewards//mean": 0.82562255859375, + "rewards//std": 0.025698702782392502, + "step": 2414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.483, + "grad_norm": 1.3851693868637085, + "kl": 0.28300685435533524, + "learning_rate": 5.351947894113645e-07, + "loss": 0.0113, + "num_tokens": 17582218.0, + "reward": 0.7955322265625, + "reward_std": 0.01764923706650734, + "rewards//mean": 0.7955322265625, + "rewards//std": 0.03626662865281105, + "step": 2415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.4832, + "grad_norm": 1.4973065853118896, + "kl": 0.3447548411786556, + "learning_rate": 5.348782368720625e-07, + "loss": 0.0137, + "num_tokens": 17589519.0, + "reward": 0.8294677734375, + "reward_std": 0.016405753791332245, + "rewards//mean": 0.8294677734375, + "rewards//std": 0.01763787493109703, + "step": 2416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4834, + "grad_norm": 1.3788809776306152, + "kl": 0.31608189456164837, + "learning_rate": 5.34561670283811e-07, + "loss": 0.0126, + "num_tokens": 17596767.0, + "reward": 0.84356689453125, + "reward_std": 0.015581324696540833, + "rewards//mean": 0.84356689453125, + "rewards//std": 0.022781724110245705, + "step": 2417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4836, + "grad_norm": 1.5034992694854736, + "kl": 0.32395802438259125, + "learning_rate": 5.342450897741228e-07, + "loss": 0.013, + "num_tokens": 17604111.0, + "reward": 0.86395263671875, + "reward_std": 0.016605503857135773, + "rewards//mean": 0.86395263671875, + "rewards//std": 0.020488150417804718, + "step": 2418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4838, + "grad_norm": 1.2850298881530762, + "kl": 0.33184175938367844, + "learning_rate": 5.339284954705165e-07, + "loss": 0.0133, + "num_tokens": 17611367.0, + "reward": 0.82025146484375, + "reward_std": 0.011805547401309013, + "rewards//mean": 0.82025146484375, + "rewards//std": 0.019003180786967278, + "step": 2419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.484, + "grad_norm": 2.032029628753662, + "kl": 0.3359858989715576, + "learning_rate": 5.336118875005164e-07, + "loss": 0.0134, + "num_tokens": 17618711.0, + "reward": 0.8475341796875, + "reward_std": 0.01425875723361969, + "rewards//mean": 0.8475341796875, + "rewards//std": 0.02464609593153, + "step": 2420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4842, + "grad_norm": 1.5763049125671387, + "kl": 0.40729560889303684, + "learning_rate": 5.33295265991652e-07, + "loss": 0.0163, + "num_tokens": 17625983.0, + "reward": 0.86572265625, + "reward_std": 0.015242833644151688, + "rewards//mean": 0.86572265625, + "rewards//std": 0.03364965319633484, + "step": 2421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4844, + "grad_norm": 1.382408618927002, + "kl": 0.3205743134021759, + "learning_rate": 5.329786310714582e-07, + "loss": 0.0128, + "num_tokens": 17633335.0, + "reward": 0.8125, + "reward_std": 0.011979279108345509, + "rewards//mean": 0.8125, + "rewards//std": 0.018920255824923515, + "step": 2422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4846, + "grad_norm": 1.3991276025772095, + "kl": 0.40574508160352707, + "learning_rate": 5.326619828674761e-07, + "loss": 0.0162, + "num_tokens": 17640599.0, + "reward": 0.7958984375, + "reward_std": 0.012045057490468025, + "rewards//mean": 0.7958984375, + "rewards//std": 0.01735801063477993, + "step": 2423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4848, + "grad_norm": 1.4956269264221191, + "kl": 0.28723564371466637, + "learning_rate": 5.323453215072509e-07, + "loss": 0.0115, + "num_tokens": 17647927.0, + "reward": 0.837646484375, + "reward_std": 0.01367952674627304, + "rewards//mean": 0.837646484375, + "rewards//std": 0.02575055882334709, + "step": 2424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.485, + "grad_norm": 1.3034106492996216, + "kl": 0.281150184571743, + "learning_rate": 5.320286471183343e-07, + "loss": 0.0112, + "num_tokens": 17655231.0, + "reward": 0.82965087890625, + "reward_std": 0.01704344153404236, + "rewards//mean": 0.82965087890625, + "rewards//std": 0.026880258694291115, + "step": 2425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4852, + "grad_norm": 1.6178908348083496, + "kl": 0.3057237509638071, + "learning_rate": 5.317119598282822e-07, + "loss": 0.0122, + "num_tokens": 17662407.0, + "reward": 0.879150390625, + "reward_std": 0.016376886516809464, + "rewards//mean": 0.879150390625, + "rewards//std": 0.026428259909152985, + "step": 2426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4854, + "grad_norm": 1.2185665369033813, + "kl": 0.3059656545519829, + "learning_rate": 5.313952597646567e-07, + "loss": 0.0122, + "num_tokens": 17669703.0, + "reward": 0.85211181640625, + "reward_std": 0.01373366080224514, + "rewards//mean": 0.85211181640625, + "rewards//std": 0.02129177562892437, + "step": 2427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4856, + "grad_norm": 1.6007750034332275, + "kl": 0.38194920867681503, + "learning_rate": 5.310785470550242e-07, + "loss": 0.0153, + "num_tokens": 17676975.0, + "reward": 0.842041015625, + "reward_std": 0.01618068479001522, + "rewards//mean": 0.842041015625, + "rewards//std": 0.02076200395822525, + "step": 2428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4858, + "grad_norm": 2.6506593227386475, + "kl": 0.2897163052111864, + "learning_rate": 5.307618218269568e-07, + "loss": 0.0116, + "num_tokens": 17684327.0, + "reward": 0.8310546875, + "reward_std": 0.012150288559496403, + "rewards//mean": 0.8310546875, + "rewards//std": 0.02844218909740448, + "step": 2429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.828125, + "epoch": 0.486, + "grad_norm": 1.7132470607757568, + "kl": 0.4406417962163687, + "learning_rate": 5.304450842080312e-07, + "loss": 0.0067, + "num_tokens": 17691660.0, + "reward": 0.817138671875, + "reward_std": 0.017038613557815552, + "rewards//mean": 0.817138671875, + "rewards//std": 0.021405315026640892, + "step": 2430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4862, + "grad_norm": 1.2794716358184814, + "kl": 0.37018309347331524, + "learning_rate": 5.301283343258292e-07, + "loss": 0.0148, + "num_tokens": 17698900.0, + "reward": 0.84063720703125, + "reward_std": 0.011736609041690826, + "rewards//mean": 0.84063720703125, + "rewards//std": 0.018716638907790184, + "step": 2431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.4864, + "grad_norm": 1.2610632181167603, + "kl": 0.30829809233546257, + "learning_rate": 5.298115723079379e-07, + "loss": 0.0116, + "num_tokens": 17706232.0, + "reward": 0.83709716796875, + "reward_std": 0.018376298248767853, + "rewards//mean": 0.83709716796875, + "rewards//std": 0.021301016211509705, + "step": 2432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.375, + "epoch": 0.4866, + "grad_norm": 1.61681067943573, + "kl": 0.3859193027019501, + "learning_rate": 5.294947982819487e-07, + "loss": -0.0198, + "num_tokens": 17713504.0, + "reward": 0.823974609375, + "reward_std": 0.01958591490983963, + "rewards//mean": 0.823974609375, + "rewards//std": 0.02453683316707611, + "step": 2433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.75, + "epoch": 0.4868, + "grad_norm": 1.5021650791168213, + "kl": 0.2877177344635129, + "learning_rate": 5.291780123754585e-07, + "loss": -0.0056, + "num_tokens": 17720728.0, + "reward": 0.87835693359375, + "reward_std": 0.017588511109352112, + "rewards//mean": 0.87835693359375, + "rewards//std": 0.025693990290164948, + "step": 2434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.487, + "grad_norm": 1.4490991830825806, + "kl": 0.29645819775760174, + "learning_rate": 5.28861214716068e-07, + "loss": 0.0119, + "num_tokens": 17728128.0, + "reward": 0.84332275390625, + "reward_std": 0.014605783857405186, + "rewards//mean": 0.84332275390625, + "rewards//std": 0.021689053624868393, + "step": 2435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4872, + "grad_norm": 1.4308723211288452, + "kl": 0.32342731580138206, + "learning_rate": 5.28544405431384e-07, + "loss": 0.0129, + "num_tokens": 17735352.0, + "reward": 0.76708984375, + "reward_std": 0.011330840177834034, + "rewards//mean": 0.76708984375, + "rewards//std": 0.02221187949180603, + "step": 2436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4874, + "grad_norm": 1.6606202125549316, + "kl": 0.35923074185848236, + "learning_rate": 5.282275846490169e-07, + "loss": 0.0144, + "num_tokens": 17742624.0, + "reward": 0.837646484375, + "reward_std": 0.018804270774126053, + "rewards//mean": 0.837646484375, + "rewards//std": 0.023734018206596375, + "step": 2437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4876, + "grad_norm": 1.486526370048523, + "kl": 0.32195258140563965, + "learning_rate": 5.27910752496582e-07, + "loss": 0.0129, + "num_tokens": 17749952.0, + "reward": 0.8560791015625, + "reward_std": 0.01703120395541191, + "rewards//mean": 0.8560791015625, + "rewards//std": 0.02727917954325676, + "step": 2438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4878, + "grad_norm": 1.4538146257400513, + "kl": 0.41517477110028267, + "learning_rate": 5.275939091016992e-07, + "loss": 0.0166, + "num_tokens": 17757280.0, + "reward": 0.86407470703125, + "reward_std": 0.017957130447030067, + "rewards//mean": 0.86407470703125, + "rewards//std": 0.0246383398771286, + "step": 2439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.488, + "grad_norm": 1.3913723230361938, + "kl": 0.2739825639873743, + "learning_rate": 5.272770545919933e-07, + "loss": 0.011, + "num_tokens": 17764616.0, + "reward": 0.84967041015625, + "reward_std": 0.016100727021694183, + "rewards//mean": 0.84967041015625, + "rewards//std": 0.024793295189738274, + "step": 2440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4882, + "grad_norm": 1.349457859992981, + "kl": 0.29920180700719357, + "learning_rate": 5.26960189095093e-07, + "loss": 0.012, + "num_tokens": 17771920.0, + "reward": 0.87542724609375, + "reward_std": 0.011670125648379326, + "rewards//mean": 0.87542724609375, + "rewards//std": 0.021487105637788773, + "step": 2441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.859375, + "epoch": 0.4884, + "grad_norm": 1.738070011138916, + "kl": 0.30409330502152443, + "learning_rate": 5.266433127386318e-07, + "loss": 0.0125, + "num_tokens": 17779167.0, + "reward": 0.87347412109375, + "reward_std": 0.015081746503710747, + "rewards//mean": 0.87347412109375, + "rewards//std": 0.020700577646493912, + "step": 2442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.5625, + "epoch": 0.4886, + "grad_norm": 1.4198371171951294, + "kl": 0.38771067559719086, + "learning_rate": 5.263264256502474e-07, + "loss": -0.0182, + "num_tokens": 17786507.0, + "reward": 0.83282470703125, + "reward_std": 0.010583756491541862, + "rewards//mean": 0.83282470703125, + "rewards//std": 0.02149907872080803, + "step": 2443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4888, + "grad_norm": 1.4541324377059937, + "kl": 0.3152114022523165, + "learning_rate": 5.260095279575818e-07, + "loss": 0.0126, + "num_tokens": 17793731.0, + "reward": 0.8511962890625, + "reward_std": 0.013361049816012383, + "rewards//mean": 0.8511962890625, + "rewards//std": 0.018916653469204903, + "step": 2444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.489, + "grad_norm": 1.4207019805908203, + "kl": 0.2980138212442398, + "learning_rate": 5.256926197882815e-07, + "loss": 0.0119, + "num_tokens": 17800987.0, + "reward": 0.88153076171875, + "reward_std": 0.019375251606106758, + "rewards//mean": 0.88153076171875, + "rewards//std": 0.026107197627425194, + "step": 2445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4892, + "grad_norm": 1.4105933904647827, + "kl": 0.3326982147991657, + "learning_rate": 5.253757012699971e-07, + "loss": 0.0133, + "num_tokens": 17808243.0, + "reward": 0.86834716796875, + "reward_std": 0.012042349204421043, + "rewards//mean": 0.86834716796875, + "rewards//std": 0.02359897829592228, + "step": 2446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4894, + "grad_norm": 1.441537857055664, + "kl": 0.3179023116827011, + "learning_rate": 5.250587725303831e-07, + "loss": 0.0127, + "num_tokens": 17815547.0, + "reward": 0.8775634765625, + "reward_std": 0.01482466608285904, + "rewards//mean": 0.8775634765625, + "rewards//std": 0.02212139405310154, + "step": 2447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4896, + "grad_norm": 1.5504956245422363, + "kl": 0.33214567601680756, + "learning_rate": 5.247418336970987e-07, + "loss": 0.0133, + "num_tokens": 17822843.0, + "reward": 0.8355712890625, + "reward_std": 0.013388611376285553, + "rewards//mean": 0.8355712890625, + "rewards//std": 0.021198881790041924, + "step": 2448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4898, + "grad_norm": 1.6977349519729614, + "kl": 0.36613139510154724, + "learning_rate": 5.244248848978067e-07, + "loss": 0.0146, + "num_tokens": 17830107.0, + "reward": 0.8350830078125, + "reward_std": 0.012476840987801552, + "rewards//mean": 0.8350830078125, + "rewards//std": 0.01988724060356617, + "step": 2449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.49, + "grad_norm": 1.5199769735336304, + "kl": 0.30573801323771477, + "learning_rate": 5.241079262601737e-07, + "loss": 0.0122, + "num_tokens": 17837459.0, + "reward": 0.80322265625, + "reward_std": 0.01175666507333517, + "rewards//mean": 0.80322265625, + "rewards//std": 0.020955048501491547, + "step": 2450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.4902, + "grad_norm": 1.7421553134918213, + "kl": 0.32654862850904465, + "learning_rate": 5.237909579118712e-07, + "loss": 0.0081, + "num_tokens": 17844829.0, + "reward": 0.85986328125, + "reward_std": 0.01739785075187683, + "rewards//mean": 0.85986328125, + "rewards//std": 0.023370232433080673, + "step": 2451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.859375, + "epoch": 0.4904, + "grad_norm": 1.3423073291778564, + "kl": 0.34000544250011444, + "learning_rate": 5.234739799805734e-07, + "loss": 0.0099, + "num_tokens": 17852100.0, + "reward": 0.8701171875, + "reward_std": 0.013748230412602425, + "rewards//mean": 0.8701171875, + "rewards//std": 0.020147761330008507, + "step": 2452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.578125, + "epoch": 0.4906, + "grad_norm": 1.4938795566558838, + "kl": 0.35115967877209187, + "learning_rate": 5.231569925939595e-07, + "loss": -0.017, + "num_tokens": 17859361.0, + "reward": 0.8079833984375, + "reward_std": 0.016785386949777603, + "rewards//mean": 0.8079833984375, + "rewards//std": 0.02097490429878235, + "step": 2453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.4908, + "grad_norm": 1.185342788696289, + "kl": 0.29285736940801144, + "learning_rate": 5.228399958797116e-07, + "loss": 0.0035, + "num_tokens": 17866610.0, + "reward": 0.83203125, + "reward_std": 0.010754084214568138, + "rewards//mean": 0.83203125, + "rewards//std": 0.02274523302912712, + "step": 2454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.491, + "grad_norm": 1.4636207818984985, + "kl": 0.35156381502747536, + "learning_rate": 5.225229899655163e-07, + "loss": 0.0141, + "num_tokens": 17873842.0, + "reward": 0.8330078125, + "reward_std": 0.011291463859379292, + "rewards//mean": 0.8330078125, + "rewards//std": 0.020171789452433586, + "step": 2455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4912, + "grad_norm": 1.3069612979888916, + "kl": 0.35451213642954826, + "learning_rate": 5.222059749790631e-07, + "loss": 0.0142, + "num_tokens": 17881106.0, + "reward": 0.86285400390625, + "reward_std": 0.01091049239039421, + "rewards//mean": 0.86285400390625, + "rewards//std": 0.013887057080864906, + "step": 2456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.4914, + "grad_norm": 5.773964881896973, + "kl": 0.7758874446153641, + "learning_rate": 5.21888951048046e-07, + "loss": 0.0297, + "num_tokens": 17888428.0, + "reward": 0.85357666015625, + "reward_std": 0.02123301848769188, + "rewards//mean": 0.85357666015625, + "rewards//std": 0.02908211015164852, + "step": 2457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4916, + "grad_norm": 1.2508341073989868, + "kl": 0.2703809607774019, + "learning_rate": 5.215719183001619e-07, + "loss": 0.0108, + "num_tokens": 17895820.0, + "reward": 0.82049560546875, + "reward_std": 0.013874966651201248, + "rewards//mean": 0.82049560546875, + "rewards//std": 0.021703705191612244, + "step": 2458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4918, + "grad_norm": 1.4305568933486938, + "kl": 0.35099021159112453, + "learning_rate": 5.212548768631117e-07, + "loss": 0.014, + "num_tokens": 17903076.0, + "reward": 0.83148193359375, + "reward_std": 0.01584121771156788, + "rewards//mean": 0.83148193359375, + "rewards//std": 0.024928469210863113, + "step": 2459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.492, + "grad_norm": 1.9902188777923584, + "kl": 0.37375036999583244, + "learning_rate": 5.209378268645997e-07, + "loss": 0.015, + "num_tokens": 17910308.0, + "reward": 0.81634521484375, + "reward_std": 0.021371889859437943, + "rewards//mean": 0.81634521484375, + "rewards//std": 0.031098438426852226, + "step": 2460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4922, + "grad_norm": 1.2564178705215454, + "kl": 0.3316455949097872, + "learning_rate": 5.206207684323335e-07, + "loss": 0.0133, + "num_tokens": 17917620.0, + "reward": 0.81842041015625, + "reward_std": 0.008419603109359741, + "rewards//mean": 0.81842041015625, + "rewards//std": 0.012757610529661179, + "step": 2461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4924, + "grad_norm": 1.745385766029358, + "kl": 0.4183713123202324, + "learning_rate": 5.203037016940245e-07, + "loss": 0.0167, + "num_tokens": 17924948.0, + "reward": 0.82733154296875, + "reward_std": 0.017618710175156593, + "rewards//mean": 0.82733154296875, + "rewards//std": 0.03193370997905731, + "step": 2462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4926, + "grad_norm": 1.3479903936386108, + "kl": 0.4044133722782135, + "learning_rate": 5.199866267773867e-07, + "loss": 0.0162, + "num_tokens": 17932164.0, + "reward": 0.8765869140625, + "reward_std": 0.01905045099556446, + "rewards//mean": 0.8765869140625, + "rewards//std": 0.02761012129485607, + "step": 2463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4928, + "grad_norm": 1.519234299659729, + "kl": 0.3458719626069069, + "learning_rate": 5.196695438101379e-07, + "loss": 0.0138, + "num_tokens": 17939428.0, + "reward": 0.88763427734375, + "reward_std": 0.02188662625849247, + "rewards//mean": 0.88763427734375, + "rewards//std": 0.0330192856490612, + "step": 2464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.765625, + "epoch": 0.493, + "grad_norm": 1.5630336999893188, + "kl": 0.42059579864144325, + "learning_rate": 5.193524529199994e-07, + "loss": 0.0083, + "num_tokens": 17946693.0, + "reward": 0.8702392578125, + "reward_std": 0.01445108000189066, + "rewards//mean": 0.8702392578125, + "rewards//std": 0.01869448646903038, + "step": 2465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.4932, + "grad_norm": 1.432262659072876, + "kl": 0.35185693204402924, + "learning_rate": 5.19035354234695e-07, + "loss": 0.0146, + "num_tokens": 17954004.0, + "reward": 0.8594970703125, + "reward_std": 0.012780074030160904, + "rewards//mean": 0.8594970703125, + "rewards//std": 0.01958658918738365, + "step": 2466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4934, + "grad_norm": 1.3563830852508545, + "kl": 0.29664307832717896, + "learning_rate": 5.187182478819523e-07, + "loss": 0.0119, + "num_tokens": 17961180.0, + "reward": 0.87933349609375, + "reward_std": 0.014906766824424267, + "rewards//mean": 0.87933349609375, + "rewards//std": 0.024979425594210625, + "step": 2467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4936, + "grad_norm": 1.5116056203842163, + "kl": 0.3970937915146351, + "learning_rate": 5.184011339895015e-07, + "loss": 0.0159, + "num_tokens": 17968388.0, + "reward": 0.80718994140625, + "reward_std": 0.013270912691950798, + "rewards//mean": 0.80718994140625, + "rewards//std": 0.01522248238325119, + "step": 2468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4938, + "grad_norm": 1.5531452894210815, + "kl": 0.3109089843928814, + "learning_rate": 5.180840126850763e-07, + "loss": 0.0124, + "num_tokens": 17975604.0, + "reward": 0.82470703125, + "reward_std": 0.01236984133720398, + "rewards//mean": 0.82470703125, + "rewards//std": 0.023255953565239906, + "step": 2469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.875, + "epoch": 0.494, + "grad_norm": 1.5437852144241333, + "kl": 0.340716402977705, + "learning_rate": 5.177668840964127e-07, + "loss": 0.0145, + "num_tokens": 17982884.0, + "reward": 0.83575439453125, + "reward_std": 0.018989989534020424, + "rewards//mean": 0.83575439453125, + "rewards//std": 0.02236325480043888, + "step": 2470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4942, + "grad_norm": 1.3741662502288818, + "kl": 0.34095484763383865, + "learning_rate": 5.174497483512505e-07, + "loss": 0.0136, + "num_tokens": 17990188.0, + "reward": 0.86627197265625, + "reward_std": 0.01811748556792736, + "rewards//mean": 0.86627197265625, + "rewards//std": 0.024849403649568558, + "step": 2471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4944, + "grad_norm": 1.3483935594558716, + "kl": 0.32666089944541454, + "learning_rate": 5.171326055773317e-07, + "loss": 0.0131, + "num_tokens": 17997420.0, + "reward": 0.8482666015625, + "reward_std": 0.017124388366937637, + "rewards//mean": 0.8482666015625, + "rewards//std": 0.02674115262925625, + "step": 2472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4946, + "grad_norm": 1.381690502166748, + "kl": 0.3107441086322069, + "learning_rate": 5.168154559024014e-07, + "loss": 0.0124, + "num_tokens": 18004708.0, + "reward": 0.84088134765625, + "reward_std": 0.017672089859843254, + "rewards//mean": 0.84088134765625, + "rewards//std": 0.023159334436058998, + "step": 2473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.78125, + "epoch": 0.4948, + "grad_norm": 1.2838901281356812, + "kl": 0.35342003405094147, + "learning_rate": 5.164982994542076e-07, + "loss": 0.0065, + "num_tokens": 18011950.0, + "reward": 0.76422119140625, + "reward_std": 0.014082306995987892, + "rewards//mean": 0.76422119140625, + "rewards//std": 0.02157989889383316, + "step": 2474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.495, + "grad_norm": 1.3614048957824707, + "kl": 0.2811114024370909, + "learning_rate": 5.161811363605005e-07, + "loss": 0.0112, + "num_tokens": 18019230.0, + "reward": 0.81707763671875, + "reward_std": 0.011752134189009666, + "rewards//mean": 0.81707763671875, + "rewards//std": 0.016463646665215492, + "step": 2475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.4952, + "grad_norm": 1.4451591968536377, + "kl": 0.3102306239306927, + "learning_rate": 5.158639667490338e-07, + "loss": 0.0119, + "num_tokens": 18026445.0, + "reward": 0.856689453125, + "reward_std": 0.01791883446276188, + "rewards//mean": 0.856689453125, + "rewards//std": 0.03063851036131382, + "step": 2476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4954, + "grad_norm": 1.4708482027053833, + "kl": 0.37262230552732944, + "learning_rate": 5.155467907475631e-07, + "loss": 0.0149, + "num_tokens": 18033669.0, + "reward": 0.78759765625, + "reward_std": 0.009519336745142937, + "rewards//mean": 0.78759765625, + "rewards//std": 0.015648234635591507, + "step": 2477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4956, + "grad_norm": 1.432774305343628, + "kl": 0.3458769656717777, + "learning_rate": 5.152296084838471e-07, + "loss": 0.0138, + "num_tokens": 18040997.0, + "reward": 0.86749267578125, + "reward_std": 0.016008634120225906, + "rewards//mean": 0.86749267578125, + "rewards//std": 0.020603088662028313, + "step": 2478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4958, + "grad_norm": 1.6252479553222656, + "kl": 0.2812929432839155, + "learning_rate": 5.149124200856465e-07, + "loss": 0.0113, + "num_tokens": 18048325.0, + "reward": 0.837646484375, + "reward_std": 0.01659276895225048, + "rewards//mean": 0.837646484375, + "rewards//std": 0.019940832629799843, + "step": 2479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.496, + "grad_norm": 1.383831262588501, + "kl": 0.31608686223626137, + "learning_rate": 5.145952256807249e-07, + "loss": 0.0126, + "num_tokens": 18055645.0, + "reward": 0.82623291015625, + "reward_std": 0.009102700278162956, + "rewards//mean": 0.82623291015625, + "rewards//std": 0.015390606597065926, + "step": 2480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4962, + "grad_norm": 1.8000696897506714, + "kl": 0.3719646669924259, + "learning_rate": 5.142780253968481e-07, + "loss": 0.0149, + "num_tokens": 18062933.0, + "reward": 0.8616943359375, + "reward_std": 0.017277073115110397, + "rewards//mean": 0.8616943359375, + "rewards//std": 0.023945782333612442, + "step": 2481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4964, + "grad_norm": 1.5421584844589233, + "kl": 0.32729310169816017, + "learning_rate": 5.139608193617844e-07, + "loss": 0.0131, + "num_tokens": 18070277.0, + "reward": 0.868896484375, + "reward_std": 0.019020887091755867, + "rewards//mean": 0.868896484375, + "rewards//std": 0.0386064350605011, + "step": 2482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4966, + "grad_norm": 1.2899117469787598, + "kl": 0.34749123454093933, + "learning_rate": 5.136436077033044e-07, + "loss": 0.0139, + "num_tokens": 18077461.0, + "reward": 0.8660888671875, + "reward_std": 0.012987465597689152, + "rewards//mean": 0.8660888671875, + "rewards//std": 0.020862014964222908, + "step": 2483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4968, + "grad_norm": 1.6240595579147339, + "kl": 0.32015368714928627, + "learning_rate": 5.133263905491808e-07, + "loss": 0.0128, + "num_tokens": 18084757.0, + "reward": 0.86505126953125, + "reward_std": 0.015358938835561275, + "rewards//mean": 0.86505126953125, + "rewards//std": 0.022808287292718887, + "step": 2484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.875, + "epoch": 0.497, + "grad_norm": 1.468000054359436, + "kl": 0.27886853739619255, + "learning_rate": 5.130091680271886e-07, + "loss": 0.0088, + "num_tokens": 18092085.0, + "reward": 0.85955810546875, + "reward_std": 0.013237418606877327, + "rewards//mean": 0.85955810546875, + "rewards//std": 0.02051030471920967, + "step": 2485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4972, + "grad_norm": 1.4427543878555298, + "kl": 0.307031461969018, + "learning_rate": 5.126919402651052e-07, + "loss": 0.0123, + "num_tokens": 18099389.0, + "reward": 0.8671875, + "reward_std": 0.016612470149993896, + "rewards//mean": 0.8671875, + "rewards//std": 0.021451938897371292, + "step": 2486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4974, + "grad_norm": 1.5445204973220825, + "kl": 0.3536435514688492, + "learning_rate": 5.123747073907097e-07, + "loss": 0.0141, + "num_tokens": 18106605.0, + "reward": 0.82293701171875, + "reward_std": 0.013373296707868576, + "rewards//mean": 0.82293701171875, + "rewards//std": 0.01948847994208336, + "step": 2487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4976, + "grad_norm": 1.691998839378357, + "kl": 0.38501379638910294, + "learning_rate": 5.120574695317836e-07, + "loss": 0.0154, + "num_tokens": 18114005.0, + "reward": 0.8052978515625, + "reward_std": 0.017846569418907166, + "rewards//mean": 0.8052978515625, + "rewards//std": 0.03012305498123169, + "step": 2488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4978, + "grad_norm": 1.3393886089324951, + "kl": 0.3007230628281832, + "learning_rate": 5.117402268161101e-07, + "loss": 0.012, + "num_tokens": 18121301.0, + "reward": 0.80010986328125, + "reward_std": 0.010365630500018597, + "rewards//mean": 0.80010986328125, + "rewards//std": 0.018773168325424194, + "step": 2489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.498, + "grad_norm": 2.2504794597625732, + "kl": 0.44110723212361336, + "learning_rate": 5.114229793714748e-07, + "loss": 0.0176, + "num_tokens": 18128549.0, + "reward": 0.79119873046875, + "reward_std": 0.015690838918089867, + "rewards//mean": 0.79119873046875, + "rewards//std": 0.02012285776436329, + "step": 2490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4982, + "grad_norm": 1.334198236465454, + "kl": 0.352440781891346, + "learning_rate": 5.111057273256647e-07, + "loss": 0.0141, + "num_tokens": 18135813.0, + "reward": 0.79217529296875, + "reward_std": 0.021525457501411438, + "rewards//mean": 0.79217529296875, + "rewards//std": 0.02812689170241356, + "step": 2491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.875, + "epoch": 0.4984, + "grad_norm": 1.5251280069351196, + "kl": 0.3004196062684059, + "learning_rate": 5.107884708064689e-07, + "loss": 0.0069, + "num_tokens": 18143037.0, + "reward": 0.88299560546875, + "reward_std": 0.013184449635446072, + "rewards//mean": 0.88299560546875, + "rewards//std": 0.026661429554224014, + "step": 2492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.4986, + "grad_norm": 1.4848906993865967, + "kl": 0.3071694280952215, + "learning_rate": 5.104712099416785e-07, + "loss": 0.0127, + "num_tokens": 18150289.0, + "reward": 0.80548095703125, + "reward_std": 0.014776867814362049, + "rewards//mean": 0.80548095703125, + "rewards//std": 0.02281094342470169, + "step": 2493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4988, + "grad_norm": 1.5109986066818237, + "kl": 0.3143600169569254, + "learning_rate": 5.101539448590858e-07, + "loss": 0.0126, + "num_tokens": 18157577.0, + "reward": 0.8629150390625, + "reward_std": 0.016501376405358315, + "rewards//mean": 0.8629150390625, + "rewards//std": 0.029893023893237114, + "step": 2494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.499, + "grad_norm": 1.3776297569274902, + "kl": 0.392249945551157, + "learning_rate": 5.098366756864855e-07, + "loss": 0.0157, + "num_tokens": 18164825.0, + "reward": 0.83270263671875, + "reward_std": 0.013872220180928707, + "rewards//mean": 0.83270263671875, + "rewards//std": 0.017365748062729836, + "step": 2495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4992, + "grad_norm": 1.8267264366149902, + "kl": 0.36212643049657345, + "learning_rate": 5.095194025516732e-07, + "loss": 0.0145, + "num_tokens": 18172089.0, + "reward": 0.83514404296875, + "reward_std": 0.018962353467941284, + "rewards//mean": 0.83514404296875, + "rewards//std": 0.021215565502643585, + "step": 2496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4994, + "grad_norm": 1.5768439769744873, + "kl": 0.3415113240480423, + "learning_rate": 5.09202125582447e-07, + "loss": 0.0137, + "num_tokens": 18179345.0, + "reward": 0.83868408203125, + "reward_std": 0.015074783936142921, + "rewards//mean": 0.83868408203125, + "rewards//std": 0.02050144597887993, + "step": 2497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.4996, + "grad_norm": 1.3317993879318237, + "kl": 0.31351035088300705, + "learning_rate": 5.088848449066054e-07, + "loss": 0.0125, + "num_tokens": 18186585.0, + "reward": 0.772216796875, + "reward_std": 0.010734660550951958, + "rewards//mean": 0.772216796875, + "rewards//std": 0.013493558391928673, + "step": 2498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.625, + "epoch": 0.4998, + "grad_norm": 1.5387516021728516, + "kl": 0.3655826710164547, + "learning_rate": 5.085675606519497e-07, + "loss": 0.0152, + "num_tokens": 18193817.0, + "reward": 0.85443115234375, + "reward_std": 0.01802353374660015, + "rewards//mean": 0.85443115234375, + "rewards//std": 0.02596941404044628, + "step": 2499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5, + "grad_norm": 1.6970332860946655, + "kl": 0.28404594399034977, + "learning_rate": 5.082502729462812e-07, + "loss": 0.0114, + "num_tokens": 18201169.0, + "reward": 0.835205078125, + "reward_std": 0.014337750151753426, + "rewards//mean": 0.835205078125, + "rewards//std": 0.020337704569101334, + "step": 2500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5002, + "grad_norm": 1.664189100265503, + "kl": 0.27189221791923046, + "learning_rate": 5.07932981917404e-07, + "loss": 0.0109, + "num_tokens": 18208537.0, + "reward": 0.83099365234375, + "reward_std": 0.015516924671828747, + "rewards//mean": 0.83099365234375, + "rewards//std": 0.028170451521873474, + "step": 2501 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.5004, + "grad_norm": 1.5161992311477661, + "kl": 0.3659998346120119, + "learning_rate": 5.076156876931225e-07, + "loss": 0.0155, + "num_tokens": 18215832.0, + "reward": 0.810302734375, + "reward_std": 0.018072128295898438, + "rewards//mean": 0.810302734375, + "rewards//std": 0.028398511931300163, + "step": 2502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5006, + "grad_norm": 1.5065944194793701, + "kl": 0.3036002442240715, + "learning_rate": 5.072983904012429e-07, + "loss": 0.0121, + "num_tokens": 18223104.0, + "reward": 0.864501953125, + "reward_std": 0.013578206300735474, + "rewards//mean": 0.864501953125, + "rewards//std": 0.022292152047157288, + "step": 2503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5008, + "grad_norm": 1.305537462234497, + "kl": 0.3162405602633953, + "learning_rate": 5.069810901695727e-07, + "loss": 0.0126, + "num_tokens": 18230360.0, + "reward": 0.80877685546875, + "reward_std": 0.01842300221323967, + "rewards//mean": 0.80877685546875, + "rewards//std": 0.024709505960345268, + "step": 2504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.501, + "grad_norm": 1.6361618041992188, + "kl": 0.31991603039205074, + "learning_rate": 5.0666378712592e-07, + "loss": 0.0128, + "num_tokens": 18237584.0, + "reward": 0.79620361328125, + "reward_std": 0.012535194866359234, + "rewards//mean": 0.79620361328125, + "rewards//std": 0.023474207147955894, + "step": 2505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5012, + "grad_norm": 1.4212653636932373, + "kl": 0.2736660558730364, + "learning_rate": 5.063464813980948e-07, + "loss": 0.0109, + "num_tokens": 18244864.0, + "reward": 0.87451171875, + "reward_std": 0.014563138596713543, + "rewards//mean": 0.87451171875, + "rewards//std": 0.025718795135617256, + "step": 2506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5014, + "grad_norm": 1.4605894088745117, + "kl": 0.2892151139676571, + "learning_rate": 5.060291731139076e-07, + "loss": 0.0116, + "num_tokens": 18252152.0, + "reward": 0.89459228515625, + "reward_std": 0.02084616757929325, + "rewards//mean": 0.89459228515625, + "rewards//std": 0.026240795850753784, + "step": 2507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5016, + "grad_norm": 1.5518438816070557, + "kl": 0.3667466100305319, + "learning_rate": 5.057118624011702e-07, + "loss": 0.0147, + "num_tokens": 18259408.0, + "reward": 0.86993408203125, + "reward_std": 0.018128875643014908, + "rewards//mean": 0.86993408203125, + "rewards//std": 0.02610371820628643, + "step": 2508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5018, + "grad_norm": 1.4614241123199463, + "kl": 0.3556336238980293, + "learning_rate": 5.053945493876952e-07, + "loss": 0.0142, + "num_tokens": 18266712.0, + "reward": 0.7991943359375, + "reward_std": 0.017554014921188354, + "rewards//mean": 0.7991943359375, + "rewards//std": 0.019496729597449303, + "step": 2509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.502, + "grad_norm": 1.3432754278182983, + "kl": 0.34678639099001884, + "learning_rate": 5.050772342012966e-07, + "loss": 0.0139, + "num_tokens": 18274000.0, + "reward": 0.8638916015625, + "reward_std": 0.014489364810287952, + "rewards//mean": 0.8638916015625, + "rewards//std": 0.016161708161234856, + "step": 2510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5022, + "grad_norm": 1.6794023513793945, + "kl": 0.3714215848594904, + "learning_rate": 5.047599169697883e-07, + "loss": 0.0149, + "num_tokens": 18281392.0, + "reward": 0.81866455078125, + "reward_std": 0.01744101010262966, + "rewards//mean": 0.81866455078125, + "rewards//std": 0.02784671261906624, + "step": 2511 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5024, + "grad_norm": 1.4645781517028809, + "kl": 0.3606887012720108, + "learning_rate": 5.044425978209863e-07, + "loss": 0.0144, + "num_tokens": 18288640.0, + "reward": 0.801025390625, + "reward_std": 0.013673251494765282, + "rewards//mean": 0.801025390625, + "rewards//std": 0.018764397129416466, + "step": 2512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5026, + "grad_norm": 1.4050153493881226, + "kl": 0.31582161039114, + "learning_rate": 5.041252768827063e-07, + "loss": 0.0126, + "num_tokens": 18295944.0, + "reward": 0.79364013671875, + "reward_std": 0.010188146494328976, + "rewards//mean": 0.79364013671875, + "rewards//std": 0.018669672310352325, + "step": 2513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5028, + "grad_norm": 1.4700336456298828, + "kl": 0.2843754179775715, + "learning_rate": 5.038079542827653e-07, + "loss": 0.0114, + "num_tokens": 18303288.0, + "reward": 0.8466796875, + "reward_std": 0.01309184730052948, + "rewards//mean": 0.8466796875, + "rewards//std": 0.02518109418451786, + "step": 2514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.503, + "grad_norm": 1.5974093675613403, + "kl": 0.3298306465148926, + "learning_rate": 5.034906301489807e-07, + "loss": 0.0132, + "num_tokens": 18310552.0, + "reward": 0.8758544921875, + "reward_std": 0.017764326184988022, + "rewards//mean": 0.8758544921875, + "rewards//std": 0.03967445343732834, + "step": 2515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5032, + "grad_norm": 1.542440414428711, + "kl": 0.3778974115848541, + "learning_rate": 5.03173304609171e-07, + "loss": 0.0151, + "num_tokens": 18317880.0, + "reward": 0.82598876953125, + "reward_std": 0.015265412628650665, + "rewards//mean": 0.82598876953125, + "rewards//std": 0.0174405537545681, + "step": 2516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.859375, + "epoch": 0.5034, + "grad_norm": 1.6297078132629395, + "kl": 0.34090383164584637, + "learning_rate": 5.028559777911541e-07, + "loss": 0.0031, + "num_tokens": 18325167.0, + "reward": 0.85443115234375, + "reward_std": 0.01819046586751938, + "rewards//mean": 0.85443115234375, + "rewards//std": 0.02121627889573574, + "step": 2517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5036, + "grad_norm": 2.176295518875122, + "kl": 0.3333977907896042, + "learning_rate": 5.025386498227501e-07, + "loss": 0.0133, + "num_tokens": 18332471.0, + "reward": 0.7686767578125, + "reward_std": 0.014084410853683949, + "rewards//mean": 0.7686767578125, + "rewards//std": 0.017791688442230225, + "step": 2518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5038, + "grad_norm": 1.347231388092041, + "kl": 0.253862788900733, + "learning_rate": 5.022213208317781e-07, + "loss": 0.0102, + "num_tokens": 18339815.0, + "reward": 0.8271484375, + "reward_std": 0.016712650656700134, + "rewards//mean": 0.8271484375, + "rewards//std": 0.026346798986196518, + "step": 2519 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.504, + "grad_norm": 1.5608599185943604, + "kl": 0.40344130247831345, + "learning_rate": 5.019039909460583e-07, + "loss": 0.0161, + "num_tokens": 18347015.0, + "reward": 0.8057861328125, + "reward_std": 0.012875773943960667, + "rewards//mean": 0.8057861328125, + "rewards//std": 0.017336631193757057, + "step": 2520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5042, + "grad_norm": 1.4194999933242798, + "kl": 0.28998011723160744, + "learning_rate": 5.015866602934111e-07, + "loss": 0.0116, + "num_tokens": 18354359.0, + "reward": 0.84893798828125, + "reward_std": 0.013471592217683792, + "rewards//mean": 0.84893798828125, + "rewards//std": 0.030322089791297913, + "step": 2521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5044, + "grad_norm": 1.6156260967254639, + "kl": 0.3455333858728409, + "learning_rate": 5.012693290016575e-07, + "loss": 0.0138, + "num_tokens": 18361639.0, + "reward": 0.86859130859375, + "reward_std": 0.016667373478412628, + "rewards//mean": 0.86859130859375, + "rewards//std": 0.020662516355514526, + "step": 2522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5046, + "grad_norm": 12.026933670043945, + "kl": 0.7123313657939434, + "learning_rate": 5.009519971986182e-07, + "loss": 0.0285, + "num_tokens": 18368911.0, + "reward": 0.85455322265625, + "reward_std": 0.012653336860239506, + "rewards//mean": 0.85455322265625, + "rewards//std": 0.023206347599625587, + "step": 2523 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5048, + "grad_norm": 1.690216302871704, + "kl": 0.32261858880519867, + "learning_rate": 5.006346650121147e-07, + "loss": 0.0129, + "num_tokens": 18376215.0, + "reward": 0.86669921875, + "reward_std": 0.01760723814368248, + "rewards//mean": 0.86669921875, + "rewards//std": 0.020885584875941277, + "step": 2524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.505, + "grad_norm": 1.506559133529663, + "kl": 0.33113570138812065, + "learning_rate": 5.003173325699681e-07, + "loss": 0.0132, + "num_tokens": 18383543.0, + "reward": 0.85064697265625, + "reward_std": 0.014469427987933159, + "rewards//mean": 0.85064697265625, + "rewards//std": 0.017562510445713997, + "step": 2525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.75, + "epoch": 0.5052, + "grad_norm": 1.3611457347869873, + "kl": 0.3438654188066721, + "learning_rate": 5e-07, + "loss": 0.0068, + "num_tokens": 18390815.0, + "reward": 0.85382080078125, + "reward_std": 0.014803800731897354, + "rewards//mean": 0.85382080078125, + "rewards//std": 0.022334806621074677, + "step": 2526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5054, + "grad_norm": 1.6908848285675049, + "kl": 0.3502159155905247, + "learning_rate": 4.996826674300319e-07, + "loss": 0.014, + "num_tokens": 18398031.0, + "reward": 0.87225341796875, + "reward_std": 0.014274800196290016, + "rewards//mean": 0.87225341796875, + "rewards//std": 0.016033228486776352, + "step": 2527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5056, + "grad_norm": 1.540298342704773, + "kl": 0.30370636098086834, + "learning_rate": 4.993653349878853e-07, + "loss": 0.0121, + "num_tokens": 18405327.0, + "reward": 0.86358642578125, + "reward_std": 0.014659545384347439, + "rewards//mean": 0.86358642578125, + "rewards//std": 0.02090650238096714, + "step": 2528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5058, + "grad_norm": 1.9112333059310913, + "kl": 0.37531768158078194, + "learning_rate": 4.990480028013818e-07, + "loss": 0.015, + "num_tokens": 18412615.0, + "reward": 0.88031005859375, + "reward_std": 0.017065241932868958, + "rewards//mean": 0.88031005859375, + "rewards//std": 0.031790703535079956, + "step": 2529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.506, + "grad_norm": 1.4429312944412231, + "kl": 0.2877156976610422, + "learning_rate": 4.987306709983425e-07, + "loss": 0.0115, + "num_tokens": 18419863.0, + "reward": 0.88177490234375, + "reward_std": 0.015316426753997803, + "rewards//mean": 0.88177490234375, + "rewards//std": 0.02650483138859272, + "step": 2530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5062, + "grad_norm": 1.4735426902770996, + "kl": 0.3648957870900631, + "learning_rate": 4.984133397065888e-07, + "loss": 0.0146, + "num_tokens": 18427119.0, + "reward": 0.880126953125, + "reward_std": 0.011722972616553307, + "rewards//mean": 0.880126953125, + "rewards//std": 0.01931145042181015, + "step": 2531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5064, + "grad_norm": 1.4006205797195435, + "kl": 0.35694146901369095, + "learning_rate": 4.980960090539417e-07, + "loss": 0.0143, + "num_tokens": 18434327.0, + "reward": 0.83831787109375, + "reward_std": 0.010723037645220757, + "rewards//mean": 0.83831787109375, + "rewards//std": 0.015210543759167194, + "step": 2532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5066, + "grad_norm": 1.3360198736190796, + "kl": 0.34854552149772644, + "learning_rate": 4.97778679168222e-07, + "loss": 0.0139, + "num_tokens": 18441615.0, + "reward": 0.84515380859375, + "reward_std": 0.019385993480682373, + "rewards//mean": 0.84515380859375, + "rewards//std": 0.03216185048222542, + "step": 2533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5068, + "grad_norm": 1.5594667196273804, + "kl": 0.3359116278588772, + "learning_rate": 4.9746135017725e-07, + "loss": 0.0134, + "num_tokens": 18448831.0, + "reward": 0.87396240234375, + "reward_std": 0.012179547920823097, + "rewards//mean": 0.87396240234375, + "rewards//std": 0.020696189254522324, + "step": 2534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.507, + "grad_norm": 1.3587212562561035, + "kl": 0.30691610649228096, + "learning_rate": 4.971440222088458e-07, + "loss": 0.0123, + "num_tokens": 18456239.0, + "reward": 0.8466796875, + "reward_std": 0.014400442130863667, + "rewards//mean": 0.8466796875, + "rewards//std": 0.025506025180220604, + "step": 2535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5072, + "grad_norm": 2.1314735412597656, + "kl": 0.47689451090991497, + "learning_rate": 4.968266953908291e-07, + "loss": 0.0191, + "num_tokens": 18463591.0, + "reward": 0.8826904296875, + "reward_std": 0.01618649810552597, + "rewards//mean": 0.8826904296875, + "rewards//std": 0.02054026536643505, + "step": 2536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5074, + "grad_norm": 1.2839010953903198, + "kl": 0.33494390174746513, + "learning_rate": 4.965093698510192e-07, + "loss": 0.0134, + "num_tokens": 18470927.0, + "reward": 0.88946533203125, + "reward_std": 0.012620697729289532, + "rewards//mean": 0.88946533203125, + "rewards//std": 0.014824478887021542, + "step": 2537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5076, + "grad_norm": 1.2809703350067139, + "kl": 0.32214609161019325, + "learning_rate": 4.961920457172346e-07, + "loss": 0.0129, + "num_tokens": 18478223.0, + "reward": 0.86407470703125, + "reward_std": 0.017608489841222763, + "rewards//mean": 0.86407470703125, + "rewards//std": 0.031483039259910583, + "step": 2538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5078, + "grad_norm": 1.451487421989441, + "kl": 0.3222263492643833, + "learning_rate": 4.958747231172937e-07, + "loss": 0.0129, + "num_tokens": 18485455.0, + "reward": 0.7685546875, + "reward_std": 0.01537287887185812, + "rewards//mean": 0.7685546875, + "rewards//std": 0.019350603222846985, + "step": 2539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.508, + "grad_norm": 1.5121768712997437, + "kl": 0.36999642103910446, + "learning_rate": 4.955574021790137e-07, + "loss": 0.0169, + "num_tokens": 18492740.0, + "reward": 0.760009765625, + "reward_std": 0.014181312173604965, + "rewards//mean": 0.760009765625, + "rewards//std": 0.022594323381781578, + "step": 2540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5082, + "grad_norm": 1.3279898166656494, + "kl": 0.32579879090189934, + "learning_rate": 4.952400830302116e-07, + "loss": 0.013, + "num_tokens": 18499964.0, + "reward": 0.81842041015625, + "reward_std": 0.014958702027797699, + "rewards//mean": 0.81842041015625, + "rewards//std": 0.024159997701644897, + "step": 2541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.828125, + "epoch": 0.5084, + "grad_norm": 1.425186038017273, + "kl": 0.36565443128347397, + "learning_rate": 4.949227657987035e-07, + "loss": 0.0062, + "num_tokens": 18507329.0, + "reward": 0.82696533203125, + "reward_std": 0.017516065388917923, + "rewards//mean": 0.82696533203125, + "rewards//std": 0.024505270645022392, + "step": 2542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5086, + "grad_norm": 1.2871025800704956, + "kl": 0.35219037160277367, + "learning_rate": 4.946054506123048e-07, + "loss": 0.0141, + "num_tokens": 18514657.0, + "reward": 0.87286376953125, + "reward_std": 0.01774539053440094, + "rewards//mean": 0.87286376953125, + "rewards//std": 0.030384927988052368, + "step": 2543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5088, + "grad_norm": 1.4123767614364624, + "kl": 0.32391561567783356, + "learning_rate": 4.942881375988299e-07, + "loss": 0.013, + "num_tokens": 18521913.0, + "reward": 0.8336181640625, + "reward_std": 0.011704141274094582, + "rewards//mean": 0.8336181640625, + "rewards//std": 0.01665618270635605, + "step": 2544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.509, + "grad_norm": 1.466060996055603, + "kl": 0.36901630088686943, + "learning_rate": 4.939708268860924e-07, + "loss": 0.0148, + "num_tokens": 18529113.0, + "reward": 0.83306884765625, + "reward_std": 0.020078860223293304, + "rewards//mean": 0.83306884765625, + "rewards//std": 0.028058458119630814, + "step": 2545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.5092, + "grad_norm": 1.327351689338684, + "kl": 0.31039347127079964, + "learning_rate": 4.936535186019052e-07, + "loss": 0.0128, + "num_tokens": 18536471.0, + "reward": 0.85723876953125, + "reward_std": 0.01605370081961155, + "rewards//mean": 0.85723876953125, + "rewards//std": 0.024119237437844276, + "step": 2546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5094, + "grad_norm": 1.4039108753204346, + "kl": 0.3390297908335924, + "learning_rate": 4.933362128740799e-07, + "loss": 0.0136, + "num_tokens": 18543735.0, + "reward": 0.85906982421875, + "reward_std": 0.01184672862291336, + "rewards//mean": 0.85906982421875, + "rewards//std": 0.0172503013163805, + "step": 2547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.5096, + "grad_norm": 1.46710205078125, + "kl": 0.38289695605635643, + "learning_rate": 4.930189098304274e-07, + "loss": 0.016, + "num_tokens": 18550934.0, + "reward": 0.83770751953125, + "reward_std": 0.016054635867476463, + "rewards//mean": 0.83770751953125, + "rewards//std": 0.02313513681292534, + "step": 2548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5098, + "grad_norm": 2.081663131713867, + "kl": 0.3777558207511902, + "learning_rate": 4.92701609598757e-07, + "loss": 0.0151, + "num_tokens": 18558126.0, + "reward": 0.88751220703125, + "reward_std": 0.026765892282128334, + "rewards//mean": 0.88751220703125, + "rewards//std": 0.03366893529891968, + "step": 2549 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.51, + "grad_norm": 1.2773789167404175, + "kl": 0.3304411992430687, + "learning_rate": 4.923843123068775e-07, + "loss": 0.0132, + "num_tokens": 18565406.0, + "reward": 0.82305908203125, + "reward_std": 0.01202135719358921, + "rewards//mean": 0.82305908203125, + "rewards//std": 0.015385687351226807, + "step": 2550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5102, + "grad_norm": 1.6257920265197754, + "kl": 0.33093650080263615, + "learning_rate": 4.92067018082596e-07, + "loss": 0.0132, + "num_tokens": 18572694.0, + "reward": 0.83563232421875, + "reward_std": 0.020606037229299545, + "rewards//mean": 0.83563232421875, + "rewards//std": 0.03792557492852211, + "step": 2551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5104, + "grad_norm": 1.6751083135604858, + "kl": 0.36950071156024933, + "learning_rate": 4.917497270537187e-07, + "loss": 0.0148, + "num_tokens": 18579934.0, + "reward": 0.8670654296875, + "reward_std": 0.017902132123708725, + "rewards//mean": 0.8670654296875, + "rewards//std": 0.024596910923719406, + "step": 2552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5106, + "grad_norm": 1.282873511314392, + "kl": 0.36655695736408234, + "learning_rate": 4.914324393480503e-07, + "loss": 0.0147, + "num_tokens": 18587174.0, + "reward": 0.8514404296875, + "reward_std": 0.01412886194884777, + "rewards//mean": 0.8514404296875, + "rewards//std": 0.01909823529422283, + "step": 2553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5108, + "grad_norm": 1.2247756719589233, + "kl": 0.3223291393369436, + "learning_rate": 4.911151550933945e-07, + "loss": 0.0129, + "num_tokens": 18594326.0, + "reward": 0.8551025390625, + "reward_std": 0.014218662865459919, + "rewards//mean": 0.8551025390625, + "rewards//std": 0.020466435700654984, + "step": 2554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.8125, + "epoch": 0.511, + "grad_norm": 1.6806451082229614, + "kl": 0.3319327346980572, + "learning_rate": 4.90797874417553e-07, + "loss": 0.0072, + "num_tokens": 18601642.0, + "reward": 0.867431640625, + "reward_std": 0.02329161949455738, + "rewards//mean": 0.867431640625, + "rewards//std": 0.031000016257166862, + "step": 2555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5112, + "grad_norm": 1.566069483757019, + "kl": 0.43433549627661705, + "learning_rate": 4.904805974483266e-07, + "loss": 0.0174, + "num_tokens": 18609026.0, + "reward": 0.83123779296875, + "reward_std": 0.01304744090884924, + "rewards//mean": 0.83123779296875, + "rewards//std": 0.017036614939570427, + "step": 2556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5114, + "grad_norm": 1.678958773612976, + "kl": 0.3258916810154915, + "learning_rate": 4.901633243135143e-07, + "loss": 0.013, + "num_tokens": 18616338.0, + "reward": 0.834716796875, + "reward_std": 0.01242737378925085, + "rewards//mean": 0.834716796875, + "rewards//std": 0.013847896829247475, + "step": 2557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5116, + "grad_norm": 1.758636236190796, + "kl": 0.308117613196373, + "learning_rate": 4.89846055140914e-07, + "loss": 0.0123, + "num_tokens": 18623554.0, + "reward": 0.8271484375, + "reward_std": 0.014124071225523949, + "rewards//mean": 0.8271484375, + "rewards//std": 0.021372759714722633, + "step": 2558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5118, + "grad_norm": 1.3491096496582031, + "kl": 0.3302165810018778, + "learning_rate": 4.895287900583216e-07, + "loss": 0.0132, + "num_tokens": 18630818.0, + "reward": 0.84429931640625, + "reward_std": 0.015291258692741394, + "rewards//mean": 0.84429931640625, + "rewards//std": 0.01866075024008751, + "step": 2559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.512, + "grad_norm": 1.5185033082962036, + "kl": 0.3764263615012169, + "learning_rate": 4.892115291935309e-07, + "loss": 0.0151, + "num_tokens": 18638202.0, + "reward": 0.83349609375, + "reward_std": 0.022016558796167374, + "rewards//mean": 0.83349609375, + "rewards//std": 0.034411199390888214, + "step": 2560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5122, + "grad_norm": 1.6022475957870483, + "kl": 0.43431079015135765, + "learning_rate": 4.888942726743353e-07, + "loss": 0.0174, + "num_tokens": 18645474.0, + "reward": 0.842529296875, + "reward_std": 0.017160480841994286, + "rewards//mean": 0.842529296875, + "rewards//std": 0.021775512024760246, + "step": 2561 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5124, + "grad_norm": 1.206511378288269, + "kl": 0.30093994550406933, + "learning_rate": 4.885770206285252e-07, + "loss": 0.012, + "num_tokens": 18652706.0, + "reward": 0.85772705078125, + "reward_std": 0.01593134179711342, + "rewards//mean": 0.85772705078125, + "rewards//std": 0.02911384403705597, + "step": 2562 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5126, + "grad_norm": 1.6722506284713745, + "kl": 0.32341347448527813, + "learning_rate": 4.882597731838898e-07, + "loss": 0.0129, + "num_tokens": 18659994.0, + "reward": 0.82373046875, + "reward_std": 0.021686460822820663, + "rewards//mean": 0.82373046875, + "rewards//std": 0.030277688056230545, + "step": 2563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5128, + "grad_norm": 1.7148358821868896, + "kl": 0.3841906897723675, + "learning_rate": 4.879425304682163e-07, + "loss": 0.0154, + "num_tokens": 18667202.0, + "reward": 0.87530517578125, + "reward_std": 0.01674170047044754, + "rewards//mean": 0.87530517578125, + "rewards//std": 0.02609095722436905, + "step": 2564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.513, + "grad_norm": 1.567955732345581, + "kl": 0.3609989397227764, + "learning_rate": 4.876252926092902e-07, + "loss": 0.0144, + "num_tokens": 18674466.0, + "reward": 0.75836181640625, + "reward_std": 0.014041273854672909, + "rewards//mean": 0.75836181640625, + "rewards//std": 0.020855030044913292, + "step": 2565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5132, + "grad_norm": 1.4394855499267578, + "kl": 0.38701654598116875, + "learning_rate": 4.873080597348947e-07, + "loss": 0.0155, + "num_tokens": 18681802.0, + "reward": 0.85137939453125, + "reward_std": 0.014440951868891716, + "rewards//mean": 0.85137939453125, + "rewards//std": 0.019517973065376282, + "step": 2566 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5134, + "grad_norm": 1.5476173162460327, + "kl": 0.3479260466992855, + "learning_rate": 4.869908319728113e-07, + "loss": 0.0139, + "num_tokens": 18689002.0, + "reward": 0.8475341796875, + "reward_std": 0.016805412247776985, + "rewards//mean": 0.8475341796875, + "rewards//std": 0.021485432982444763, + "step": 2567 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5136, + "grad_norm": 1.6740385293960571, + "kl": 0.43860577046871185, + "learning_rate": 4.866736094508191e-07, + "loss": 0.0175, + "num_tokens": 18696362.0, + "reward": 0.86431884765625, + "reward_std": 0.016291745007038116, + "rewards//mean": 0.86431884765625, + "rewards//std": 0.022328706458210945, + "step": 2568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5138, + "grad_norm": 1.5752202272415161, + "kl": 0.3853962831199169, + "learning_rate": 4.863563922966956e-07, + "loss": 0.0154, + "num_tokens": 18703658.0, + "reward": 0.84747314453125, + "reward_std": 0.018708601593971252, + "rewards//mean": 0.84747314453125, + "rewards//std": 0.027031322941184044, + "step": 2569 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.514, + "grad_norm": 1.605536699295044, + "kl": 0.4105445183813572, + "learning_rate": 4.860391806382156e-07, + "loss": 0.0164, + "num_tokens": 18710986.0, + "reward": 0.85858154296875, + "reward_std": 0.01198972761631012, + "rewards//mean": 0.85858154296875, + "rewards//std": 0.01559675857424736, + "step": 2570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5142, + "grad_norm": 1.5200084447860718, + "kl": 0.3388242870569229, + "learning_rate": 4.857219746031519e-07, + "loss": 0.0136, + "num_tokens": 18718226.0, + "reward": 0.8563232421875, + "reward_std": 0.015397089533507824, + "rewards//mean": 0.8563232421875, + "rewards//std": 0.024473514407873154, + "step": 2571 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5144, + "grad_norm": 1.7249723672866821, + "kl": 0.31983195804059505, + "learning_rate": 4.854047743192752e-07, + "loss": 0.0128, + "num_tokens": 18725522.0, + "reward": 0.84478759765625, + "reward_std": 0.01774788275361061, + "rewards//mean": 0.84478759765625, + "rewards//std": 0.025527307763695717, + "step": 2572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5146, + "grad_norm": 1.7777760028839111, + "kl": 0.27463530376553535, + "learning_rate": 4.850875799143536e-07, + "loss": 0.011, + "num_tokens": 18732738.0, + "reward": 0.85308837890625, + "reward_std": 0.02045869082212448, + "rewards//mean": 0.85308837890625, + "rewards//std": 0.024789631366729736, + "step": 2573 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5148, + "grad_norm": 1.8283127546310425, + "kl": 0.3482130356132984, + "learning_rate": 4.84770391516153e-07, + "loss": 0.0139, + "num_tokens": 18740082.0, + "reward": 0.8516845703125, + "reward_std": 0.014721740037202835, + "rewards//mean": 0.8516845703125, + "rewards//std": 0.021121619269251823, + "step": 2574 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.515, + "grad_norm": 1.5196006298065186, + "kl": 0.32229218259453773, + "learning_rate": 4.84453209252437e-07, + "loss": 0.0129, + "num_tokens": 18747386.0, + "reward": 0.86810302734375, + "reward_std": 0.015457307919859886, + "rewards//mean": 0.86810302734375, + "rewards//std": 0.030143337324261665, + "step": 2575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5152, + "grad_norm": 1.5005295276641846, + "kl": 0.35130928829312325, + "learning_rate": 4.841360332509662e-07, + "loss": 0.0141, + "num_tokens": 18754730.0, + "reward": 0.85833740234375, + "reward_std": 0.014901273883879185, + "rewards//mean": 0.85833740234375, + "rewards//std": 0.020766286179423332, + "step": 2576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5154, + "grad_norm": 1.4097487926483154, + "kl": 0.35549708269536495, + "learning_rate": 4.838188636394996e-07, + "loss": 0.0142, + "num_tokens": 18762034.0, + "reward": 0.827392578125, + "reward_std": 0.014502542093396187, + "rewards//mean": 0.827392578125, + "rewards//std": 0.02585381455719471, + "step": 2577 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5156, + "grad_norm": 1.5946385860443115, + "kl": 0.3197228033095598, + "learning_rate": 4.835017005457925e-07, + "loss": 0.0128, + "num_tokens": 18769370.0, + "reward": 0.8262939453125, + "reward_std": 0.01283184252679348, + "rewards//mean": 0.8262939453125, + "rewards//std": 0.024948880076408386, + "step": 2578 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5158, + "grad_norm": 1.8998478651046753, + "kl": 0.31559765338897705, + "learning_rate": 4.831845440975987e-07, + "loss": 0.0126, + "num_tokens": 18776786.0, + "reward": 0.8519287109375, + "reward_std": 0.013680608943104744, + "rewards//mean": 0.8519287109375, + "rewards//std": 0.02084168791770935, + "step": 2579 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.516, + "grad_norm": 1.3541088104248047, + "kl": 0.3510809913277626, + "learning_rate": 4.828673944226683e-07, + "loss": 0.0114, + "num_tokens": 18784069.0, + "reward": 0.78387451171875, + "reward_std": 0.011016166768968105, + "rewards//mean": 0.78387451171875, + "rewards//std": 0.020915189757943153, + "step": 2580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5162, + "grad_norm": 1.5288509130477905, + "kl": 0.3761742189526558, + "learning_rate": 4.825502516487496e-07, + "loss": 0.015, + "num_tokens": 18791293.0, + "reward": 0.82421875, + "reward_std": 0.01970573514699936, + "rewards//mean": 0.82421875, + "rewards//std": 0.02790054678916931, + "step": 2581 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.828125, + "epoch": 0.5164, + "grad_norm": 1.5121437311172485, + "kl": 0.3629744425415993, + "learning_rate": 4.822331159035873e-07, + "loss": 0.0109, + "num_tokens": 18798562.0, + "reward": 0.84234619140625, + "reward_std": 0.015473540872335434, + "rewards//mean": 0.84234619140625, + "rewards//std": 0.021814320236444473, + "step": 2582 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5166, + "grad_norm": 1.4003859758377075, + "kl": 0.36019396409392357, + "learning_rate": 4.819159873149239e-07, + "loss": 0.0144, + "num_tokens": 18805866.0, + "reward": 0.85296630859375, + "reward_std": 0.011898335069417953, + "rewards//mean": 0.85296630859375, + "rewards//std": 0.018154172226786613, + "step": 2583 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5168, + "grad_norm": 1.462808609008789, + "kl": 0.3144405819475651, + "learning_rate": 4.815988660104985e-07, + "loss": 0.0126, + "num_tokens": 18813018.0, + "reward": 0.8009033203125, + "reward_std": 0.016140539199113846, + "rewards//mean": 0.8009033203125, + "rewards//std": 0.029098566621541977, + "step": 2584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.517, + "grad_norm": 1.5739498138427734, + "kl": 0.4157866891473532, + "learning_rate": 4.812817521180478e-07, + "loss": 0.0166, + "num_tokens": 18820298.0, + "reward": 0.79998779296875, + "reward_std": 0.01526925154030323, + "rewards//mean": 0.79998779296875, + "rewards//std": 0.03315424919128418, + "step": 2585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5172, + "grad_norm": 1.4999189376831055, + "kl": 0.3139990586787462, + "learning_rate": 4.809646457653051e-07, + "loss": 0.0126, + "num_tokens": 18827610.0, + "reward": 0.83355712890625, + "reward_std": 0.015277761965990067, + "rewards//mean": 0.83355712890625, + "rewards//std": 0.017848074436187744, + "step": 2586 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5174, + "grad_norm": 1.5336309671401978, + "kl": 0.32522708363831043, + "learning_rate": 4.806475470800008e-07, + "loss": 0.013, + "num_tokens": 18834906.0, + "reward": 0.77313232421875, + "reward_std": 0.011693628504872322, + "rewards//mean": 0.77313232421875, + "rewards//std": 0.014554484747350216, + "step": 2587 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5176, + "grad_norm": 1.676614761352539, + "kl": 0.3486713506281376, + "learning_rate": 4.803304561898621e-07, + "loss": 0.0139, + "num_tokens": 18842266.0, + "reward": 0.83380126953125, + "reward_std": 0.014659540727734566, + "rewards//mean": 0.83380126953125, + "rewards//std": 0.017784349620342255, + "step": 2588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5178, + "grad_norm": 1.7569876909255981, + "kl": 0.34687814489006996, + "learning_rate": 4.800133732226135e-07, + "loss": 0.0139, + "num_tokens": 18849546.0, + "reward": 0.83782958984375, + "reward_std": 0.016992030665278435, + "rewards//mean": 0.83782958984375, + "rewards//std": 0.020099524408578873, + "step": 2589 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.518, + "grad_norm": 1.3806123733520508, + "kl": 0.3027472738176584, + "learning_rate": 4.796962983059757e-07, + "loss": 0.0121, + "num_tokens": 18856826.0, + "reward": 0.85955810546875, + "reward_std": 0.016247520223259926, + "rewards//mean": 0.85955810546875, + "rewards//std": 0.024532437324523926, + "step": 2590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.8125, + "epoch": 0.5182, + "grad_norm": 1.3084439039230347, + "kl": 0.3316862992942333, + "learning_rate": 4.793792315676664e-07, + "loss": 0.0097, + "num_tokens": 18864158.0, + "reward": 0.87286376953125, + "reward_std": 0.013062400743365288, + "rewards//mean": 0.87286376953125, + "rewards//std": 0.01946205087006092, + "step": 2591 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5184, + "grad_norm": 1.4166147708892822, + "kl": 0.3789817579090595, + "learning_rate": 4.790621731354002e-07, + "loss": 0.0152, + "num_tokens": 18871430.0, + "reward": 0.84002685546875, + "reward_std": 0.01309283822774887, + "rewards//mean": 0.84002685546875, + "rewards//std": 0.022057918831706047, + "step": 2592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5186, + "grad_norm": 1.6026620864868164, + "kl": 0.33677417412400246, + "learning_rate": 4.787451231368882e-07, + "loss": 0.0135, + "num_tokens": 18878670.0, + "reward": 0.86627197265625, + "reward_std": 0.018008306622505188, + "rewards//mean": 0.86627197265625, + "rewards//std": 0.033143747597932816, + "step": 2593 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5188, + "grad_norm": 2.12988018989563, + "kl": 0.44354144111275673, + "learning_rate": 4.784280816998382e-07, + "loss": 0.0177, + "num_tokens": 18885942.0, + "reward": 0.8446044921875, + "reward_std": 0.017622198909521103, + "rewards//mean": 0.8446044921875, + "rewards//std": 0.03894742950797081, + "step": 2594 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.519, + "grad_norm": 1.3389376401901245, + "kl": 0.3337762914597988, + "learning_rate": 4.78111048951954e-07, + "loss": 0.0134, + "num_tokens": 18893230.0, + "reward": 0.8284912109375, + "reward_std": 0.017700878903269768, + "rewards//mean": 0.8284912109375, + "rewards//std": 0.023517129942774773, + "step": 2595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5192, + "grad_norm": 1.5416620969772339, + "kl": 0.32528194785118103, + "learning_rate": 4.777940250209369e-07, + "loss": 0.013, + "num_tokens": 18900502.0, + "reward": 0.88214111328125, + "reward_std": 0.01693771779537201, + "rewards//mean": 0.88214111328125, + "rewards//std": 0.022889114916324615, + "step": 2596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5194, + "grad_norm": 1.523207426071167, + "kl": 0.3584433440119028, + "learning_rate": 4.774770100344838e-07, + "loss": 0.0143, + "num_tokens": 18907742.0, + "reward": 0.86053466796875, + "reward_std": 0.016129937022924423, + "rewards//mean": 0.86053466796875, + "rewards//std": 0.01898006722331047, + "step": 2597 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5196, + "grad_norm": 1.613010048866272, + "kl": 0.3130437098443508, + "learning_rate": 4.771600041202883e-07, + "loss": 0.0125, + "num_tokens": 18915006.0, + "reward": 0.81304931640625, + "reward_std": 0.016246646642684937, + "rewards//mean": 0.81304931640625, + "rewards//std": 0.021897433325648308, + "step": 2598 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.5198, + "grad_norm": 1.5724773406982422, + "kl": 0.37368506751954556, + "learning_rate": 4.768430074060405e-07, + "loss": 0.015, + "num_tokens": 18922273.0, + "reward": 0.86767578125, + "reward_std": 0.015596535056829453, + "rewards//mean": 0.86767578125, + "rewards//std": 0.02072259411215782, + "step": 2599 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.5625, + "epoch": 0.52, + "grad_norm": 1.966259241104126, + "kl": 0.4441455081105232, + "learning_rate": 4.7652602001942655e-07, + "loss": -0.0107, + "num_tokens": 18929477.0, + "reward": 0.84124755859375, + "reward_std": 0.028441719710826874, + "rewards//mean": 0.84124755859375, + "rewards//std": 0.03296514227986336, + "step": 2600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.6875, + "epoch": 0.5202, + "grad_norm": 1.6453505754470825, + "kl": 0.3524177595973015, + "learning_rate": 4.762090420881288e-07, + "loss": -0.0017, + "num_tokens": 18936697.0, + "reward": 0.82135009765625, + "reward_std": 0.014050496742129326, + "rewards//mean": 0.82135009765625, + "rewards//std": 0.020719582214951515, + "step": 2601 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5204, + "grad_norm": 1.6017374992370605, + "kl": 0.33024074882268906, + "learning_rate": 4.758920737398263e-07, + "loss": 0.0132, + "num_tokens": 18943937.0, + "reward": 0.850341796875, + "reward_std": 0.01377837173640728, + "rewards//mean": 0.850341796875, + "rewards//std": 0.014892864041030407, + "step": 2602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5206, + "grad_norm": 1.6459745168685913, + "kl": 0.3574538268148899, + "learning_rate": 4.7557511510219335e-07, + "loss": 0.0143, + "num_tokens": 18951129.0, + "reward": 0.830078125, + "reward_std": 0.01772329770028591, + "rewards//mean": 0.830078125, + "rewards//std": 0.026621157303452492, + "step": 2603 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5208, + "grad_norm": 1.8509798049926758, + "kl": 0.29672596603631973, + "learning_rate": 4.7525816630290126e-07, + "loss": 0.0119, + "num_tokens": 18958513.0, + "reward": 0.7901611328125, + "reward_std": 0.018890097737312317, + "rewards//mean": 0.7901611328125, + "rewards//std": 0.02299378253519535, + "step": 2604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.521, + "grad_norm": 1.3735578060150146, + "kl": 0.3087340258061886, + "learning_rate": 4.7494122746961687e-07, + "loss": 0.0123, + "num_tokens": 18965777.0, + "reward": 0.86749267578125, + "reward_std": 0.0173137616366148, + "rewards//mean": 0.86749267578125, + "rewards//std": 0.028162389993667603, + "step": 2605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5212, + "grad_norm": 1.5899903774261475, + "kl": 0.38091477006673813, + "learning_rate": 4.7462429873000293e-07, + "loss": 0.0152, + "num_tokens": 18973097.0, + "reward": 0.826171875, + "reward_std": 0.014357205480337143, + "rewards//mean": 0.826171875, + "rewards//std": 0.02446887455880642, + "step": 2606 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5214, + "grad_norm": 1.4616235494613647, + "kl": 0.36095903627574444, + "learning_rate": 4.743073802117185e-07, + "loss": 0.0144, + "num_tokens": 18980353.0, + "reward": 0.8460693359375, + "reward_std": 0.013393433764576912, + "rewards//mean": 0.8460693359375, + "rewards//std": 0.027442943304777145, + "step": 2607 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5216, + "grad_norm": 1.4156144857406616, + "kl": 0.38728547655045986, + "learning_rate": 4.7399047204241823e-07, + "loss": 0.0155, + "num_tokens": 18987633.0, + "reward": 0.832275390625, + "reward_std": 0.020026277750730515, + "rewards//mean": 0.832275390625, + "rewards//std": 0.028063921257853508, + "step": 2608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5218, + "grad_norm": 1.6353765726089478, + "kl": 0.33486228436231613, + "learning_rate": 4.7367357434975274e-07, + "loss": 0.0134, + "num_tokens": 18994993.0, + "reward": 0.87908935546875, + "reward_std": 0.016682498157024384, + "rewards//mean": 0.87908935546875, + "rewards//std": 0.022856025025248528, + "step": 2609 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.522, + "grad_norm": 1.4903204441070557, + "kl": 0.31787881441414356, + "learning_rate": 4.733566872613682e-07, + "loss": 0.0127, + "num_tokens": 19002217.0, + "reward": 0.843505859375, + "reward_std": 0.01333874836564064, + "rewards//mean": 0.843505859375, + "rewards//std": 0.024576283991336823, + "step": 2610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.828125, + "epoch": 0.5222, + "grad_norm": 1.6295806169509888, + "kl": 0.3363189026713371, + "learning_rate": 4.7303981090490706e-07, + "loss": 0.0077, + "num_tokens": 19009438.0, + "reward": 0.87213134765625, + "reward_std": 0.016990456730127335, + "rewards//mean": 0.87213134765625, + "rewards//std": 0.024372097104787827, + "step": 2611 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5224, + "grad_norm": 1.404524326324463, + "kl": 0.2924652732908726, + "learning_rate": 4.727229454080067e-07, + "loss": 0.0117, + "num_tokens": 19016718.0, + "reward": 0.83343505859375, + "reward_std": 0.015619929879903793, + "rewards//mean": 0.83343505859375, + "rewards//std": 0.0203554667532444, + "step": 2612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5226, + "grad_norm": 1.7227839231491089, + "kl": 0.3538152538239956, + "learning_rate": 4.724060908983008e-07, + "loss": 0.0142, + "num_tokens": 19023926.0, + "reward": 0.84967041015625, + "reward_std": 0.014365484938025475, + "rewards//mean": 0.84967041015625, + "rewards//std": 0.023561101406812668, + "step": 2613 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5228, + "grad_norm": 1.3167616128921509, + "kl": 0.2893805466592312, + "learning_rate": 4.7208924750341805e-07, + "loss": 0.0116, + "num_tokens": 19031190.0, + "reward": 0.8548583984375, + "reward_std": 0.013887177221477032, + "rewards//mean": 0.8548583984375, + "rewards//std": 0.01892305538058281, + "step": 2614 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.765625, + "epoch": 0.523, + "grad_norm": 1.2675045728683472, + "kl": 0.35584751702845097, + "learning_rate": 4.717724153509832e-07, + "loss": 0.0174, + "num_tokens": 19038455.0, + "reward": 0.85614013671875, + "reward_std": 0.016220059245824814, + "rewards//mean": 0.85614013671875, + "rewards//std": 0.018669672310352325, + "step": 2615 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5232, + "grad_norm": 1.7630419731140137, + "kl": 0.3659247402101755, + "learning_rate": 4.7145559456861594e-07, + "loss": 0.0146, + "num_tokens": 19045703.0, + "reward": 0.833251953125, + "reward_std": 0.013744554482400417, + "rewards//mean": 0.833251953125, + "rewards//std": 0.020456448197364807, + "step": 2616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5234, + "grad_norm": 1.6611042022705078, + "kl": 0.33219318836927414, + "learning_rate": 4.711387852839319e-07, + "loss": 0.0133, + "num_tokens": 19052967.0, + "reward": 0.84716796875, + "reward_std": 0.015387913212180138, + "rewards//mean": 0.84716796875, + "rewards//std": 0.020617136731743813, + "step": 2617 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5236, + "grad_norm": 1.7673321962356567, + "kl": 0.3465627655386925, + "learning_rate": 4.708219876245416e-07, + "loss": 0.0139, + "num_tokens": 19060135.0, + "reward": 0.8707275390625, + "reward_std": 0.015200222842395306, + "rewards//mean": 0.8707275390625, + "rewards//std": 0.026472613215446472, + "step": 2618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5238, + "grad_norm": 1.5100841522216797, + "kl": 0.31114339642226696, + "learning_rate": 4.7050520171805133e-07, + "loss": 0.0124, + "num_tokens": 19067399.0, + "reward": 0.8516845703125, + "reward_std": 0.01809905655682087, + "rewards//mean": 0.8516845703125, + "rewards//std": 0.024997374042868614, + "step": 2619 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.524, + "grad_norm": 1.373276948928833, + "kl": 0.30649175122380257, + "learning_rate": 4.7018842769206214e-07, + "loss": 0.0123, + "num_tokens": 19074703.0, + "reward": 0.83538818359375, + "reward_std": 0.014583289623260498, + "rewards//mean": 0.83538818359375, + "rewards//std": 0.02139391005039215, + "step": 2620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.5242, + "grad_norm": 1.5985198020935059, + "kl": 0.3531603030860424, + "learning_rate": 4.698716656741708e-07, + "loss": 0.0113, + "num_tokens": 19081962.0, + "reward": 0.858642578125, + "reward_std": 0.020242147147655487, + "rewards//mean": 0.858642578125, + "rewards//std": 0.03253998979926109, + "step": 2621 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5244, + "grad_norm": 1.3561499118804932, + "kl": 0.2922902051359415, + "learning_rate": 4.6955491579196893e-07, + "loss": 0.0117, + "num_tokens": 19089178.0, + "reward": 0.870849609375, + "reward_std": 0.01655631512403488, + "rewards//mean": 0.870849609375, + "rewards//std": 0.024497317150235176, + "step": 2622 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5246, + "grad_norm": 1.5487885475158691, + "kl": 0.33271494321525097, + "learning_rate": 4.692381781730432e-07, + "loss": 0.0133, + "num_tokens": 19096458.0, + "reward": 0.8486328125, + "reward_std": 0.018014926463365555, + "rewards//mean": 0.8486328125, + "rewards//std": 0.026392724364995956, + "step": 2623 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5248, + "grad_norm": 1.3647114038467407, + "kl": 0.36995625868439674, + "learning_rate": 4.6892145294497576e-07, + "loss": 0.0148, + "num_tokens": 19103778.0, + "reward": 0.87506103515625, + "reward_std": 0.017025336623191833, + "rewards//mean": 0.87506103515625, + "rewards//std": 0.027130262926220894, + "step": 2624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.525, + "grad_norm": 1.2942396402359009, + "kl": 0.3521488718688488, + "learning_rate": 4.686047402353433e-07, + "loss": 0.0141, + "num_tokens": 19111074.0, + "reward": 0.85211181640625, + "reward_std": 0.01653655245900154, + "rewards//mean": 0.85211181640625, + "rewards//std": 0.023947283625602722, + "step": 2625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5252, + "grad_norm": 1.2363684177398682, + "kl": 0.2921108677983284, + "learning_rate": 4.682880401717177e-07, + "loss": 0.0117, + "num_tokens": 19118354.0, + "reward": 0.80755615234375, + "reward_std": 0.01172527763992548, + "rewards//mean": 0.80755615234375, + "rewards//std": 0.023880183696746826, + "step": 2626 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5254, + "grad_norm": 1.831997036933899, + "kl": 0.34725948981940746, + "learning_rate": 4.679713528816658e-07, + "loss": 0.0139, + "num_tokens": 19125562.0, + "reward": 0.86029052734375, + "reward_std": 0.015306422486901283, + "rewards//mean": 0.86029052734375, + "rewards//std": 0.024634039029479027, + "step": 2627 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5256, + "grad_norm": 1.5622282028198242, + "kl": 0.3289667945355177, + "learning_rate": 4.676546784927491e-07, + "loss": 0.0132, + "num_tokens": 19132906.0, + "reward": 0.8798828125, + "reward_std": 0.014692737720906734, + "rewards//mean": 0.8798828125, + "rewards//std": 0.016022957861423492, + "step": 2628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5258, + "grad_norm": 1.820204734802246, + "kl": 0.3091072626411915, + "learning_rate": 4.67338017132524e-07, + "loss": 0.0124, + "num_tokens": 19140194.0, + "reward": 0.76611328125, + "reward_std": 0.012311168946325779, + "rewards//mean": 0.76611328125, + "rewards//std": 0.017490066587924957, + "step": 2629 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.526, + "grad_norm": 1.54385507106781, + "kl": 0.34463027864694595, + "learning_rate": 4.670213689285417e-07, + "loss": 0.0138, + "num_tokens": 19147434.0, + "reward": 0.83477783203125, + "reward_std": 0.015279427170753479, + "rewards//mean": 0.83477783203125, + "rewards//std": 0.022430842742323875, + "step": 2630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5262, + "grad_norm": 1.5614347457885742, + "kl": 0.37768444791436195, + "learning_rate": 4.66704734008348e-07, + "loss": 0.0151, + "num_tokens": 19154738.0, + "reward": 0.86541748046875, + "reward_std": 0.02238403633236885, + "rewards//mean": 0.86541748046875, + "rewards//std": 0.02822789177298546, + "step": 2631 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5264, + "grad_norm": 1.3344188928604126, + "kl": 0.2828488126397133, + "learning_rate": 4.6638811249948365e-07, + "loss": 0.0113, + "num_tokens": 19162010.0, + "reward": 0.8349609375, + "reward_std": 0.013054712675511837, + "rewards//mean": 0.8349609375, + "rewards//std": 0.021259134635329247, + "step": 2632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5266, + "grad_norm": 1.295770525932312, + "kl": 0.3434773236513138, + "learning_rate": 4.6607150452948336e-07, + "loss": 0.0137, + "num_tokens": 19169346.0, + "reward": 0.7645263671875, + "reward_std": 0.012489317916333675, + "rewards//mean": 0.7645263671875, + "rewards//std": 0.015096287243068218, + "step": 2633 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5268, + "grad_norm": 1.4845235347747803, + "kl": 0.37647124379873276, + "learning_rate": 4.657549102258771e-07, + "loss": 0.0151, + "num_tokens": 19176618.0, + "reward": 0.84228515625, + "reward_std": 0.013338612392544746, + "rewards//mean": 0.84228515625, + "rewards//std": 0.02248283103108406, + "step": 2634 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.527, + "grad_norm": 1.6711368560791016, + "kl": 0.4147661626338959, + "learning_rate": 4.6543832971618885e-07, + "loss": 0.0166, + "num_tokens": 19183898.0, + "reward": 0.84930419921875, + "reward_std": 0.012647108174860477, + "rewards//mean": 0.84930419921875, + "rewards//std": 0.0161854587495327, + "step": 2635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5272, + "grad_norm": 1.7783865928649902, + "kl": 0.3271982092410326, + "learning_rate": 4.6512176312793735e-07, + "loss": 0.0131, + "num_tokens": 19191218.0, + "reward": 0.8414306640625, + "reward_std": 0.01916699856519699, + "rewards//mean": 0.8414306640625, + "rewards//std": 0.0273013673722744, + "step": 2636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5274, + "grad_norm": 1.350386619567871, + "kl": 0.29958065412938595, + "learning_rate": 4.648052105886354e-07, + "loss": 0.012, + "num_tokens": 19198522.0, + "reward": 0.7916259765625, + "reward_std": 0.00928786676377058, + "rewards//mean": 0.7916259765625, + "rewards//std": 0.03023541159927845, + "step": 2637 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5276, + "grad_norm": 1.421650767326355, + "kl": 0.3632173538208008, + "learning_rate": 4.644886722257904e-07, + "loss": 0.0145, + "num_tokens": 19205714.0, + "reward": 0.82562255859375, + "reward_std": 0.019522197544574738, + "rewards//mean": 0.82562255859375, + "rewards//std": 0.029572466388344765, + "step": 2638 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5278, + "grad_norm": 1.7714627981185913, + "kl": 0.3741338811814785, + "learning_rate": 4.641721481669041e-07, + "loss": 0.015, + "num_tokens": 19212994.0, + "reward": 0.84649658203125, + "reward_std": 0.018882228061556816, + "rewards//mean": 0.84649658203125, + "rewards//std": 0.02350642718374729, + "step": 2639 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.528, + "grad_norm": 1.4831340312957764, + "kl": 0.3812151625752449, + "learning_rate": 4.638556385394721e-07, + "loss": 0.0152, + "num_tokens": 19220410.0, + "reward": 0.829833984375, + "reward_std": 0.011442378163337708, + "rewards//mean": 0.829833984375, + "rewards//std": 0.019248638302087784, + "step": 2640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5282, + "grad_norm": 1.515166163444519, + "kl": 0.33844752982258797, + "learning_rate": 4.6353914347098467e-07, + "loss": 0.0135, + "num_tokens": 19227722.0, + "reward": 0.82891845703125, + "reward_std": 0.012985968962311745, + "rewards//mean": 0.82891845703125, + "rewards//std": 0.0178828127682209, + "step": 2641 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5284, + "grad_norm": 1.3095166683197021, + "kl": 0.32731708884239197, + "learning_rate": 4.6322266308892577e-07, + "loss": 0.0131, + "num_tokens": 19235018.0, + "reward": 0.854248046875, + "reward_std": 0.015045281499624252, + "rewards//mean": 0.854248046875, + "rewards//std": 0.02087833546102047, + "step": 2642 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.765625, + "epoch": 0.5286, + "grad_norm": 2.1537086963653564, + "kl": 0.43506084755063057, + "learning_rate": 4.6290619752077394e-07, + "loss": 0.006, + "num_tokens": 19242275.0, + "reward": 0.83880615234375, + "reward_std": 0.015976212918758392, + "rewards//mean": 0.83880615234375, + "rewards//std": 0.020496277138590813, + "step": 2643 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5288, + "grad_norm": 1.5872241258621216, + "kl": 0.35215385630726814, + "learning_rate": 4.6258974689400113e-07, + "loss": 0.0141, + "num_tokens": 19249603.0, + "reward": 0.8450927734375, + "reward_std": 0.01716696098446846, + "rewards//mean": 0.8450927734375, + "rewards//std": 0.0246288925409317, + "step": 2644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.529, + "grad_norm": 1.5017027854919434, + "kl": 0.29951250180602074, + "learning_rate": 4.6227331133607394e-07, + "loss": 0.012, + "num_tokens": 19256779.0, + "reward": 0.90185546875, + "reward_std": 0.01978517882525921, + "rewards//mean": 0.90185546875, + "rewards//std": 0.0327005535364151, + "step": 2645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5292, + "grad_norm": 1.6424251794815063, + "kl": 0.5355696715414524, + "learning_rate": 4.6195689097445236e-07, + "loss": 0.0214, + "num_tokens": 19263987.0, + "reward": 0.8543701171875, + "reward_std": 0.016207918524742126, + "rewards//mean": 0.8543701171875, + "rewards//std": 0.026083195582032204, + "step": 2646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.5294, + "grad_norm": 2.052316665649414, + "kl": 0.5444790571928024, + "learning_rate": 4.6164048593659065e-07, + "loss": 0.0222, + "num_tokens": 19271264.0, + "reward": 0.85491943359375, + "reward_std": 0.02397274784743786, + "rewards//mean": 0.85491943359375, + "rewards//std": 0.03288054093718529, + "step": 2647 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5296, + "grad_norm": 1.4650753736495972, + "kl": 0.3730703741312027, + "learning_rate": 4.6132409634993645e-07, + "loss": 0.0149, + "num_tokens": 19278576.0, + "reward": 0.8424072265625, + "reward_std": 0.015881093218922615, + "rewards//mean": 0.8424072265625, + "rewards//std": 0.01738894172012806, + "step": 2648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5298, + "grad_norm": 1.5674200057983398, + "kl": 0.35117897018790245, + "learning_rate": 4.610077223419318e-07, + "loss": 0.014, + "num_tokens": 19285824.0, + "reward": 0.84722900390625, + "reward_std": 0.015490178018808365, + "rewards//mean": 0.84722900390625, + "rewards//std": 0.01619013398885727, + "step": 2649 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.53, + "grad_norm": 3.2938716411590576, + "kl": 0.2955870423465967, + "learning_rate": 4.606913640400117e-07, + "loss": 0.0118, + "num_tokens": 19293272.0, + "reward": 0.844970703125, + "reward_std": 0.009766868315637112, + "rewards//mean": 0.844970703125, + "rewards//std": 0.023774802684783936, + "step": 2650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.5302, + "grad_norm": 1.7181826829910278, + "kl": 0.342176228761673, + "learning_rate": 4.6037502157160567e-07, + "loss": 0.0134, + "num_tokens": 19300566.0, + "reward": 0.85101318359375, + "reward_std": 0.013885491527616978, + "rewards//mean": 0.85101318359375, + "rewards//std": 0.02173088863492012, + "step": 2651 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5304, + "grad_norm": 1.5063197612762451, + "kl": 0.33522002026438713, + "learning_rate": 4.6005869506413615e-07, + "loss": 0.0134, + "num_tokens": 19307934.0, + "reward": 0.828857421875, + "reward_std": 0.011655833572149277, + "rewards//mean": 0.828857421875, + "rewards//std": 0.018582813441753387, + "step": 2652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5306, + "grad_norm": 1.668357014656067, + "kl": 0.3673313446342945, + "learning_rate": 4.5974238464501954e-07, + "loss": 0.0147, + "num_tokens": 19315150.0, + "reward": 0.850341796875, + "reward_std": 0.018780790269374847, + "rewards//mean": 0.850341796875, + "rewards//std": 0.021952755749225616, + "step": 2653 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5308, + "grad_norm": 1.4650684595108032, + "kl": 0.40973762422800064, + "learning_rate": 4.594260904416655e-07, + "loss": 0.0164, + "num_tokens": 19322414.0, + "reward": 0.83245849609375, + "reward_std": 0.018694691359996796, + "rewards//mean": 0.83245849609375, + "rewards//std": 0.024090977385640144, + "step": 2654 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.531, + "grad_norm": 1.6133620738983154, + "kl": 0.3694038689136505, + "learning_rate": 4.591098125814776e-07, + "loss": 0.0148, + "num_tokens": 19329750.0, + "reward": 0.886474609375, + "reward_std": 0.01660846173763275, + "rewards//mean": 0.886474609375, + "rewards//std": 0.02644658274948597, + "step": 2655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5312, + "grad_norm": 1.3443831205368042, + "kl": 0.30214711651206017, + "learning_rate": 4.58793551191852e-07, + "loss": 0.0121, + "num_tokens": 19337006.0, + "reward": 0.84759521484375, + "reward_std": 0.013445498421788216, + "rewards//mean": 0.84759521484375, + "rewards//std": 0.02324545383453369, + "step": 2656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5314, + "grad_norm": 1.4504008293151855, + "kl": 0.37457485496997833, + "learning_rate": 4.584773064001792e-07, + "loss": 0.015, + "num_tokens": 19344310.0, + "reward": 0.85394287109375, + "reward_std": 0.01518944464623928, + "rewards//mean": 0.85394287109375, + "rewards//std": 0.021220559254288673, + "step": 2657 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5316, + "grad_norm": 1.1812673807144165, + "kl": 0.2517544459551573, + "learning_rate": 4.5816107833384233e-07, + "loss": 0.0101, + "num_tokens": 19351630.0, + "reward": 0.8526611328125, + "reward_std": 0.009731203317642212, + "rewards//mean": 0.8526611328125, + "rewards//std": 0.013980082236230373, + "step": 2658 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.5318, + "grad_norm": 1.7638002634048462, + "kl": 0.428161583840847, + "learning_rate": 4.5784486712021817e-07, + "loss": 0.0179, + "num_tokens": 19358925.0, + "reward": 0.8873291015625, + "reward_std": 0.02094094827771187, + "rewards//mean": 0.8873291015625, + "rewards//std": 0.02636718936264515, + "step": 2659 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.532, + "grad_norm": 1.4634912014007568, + "kl": 0.3801167979836464, + "learning_rate": 4.575286728866764e-07, + "loss": 0.0163, + "num_tokens": 19366211.0, + "reward": 0.770751953125, + "reward_std": 0.014205412939190865, + "rewards//mean": 0.770751953125, + "rewards//std": 0.020773665979504585, + "step": 2660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.5625, + "epoch": 0.5322, + "grad_norm": 1.2986844778060913, + "kl": 0.3691694922745228, + "learning_rate": 4.5721249576058027e-07, + "loss": -0.0175, + "num_tokens": 19373383.0, + "reward": 0.85302734375, + "reward_std": 0.019095826894044876, + "rewards//mean": 0.85302734375, + "rewards//std": 0.028843844309449196, + "step": 2661 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5324, + "grad_norm": 1.4949411153793335, + "kl": 0.36177531257271767, + "learning_rate": 4.568963358692856e-07, + "loss": 0.0145, + "num_tokens": 19380807.0, + "reward": 0.798583984375, + "reward_std": 0.015015674754977226, + "rewards//mean": 0.798583984375, + "rewards//std": 0.02975623868405819, + "step": 2662 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5326, + "grad_norm": 1.4479343891143799, + "kl": 0.3270976357161999, + "learning_rate": 4.565801933401417e-07, + "loss": 0.0131, + "num_tokens": 19388079.0, + "reward": 0.89031982421875, + "reward_std": 0.017566604539752007, + "rewards//mean": 0.89031982421875, + "rewards//std": 0.026578404009342194, + "step": 2663 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5328, + "grad_norm": 1.3303974866867065, + "kl": 0.33441925421357155, + "learning_rate": 4.562640683004907e-07, + "loss": 0.0134, + "num_tokens": 19395383.0, + "reward": 0.84295654296875, + "reward_std": 0.013823047280311584, + "rewards//mean": 0.84295654296875, + "rewards//std": 0.017346560955047607, + "step": 2664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.533, + "grad_norm": 1.489678978919983, + "kl": 0.27959903329610825, + "learning_rate": 4.5594796087766787e-07, + "loss": 0.0112, + "num_tokens": 19402711.0, + "reward": 0.8846435546875, + "reward_std": 0.016389179974794388, + "rewards//mean": 0.8846435546875, + "rewards//std": 0.02347847819328308, + "step": 2665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5332, + "grad_norm": 1.7343658208847046, + "kl": 0.3494054637849331, + "learning_rate": 4.55631871199001e-07, + "loss": 0.014, + "num_tokens": 19410063.0, + "reward": 0.78887939453125, + "reward_std": 0.011822298169136047, + "rewards//mean": 0.78887939453125, + "rewards//std": 0.014422840438783169, + "step": 2666 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5334, + "grad_norm": 1.77515709400177, + "kl": 0.3671058714389801, + "learning_rate": 4.553157993918112e-07, + "loss": 0.0147, + "num_tokens": 19417455.0, + "reward": 0.82012939453125, + "reward_std": 0.017081521451473236, + "rewards//mean": 0.82012939453125, + "rewards//std": 0.026073545217514038, + "step": 2667 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5336, + "grad_norm": 1.289597511291504, + "kl": 0.40744630992412567, + "learning_rate": 4.5499974558341206e-07, + "loss": 0.0163, + "num_tokens": 19424719.0, + "reward": 0.86712646484375, + "reward_std": 0.013838870450854301, + "rewards//mean": 0.86712646484375, + "rewards//std": 0.01933172158896923, + "step": 2668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5338, + "grad_norm": 1.6415482759475708, + "kl": 0.34078336507081985, + "learning_rate": 4.5468370990110997e-07, + "loss": 0.0136, + "num_tokens": 19431935.0, + "reward": 0.8653564453125, + "reward_std": 0.018648862838745117, + "rewards//mean": 0.8653564453125, + "rewards//std": 0.026657240465283394, + "step": 2669 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.534, + "grad_norm": 1.4361060857772827, + "kl": 0.3432545382529497, + "learning_rate": 4.543676924722042e-07, + "loss": 0.0137, + "num_tokens": 19439303.0, + "reward": 0.83929443359375, + "reward_std": 0.012843593955039978, + "rewards//mean": 0.83929443359375, + "rewards//std": 0.01696092076599598, + "step": 2670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5342, + "grad_norm": 1.7010449171066284, + "kl": 0.3585103526711464, + "learning_rate": 4.540516934239863e-07, + "loss": 0.0143, + "num_tokens": 19446495.0, + "reward": 0.88616943359375, + "reward_std": 0.013881441205739975, + "rewards//mean": 0.88616943359375, + "rewards//std": 0.019122296944260597, + "step": 2671 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5344, + "grad_norm": 3.3750734329223633, + "kl": 0.5111861061304808, + "learning_rate": 4.5373571288374097e-07, + "loss": 0.0204, + "num_tokens": 19453783.0, + "reward": 0.838623046875, + "reward_std": 0.01905292645096779, + "rewards//mean": 0.838623046875, + "rewards//std": 0.0243386123329401, + "step": 2672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5346, + "grad_norm": 1.9567049741744995, + "kl": 0.37949665635824203, + "learning_rate": 4.534197509787448e-07, + "loss": 0.0152, + "num_tokens": 19461087.0, + "reward": 0.858154296875, + "reward_std": 0.018202684819698334, + "rewards//mean": 0.858154296875, + "rewards//std": 0.02342587150633335, + "step": 2673 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5348, + "grad_norm": 1.4115294218063354, + "kl": 0.2991499863564968, + "learning_rate": 4.5310380783626747e-07, + "loss": 0.012, + "num_tokens": 19468503.0, + "reward": 0.86883544921875, + "reward_std": 0.012324618175625801, + "rewards//mean": 0.86883544921875, + "rewards//std": 0.019915670156478882, + "step": 2674 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.535, + "grad_norm": 1.7299349308013916, + "kl": 0.33318985626101494, + "learning_rate": 4.527878835835706e-07, + "loss": 0.0133, + "num_tokens": 19475783.0, + "reward": 0.851806640625, + "reward_std": 0.011518262326717377, + "rewards//mean": 0.851806640625, + "rewards//std": 0.020726976916193962, + "step": 2675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5352, + "grad_norm": 1.8755791187286377, + "kl": 0.3142531141638756, + "learning_rate": 4.5247197834790873e-07, + "loss": 0.0126, + "num_tokens": 19483047.0, + "reward": 0.80291748046875, + "reward_std": 0.01789286360144615, + "rewards//mean": 0.80291748046875, + "rewards//std": 0.025381607934832573, + "step": 2676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5354, + "grad_norm": 1.5216519832611084, + "kl": 0.290446512401104, + "learning_rate": 4.5215609225652817e-07, + "loss": 0.0116, + "num_tokens": 19490247.0, + "reward": 0.84027099609375, + "reward_std": 0.017211750149726868, + "rewards//mean": 0.84027099609375, + "rewards//std": 0.020161187276244164, + "step": 2677 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5356, + "grad_norm": 1.5558490753173828, + "kl": 0.36084816232323647, + "learning_rate": 4.5184022543666806e-07, + "loss": 0.0144, + "num_tokens": 19497543.0, + "reward": 0.862548828125, + "reward_std": 0.01980038359761238, + "rewards//mean": 0.862548828125, + "rewards//std": 0.035888370126485825, + "step": 2678 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5358, + "grad_norm": 1.4888286590576172, + "kl": 0.33441014401614666, + "learning_rate": 4.5152437801555926e-07, + "loss": 0.0134, + "num_tokens": 19504783.0, + "reward": 0.82904052734375, + "reward_std": 0.01762983575463295, + "rewards//mean": 0.82904052734375, + "rewards//std": 0.024859147146344185, + "step": 2679 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.536, + "grad_norm": 1.4828790426254272, + "kl": 0.29810551926493645, + "learning_rate": 4.512085501204253e-07, + "loss": 0.0143, + "num_tokens": 19512144.0, + "reward": 0.87640380859375, + "reward_std": 0.0214396882802248, + "rewards//mean": 0.87640380859375, + "rewards//std": 0.02676965296268463, + "step": 2680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5362, + "grad_norm": 1.309980869293213, + "kl": 0.35364517010748386, + "learning_rate": 4.508927418784814e-07, + "loss": 0.0141, + "num_tokens": 19519472.0, + "reward": 0.8427734375, + "reward_std": 0.019127314910292625, + "rewards//mean": 0.8427734375, + "rewards//std": 0.02327156998217106, + "step": 2681 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5364, + "grad_norm": 1.639060139656067, + "kl": 0.3081929814070463, + "learning_rate": 4.5057695341693536e-07, + "loss": 0.0123, + "num_tokens": 19526848.0, + "reward": 0.86181640625, + "reward_std": 0.012177985161542892, + "rewards//mean": 0.86181640625, + "rewards//std": 0.015319748781621456, + "step": 2682 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5366, + "grad_norm": 1.4143695831298828, + "kl": 0.31415414437651634, + "learning_rate": 4.502611848629865e-07, + "loss": 0.0126, + "num_tokens": 19534256.0, + "reward": 0.898681640625, + "reward_std": 0.015679379925131798, + "rewards//mean": 0.898681640625, + "rewards//std": 0.023050658404827118, + "step": 2683 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5368, + "grad_norm": 1.376062273979187, + "kl": 0.3902102205902338, + "learning_rate": 4.499454363438264e-07, + "loss": 0.0156, + "num_tokens": 19541600.0, + "reward": 0.8558349609375, + "reward_std": 0.01803385466337204, + "rewards//mean": 0.8558349609375, + "rewards//std": 0.02566429413855076, + "step": 2684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.537, + "grad_norm": 1.7212233543395996, + "kl": 0.35629962384700775, + "learning_rate": 4.496297079866386e-07, + "loss": 0.0143, + "num_tokens": 19548848.0, + "reward": 0.83538818359375, + "reward_std": 0.014241941273212433, + "rewards//mean": 0.83538818359375, + "rewards//std": 0.019473714753985405, + "step": 2685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5372, + "grad_norm": 1.473036766052246, + "kl": 0.3668597601354122, + "learning_rate": 4.4931399991859833e-07, + "loss": 0.0147, + "num_tokens": 19556088.0, + "reward": 0.7926025390625, + "reward_std": 0.015906283631920815, + "rewards//mean": 0.7926025390625, + "rewards//std": 0.019284386187791824, + "step": 2686 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5374, + "grad_norm": 1.3229801654815674, + "kl": 0.32170593179762363, + "learning_rate": 4.489983122668729e-07, + "loss": 0.0129, + "num_tokens": 19563328.0, + "reward": 0.83197021484375, + "reward_std": 0.01232241466641426, + "rewards//mean": 0.83197021484375, + "rewards//std": 0.020033137872815132, + "step": 2687 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.5376, + "grad_norm": 5.033291816711426, + "kl": 0.7153502628207207, + "learning_rate": 4.486826451586211e-07, + "loss": 0.0286, + "num_tokens": 19570567.0, + "reward": 0.848876953125, + "reward_std": 0.014902085065841675, + "rewards//mean": 0.848876953125, + "rewards//std": 0.020146258175373077, + "step": 2688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5378, + "grad_norm": 1.5385273694992065, + "kl": 0.3440232500433922, + "learning_rate": 4.483669987209938e-07, + "loss": 0.0138, + "num_tokens": 19577831.0, + "reward": 0.870849609375, + "reward_std": 0.015786461532115936, + "rewards//mean": 0.870849609375, + "rewards//std": 0.020266123116016388, + "step": 2689 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.538, + "grad_norm": 1.5432813167572021, + "kl": 0.32677095010876656, + "learning_rate": 4.4805137308113315e-07, + "loss": 0.0128, + "num_tokens": 19585146.0, + "reward": 0.86395263671875, + "reward_std": 0.014960480853915215, + "rewards//mean": 0.86395263671875, + "rewards//std": 0.022751139476895332, + "step": 2690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5382, + "grad_norm": 1.5505954027175903, + "kl": 0.39079340919852257, + "learning_rate": 4.477357683661733e-07, + "loss": 0.0156, + "num_tokens": 19592514.0, + "reward": 0.85723876953125, + "reward_std": 0.013508303090929985, + "rewards//mean": 0.85723876953125, + "rewards//std": 0.016157375648617744, + "step": 2691 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5384, + "grad_norm": 1.355397343635559, + "kl": 0.2620339132845402, + "learning_rate": 4.474201847032396e-07, + "loss": 0.0105, + "num_tokens": 19599778.0, + "reward": 0.87835693359375, + "reward_std": 0.015768572688102722, + "rewards//mean": 0.87835693359375, + "rewards//std": 0.02520865760743618, + "step": 2692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5386, + "grad_norm": 1.8493931293487549, + "kl": 0.40624765679240227, + "learning_rate": 4.4710462221944936e-07, + "loss": 0.0162, + "num_tokens": 19607066.0, + "reward": 0.86505126953125, + "reward_std": 0.017033226788043976, + "rewards//mean": 0.86505126953125, + "rewards//std": 0.02227100543677807, + "step": 2693 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5388, + "grad_norm": 1.7341822385787964, + "kl": 0.3599294126033783, + "learning_rate": 4.4678908104191076e-07, + "loss": 0.0144, + "num_tokens": 19614394.0, + "reward": 0.831787109375, + "reward_std": 0.010146189481019974, + "rewards//mean": 0.831787109375, + "rewards//std": 0.013111201114952564, + "step": 2694 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.539, + "grad_norm": 1.4192075729370117, + "kl": 0.301291698589921, + "learning_rate": 4.464735612977242e-07, + "loss": 0.0121, + "num_tokens": 19621658.0, + "reward": 0.8551025390625, + "reward_std": 0.01387939602136612, + "rewards//mean": 0.8551025390625, + "rewards//std": 0.021003752946853638, + "step": 2695 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5392, + "grad_norm": 1.700320839881897, + "kl": 0.3231286685913801, + "learning_rate": 4.4615806311398055e-07, + "loss": 0.0129, + "num_tokens": 19628938.0, + "reward": 0.8673095703125, + "reward_std": 0.014345229603350163, + "rewards//mean": 0.8673095703125, + "rewards//std": 0.024547627195715904, + "step": 2696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5394, + "grad_norm": 1.545517086982727, + "kl": 0.4259725585579872, + "learning_rate": 4.458425866177627e-07, + "loss": 0.017, + "num_tokens": 19636226.0, + "reward": 0.8385009765625, + "reward_std": 0.01805395632982254, + "rewards//mean": 0.8385009765625, + "rewards//std": 0.023336201906204224, + "step": 2697 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.765625, + "epoch": 0.5396, + "grad_norm": 1.1576826572418213, + "kl": 0.3151168189942837, + "learning_rate": 4.4552713193614443e-07, + "loss": 0.006, + "num_tokens": 19643531.0, + "reward": 0.86822509765625, + "reward_std": 0.015074710361659527, + "rewards//mean": 0.86822509765625, + "rewards//std": 0.027745964005589485, + "step": 2698 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5398, + "grad_norm": 1.6574174165725708, + "kl": 0.3342438992112875, + "learning_rate": 4.45211699196191e-07, + "loss": 0.0134, + "num_tokens": 19650771.0, + "reward": 0.82647705078125, + "reward_std": 0.013663307763636112, + "rewards//mean": 0.82647705078125, + "rewards//std": 0.017850618809461594, + "step": 2699 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.54, + "grad_norm": 1.5350004434585571, + "kl": 0.37087193690240383, + "learning_rate": 4.448962885249586e-07, + "loss": 0.0148, + "num_tokens": 19658019.0, + "reward": 0.83551025390625, + "reward_std": 0.012116564437747002, + "rewards//mean": 0.83551025390625, + "rewards//std": 0.016574528068304062, + "step": 2700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5402, + "grad_norm": 1.3204401731491089, + "kl": 0.3117235116660595, + "learning_rate": 4.445809000494945e-07, + "loss": 0.0125, + "num_tokens": 19665363.0, + "reward": 0.83673095703125, + "reward_std": 0.012175751850008965, + "rewards//mean": 0.83673095703125, + "rewards//std": 0.015550101175904274, + "step": 2701 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5404, + "grad_norm": 1.4787158966064453, + "kl": 0.3417541943490505, + "learning_rate": 4.442655338968373e-07, + "loss": 0.0137, + "num_tokens": 19672691.0, + "reward": 0.83514404296875, + "reward_std": 0.018753577023744583, + "rewards//mean": 0.83514404296875, + "rewards//std": 0.02755269967019558, + "step": 2702 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5406, + "grad_norm": 1.4613667726516724, + "kl": 0.34986563958227634, + "learning_rate": 4.439501901940163e-07, + "loss": 0.014, + "num_tokens": 19680067.0, + "reward": 0.83892822265625, + "reward_std": 0.016468264162540436, + "rewards//mean": 0.83892822265625, + "rewards//std": 0.023154104128479958, + "step": 2703 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5408, + "grad_norm": 1.4533134698867798, + "kl": 0.3433993197977543, + "learning_rate": 4.436348690680521e-07, + "loss": 0.0137, + "num_tokens": 19687363.0, + "reward": 0.801513671875, + "reward_std": 0.009066836908459663, + "rewards//mean": 0.801513671875, + "rewards//std": 0.01756434142589569, + "step": 2704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.541, + "grad_norm": 1.7319462299346924, + "kl": 0.33829156681895256, + "learning_rate": 4.4331957064595575e-07, + "loss": 0.0135, + "num_tokens": 19694643.0, + "reward": 0.80548095703125, + "reward_std": 0.019258026033639908, + "rewards//mean": 0.80548095703125, + "rewards//std": 0.029324183240532875, + "step": 2705 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5412, + "grad_norm": 1.4829121828079224, + "kl": 0.29513346403837204, + "learning_rate": 4.430042950547297e-07, + "loss": 0.0118, + "num_tokens": 19701915.0, + "reward": 0.83843994140625, + "reward_std": 0.013369082473218441, + "rewards//mean": 0.83843994140625, + "rewards//std": 0.021669501438736916, + "step": 2706 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5414, + "grad_norm": 1.640621542930603, + "kl": 0.3212854787707329, + "learning_rate": 4.4268904242136667e-07, + "loss": 0.0129, + "num_tokens": 19709171.0, + "reward": 0.86468505859375, + "reward_std": 0.018472949042916298, + "rewards//mean": 0.86468505859375, + "rewards//std": 0.02480672299861908, + "step": 2707 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5416, + "grad_norm": 1.8245973587036133, + "kl": 0.3300775773823261, + "learning_rate": 4.4237381287285064e-07, + "loss": 0.0132, + "num_tokens": 19716467.0, + "reward": 0.802001953125, + "reward_std": 0.014215769246220589, + "rewards//mean": 0.802001953125, + "rewards//std": 0.023795168846845627, + "step": 2708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.5418, + "grad_norm": 1.61478853225708, + "kl": 0.3991621062159538, + "learning_rate": 4.420586065361557e-07, + "loss": 0.0131, + "num_tokens": 19723806.0, + "reward": 0.8551025390625, + "reward_std": 0.015090454369783401, + "rewards//mean": 0.8551025390625, + "rewards//std": 0.020560890436172485, + "step": 2709 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.542, + "grad_norm": 1.6226297616958618, + "kl": 0.3333650901913643, + "learning_rate": 4.4174342353824736e-07, + "loss": 0.0133, + "num_tokens": 19731158.0, + "reward": 0.7901611328125, + "reward_std": 0.013000745326280594, + "rewards//mean": 0.7901611328125, + "rewards//std": 0.017489634454250336, + "step": 2710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5422, + "grad_norm": 1.5433886051177979, + "kl": 0.31092025712132454, + "learning_rate": 4.4142826400608085e-07, + "loss": 0.0124, + "num_tokens": 19738422.0, + "reward": 0.84686279296875, + "reward_std": 0.016266612336039543, + "rewards//mean": 0.84686279296875, + "rewards//std": 0.020197942852973938, + "step": 2711 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5424, + "grad_norm": 1.5495643615722656, + "kl": 0.3527500033378601, + "learning_rate": 4.411131280666027e-07, + "loss": 0.0141, + "num_tokens": 19745694.0, + "reward": 0.846923828125, + "reward_std": 0.018130049109458923, + "rewards//mean": 0.846923828125, + "rewards//std": 0.023713599890470505, + "step": 2712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5426, + "grad_norm": 1.8187979459762573, + "kl": 0.40419798716902733, + "learning_rate": 4.407980158467495e-07, + "loss": 0.0162, + "num_tokens": 19752910.0, + "reward": 0.864013671875, + "reward_std": 0.01586887240409851, + "rewards//mean": 0.864013671875, + "rewards//std": 0.026327257975935936, + "step": 2713 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5428, + "grad_norm": 1.4459409713745117, + "kl": 0.3083775248378515, + "learning_rate": 4.4048292747344844e-07, + "loss": 0.0123, + "num_tokens": 19760198.0, + "reward": 0.88525390625, + "reward_std": 0.018439264968037605, + "rewards//mean": 0.88525390625, + "rewards//std": 0.0288186427205801, + "step": 2714 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.543, + "grad_norm": 1.5114833116531372, + "kl": 0.3524801451712847, + "learning_rate": 4.4016786307361715e-07, + "loss": 0.0141, + "num_tokens": 19767502.0, + "reward": 0.8502197265625, + "reward_std": 0.015140203759074211, + "rewards//mean": 0.8502197265625, + "rewards//std": 0.01848275400698185, + "step": 2715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5432, + "grad_norm": 1.7345218658447266, + "kl": 0.35105688497424126, + "learning_rate": 4.398528227741633e-07, + "loss": 0.014, + "num_tokens": 19774790.0, + "reward": 0.8045654296875, + "reward_std": 0.013687700033187866, + "rewards//mean": 0.8045654296875, + "rewards//std": 0.022433962672948837, + "step": 2716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5434, + "grad_norm": 1.814107894897461, + "kl": 0.33516454696655273, + "learning_rate": 4.3953780670198534e-07, + "loss": 0.0134, + "num_tokens": 19782070.0, + "reward": 0.83856201171875, + "reward_std": 0.01586761325597763, + "rewards//mean": 0.83856201171875, + "rewards//std": 0.019476046785712242, + "step": 2717 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5436, + "grad_norm": 1.3642008304595947, + "kl": 0.3364564124494791, + "learning_rate": 4.392228149839716e-07, + "loss": 0.0135, + "num_tokens": 19789350.0, + "reward": 0.87933349609375, + "reward_std": 0.015231659635901451, + "rewards//mean": 0.87933349609375, + "rewards//std": 0.029513027518987656, + "step": 2718 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5438, + "grad_norm": 1.6224645376205444, + "kl": 0.3229537382721901, + "learning_rate": 4.389078477470007e-07, + "loss": 0.0129, + "num_tokens": 19796638.0, + "reward": 0.86749267578125, + "reward_std": 0.015420828014612198, + "rewards//mean": 0.86749267578125, + "rewards//std": 0.020544227212667465, + "step": 2719 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.544, + "grad_norm": 1.776145577430725, + "kl": 0.2956127468496561, + "learning_rate": 4.385929051179414e-07, + "loss": 0.0118, + "num_tokens": 19803894.0, + "reward": 0.82281494140625, + "reward_std": 0.014886821620166302, + "rewards//mean": 0.82281494140625, + "rewards//std": 0.02097661979496479, + "step": 2720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5442, + "grad_norm": 1.3904169797897339, + "kl": 0.3155914843082428, + "learning_rate": 4.382779872236526e-07, + "loss": 0.0126, + "num_tokens": 19811214.0, + "reward": 0.83489990234375, + "reward_std": 0.011847756803035736, + "rewards//mean": 0.83489990234375, + "rewards//std": 0.014735372737050056, + "step": 2721 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5444, + "grad_norm": 1.6091359853744507, + "kl": 0.3316914699971676, + "learning_rate": 4.3796309419098315e-07, + "loss": 0.0133, + "num_tokens": 19818550.0, + "reward": 0.85430908203125, + "reward_std": 0.0130733922123909, + "rewards//mean": 0.85430908203125, + "rewards//std": 0.020572207868099213, + "step": 2722 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5446, + "grad_norm": 1.4407312870025635, + "kl": 0.34029263257980347, + "learning_rate": 4.37648226146772e-07, + "loss": 0.0136, + "num_tokens": 19825830.0, + "reward": 0.83111572265625, + "reward_std": 0.012468839064240456, + "rewards//mean": 0.83111572265625, + "rewards//std": 0.018633149564266205, + "step": 2723 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.765625, + "epoch": 0.5448, + "grad_norm": 1.4840686321258545, + "kl": 0.3392169699072838, + "learning_rate": 4.3733338321784777e-07, + "loss": 0.0179, + "num_tokens": 19833031.0, + "reward": 0.86956787109375, + "reward_std": 0.019271986559033394, + "rewards//mean": 0.86956787109375, + "rewards//std": 0.025378623977303505, + "step": 2724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.875, + "epoch": 0.545, + "grad_norm": 1.6220941543579102, + "kl": 0.3373652659356594, + "learning_rate": 4.3701856553102943e-07, + "loss": 0.0105, + "num_tokens": 19840335.0, + "reward": 0.8564453125, + "reward_std": 0.01871020719408989, + "rewards//mean": 0.8564453125, + "rewards//std": 0.02632840722799301, + "step": 2725 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5452, + "grad_norm": 1.6873087882995605, + "kl": 0.39296796172857285, + "learning_rate": 4.367037732131253e-07, + "loss": 0.0157, + "num_tokens": 19847575.0, + "reward": 0.82965087890625, + "reward_std": 0.013576554134488106, + "rewards//mean": 0.82965087890625, + "rewards//std": 0.016857072710990906, + "step": 2726 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5454, + "grad_norm": 1.4121664762496948, + "kl": 0.3096821494400501, + "learning_rate": 4.363890063909338e-07, + "loss": 0.0124, + "num_tokens": 19854903.0, + "reward": 0.8504638671875, + "reward_std": 0.013636471703648567, + "rewards//mean": 0.8504638671875, + "rewards//std": 0.018182188272476196, + "step": 2727 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5456, + "grad_norm": 1.3571301698684692, + "kl": 0.3073268048465252, + "learning_rate": 4.360742651912428e-07, + "loss": 0.0123, + "num_tokens": 19862295.0, + "reward": 0.82147216796875, + "reward_std": 0.01230904832482338, + "rewards//mean": 0.82147216796875, + "rewards//std": 0.0185018889605999, + "step": 2728 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5458, + "grad_norm": 1.800833821296692, + "kl": 0.36724260635674, + "learning_rate": 4.357595497408303e-07, + "loss": 0.0147, + "num_tokens": 19869479.0, + "reward": 0.83770751953125, + "reward_std": 0.018116889521479607, + "rewards//mean": 0.83770751953125, + "rewards//std": 0.02544831670820713, + "step": 2729 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.546, + "grad_norm": 1.6675333976745605, + "kl": 0.31828783079981804, + "learning_rate": 4.354448601664633e-07, + "loss": 0.0127, + "num_tokens": 19876735.0, + "reward": 0.83380126953125, + "reward_std": 0.01878710277378559, + "rewards//mean": 0.83380126953125, + "rewards//std": 0.02914814092218876, + "step": 2730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5462, + "grad_norm": 1.946384072303772, + "kl": 0.3478122279047966, + "learning_rate": 4.3513019659489906e-07, + "loss": 0.0139, + "num_tokens": 19884031.0, + "reward": 0.83612060546875, + "reward_std": 0.0199592188000679, + "rewards//mean": 0.83612060546875, + "rewards//std": 0.03328002989292145, + "step": 2731 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5464, + "grad_norm": 1.5026661157608032, + "kl": 0.29767439141869545, + "learning_rate": 4.3481555915288384e-07, + "loss": 0.0119, + "num_tokens": 19891343.0, + "reward": 0.81085205078125, + "reward_std": 0.010988411493599415, + "rewards//mean": 0.81085205078125, + "rewards//std": 0.016960028558969498, + "step": 2732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5466, + "grad_norm": 1.4059712886810303, + "kl": 0.38082095235586166, + "learning_rate": 4.345009479671535e-07, + "loss": 0.0152, + "num_tokens": 19898775.0, + "reward": 0.87506103515625, + "reward_std": 0.011072736233472824, + "rewards//mean": 0.87506103515625, + "rewards//std": 0.01913021132349968, + "step": 2733 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5468, + "grad_norm": 1.5722942352294922, + "kl": 0.3056444749236107, + "learning_rate": 4.3418636316443365e-07, + "loss": 0.0122, + "num_tokens": 19905959.0, + "reward": 0.80328369140625, + "reward_std": 0.011432883329689503, + "rewards//mean": 0.80328369140625, + "rewards//std": 0.017885353416204453, + "step": 2734 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.547, + "grad_norm": 1.5244019031524658, + "kl": 0.30251445807516575, + "learning_rate": 4.338718048714387e-07, + "loss": 0.0121, + "num_tokens": 19913159.0, + "reward": 0.89398193359375, + "reward_std": 0.016276268288493156, + "rewards//mean": 0.89398193359375, + "rewards//std": 0.021853147074580193, + "step": 2735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5472, + "grad_norm": 1.538846492767334, + "kl": 0.33809137530624866, + "learning_rate": 4.3355727321487297e-07, + "loss": 0.0135, + "num_tokens": 19920423.0, + "reward": 0.86444091796875, + "reward_std": 0.016644861549139023, + "rewards//mean": 0.86444091796875, + "rewards//std": 0.028971027582883835, + "step": 2736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5474, + "grad_norm": 1.6497489213943481, + "kl": 0.32137641310691833, + "learning_rate": 4.332427683214295e-07, + "loss": 0.0129, + "num_tokens": 19927751.0, + "reward": 0.8536376953125, + "reward_std": 0.013233836740255356, + "rewards//mean": 0.8536376953125, + "rewards//std": 0.018544895574450493, + "step": 2737 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5476, + "grad_norm": 1.4601142406463623, + "kl": 0.3837617412209511, + "learning_rate": 4.329282903177911e-07, + "loss": 0.0154, + "num_tokens": 19935071.0, + "reward": 0.83843994140625, + "reward_std": 0.012792302295565605, + "rewards//mean": 0.83843994140625, + "rewards//std": 0.01730724610388279, + "step": 2738 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.859375, + "epoch": 0.5478, + "grad_norm": 1.4437788724899292, + "kl": 0.3626781478524208, + "learning_rate": 4.3261383933062916e-07, + "loss": 0.007, + "num_tokens": 19942358.0, + "reward": 0.85198974609375, + "reward_std": 0.0165222380310297, + "rewards//mean": 0.85198974609375, + "rewards//std": 0.027683697640895844, + "step": 2739 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.548, + "grad_norm": 1.489400029182434, + "kl": 0.3329983986914158, + "learning_rate": 4.32299415486605e-07, + "loss": 0.0133, + "num_tokens": 19949678.0, + "reward": 0.79486083984375, + "reward_std": 0.01718454249203205, + "rewards//mean": 0.79486083984375, + "rewards//std": 0.027865733951330185, + "step": 2740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5482, + "grad_norm": 1.5667369365692139, + "kl": 0.32434090226888657, + "learning_rate": 4.31985018912368e-07, + "loss": 0.013, + "num_tokens": 19956966.0, + "reward": 0.83013916015625, + "reward_std": 0.019133444875478745, + "rewards//mean": 0.83013916015625, + "rewards//std": 0.025016970932483673, + "step": 2741 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5484, + "grad_norm": 1.106978178024292, + "kl": 0.3092767335474491, + "learning_rate": 4.316706497345572e-07, + "loss": 0.0124, + "num_tokens": 19964182.0, + "reward": 0.84295654296875, + "reward_std": 0.014331587590277195, + "rewards//mean": 0.84295654296875, + "rewards//std": 0.026210203766822815, + "step": 2742 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5486, + "grad_norm": 1.6046087741851807, + "kl": 0.30752732418477535, + "learning_rate": 4.313563080798006e-07, + "loss": 0.0123, + "num_tokens": 19971494.0, + "reward": 0.82952880859375, + "reward_std": 0.014105590060353279, + "rewards//mean": 0.82952880859375, + "rewards//std": 0.02486523613333702, + "step": 2743 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5488, + "grad_norm": 1.7004222869873047, + "kl": 0.3518461436033249, + "learning_rate": 4.3104199407471477e-07, + "loss": 0.0141, + "num_tokens": 19978814.0, + "reward": 0.865234375, + "reward_std": 0.018173029646277428, + "rewards//mean": 0.865234375, + "rewards//std": 0.022347765043377876, + "step": 2744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.549, + "grad_norm": 2.53899884223938, + "kl": 0.4737774543464184, + "learning_rate": 4.3072770784590564e-07, + "loss": 0.019, + "num_tokens": 19986070.0, + "reward": 0.8570556640625, + "reward_std": 0.016823632642626762, + "rewards//mean": 0.8570556640625, + "rewards//std": 0.02792413718998432, + "step": 2745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5492, + "grad_norm": 1.4666095972061157, + "kl": 0.334476713091135, + "learning_rate": 4.304134495199674e-07, + "loss": 0.0134, + "num_tokens": 19993334.0, + "reward": 0.7298583984375, + "reward_std": 0.013706637546420097, + "rewards//mean": 0.7298583984375, + "rewards//std": 0.025125425308942795, + "step": 2746 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5494, + "grad_norm": 1.4567115306854248, + "kl": 0.31262429244816303, + "learning_rate": 4.3009921922348334e-07, + "loss": 0.0125, + "num_tokens": 20000590.0, + "reward": 0.83984375, + "reward_std": 0.013786444440484047, + "rewards//mean": 0.83984375, + "rewards//std": 0.020763462409377098, + "step": 2747 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5496, + "grad_norm": 1.6048052310943604, + "kl": 0.37082360312342644, + "learning_rate": 4.297850170830255e-07, + "loss": 0.0148, + "num_tokens": 20007974.0, + "reward": 0.8564453125, + "reward_std": 0.017338117584586143, + "rewards//mean": 0.8564453125, + "rewards//std": 0.021451938897371292, + "step": 2748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5498, + "grad_norm": 1.5085581541061401, + "kl": 0.3799843154847622, + "learning_rate": 4.294708432251543e-07, + "loss": 0.0152, + "num_tokens": 20015238.0, + "reward": 0.85223388671875, + "reward_std": 0.015791786834597588, + "rewards//mean": 0.85223388671875, + "rewards//std": 0.02174968831241131, + "step": 2749 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.55, + "grad_norm": 1.7861210107803345, + "kl": 0.3592930808663368, + "learning_rate": 4.291566977764191e-07, + "loss": 0.0144, + "num_tokens": 20022446.0, + "reward": 0.86297607421875, + "reward_std": 0.02023966982960701, + "rewards//mean": 0.86297607421875, + "rewards//std": 0.028623035177588463, + "step": 2750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5502, + "grad_norm": 1.7468249797821045, + "kl": 0.31582991033792496, + "learning_rate": 4.2884258086335745e-07, + "loss": 0.0126, + "num_tokens": 20029862.0, + "reward": 0.8292236328125, + "reward_std": 0.014619304798543453, + "rewards//mean": 0.8292236328125, + "rewards//std": 0.019580405205488205, + "step": 2751 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5504, + "grad_norm": 1.3514702320098877, + "kl": 0.32217870838940144, + "learning_rate": 4.285284926124959e-07, + "loss": 0.0129, + "num_tokens": 20037054.0, + "reward": 0.7911376953125, + "reward_std": 0.009992348030209541, + "rewards//mean": 0.7911376953125, + "rewards//std": 0.01980792172253132, + "step": 2752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5506, + "grad_norm": 1.65932035446167, + "kl": 0.337037093937397, + "learning_rate": 4.2821443315034875e-07, + "loss": 0.0135, + "num_tokens": 20044350.0, + "reward": 0.8499755859375, + "reward_std": 0.015466149896383286, + "rewards//mean": 0.8499755859375, + "rewards//std": 0.019558746367692947, + "step": 2753 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5508, + "grad_norm": 1.3724931478500366, + "kl": 0.3307435642927885, + "learning_rate": 4.2790040260341954e-07, + "loss": 0.0132, + "num_tokens": 20051622.0, + "reward": 0.87744140625, + "reward_std": 0.013324389234185219, + "rewards//mean": 0.87744140625, + "rewards//std": 0.019826635718345642, + "step": 2754 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.551, + "grad_norm": 1.6092426776885986, + "kl": 0.33821072429418564, + "learning_rate": 4.2758640109819944e-07, + "loss": 0.0135, + "num_tokens": 20058814.0, + "reward": 0.82232666015625, + "reward_std": 0.015836820006370544, + "rewards//mean": 0.82232666015625, + "rewards//std": 0.023219391703605652, + "step": 2755 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5512, + "grad_norm": 1.422500491142273, + "kl": 0.35956227593123913, + "learning_rate": 4.272724287611684e-07, + "loss": 0.0144, + "num_tokens": 20066054.0, + "reward": 0.857177734375, + "reward_std": 0.015701688826084137, + "rewards//mean": 0.857177734375, + "rewards//std": 0.027567598968744278, + "step": 2756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.5514, + "grad_norm": 1.7018121480941772, + "kl": 0.3430070150643587, + "learning_rate": 4.2695848571879424e-07, + "loss": 0.0109, + "num_tokens": 20073375.0, + "reward": 0.83953857421875, + "reward_std": 0.012093514204025269, + "rewards//mean": 0.83953857421875, + "rewards//std": 0.016737211495637894, + "step": 2757 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.5625, + "epoch": 0.5516, + "grad_norm": 1.4041357040405273, + "kl": 0.3081559296697378, + "learning_rate": 4.2664457209753333e-07, + "loss": -0.0177, + "num_tokens": 20080811.0, + "reward": 0.80859375, + "reward_std": 0.011838339269161224, + "rewards//mean": 0.80859375, + "rewards//std": 0.021799135953187943, + "step": 2758 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5518, + "grad_norm": 1.7474884986877441, + "kl": 0.2651078524067998, + "learning_rate": 4.2633068802383004e-07, + "loss": 0.0106, + "num_tokens": 20088155.0, + "reward": 0.8316650390625, + "reward_std": 0.015468253754079342, + "rewards//mean": 0.8316650390625, + "rewards//std": 0.02918790839612484, + "step": 2759 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.552, + "grad_norm": 1.3968881368637085, + "kl": 0.3713431544601917, + "learning_rate": 4.2601683362411685e-07, + "loss": 0.0149, + "num_tokens": 20095403.0, + "reward": 0.83050537109375, + "reward_std": 0.015291988849639893, + "rewards//mean": 0.83050537109375, + "rewards//std": 0.018250642344355583, + "step": 2760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.5522, + "grad_norm": 1.5197815895080566, + "kl": 0.3314190171658993, + "learning_rate": 4.257030090248142e-07, + "loss": 0.0154, + "num_tokens": 20102711.0, + "reward": 0.8466796875, + "reward_std": 0.016835985705256462, + "rewards//mean": 0.8466796875, + "rewards//std": 0.024389559403061867, + "step": 2761 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5524, + "grad_norm": 1.5484774112701416, + "kl": 0.33518281020224094, + "learning_rate": 4.2538921435233053e-07, + "loss": 0.0134, + "num_tokens": 20110039.0, + "reward": 0.88165283203125, + "reward_std": 0.015652703121304512, + "rewards//mean": 0.88165283203125, + "rewards//std": 0.02518763206899166, + "step": 2762 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5526, + "grad_norm": 1.323158860206604, + "kl": 0.3304385021328926, + "learning_rate": 4.2507544973306255e-07, + "loss": 0.0132, + "num_tokens": 20117279.0, + "reward": 0.88848876953125, + "reward_std": 0.022814270108938217, + "rewards//mean": 0.88848876953125, + "rewards//std": 0.029585260897874832, + "step": 2763 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5528, + "grad_norm": 1.410201072692871, + "kl": 0.30685376934707165, + "learning_rate": 4.2476171529339435e-07, + "loss": 0.0123, + "num_tokens": 20124703.0, + "reward": 0.82379150390625, + "reward_std": 0.011657475493848324, + "rewards//mean": 0.82379150390625, + "rewards//std": 0.03172778710722923, + "step": 2764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.553, + "grad_norm": 1.8339194059371948, + "kl": 0.4622834287583828, + "learning_rate": 4.244480111596983e-07, + "loss": 0.0185, + "num_tokens": 20132047.0, + "reward": 0.87847900390625, + "reward_std": 0.016460556536912918, + "rewards//mean": 0.87847900390625, + "rewards//std": 0.02458852343261242, + "step": 2765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.5532, + "grad_norm": 1.2579768896102905, + "kl": 0.29419286362826824, + "learning_rate": 4.241343374583342e-07, + "loss": 0.0175, + "num_tokens": 20139352.0, + "reward": 0.79248046875, + "reward_std": 0.014246577396988869, + "rewards//mean": 0.79248046875, + "rewards//std": 0.029285505414009094, + "step": 2766 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5534, + "grad_norm": 1.4790464639663696, + "kl": 0.2810200359672308, + "learning_rate": 4.2382069431565e-07, + "loss": 0.0112, + "num_tokens": 20146576.0, + "reward": 0.8809814453125, + "reward_std": 0.020594045519828796, + "rewards//mean": 0.8809814453125, + "rewards//std": 0.02491001784801483, + "step": 2767 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.5536, + "grad_norm": 1.942155122756958, + "kl": 0.38821282237768173, + "learning_rate": 4.23507081857981e-07, + "loss": 0.0149, + "num_tokens": 20153854.0, + "reward": 0.84490966796875, + "reward_std": 0.01824694126844406, + "rewards//mean": 0.84490966796875, + "rewards//std": 0.027819519862532616, + "step": 2768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.84375, + "epoch": 0.5538, + "grad_norm": 1.6518149375915527, + "kl": 0.36377599090337753, + "learning_rate": 4.2319350021165036e-07, + "loss": 0.013, + "num_tokens": 20161028.0, + "reward": 0.868896484375, + "reward_std": 0.014827638864517212, + "rewards//mean": 0.868896484375, + "rewards//std": 0.018764397129416466, + "step": 2769 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.554, + "grad_norm": 1.5683567523956299, + "kl": 0.2663705740123987, + "learning_rate": 4.2287994950296844e-07, + "loss": 0.0107, + "num_tokens": 20168340.0, + "reward": 0.85662841796875, + "reward_std": 0.015246082097291946, + "rewards//mean": 0.85662841796875, + "rewards//std": 0.021118303760886192, + "step": 2770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5542, + "grad_norm": 1.537927508354187, + "kl": 0.4096350334584713, + "learning_rate": 4.2256642985823387e-07, + "loss": 0.0164, + "num_tokens": 20175684.0, + "reward": 0.77923583984375, + "reward_std": 0.011808895505964756, + "rewards//mean": 0.77923583984375, + "rewards//std": 0.015840407460927963, + "step": 2771 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5544, + "grad_norm": 1.7638427019119263, + "kl": 0.3936031609773636, + "learning_rate": 4.222529414037319e-07, + "loss": 0.0157, + "num_tokens": 20182932.0, + "reward": 0.86541748046875, + "reward_std": 0.02181086502969265, + "rewards//mean": 0.86541748046875, + "rewards//std": 0.03430849313735962, + "step": 2772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.5546, + "grad_norm": 1.505298376083374, + "kl": 0.30963293835520744, + "learning_rate": 4.21939484265736e-07, + "loss": 0.0116, + "num_tokens": 20190226.0, + "reward": 0.8623046875, + "reward_std": 0.014531875029206276, + "rewards//mean": 0.8623046875, + "rewards//std": 0.02084495685994625, + "step": 2773 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5548, + "grad_norm": 1.561962366104126, + "kl": 0.30350567400455475, + "learning_rate": 4.216260585705064e-07, + "loss": 0.0121, + "num_tokens": 20197610.0, + "reward": 0.85870361328125, + "reward_std": 0.012093020603060722, + "rewards//mean": 0.85870361328125, + "rewards//std": 0.016985002905130386, + "step": 2774 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.555, + "grad_norm": 1.4363222122192383, + "kl": 0.31764471158385277, + "learning_rate": 4.2131266444429105e-07, + "loss": 0.0127, + "num_tokens": 20204938.0, + "reward": 0.8304443359375, + "reward_std": 0.014518562704324722, + "rewards//mean": 0.8304443359375, + "rewards//std": 0.021153129637241364, + "step": 2775 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5552, + "grad_norm": 1.5499671697616577, + "kl": 0.3349105343222618, + "learning_rate": 4.20999302013325e-07, + "loss": 0.0134, + "num_tokens": 20212162.0, + "reward": 0.86358642578125, + "reward_std": 0.014282376505434513, + "rewards//mean": 0.86358642578125, + "rewards//std": 0.02131948620080948, + "step": 2776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5554, + "grad_norm": 1.3748054504394531, + "kl": 0.32171138375997543, + "learning_rate": 4.206859714038308e-07, + "loss": 0.0129, + "num_tokens": 20219498.0, + "reward": 0.8814697265625, + "reward_std": 0.012910484336316586, + "rewards//mean": 0.8814697265625, + "rewards//std": 0.026293598115444183, + "step": 2777 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5556, + "grad_norm": 1.5053303241729736, + "kl": 0.35489567182958126, + "learning_rate": 4.203726727420178e-07, + "loss": 0.0142, + "num_tokens": 20226746.0, + "reward": 0.8800048828125, + "reward_std": 0.019375529140233994, + "rewards//mean": 0.8800048828125, + "rewards//std": 0.026561668142676353, + "step": 2778 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5558, + "grad_norm": 1.4945917129516602, + "kl": 0.3664313741028309, + "learning_rate": 4.200594061540826e-07, + "loss": 0.0147, + "num_tokens": 20234106.0, + "reward": 0.8712158203125, + "reward_std": 0.013360193930566311, + "rewards//mean": 0.8712158203125, + "rewards//std": 0.0214402936398983, + "step": 2779 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.556, + "grad_norm": 1.5688635110855103, + "kl": 0.29879194125533104, + "learning_rate": 4.1974617176620913e-07, + "loss": 0.012, + "num_tokens": 20241418.0, + "reward": 0.8831787109375, + "reward_std": 0.015536638908088207, + "rewards//mean": 0.8831787109375, + "rewards//std": 0.02231217548251152, + "step": 2780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5562, + "grad_norm": 1.4208317995071411, + "kl": 0.32808441296219826, + "learning_rate": 4.19432969704568e-07, + "loss": 0.0131, + "num_tokens": 20248618.0, + "reward": 0.83563232421875, + "reward_std": 0.01247791014611721, + "rewards//mean": 0.83563232421875, + "rewards//std": 0.018470773473381996, + "step": 2781 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5564, + "grad_norm": 1.234885334968567, + "kl": 0.30000941827893257, + "learning_rate": 4.191198000953171e-07, + "loss": 0.012, + "num_tokens": 20255946.0, + "reward": 0.890625, + "reward_std": 0.012247921898961067, + "rewards//mean": 0.890625, + "rewards//std": 0.02308347076177597, + "step": 2782 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5566, + "grad_norm": 1.3190668821334839, + "kl": 0.32640076242387295, + "learning_rate": 4.188066630646009e-07, + "loss": 0.0131, + "num_tokens": 20263258.0, + "reward": 0.79193115234375, + "reward_std": 0.01033921167254448, + "rewards//mean": 0.79193115234375, + "rewards//std": 0.017305497080087662, + "step": 2783 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.8125, + "epoch": 0.5568, + "grad_norm": 1.337037444114685, + "kl": 0.3159500528126955, + "learning_rate": 4.184935587385513e-07, + "loss": 0.0098, + "num_tokens": 20270542.0, + "reward": 0.87640380859375, + "reward_std": 0.012852530926465988, + "rewards//mean": 0.87640380859375, + "rewards//std": 0.030515672639012337, + "step": 2784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.557, + "grad_norm": 1.2950804233551025, + "kl": 0.2250597244128585, + "learning_rate": 4.1818048724328636e-07, + "loss": 0.009, + "num_tokens": 20277814.0, + "reward": 0.78497314453125, + "reward_std": 0.012090697884559631, + "rewards//mean": 0.78497314453125, + "rewards//std": 0.015984058380126953, + "step": 2785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5572, + "grad_norm": 2.215226650238037, + "kl": 0.2976020984351635, + "learning_rate": 4.1786744870491154e-07, + "loss": 0.0119, + "num_tokens": 20285142.0, + "reward": 0.77947998046875, + "reward_std": 0.01535598374903202, + "rewards//mean": 0.77947998046875, + "rewards//std": 0.02515154518187046, + "step": 2786 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5574, + "grad_norm": 1.3898580074310303, + "kl": 0.38083716109395027, + "learning_rate": 4.175544432495184e-07, + "loss": 0.0152, + "num_tokens": 20292422.0, + "reward": 0.86334228515625, + "reward_std": 0.011078935116529465, + "rewards//mean": 0.86334228515625, + "rewards//std": 0.016969842836260796, + "step": 2787 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5576, + "grad_norm": 1.5103408098220825, + "kl": 0.3508564792573452, + "learning_rate": 4.1724147100318573e-07, + "loss": 0.014, + "num_tokens": 20299702.0, + "reward": 0.8336181640625, + "reward_std": 0.021226251497864723, + "rewards//mean": 0.8336181640625, + "rewards//std": 0.02831175923347473, + "step": 2788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5578, + "grad_norm": 1.6677861213684082, + "kl": 0.31503043696284294, + "learning_rate": 4.169285320919786e-07, + "loss": 0.0126, + "num_tokens": 20306934.0, + "reward": 0.87091064453125, + "reward_std": 0.018412472680211067, + "rewards//mean": 0.87091064453125, + "rewards//std": 0.026323160156607628, + "step": 2789 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.558, + "grad_norm": 1.3668177127838135, + "kl": 0.36333697102963924, + "learning_rate": 4.166156266419489e-07, + "loss": 0.0145, + "num_tokens": 20314182.0, + "reward": 0.8118896484375, + "reward_std": 0.014691433869302273, + "rewards//mean": 0.8118896484375, + "rewards//std": 0.02250133827328682, + "step": 2790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5582, + "grad_norm": 1.5684947967529297, + "kl": 0.3354495204985142, + "learning_rate": 4.1630275477913465e-07, + "loss": 0.0134, + "num_tokens": 20321358.0, + "reward": 0.84259033203125, + "reward_std": 0.013827082701027393, + "rewards//mean": 0.84259033203125, + "rewards//std": 0.022773750126361847, + "step": 2791 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5584, + "grad_norm": 1.3521233797073364, + "kl": 0.39145586639642715, + "learning_rate": 4.1598991662956096e-07, + "loss": 0.0157, + "num_tokens": 20328574.0, + "reward": 0.86590576171875, + "reward_std": 0.013701280578970909, + "rewards//mean": 0.86590576171875, + "rewards//std": 0.014633316546678543, + "step": 2792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5586, + "grad_norm": 1.5390161275863647, + "kl": 0.38397680781781673, + "learning_rate": 4.1567711231923876e-07, + "loss": 0.0154, + "num_tokens": 20335830.0, + "reward": 0.80877685546875, + "reward_std": 0.014160681515932083, + "rewards//mean": 0.80877685546875, + "rewards//std": 0.018577009439468384, + "step": 2793 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5588, + "grad_norm": 1.9830619096755981, + "kl": 0.3666081614792347, + "learning_rate": 4.1536434197416556e-07, + "loss": 0.0147, + "num_tokens": 20343086.0, + "reward": 0.88055419921875, + "reward_std": 0.018012087792158127, + "rewards//mean": 0.88055419921875, + "rewards//std": 0.021393202245235443, + "step": 2794 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.559, + "grad_norm": 1.6048403978347778, + "kl": 0.39951574243605137, + "learning_rate": 4.1505160572032534e-07, + "loss": 0.016, + "num_tokens": 20350342.0, + "reward": 0.86181640625, + "reward_std": 0.010467847809195518, + "rewards//mean": 0.86181640625, + "rewards//std": 0.02172679640352726, + "step": 2795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5592, + "grad_norm": 1.3772729635238647, + "kl": 0.2744261156767607, + "learning_rate": 4.1473890368368805e-07, + "loss": 0.011, + "num_tokens": 20357638.0, + "reward": 0.87371826171875, + "reward_std": 0.01632080227136612, + "rewards//mean": 0.87371826171875, + "rewards//std": 0.020196443423628807, + "step": 2796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5594, + "grad_norm": 1.1827619075775146, + "kl": 0.26487937942147255, + "learning_rate": 4.1442623599021035e-07, + "loss": 0.0106, + "num_tokens": 20365022.0, + "reward": 0.878662109375, + "reward_std": 0.01888996921479702, + "rewards//mean": 0.878662109375, + "rewards//std": 0.02828742191195488, + "step": 2797 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5596, + "grad_norm": 1.53508460521698, + "kl": 0.351377310231328, + "learning_rate": 4.141136027658344e-07, + "loss": 0.0141, + "num_tokens": 20372302.0, + "reward": 0.79766845703125, + "reward_std": 0.015303075313568115, + "rewards//mean": 0.79766845703125, + "rewards//std": 0.02747347205877304, + "step": 2798 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5598, + "grad_norm": 1.4329859018325806, + "kl": 0.3517420366406441, + "learning_rate": 4.138010041364891e-07, + "loss": 0.0141, + "num_tokens": 20379542.0, + "reward": 0.7823486328125, + "reward_std": 0.014248628169298172, + "rewards//mean": 0.7823486328125, + "rewards//std": 0.01725260354578495, + "step": 2799 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.56, + "grad_norm": 1.3489972352981567, + "kl": 0.36160561442375183, + "learning_rate": 4.134884402280889e-07, + "loss": 0.0145, + "num_tokens": 20386806.0, + "reward": 0.8477783203125, + "reward_std": 0.017842307686805725, + "rewards//mean": 0.8477783203125, + "rewards//std": 0.026883427053689957, + "step": 2800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5602, + "grad_norm": 1.4205838441848755, + "kl": 0.32463364861905575, + "learning_rate": 4.131759111665348e-07, + "loss": 0.013, + "num_tokens": 20394078.0, + "reward": 0.88165283203125, + "reward_std": 0.014882158488035202, + "rewards//mean": 0.88165283203125, + "rewards//std": 0.0328717902302742, + "step": 2801 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.5604, + "grad_norm": 1.4347597360610962, + "kl": 0.31151482462882996, + "learning_rate": 4.128634170777132e-07, + "loss": 0.012, + "num_tokens": 20401396.0, + "reward": 0.83575439453125, + "reward_std": 0.01663202978670597, + "rewards//mean": 0.83575439453125, + "rewards//std": 0.01818166859447956, + "step": 2802 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5606, + "grad_norm": 1.446862816810608, + "kl": 0.34031396731734276, + "learning_rate": 4.1255095808749687e-07, + "loss": 0.0136, + "num_tokens": 20408636.0, + "reward": 0.84796142578125, + "reward_std": 0.012229925952851772, + "rewards//mean": 0.84796142578125, + "rewards//std": 0.01808650605380535, + "step": 2803 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5608, + "grad_norm": 1.7699978351593018, + "kl": 0.32277972623705864, + "learning_rate": 4.12238534321744e-07, + "loss": 0.0129, + "num_tokens": 20416028.0, + "reward": 0.86676025390625, + "reward_std": 0.017418470233678818, + "rewards//mean": 0.86676025390625, + "rewards//std": 0.03485609218478203, + "step": 2804 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.561, + "grad_norm": 1.4007513523101807, + "kl": 0.29013893380761147, + "learning_rate": 4.1192614590629916e-07, + "loss": 0.0116, + "num_tokens": 20423324.0, + "reward": 0.82379150390625, + "reward_std": 0.011977458372712135, + "rewards//mean": 0.82379150390625, + "rewards//std": 0.024020498618483543, + "step": 2805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5612, + "grad_norm": 2.0372164249420166, + "kl": 0.34981747902929783, + "learning_rate": 4.1161379296699204e-07, + "loss": 0.014, + "num_tokens": 20430588.0, + "reward": 0.87567138671875, + "reward_std": 0.01878545805811882, + "rewards//mean": 0.87567138671875, + "rewards//std": 0.027739960700273514, + "step": 2806 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5614, + "grad_norm": 1.566158413887024, + "kl": 0.3279149681329727, + "learning_rate": 4.113014756296388e-07, + "loss": 0.0131, + "num_tokens": 20437820.0, + "reward": 0.86065673828125, + "reward_std": 0.018016736954450607, + "rewards//mean": 0.86065673828125, + "rewards//std": 0.021877378225326538, + "step": 2807 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5616, + "grad_norm": 1.6627511978149414, + "kl": 0.3093036934733391, + "learning_rate": 4.1098919402004037e-07, + "loss": 0.0124, + "num_tokens": 20445076.0, + "reward": 0.877197265625, + "reward_std": 0.01730480045080185, + "rewards//mean": 0.877197265625, + "rewards//std": 0.023155493661761284, + "step": 2808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5618, + "grad_norm": 1.7826014757156372, + "kl": 0.343082208186388, + "learning_rate": 4.1067694826398403e-07, + "loss": 0.0137, + "num_tokens": 20452316.0, + "reward": 0.88848876953125, + "reward_std": 0.013890914618968964, + "rewards//mean": 0.88848876953125, + "rewards//std": 0.025628510862588882, + "step": 2809 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.562, + "grad_norm": 1.473008155822754, + "kl": 0.3383033014833927, + "learning_rate": 4.1036473848724227e-07, + "loss": 0.0135, + "num_tokens": 20459636.0, + "reward": 0.77117919921875, + "reward_std": 0.013959741219878197, + "rewards//mean": 0.77117919921875, + "rewards//std": 0.019398165866732597, + "step": 2810 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5622, + "grad_norm": 1.5157546997070312, + "kl": 0.3324814010411501, + "learning_rate": 4.1005256481557306e-07, + "loss": 0.0133, + "num_tokens": 20466972.0, + "reward": 0.844970703125, + "reward_std": 0.011129520833492279, + "rewards//mean": 0.844970703125, + "rewards//std": 0.01409063208848238, + "step": 2811 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5624, + "grad_norm": 1.6646212339401245, + "kl": 0.3928990550339222, + "learning_rate": 4.0974042737472005e-07, + "loss": 0.0157, + "num_tokens": 20474308.0, + "reward": 0.85968017578125, + "reward_std": 0.024531036615371704, + "rewards//mean": 0.85968017578125, + "rewards//std": 0.039033375680446625, + "step": 2812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5626, + "grad_norm": 1.3289926052093506, + "kl": 0.34570737928152084, + "learning_rate": 4.0942832629041197e-07, + "loss": 0.0138, + "num_tokens": 20481516.0, + "reward": 0.86285400390625, + "reward_std": 0.014953899197280407, + "rewards//mean": 0.86285400390625, + "rewards//std": 0.02283150516450405, + "step": 2813 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5628, + "grad_norm": 1.252249002456665, + "kl": 0.38286472484469414, + "learning_rate": 4.0911626168836334e-07, + "loss": 0.0153, + "num_tokens": 20488892.0, + "reward": 0.8428955078125, + "reward_std": 0.012599430046975613, + "rewards//mean": 0.8428955078125, + "rewards//std": 0.016912322491407394, + "step": 2814 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.563, + "grad_norm": 1.6350476741790771, + "kl": 0.29029135406017303, + "learning_rate": 4.0880423369427353e-07, + "loss": 0.0116, + "num_tokens": 20496212.0, + "reward": 0.8448486328125, + "reward_std": 0.012375125661492348, + "rewards//mean": 0.8448486328125, + "rewards//std": 0.01953086256980896, + "step": 2815 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5632, + "grad_norm": 1.8755707740783691, + "kl": 0.37867461517453194, + "learning_rate": 4.084922424338276e-07, + "loss": 0.0151, + "num_tokens": 20503420.0, + "reward": 0.85528564453125, + "reward_std": 0.016553450375795364, + "rewards//mean": 0.85528564453125, + "rewards//std": 0.021546903997659683, + "step": 2816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5634, + "grad_norm": 2.6315250396728516, + "kl": 0.5275305323302746, + "learning_rate": 4.0818028803269545e-07, + "loss": 0.0211, + "num_tokens": 20510772.0, + "reward": 0.81072998046875, + "reward_std": 0.00918223150074482, + "rewards//mean": 0.81072998046875, + "rewards//std": 0.014143042266368866, + "step": 2817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5636, + "grad_norm": 1.566871166229248, + "kl": 0.367403369396925, + "learning_rate": 4.078683706165323e-07, + "loss": 0.0147, + "num_tokens": 20518100.0, + "reward": 0.853759765625, + "reward_std": 0.012064030393958092, + "rewards//mean": 0.853759765625, + "rewards//std": 0.03316663205623627, + "step": 2818 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5638, + "grad_norm": 1.7556754350662231, + "kl": 0.30601877719163895, + "learning_rate": 4.075564903109784e-07, + "loss": 0.0122, + "num_tokens": 20525612.0, + "reward": 0.85687255859375, + "reward_std": 0.013694776222109795, + "rewards//mean": 0.85687255859375, + "rewards//std": 0.02198023349046707, + "step": 2819 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.564, + "grad_norm": 1.4032723903656006, + "kl": 0.32547534443438053, + "learning_rate": 4.072446472416592e-07, + "loss": 0.013, + "num_tokens": 20532884.0, + "reward": 0.83984375, + "reward_std": 0.012541882693767548, + "rewards//mean": 0.83984375, + "rewards//std": 0.01605316251516342, + "step": 2820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5642, + "grad_norm": 1.9697409868240356, + "kl": 0.3883112035691738, + "learning_rate": 4.0693284153418497e-07, + "loss": 0.0155, + "num_tokens": 20540204.0, + "reward": 0.85162353515625, + "reward_std": 0.013769319280982018, + "rewards//mean": 0.85162353515625, + "rewards//std": 0.018182501196861267, + "step": 2821 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.5644, + "grad_norm": 1.4517942667007446, + "kl": 0.330961175262928, + "learning_rate": 4.0662107331415107e-07, + "loss": 0.0122, + "num_tokens": 20547443.0, + "reward": 0.80902099609375, + "reward_std": 0.010593228042125702, + "rewards//mean": 0.80902099609375, + "rewards//std": 0.01809738203883171, + "step": 2822 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5646, + "grad_norm": 1.9392869472503662, + "kl": 0.3429803606122732, + "learning_rate": 4.0630934270713755e-07, + "loss": 0.0137, + "num_tokens": 20554675.0, + "reward": 0.8509521484375, + "reward_std": 0.02042984962463379, + "rewards//mean": 0.8509521484375, + "rewards//std": 0.027011506259441376, + "step": 2823 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.875, + "epoch": 0.5648, + "grad_norm": 1.546613097190857, + "kl": 0.3475144747644663, + "learning_rate": 4.0599764983870974e-07, + "loss": 0.0161, + "num_tokens": 20561883.0, + "reward": 0.82977294921875, + "reward_std": 0.01583215408027172, + "rewards//mean": 0.82977294921875, + "rewards//std": 0.023091912269592285, + "step": 2824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.565, + "grad_norm": 1.629164695739746, + "kl": 0.28161269426345825, + "learning_rate": 4.0568599483441745e-07, + "loss": 0.0113, + "num_tokens": 20569179.0, + "reward": 0.87152099609375, + "reward_std": 0.020219434052705765, + "rewards//mean": 0.87152099609375, + "rewards//std": 0.03009810671210289, + "step": 2825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5652, + "grad_norm": 1.4630507230758667, + "kl": 0.3141976594924927, + "learning_rate": 4.0537437781979505e-07, + "loss": 0.0126, + "num_tokens": 20576451.0, + "reward": 0.8531494140625, + "reward_std": 0.012983300723135471, + "rewards//mean": 0.8531494140625, + "rewards//std": 0.015942927449941635, + "step": 2826 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5654, + "grad_norm": 3.415698528289795, + "kl": 0.6482616662979126, + "learning_rate": 4.0506279892036185e-07, + "loss": 0.0259, + "num_tokens": 20583723.0, + "reward": 0.85858154296875, + "reward_std": 0.018932124599814415, + "rewards//mean": 0.85858154296875, + "rewards//std": 0.02560250833630562, + "step": 2827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5656, + "grad_norm": 1.5941650867462158, + "kl": 0.2847920563071966, + "learning_rate": 4.0475125826162193e-07, + "loss": 0.0114, + "num_tokens": 20591027.0, + "reward": 0.822265625, + "reward_std": 0.011682480573654175, + "rewards//mean": 0.822265625, + "rewards//std": 0.01980830356478691, + "step": 2828 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.5658, + "grad_norm": 1.5102460384368896, + "kl": 0.363676480948925, + "learning_rate": 4.0443975596906376e-07, + "loss": 0.0128, + "num_tokens": 20598345.0, + "reward": 0.86346435546875, + "reward_std": 0.01937355101108551, + "rewards//mean": 0.86346435546875, + "rewards//std": 0.03745728358626366, + "step": 2829 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.566, + "grad_norm": 1.7639837265014648, + "kl": 0.46550631150603294, + "learning_rate": 4.041282921681605e-07, + "loss": 0.0186, + "num_tokens": 20605641.0, + "reward": 0.826904296875, + "reward_std": 0.012748455628752708, + "rewards//mean": 0.826904296875, + "rewards//std": 0.018438884988427162, + "step": 2830 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5662, + "grad_norm": 1.7467375993728638, + "kl": 0.3022360373288393, + "learning_rate": 4.038168669843697e-07, + "loss": 0.0121, + "num_tokens": 20612889.0, + "reward": 0.7822265625, + "reward_std": 0.012990603223443031, + "rewards//mean": 0.7822265625, + "rewards//std": 0.01661660149693489, + "step": 2831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5664, + "grad_norm": 1.6225764751434326, + "kl": 0.4199977181851864, + "learning_rate": 4.0350548054313336e-07, + "loss": 0.0168, + "num_tokens": 20620185.0, + "reward": 0.8035888671875, + "reward_std": 0.011245017871260643, + "rewards//mean": 0.8035888671875, + "rewards//std": 0.022696930915117264, + "step": 2832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5666, + "grad_norm": 1.8331947326660156, + "kl": 0.30884215235710144, + "learning_rate": 4.031941329698778e-07, + "loss": 0.0124, + "num_tokens": 20627441.0, + "reward": 0.8477783203125, + "reward_std": 0.01641494780778885, + "rewards//mean": 0.8477783203125, + "rewards//std": 0.02403916046023369, + "step": 2833 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5668, + "grad_norm": 1.3026154041290283, + "kl": 0.32174983248114586, + "learning_rate": 4.028828243900141e-07, + "loss": 0.0129, + "num_tokens": 20634801.0, + "reward": 0.8428955078125, + "reward_std": 0.012283504009246826, + "rewards//mean": 0.8428955078125, + "rewards//std": 0.015939129516482353, + "step": 2834 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.567, + "grad_norm": 1.6999746561050415, + "kl": 0.3238196559250355, + "learning_rate": 4.02571554928937e-07, + "loss": 0.013, + "num_tokens": 20642057.0, + "reward": 0.8631591796875, + "reward_std": 0.016583500429987907, + "rewards//mean": 0.8631591796875, + "rewards//std": 0.031442925333976746, + "step": 2835 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5672, + "grad_norm": 2.054150342941284, + "kl": 0.416274506598711, + "learning_rate": 4.0226032471202597e-07, + "loss": 0.0167, + "num_tokens": 20649385.0, + "reward": 0.81182861328125, + "reward_std": 0.016186248511075974, + "rewards//mean": 0.81182861328125, + "rewards//std": 0.02182750217616558, + "step": 2836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5674, + "grad_norm": 1.3579628467559814, + "kl": 0.37036143243312836, + "learning_rate": 4.019491338646444e-07, + "loss": 0.0148, + "num_tokens": 20656785.0, + "reward": 0.8404541015625, + "reward_std": 0.014748409390449524, + "rewards//mean": 0.8404541015625, + "rewards//std": 0.020766014233231544, + "step": 2837 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5676, + "grad_norm": 1.514999270439148, + "kl": 0.34981758892536163, + "learning_rate": 4.016379825121401e-07, + "loss": 0.014, + "num_tokens": 20664009.0, + "reward": 0.801513671875, + "reward_std": 0.01607712171971798, + "rewards//mean": 0.801513671875, + "rewards//std": 0.027646558359265327, + "step": 2838 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5678, + "grad_norm": 1.5553749799728394, + "kl": 0.37025028839707375, + "learning_rate": 4.013268707798447e-07, + "loss": 0.0148, + "num_tokens": 20671225.0, + "reward": 0.80084228515625, + "reward_std": 0.014976250939071178, + "rewards//mean": 0.80084228515625, + "rewards//std": 0.03082464262843132, + "step": 2839 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.568, + "grad_norm": 1.6633905172348022, + "kl": 0.3298432230949402, + "learning_rate": 4.010157987930738e-07, + "loss": 0.0132, + "num_tokens": 20678481.0, + "reward": 0.851318359375, + "reward_std": 0.014021502807736397, + "rewards//mean": 0.851318359375, + "rewards//std": 0.017272334545850754, + "step": 2840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5682, + "grad_norm": 1.4739923477172852, + "kl": 0.3877205513417721, + "learning_rate": 4.0070476667712736e-07, + "loss": 0.0155, + "num_tokens": 20685737.0, + "reward": 0.826171875, + "reward_std": 0.01195044070482254, + "rewards//mean": 0.826171875, + "rewards//std": 0.0170624740421772, + "step": 2841 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5684, + "grad_norm": 1.5083255767822266, + "kl": 0.3829157743602991, + "learning_rate": 4.00393774557289e-07, + "loss": 0.0153, + "num_tokens": 20692921.0, + "reward": 0.78094482421875, + "reward_std": 0.013124346733093262, + "rewards//mean": 0.78094482421875, + "rewards//std": 0.017680199816823006, + "step": 2842 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5686, + "grad_norm": 1.8624745607376099, + "kl": 0.3602810427546501, + "learning_rate": 4.000828225588264e-07, + "loss": 0.0144, + "num_tokens": 20700233.0, + "reward": 0.85186767578125, + "reward_std": 0.019822105765342712, + "rewards//mean": 0.85186767578125, + "rewards//std": 0.02336820960044861, + "step": 2843 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5688, + "grad_norm": 1.2839207649230957, + "kl": 0.3662825934588909, + "learning_rate": 3.9977191080699087e-07, + "loss": 0.0147, + "num_tokens": 20707473.0, + "reward": 0.869384765625, + "reward_std": 0.014685485512018204, + "rewards//mean": 0.869384765625, + "rewards//std": 0.022987527772784233, + "step": 2844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.569, + "grad_norm": 1.5736945867538452, + "kl": 0.34276210702955723, + "learning_rate": 3.9946103942701775e-07, + "loss": 0.0137, + "num_tokens": 20714745.0, + "reward": 0.8209228515625, + "reward_std": 0.01815829426050186, + "rewards//mean": 0.8209228515625, + "rewards//std": 0.024756405502557755, + "step": 2845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5692, + "grad_norm": 1.6121872663497925, + "kl": 0.37798054702579975, + "learning_rate": 3.9915020854412585e-07, + "loss": 0.0151, + "num_tokens": 20722001.0, + "reward": 0.85943603515625, + "reward_std": 0.016528967767953873, + "rewards//mean": 0.85943603515625, + "rewards//std": 0.020511042326688766, + "step": 2846 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5694, + "grad_norm": 2.4130430221557617, + "kl": 0.5073033757507801, + "learning_rate": 3.9883941828351796e-07, + "loss": 0.0203, + "num_tokens": 20729265.0, + "reward": 0.84906005859375, + "reward_std": 0.013025444000959396, + "rewards//mean": 0.84906005859375, + "rewards//std": 0.016839103773236275, + "step": 2847 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5696, + "grad_norm": 1.7820531129837036, + "kl": 0.3296133503317833, + "learning_rate": 3.9852866877038017e-07, + "loss": 0.0132, + "num_tokens": 20736489.0, + "reward": 0.84088134765625, + "reward_std": 0.018777986988425255, + "rewards//mean": 0.84088134765625, + "rewards//std": 0.033827271312475204, + "step": 2848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5698, + "grad_norm": 1.569543719291687, + "kl": 0.29168411903083324, + "learning_rate": 3.9821796012988264e-07, + "loss": 0.0117, + "num_tokens": 20743777.0, + "reward": 0.86395263671875, + "reward_std": 0.01635039411485195, + "rewards//mean": 0.86395263671875, + "rewards//std": 0.020594270899891853, + "step": 2849 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.57, + "grad_norm": 1.5350381135940552, + "kl": 0.3173026815056801, + "learning_rate": 3.9790729248717843e-07, + "loss": 0.0126, + "num_tokens": 20751037.0, + "reward": 0.843505859375, + "reward_std": 0.01787247508764267, + "rewards//mean": 0.843505859375, + "rewards//std": 0.030748983845114708, + "step": 2850 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.5702, + "grad_norm": 1.6817926168441772, + "kl": 0.37751685455441475, + "learning_rate": 3.9759666596740473e-07, + "loss": 0.0155, + "num_tokens": 20758276.0, + "reward": 0.85223388671875, + "reward_std": 0.01357355434447527, + "rewards//mean": 0.85223388671875, + "rewards//std": 0.020594270899891853, + "step": 2851 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5704, + "grad_norm": 1.4915727376937866, + "kl": 0.30867163464426994, + "learning_rate": 3.972860806956816e-07, + "loss": 0.0123, + "num_tokens": 20765516.0, + "reward": 0.8404541015625, + "reward_std": 0.012216243892908096, + "rewards//mean": 0.8404541015625, + "rewards//std": 0.01702294871211052, + "step": 2852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.5706, + "grad_norm": 1.5448062419891357, + "kl": 0.2844315655529499, + "learning_rate": 3.9697553679711307e-07, + "loss": 0.0127, + "num_tokens": 20772816.0, + "reward": 0.85040283203125, + "reward_std": 0.012564954347908497, + "rewards//mean": 0.85040283203125, + "rewards//std": 0.019081881269812584, + "step": 2853 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5708, + "grad_norm": 1.837639331817627, + "kl": 0.49733118899166584, + "learning_rate": 3.9666503439678576e-07, + "loss": 0.0199, + "num_tokens": 20780160.0, + "reward": 0.857421875, + "reward_std": 0.015667008236050606, + "rewards//mean": 0.857421875, + "rewards//std": 0.017288103699684143, + "step": 2854 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.571, + "grad_norm": 1.995194911956787, + "kl": 0.3536922913044691, + "learning_rate": 3.9635457361977045e-07, + "loss": 0.0141, + "num_tokens": 20787360.0, + "reward": 0.86029052734375, + "reward_std": 0.01671062968671322, + "rewards//mean": 0.86029052734375, + "rewards//std": 0.025936750695109367, + "step": 2855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5712, + "grad_norm": 1.4854780435562134, + "kl": 0.3596052713692188, + "learning_rate": 3.960441545911204e-07, + "loss": 0.0144, + "num_tokens": 20794648.0, + "reward": 0.83544921875, + "reward_std": 0.011721225455403328, + "rewards//mean": 0.83544921875, + "rewards//std": 0.014118536375463009, + "step": 2856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5714, + "grad_norm": 2.0939042568206787, + "kl": 0.3317020982503891, + "learning_rate": 3.9573377743587246e-07, + "loss": 0.0133, + "num_tokens": 20802008.0, + "reward": 0.89544677734375, + "reward_std": 0.018353993073105812, + "rewards//mean": 0.89544677734375, + "rewards//std": 0.022975590080022812, + "step": 2857 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5716, + "grad_norm": 1.4843899011611938, + "kl": 0.38137348368763924, + "learning_rate": 3.954234422790465e-07, + "loss": 0.0153, + "num_tokens": 20809344.0, + "reward": 0.81378173828125, + "reward_std": 0.01654823310673237, + "rewards//mean": 0.81378173828125, + "rewards//std": 0.027010034769773483, + "step": 2858 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5718, + "grad_norm": 1.5401244163513184, + "kl": 0.3450390100479126, + "learning_rate": 3.951131492456454e-07, + "loss": 0.0138, + "num_tokens": 20816824.0, + "reward": 0.881591796875, + "reward_std": 0.017643343657255173, + "rewards//mean": 0.881591796875, + "rewards//std": 0.021596815437078476, + "step": 2859 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.572, + "grad_norm": 1.4022468328475952, + "kl": 0.36840004473924637, + "learning_rate": 3.948028984606554e-07, + "loss": 0.0147, + "num_tokens": 20824072.0, + "reward": 0.84356689453125, + "reward_std": 0.010434623807668686, + "rewards//mean": 0.84356689453125, + "rewards//std": 0.018047964200377464, + "step": 2860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5722, + "grad_norm": 2.1388063430786133, + "kl": 0.4299214631319046, + "learning_rate": 3.9449269004904516e-07, + "loss": 0.0172, + "num_tokens": 20831360.0, + "reward": 0.884765625, + "reward_std": 0.019303858280181885, + "rewards//mean": 0.884765625, + "rewards//std": 0.032466404139995575, + "step": 2861 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.5724, + "grad_norm": 1.700238823890686, + "kl": 0.3352585230022669, + "learning_rate": 3.941825241357669e-07, + "loss": 0.0152, + "num_tokens": 20838603.0, + "reward": 0.821044921875, + "reward_std": 0.012723580002784729, + "rewards//mean": 0.821044921875, + "rewards//std": 0.01922345533967018, + "step": 2862 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.5726, + "grad_norm": 1.3640984296798706, + "kl": 0.32723918184638023, + "learning_rate": 3.9387240084575514e-07, + "loss": 0.0142, + "num_tokens": 20845944.0, + "reward": 0.856689453125, + "reward_std": 0.01813451200723648, + "rewards//mean": 0.856689453125, + "rewards//std": 0.02429877407848835, + "step": 2863 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.5728, + "grad_norm": 1.4968308210372925, + "kl": 0.3087011035531759, + "learning_rate": 3.935623203039277e-07, + "loss": 0.0128, + "num_tokens": 20853222.0, + "reward": 0.86474609375, + "reward_std": 0.022357050329446793, + "rewards//mean": 0.86474609375, + "rewards//std": 0.02711247280240059, + "step": 2864 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.573, + "grad_norm": 1.6382538080215454, + "kl": 0.3609756529331207, + "learning_rate": 3.9325228263518484e-07, + "loss": 0.0143, + "num_tokens": 20860525.0, + "reward": 0.86834716796875, + "reward_std": 0.016185611486434937, + "rewards//mean": 0.86834716796875, + "rewards//std": 0.019283896312117577, + "step": 2865 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5732, + "grad_norm": 1.7651410102844238, + "kl": 0.34922398813068867, + "learning_rate": 3.9294228796440986e-07, + "loss": 0.014, + "num_tokens": 20867757.0, + "reward": 0.773193359375, + "reward_std": 0.015338817611336708, + "rewards//mean": 0.773193359375, + "rewards//std": 0.0202182624489069, + "step": 2866 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5734, + "grad_norm": 1.3254914283752441, + "kl": 0.275520995259285, + "learning_rate": 3.9263233641646836e-07, + "loss": 0.011, + "num_tokens": 20874997.0, + "reward": 0.8494873046875, + "reward_std": 0.013493809849023819, + "rewards//mean": 0.8494873046875, + "rewards//std": 0.016247650608420372, + "step": 2867 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5736, + "grad_norm": 1.5371038913726807, + "kl": 0.3219088949263096, + "learning_rate": 3.923224281162091e-07, + "loss": 0.0129, + "num_tokens": 20882261.0, + "reward": 0.85455322265625, + "reward_std": 0.016029544174671173, + "rewards//mean": 0.85455322265625, + "rewards//std": 0.02102707326412201, + "step": 2868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5738, + "grad_norm": 1.2816070318222046, + "kl": 0.32157453149557114, + "learning_rate": 3.920125631884627e-07, + "loss": 0.0129, + "num_tokens": 20889509.0, + "reward": 0.87298583984375, + "reward_std": 0.016658175736665726, + "rewards//mean": 0.87298583984375, + "rewards//std": 0.026374291628599167, + "step": 2869 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.574, + "grad_norm": 1.5030608177185059, + "kl": 0.3073342088609934, + "learning_rate": 3.917027417580431e-07, + "loss": 0.0123, + "num_tokens": 20896869.0, + "reward": 0.85906982421875, + "reward_std": 0.014194365590810776, + "rewards//mean": 0.85906982421875, + "rewards//std": 0.019860869273543358, + "step": 2870 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5742, + "grad_norm": 1.5507841110229492, + "kl": 0.3809580132365227, + "learning_rate": 3.913929639497462e-07, + "loss": 0.0152, + "num_tokens": 20904093.0, + "reward": 0.8614501953125, + "reward_std": 0.01433117501437664, + "rewards//mean": 0.8614501953125, + "rewards//std": 0.018983755260705948, + "step": 2871 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5744, + "grad_norm": 1.5048463344573975, + "kl": 0.3269166089594364, + "learning_rate": 3.910832298883503e-07, + "loss": 0.0131, + "num_tokens": 20911325.0, + "reward": 0.8721923828125, + "reward_std": 0.016879864037036896, + "rewards//mean": 0.8721923828125, + "rewards//std": 0.02308838814496994, + "step": 2872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5746, + "grad_norm": 1.6518902778625488, + "kl": 0.3442311566323042, + "learning_rate": 3.907735396986165e-07, + "loss": 0.0138, + "num_tokens": 20918661.0, + "reward": 0.85174560546875, + "reward_std": 0.016138087958097458, + "rewards//mean": 0.85174560546875, + "rewards//std": 0.018887322396039963, + "step": 2873 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.5748, + "grad_norm": 4.26786994934082, + "kl": 0.4847493879497051, + "learning_rate": 3.904638935052876e-07, + "loss": 0.0196, + "num_tokens": 20925975.0, + "reward": 0.83258056640625, + "reward_std": 0.015257328748703003, + "rewards//mean": 0.83258056640625, + "rewards//std": 0.020550856366753578, + "step": 2874 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.575, + "grad_norm": 1.330775260925293, + "kl": 0.3518476076424122, + "learning_rate": 3.9015429143308957e-07, + "loss": 0.0141, + "num_tokens": 20933279.0, + "reward": 0.87445068359375, + "reward_std": 0.013755006715655327, + "rewards//mean": 0.87445068359375, + "rewards//std": 0.01838616468012333, + "step": 2875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5752, + "grad_norm": 2.3287582397460938, + "kl": 0.34044358506798744, + "learning_rate": 3.8984473360672967e-07, + "loss": 0.0136, + "num_tokens": 20940639.0, + "reward": 0.8646240234375, + "reward_std": 0.02130088582634926, + "rewards//mean": 0.8646240234375, + "rewards//std": 0.03088529407978058, + "step": 2876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5754, + "grad_norm": 1.3543697595596313, + "kl": 0.32948085479438305, + "learning_rate": 3.89535220150898e-07, + "loss": 0.0132, + "num_tokens": 20947903.0, + "reward": 0.86981201171875, + "reward_std": 0.012571027502417564, + "rewards//mean": 0.86981201171875, + "rewards//std": 0.01899760402739048, + "step": 2877 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5756, + "grad_norm": 1.7967994213104248, + "kl": 0.4750402234494686, + "learning_rate": 3.8922575119026635e-07, + "loss": 0.019, + "num_tokens": 20955199.0, + "reward": 0.82574462890625, + "reward_std": 0.017112944275140762, + "rewards//mean": 0.82574462890625, + "rewards//std": 0.020334633067250252, + "step": 2878 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5758, + "grad_norm": 1.4433784484863281, + "kl": 0.3202957659959793, + "learning_rate": 3.8891632684948895e-07, + "loss": 0.0128, + "num_tokens": 20962463.0, + "reward": 0.7720947265625, + "reward_std": 0.016974229365587234, + "rewards//mean": 0.7720947265625, + "rewards//std": 0.023190442472696304, + "step": 2879 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.576, + "grad_norm": 1.6899242401123047, + "kl": 0.351572223007679, + "learning_rate": 3.886069472532017e-07, + "loss": 0.0141, + "num_tokens": 20969727.0, + "reward": 0.83734130859375, + "reward_std": 0.020752161741256714, + "rewards//mean": 0.83734130859375, + "rewards//std": 0.026896024122834206, + "step": 2880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5762, + "grad_norm": 1.7071269750595093, + "kl": 0.3570438250899315, + "learning_rate": 3.882976125260229e-07, + "loss": 0.0143, + "num_tokens": 20976999.0, + "reward": 0.84991455078125, + "reward_std": 0.013385903090238571, + "rewards//mean": 0.84991455078125, + "rewards//std": 0.01821327768266201, + "step": 2881 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.5764, + "grad_norm": 1.9000529050827026, + "kl": 0.3521097656339407, + "learning_rate": 3.879883227925523e-07, + "loss": 0.0138, + "num_tokens": 20984296.0, + "reward": 0.85076904296875, + "reward_std": 0.01717042364180088, + "rewards//mean": 0.85076904296875, + "rewards//std": 0.018586784601211548, + "step": 2882 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5766, + "grad_norm": 1.3157691955566406, + "kl": 0.32391694374382496, + "learning_rate": 3.87679078177372e-07, + "loss": 0.013, + "num_tokens": 20991600.0, + "reward": 0.82122802734375, + "reward_std": 0.012087035924196243, + "rewards//mean": 0.82122802734375, + "rewards//std": 0.01823902502655983, + "step": 2883 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.5768, + "grad_norm": 1.4436371326446533, + "kl": 0.29010348953306675, + "learning_rate": 3.8736987880504546e-07, + "loss": 0.0088, + "num_tokens": 20998947.0, + "reward": 0.8642578125, + "reward_std": 0.017912402749061584, + "rewards//mean": 0.8642578125, + "rewards//std": 0.021144898608326912, + "step": 2884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.577, + "grad_norm": 1.566665768623352, + "kl": 0.3598574995994568, + "learning_rate": 3.870607248001184e-07, + "loss": 0.0144, + "num_tokens": 21006315.0, + "reward": 0.83343505859375, + "reward_std": 0.01424839161336422, + "rewards//mean": 0.83343505859375, + "rewards//std": 0.02458113431930542, + "step": 2885 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5772, + "grad_norm": 1.5148273706436157, + "kl": 0.2936252951622009, + "learning_rate": 3.8675161628711773e-07, + "loss": 0.0117, + "num_tokens": 21013571.0, + "reward": 0.85101318359375, + "reward_std": 0.015438312664628029, + "rewards//mean": 0.85101318359375, + "rewards//std": 0.022529829293489456, + "step": 2886 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5774, + "grad_norm": 2.175304651260376, + "kl": 0.40180307626724243, + "learning_rate": 3.8644255339055266e-07, + "loss": 0.0161, + "num_tokens": 21020827.0, + "reward": 0.81402587890625, + "reward_std": 0.01270057912915945, + "rewards//mean": 0.81402587890625, + "rewards//std": 0.01598689891397953, + "step": 2887 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5776, + "grad_norm": 1.6606175899505615, + "kl": 0.3080577477812767, + "learning_rate": 3.861335362349134e-07, + "loss": 0.0123, + "num_tokens": 21028059.0, + "reward": 0.871337890625, + "reward_std": 0.013755547814071178, + "rewards//mean": 0.871337890625, + "rewards//std": 0.01768801175057888, + "step": 2888 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5778, + "grad_norm": 2.1492481231689453, + "kl": 0.35315701365470886, + "learning_rate": 3.8582456494467206e-07, + "loss": 0.0141, + "num_tokens": 21035435.0, + "reward": 0.84466552734375, + "reward_std": 0.017820367589592934, + "rewards//mean": 0.84466552734375, + "rewards//std": 0.02201739139854908, + "step": 2889 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.578, + "grad_norm": 1.4003452062606812, + "kl": 0.30910225957632065, + "learning_rate": 3.8551563964428247e-07, + "loss": 0.0124, + "num_tokens": 21042627.0, + "reward": 0.85003662109375, + "reward_std": 0.01736542209982872, + "rewards//mean": 0.85003662109375, + "rewards//std": 0.023964976891875267, + "step": 2890 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5782, + "grad_norm": 1.4902961254119873, + "kl": 0.366636348888278, + "learning_rate": 3.852067604581794e-07, + "loss": 0.0147, + "num_tokens": 21049851.0, + "reward": 0.86810302734375, + "reward_std": 0.016958557069301605, + "rewards//mean": 0.86810302734375, + "rewards//std": 0.023268885910511017, + "step": 2891 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5784, + "grad_norm": 1.450570821762085, + "kl": 0.3268718458712101, + "learning_rate": 3.848979275107796e-07, + "loss": 0.0131, + "num_tokens": 21057227.0, + "reward": 0.8531494140625, + "reward_std": 0.015100855380296707, + "rewards//mean": 0.8531494140625, + "rewards//std": 0.03114689514040947, + "step": 2892 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5786, + "grad_norm": 1.651654839515686, + "kl": 0.428241241723299, + "learning_rate": 3.845891409264807e-07, + "loss": 0.0171, + "num_tokens": 21064483.0, + "reward": 0.8590087890625, + "reward_std": 0.01977067068219185, + "rewards//mean": 0.8590087890625, + "rewards//std": 0.029795637354254723, + "step": 2893 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5788, + "grad_norm": 1.6949905157089233, + "kl": 0.3528018482029438, + "learning_rate": 3.8428040082966217e-07, + "loss": 0.0141, + "num_tokens": 21071739.0, + "reward": 0.8408203125, + "reward_std": 0.015214340761303902, + "rewards//mean": 0.8408203125, + "rewards//std": 0.021676573902368546, + "step": 2894 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.579, + "grad_norm": 1.4699780941009521, + "kl": 0.35454971343278885, + "learning_rate": 3.839717073446842e-07, + "loss": 0.0142, + "num_tokens": 21078971.0, + "reward": 0.8665771484375, + "reward_std": 0.0184504222124815, + "rewards//mean": 0.8665771484375, + "rewards//std": 0.026062294840812683, + "step": 2895 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.875, + "epoch": 0.5792, + "grad_norm": 1.671282172203064, + "kl": 0.3236817866563797, + "learning_rate": 3.8366306059588876e-07, + "loss": 0.0176, + "num_tokens": 21086147.0, + "reward": 0.87115478515625, + "reward_std": 0.019631164148449898, + "rewards//mean": 0.87115478515625, + "rewards//std": 0.025491703301668167, + "step": 2896 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.84375, + "epoch": 0.5794, + "grad_norm": 1.653166651725769, + "kl": 0.35496271401643753, + "learning_rate": 3.8335446070759855e-07, + "loss": 0.0048, + "num_tokens": 21093553.0, + "reward": 0.85772705078125, + "reward_std": 0.017272384837269783, + "rewards//mean": 0.85772705078125, + "rewards//std": 0.0265920702368021, + "step": 2897 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5796, + "grad_norm": 3.5541982650756836, + "kl": 0.5978864282369614, + "learning_rate": 3.8304590780411766e-07, + "loss": 0.0239, + "num_tokens": 21100873.0, + "reward": 0.8555908203125, + "reward_std": 0.015555327758193016, + "rewards//mean": 0.8555908203125, + "rewards//std": 0.026346512138843536, + "step": 2898 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5798, + "grad_norm": 1.8323098421096802, + "kl": 0.37283468060195446, + "learning_rate": 3.8273740200973103e-07, + "loss": 0.0149, + "num_tokens": 21108049.0, + "reward": 0.8555908203125, + "reward_std": 0.016757164150476456, + "rewards//mean": 0.8555908203125, + "rewards//std": 0.026529734954237938, + "step": 2899 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.58, + "grad_norm": 1.5213063955307007, + "kl": 0.2822359539568424, + "learning_rate": 3.8242894344870495e-07, + "loss": 0.0113, + "num_tokens": 21115345.0, + "reward": 0.859130859375, + "reward_std": 0.012352509424090385, + "rewards//mean": 0.859130859375, + "rewards//std": 0.01665845327079296, + "step": 2900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5802, + "grad_norm": 1.7228693962097168, + "kl": 0.3337738774716854, + "learning_rate": 3.821205322452863e-07, + "loss": 0.0134, + "num_tokens": 21122609.0, + "reward": 0.82684326171875, + "reward_std": 0.015171612612903118, + "rewards//mean": 0.82684326171875, + "rewards//std": 0.021214138716459274, + "step": 2901 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5804, + "grad_norm": 1.675292730331421, + "kl": 0.37006600201129913, + "learning_rate": 3.8181216852370324e-07, + "loss": 0.0148, + "num_tokens": 21129825.0, + "reward": 0.87109375, + "reward_std": 0.018396252766251564, + "rewards//mean": 0.87109375, + "rewards//std": 0.025123316794633865, + "step": 2902 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5806, + "grad_norm": 1.517289638519287, + "kl": 0.3030075505375862, + "learning_rate": 3.8150385240816455e-07, + "loss": 0.0121, + "num_tokens": 21137009.0, + "reward": 0.8594970703125, + "reward_std": 0.015266655012965202, + "rewards//mean": 0.8594970703125, + "rewards//std": 0.023581411689519882, + "step": 2903 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5808, + "grad_norm": 1.625970721244812, + "kl": 0.33640630543231964, + "learning_rate": 3.811955840228599e-07, + "loss": 0.0135, + "num_tokens": 21144257.0, + "reward": 0.84912109375, + "reward_std": 0.015574105083942413, + "rewards//mean": 0.84912109375, + "rewards//std": 0.025157036259770393, + "step": 2904 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.581, + "grad_norm": 1.6897028684616089, + "kl": 0.3875657916069031, + "learning_rate": 3.808873634919599e-07, + "loss": 0.0155, + "num_tokens": 21151625.0, + "reward": 0.82073974609375, + "reward_std": 0.013956956565380096, + "rewards//mean": 0.82073974609375, + "rewards//std": 0.020659584552049637, + "step": 2905 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.84375, + "epoch": 0.5812, + "grad_norm": 1.3966155052185059, + "kl": 0.35023365914821625, + "learning_rate": 3.805791909396155e-07, + "loss": 0.0196, + "num_tokens": 21158943.0, + "reward": 0.76824951171875, + "reward_std": 0.009715900756418705, + "rewards//mean": 0.76824951171875, + "rewards//std": 0.020124360918998718, + "step": 2906 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5814, + "grad_norm": 1.7761036157608032, + "kl": 0.3847304694354534, + "learning_rate": 3.8027106648995875e-07, + "loss": 0.0154, + "num_tokens": 21166375.0, + "reward": 0.82586669921875, + "reward_std": 0.013101045973598957, + "rewards//mean": 0.82586669921875, + "rewards//std": 0.01842482015490532, + "step": 2907 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5816, + "grad_norm": 1.730652928352356, + "kl": 0.3179916311055422, + "learning_rate": 3.799629902671021e-07, + "loss": 0.0127, + "num_tokens": 21173735.0, + "reward": 0.84344482421875, + "reward_std": 0.011501655913889408, + "rewards//mean": 0.84344482421875, + "rewards//std": 0.01584518700838089, + "step": 2908 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5818, + "grad_norm": 1.62855863571167, + "kl": 0.27623927779495716, + "learning_rate": 3.7965496239513874e-07, + "loss": 0.011, + "num_tokens": 21181007.0, + "reward": 0.85235595703125, + "reward_std": 0.016314178705215454, + "rewards//mean": 0.85235595703125, + "rewards//std": 0.02565920539200306, + "step": 2909 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.582, + "grad_norm": 1.46992027759552, + "kl": 0.3431815542280674, + "learning_rate": 3.7934698299814196e-07, + "loss": 0.0137, + "num_tokens": 21188271.0, + "reward": 0.82476806640625, + "reward_std": 0.015911126509308815, + "rewards//mean": 0.82476806640625, + "rewards//std": 0.01703217253088951, + "step": 2910 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.609375, + "epoch": 0.5822, + "grad_norm": 1.6905187368392944, + "kl": 0.4002700736746192, + "learning_rate": 3.790390522001662e-07, + "loss": -0.014, + "num_tokens": 21195558.0, + "reward": 0.8382568359375, + "reward_std": 0.021059563383460045, + "rewards//mean": 0.8382568359375, + "rewards//std": 0.02796747162938118, + "step": 2911 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5824, + "grad_norm": 1.3810511827468872, + "kl": 0.300725843757391, + "learning_rate": 3.787311701252457e-07, + "loss": 0.012, + "num_tokens": 21203014.0, + "reward": 0.85400390625, + "reward_std": 0.019547084346413612, + "rewards//mean": 0.85400390625, + "rewards//std": 0.03587908670306206, + "step": 2912 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5826, + "grad_norm": 1.782904863357544, + "kl": 0.39416565746068954, + "learning_rate": 3.784233368973952e-07, + "loss": 0.0158, + "num_tokens": 21210238.0, + "reward": 0.8211669921875, + "reward_std": 0.013917792588472366, + "rewards//mean": 0.8211669921875, + "rewards//std": 0.027581598609685898, + "step": 2913 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5828, + "grad_norm": 1.4534296989440918, + "kl": 0.3489321991801262, + "learning_rate": 3.7811555264061024e-07, + "loss": 0.014, + "num_tokens": 21217574.0, + "reward": 0.849365234375, + "reward_std": 0.0208866149187088, + "rewards//mean": 0.849365234375, + "rewards//std": 0.024605832993984222, + "step": 2914 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.583, + "grad_norm": 1.6439255475997925, + "kl": 0.3459588438272476, + "learning_rate": 3.7780781747886594e-07, + "loss": 0.0138, + "num_tokens": 21224822.0, + "reward": 0.80926513671875, + "reward_std": 0.021596763283014297, + "rewards//mean": 0.80926513671875, + "rewards//std": 0.0322008952498436, + "step": 2915 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5832, + "grad_norm": 1.3755948543548584, + "kl": 0.3068405259400606, + "learning_rate": 3.7750013153611827e-07, + "loss": 0.0123, + "num_tokens": 21232078.0, + "reward": 0.87506103515625, + "reward_std": 0.009809325449168682, + "rewards//mean": 0.87506103515625, + "rewards//std": 0.016374217346310616, + "step": 2916 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5834, + "grad_norm": 1.6864771842956543, + "kl": 0.4028734862804413, + "learning_rate": 3.7719249493630297e-07, + "loss": 0.0161, + "num_tokens": 21239390.0, + "reward": 0.8468017578125, + "reward_std": 0.022065982222557068, + "rewards//mean": 0.8468017578125, + "rewards//std": 0.030068732798099518, + "step": 2917 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5836, + "grad_norm": 1.8024131059646606, + "kl": 0.30348160304129124, + "learning_rate": 3.768849078033359e-07, + "loss": 0.0121, + "num_tokens": 21246734.0, + "reward": 0.81231689453125, + "reward_std": 0.015689771622419357, + "rewards//mean": 0.81231689453125, + "rewards//std": 0.026986485347151756, + "step": 2918 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5838, + "grad_norm": 1.5154041051864624, + "kl": 0.28760317899286747, + "learning_rate": 3.7657737026111335e-07, + "loss": 0.0115, + "num_tokens": 21254102.0, + "reward": 0.8421630859375, + "reward_std": 0.011192502453923225, + "rewards//mean": 0.8421630859375, + "rewards//std": 0.012366731651127338, + "step": 2919 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.584, + "grad_norm": 1.507631540298462, + "kl": 0.30736459977924824, + "learning_rate": 3.762698824335112e-07, + "loss": 0.0123, + "num_tokens": 21261318.0, + "reward": 0.80908203125, + "reward_std": 0.01777060702443123, + "rewards//mean": 0.80908203125, + "rewards//std": 0.026277761906385422, + "step": 2920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5842, + "grad_norm": 1.3876992464065552, + "kl": 0.3542071506381035, + "learning_rate": 3.7596244444438574e-07, + "loss": 0.0142, + "num_tokens": 21268550.0, + "reward": 0.8326416015625, + "reward_std": 0.013923238031566143, + "rewards//mean": 0.8326416015625, + "rewards//std": 0.01936585269868374, + "step": 2921 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5844, + "grad_norm": 1.643754243850708, + "kl": 0.3184722140431404, + "learning_rate": 3.7565505641757266e-07, + "loss": 0.0127, + "num_tokens": 21275814.0, + "reward": 0.8648681640625, + "reward_std": 0.017238471657037735, + "rewards//mean": 0.8648681640625, + "rewards//std": 0.023122457787394524, + "step": 2922 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.84375, + "epoch": 0.5846, + "grad_norm": 1.2418384552001953, + "kl": 0.3308109622448683, + "learning_rate": 3.7534771847688814e-07, + "loss": 0.0098, + "num_tokens": 21283052.0, + "reward": 0.87945556640625, + "reward_std": 0.019181156530976295, + "rewards//mean": 0.87945556640625, + "rewards//std": 0.03047993592917919, + "step": 2923 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.796875, + "epoch": 0.5848, + "grad_norm": 1.5341547727584839, + "kl": 0.30699026212096214, + "learning_rate": 3.750404307461276e-07, + "loss": 0.0079, + "num_tokens": 21290287.0, + "reward": 0.85595703125, + "reward_std": 0.022712014615535736, + "rewards//mean": 0.85595703125, + "rewards//std": 0.036923713982105255, + "step": 2924 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.585, + "grad_norm": 1.3651939630508423, + "kl": 0.29263130761682987, + "learning_rate": 3.7473319334906673e-07, + "loss": 0.0117, + "num_tokens": 21297591.0, + "reward": 0.856689453125, + "reward_std": 0.015358371660113335, + "rewards//mean": 0.856689453125, + "rewards//std": 0.021461816504597664, + "step": 2925 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5852, + "grad_norm": 1.945060133934021, + "kl": 0.32548391073942184, + "learning_rate": 3.744260064094604e-07, + "loss": 0.013, + "num_tokens": 21304895.0, + "reward": 0.84716796875, + "reward_std": 0.01952008344233036, + "rewards//mean": 0.84716796875, + "rewards//std": 0.027458660304546356, + "step": 2926 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.5854, + "grad_norm": 1.6340919733047485, + "kl": 0.32058765552937984, + "learning_rate": 3.7411887005104395e-07, + "loss": 0.0136, + "num_tokens": 21312195.0, + "reward": 0.8121337890625, + "reward_std": 0.015559937804937363, + "rewards//mean": 0.8121337890625, + "rewards//std": 0.018762381747364998, + "step": 2927 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5856, + "grad_norm": 2.0043115615844727, + "kl": 0.39395728148519993, + "learning_rate": 3.7381178439753135e-07, + "loss": 0.0158, + "num_tokens": 21319563.0, + "reward": 0.8292236328125, + "reward_std": 0.012595362029969692, + "rewards//mean": 0.8292236328125, + "rewards//std": 0.01911724917590618, + "step": 2928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5858, + "grad_norm": 1.6649210453033447, + "kl": 0.3608947917819023, + "learning_rate": 3.73504749572617e-07, + "loss": 0.0144, + "num_tokens": 21326835.0, + "reward": 0.8197021484375, + "reward_std": 0.02053481712937355, + "rewards//mean": 0.8197021484375, + "rewards//std": 0.031865671277046204, + "step": 2929 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.586, + "grad_norm": 1.4522360563278198, + "kl": 0.46155978739261627, + "learning_rate": 3.7319776569997434e-07, + "loss": 0.0185, + "num_tokens": 21334107.0, + "reward": 0.87664794921875, + "reward_std": 0.017367498949170113, + "rewards//mean": 0.87664794921875, + "rewards//std": 0.02373330108821392, + "step": 2930 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5862, + "grad_norm": 1.588176965713501, + "kl": 0.4415661823004484, + "learning_rate": 3.728908329032566e-07, + "loss": 0.0177, + "num_tokens": 21341339.0, + "reward": 0.86614990234375, + "reward_std": 0.013865359127521515, + "rewards//mean": 0.86614990234375, + "rewards//std": 0.02829056605696678, + "step": 2931 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5864, + "grad_norm": 1.7298270463943481, + "kl": 0.39550740644335747, + "learning_rate": 3.7258395130609606e-07, + "loss": 0.0158, + "num_tokens": 21348611.0, + "reward": 0.771728515625, + "reward_std": 0.009401939809322357, + "rewards//mean": 0.771728515625, + "rewards//std": 0.014663414098322392, + "step": 2932 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5866, + "grad_norm": 1.5209332704544067, + "kl": 0.40530458837747574, + "learning_rate": 3.722771210321048e-07, + "loss": 0.0162, + "num_tokens": 21355947.0, + "reward": 0.8126220703125, + "reward_std": 0.017847422510385513, + "rewards//mean": 0.8126220703125, + "rewards//std": 0.025572113692760468, + "step": 2933 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5868, + "grad_norm": 1.4470336437225342, + "kl": 0.33577729389071465, + "learning_rate": 3.719703422048739e-07, + "loss": 0.0134, + "num_tokens": 21363179.0, + "reward": 0.80523681640625, + "reward_std": 0.01818227209150791, + "rewards//mean": 0.80523681640625, + "rewards//std": 0.020170195028185844, + "step": 2934 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.587, + "grad_norm": 1.6886703968048096, + "kl": 0.4155691973865032, + "learning_rate": 3.716636149479737e-07, + "loss": 0.0166, + "num_tokens": 21370451.0, + "reward": 0.85546875, + "reward_std": 0.015114277601242065, + "rewards//mean": 0.85546875, + "rewards//std": 0.016396503895521164, + "step": 2935 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5872, + "grad_norm": 1.3670481443405151, + "kl": 0.33472771383821964, + "learning_rate": 3.7135693938495426e-07, + "loss": 0.0134, + "num_tokens": 21377707.0, + "reward": 0.85986328125, + "reward_std": 0.01261800155043602, + "rewards//mean": 0.85986328125, + "rewards//std": 0.015771571546792984, + "step": 2936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.75, + "epoch": 0.5874, + "grad_norm": 1.5653796195983887, + "kl": 0.40544213354587555, + "learning_rate": 3.710503156393441e-07, + "loss": 0.0096, + "num_tokens": 21384979.0, + "reward": 0.822265625, + "reward_std": 0.01791950687766075, + "rewards//mean": 0.822265625, + "rewards//std": 0.028064999729394913, + "step": 2937 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5876, + "grad_norm": 1.4330828189849854, + "kl": 0.3085546940565109, + "learning_rate": 3.7074374383465146e-07, + "loss": 0.0123, + "num_tokens": 21392435.0, + "reward": 0.89556884765625, + "reward_std": 0.015038450248539448, + "rewards//mean": 0.89556884765625, + "rewards//std": 0.021398155018687248, + "step": 2938 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5878, + "grad_norm": 6.176050186157227, + "kl": 0.7992909587919712, + "learning_rate": 3.704372240943633e-07, + "loss": 0.032, + "num_tokens": 21399675.0, + "reward": 0.86639404296875, + "reward_std": 0.014218630269169807, + "rewards//mean": 0.86639404296875, + "rewards//std": 0.023511577397584915, + "step": 2939 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.588, + "grad_norm": 1.817883849143982, + "kl": 0.3728683069348335, + "learning_rate": 3.701307565419458e-07, + "loss": 0.0149, + "num_tokens": 21406979.0, + "reward": 0.88079833984375, + "reward_std": 0.01804790459573269, + "rewards//mean": 0.88079833984375, + "rewards//std": 0.025410814210772514, + "step": 2940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5882, + "grad_norm": 1.515778660774231, + "kl": 0.34318842738866806, + "learning_rate": 3.6982434130084396e-07, + "loss": 0.0137, + "num_tokens": 21414163.0, + "reward": 0.8568115234375, + "reward_std": 0.013716425746679306, + "rewards//mean": 0.8568115234375, + "rewards//std": 0.025009481236338615, + "step": 2941 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5884, + "grad_norm": 1.324209451675415, + "kl": 0.3136742580682039, + "learning_rate": 3.69517978494482e-07, + "loss": 0.0125, + "num_tokens": 21421459.0, + "reward": 0.84063720703125, + "reward_std": 0.01692243665456772, + "rewards//mean": 0.84063720703125, + "rewards//std": 0.018521513789892197, + "step": 2942 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5886, + "grad_norm": 1.4108978509902954, + "kl": 0.36340594850480556, + "learning_rate": 3.6921166824626257e-07, + "loss": 0.0145, + "num_tokens": 21428843.0, + "reward": 0.812744140625, + "reward_std": 0.011877110227942467, + "rewards//mean": 0.812744140625, + "rewards//std": 0.014941574074327946, + "step": 2943 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5888, + "grad_norm": 1.6192622184753418, + "kl": 0.37894843332469463, + "learning_rate": 3.689054106795677e-07, + "loss": 0.0152, + "num_tokens": 21436107.0, + "reward": 0.80389404296875, + "reward_std": 0.012358007952570915, + "rewards//mean": 0.80389404296875, + "rewards//std": 0.018755421042442322, + "step": 2944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.589, + "grad_norm": 1.596458077430725, + "kl": 0.3262626249343157, + "learning_rate": 3.685992059177576e-07, + "loss": 0.0131, + "num_tokens": 21443427.0, + "reward": 0.84674072265625, + "reward_std": 0.023477379232645035, + "rewards//mean": 0.84674072265625, + "rewards//std": 0.03562328964471817, + "step": 2945 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.828125, + "epoch": 0.5892, + "grad_norm": 3.18048095703125, + "kl": 0.49971077777445316, + "learning_rate": 3.6829305408417166e-07, + "loss": 0.0231, + "num_tokens": 21450752.0, + "reward": 0.8016357421875, + "reward_std": 0.013953637331724167, + "rewards//mean": 0.8016357421875, + "rewards//std": 0.020605016499757767, + "step": 2946 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5894, + "grad_norm": 1.6151759624481201, + "kl": 0.369115486741066, + "learning_rate": 3.679869553021278e-07, + "loss": 0.0148, + "num_tokens": 21457912.0, + "reward": 0.8695068359375, + "reward_std": 0.018091101199388504, + "rewards//mean": 0.8695068359375, + "rewards//std": 0.02249595709145069, + "step": 2947 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5896, + "grad_norm": 1.8225599527359009, + "kl": 0.3040959853678942, + "learning_rate": 3.676809096949226e-07, + "loss": 0.0122, + "num_tokens": 21465160.0, + "reward": 0.8575439453125, + "reward_std": 0.01510573923587799, + "rewards//mean": 0.8575439453125, + "rewards//std": 0.021224573254585266, + "step": 2948 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.5898, + "grad_norm": 1.279245138168335, + "kl": 0.33714798651635647, + "learning_rate": 3.6737491738583117e-07, + "loss": 0.0134, + "num_tokens": 21472447.0, + "reward": 0.85150146484375, + "reward_std": 0.017755568027496338, + "rewards//mean": 0.85150146484375, + "rewards//std": 0.02118486352264881, + "step": 2949 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.59, + "grad_norm": 1.673782229423523, + "kl": 0.3813757412135601, + "learning_rate": 3.67068978498107e-07, + "loss": 0.0153, + "num_tokens": 21479743.0, + "reward": 0.84136962890625, + "reward_std": 0.01438809372484684, + "rewards//mean": 0.84136962890625, + "rewards//std": 0.02152652107179165, + "step": 2950 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.578125, + "epoch": 0.5902, + "grad_norm": 1.844822883605957, + "kl": 0.39631787315011024, + "learning_rate": 3.6676309315498255e-07, + "loss": -0.0128, + "num_tokens": 21486956.0, + "reward": 0.841796875, + "reward_std": 0.02044687047600746, + "rewards//mean": 0.841796875, + "rewards//std": 0.027533533051609993, + "step": 2951 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5904, + "grad_norm": 1.5438815355300903, + "kl": 0.30816027894616127, + "learning_rate": 3.6645726147966817e-07, + "loss": 0.0123, + "num_tokens": 21494220.0, + "reward": 0.84759521484375, + "reward_std": 0.02103826403617859, + "rewards//mean": 0.84759521484375, + "rewards//std": 0.03338947147130966, + "step": 2952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5906, + "grad_norm": 1.438666820526123, + "kl": 0.29324026219546795, + "learning_rate": 3.6615148359535295e-07, + "loss": 0.0117, + "num_tokens": 21501564.0, + "reward": 0.8426513671875, + "reward_std": 0.015418825671076775, + "rewards//mean": 0.8426513671875, + "rewards//std": 0.0232243612408638, + "step": 2953 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5908, + "grad_norm": 1.4039586782455444, + "kl": 0.35435326024889946, + "learning_rate": 3.6584575962520405e-07, + "loss": 0.0142, + "num_tokens": 21508804.0, + "reward": 0.848388671875, + "reward_std": 0.014363281428813934, + "rewards//mean": 0.848388671875, + "rewards//std": 0.018931452184915543, + "step": 2954 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.765625, + "epoch": 0.591, + "grad_norm": 1.514977216720581, + "kl": 0.4224591627717018, + "learning_rate": 3.6554008969236715e-07, + "loss": 0.0014, + "num_tokens": 21516117.0, + "reward": 0.8690185546875, + "reward_std": 0.020930692553520203, + "rewards//mean": 0.8690185546875, + "rewards//std": 0.02404923364520073, + "step": 2955 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5912, + "grad_norm": 1.8010073900222778, + "kl": 0.34423481300473213, + "learning_rate": 3.652344739199661e-07, + "loss": 0.0138, + "num_tokens": 21523397.0, + "reward": 0.86077880859375, + "reward_std": 0.01652872934937477, + "rewards//mean": 0.86077880859375, + "rewards//std": 0.023003244772553444, + "step": 2956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5914, + "grad_norm": 1.552305817604065, + "kl": 0.345169372856617, + "learning_rate": 3.649289124311028e-07, + "loss": 0.0138, + "num_tokens": 21530645.0, + "reward": 0.8243408203125, + "reward_std": 0.012823425233364105, + "rewards//mean": 0.8243408203125, + "rewards//std": 0.021462874487042427, + "step": 2957 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5916, + "grad_norm": 3.2133114337921143, + "kl": 0.5806980691850185, + "learning_rate": 3.6462340534885736e-07, + "loss": 0.0232, + "num_tokens": 21537973.0, + "reward": 0.84674072265625, + "reward_std": 0.017862267792224884, + "rewards//mean": 0.84674072265625, + "rewards//std": 0.022625036537647247, + "step": 2958 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.5918, + "grad_norm": 1.519640564918518, + "kl": 0.34986070916056633, + "learning_rate": 3.6431795279628816e-07, + "loss": 0.0141, + "num_tokens": 21545289.0, + "reward": 0.85076904296875, + "reward_std": 0.011196313425898552, + "rewards//mean": 0.85076904296875, + "rewards//std": 0.015995418652892113, + "step": 2959 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.592, + "grad_norm": 1.7067031860351562, + "kl": 0.33822400495409966, + "learning_rate": 3.640125548964312e-07, + "loss": 0.0135, + "num_tokens": 21552601.0, + "reward": 0.86358642578125, + "reward_std": 0.014488797634840012, + "rewards//mean": 0.86358642578125, + "rewards//std": 0.02207849733531475, + "step": 2960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5922, + "grad_norm": 1.8923449516296387, + "kl": 0.35046125017106533, + "learning_rate": 3.6370721177230115e-07, + "loss": 0.014, + "num_tokens": 21559809.0, + "reward": 0.78424072265625, + "reward_std": 0.015242666937410831, + "rewards//mean": 0.78424072265625, + "rewards//std": 0.025120830163359642, + "step": 2961 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5924, + "grad_norm": 1.5283607244491577, + "kl": 0.37829704210162163, + "learning_rate": 3.634019235468896e-07, + "loss": 0.0151, + "num_tokens": 21567041.0, + "reward": 0.87109375, + "reward_std": 0.018723588436841965, + "rewards//mean": 0.87109375, + "rewards//std": 0.022097086533904076, + "step": 2962 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.5926, + "grad_norm": 1.8142694234848022, + "kl": 0.3025163635611534, + "learning_rate": 3.630966903431671e-07, + "loss": 0.0118, + "num_tokens": 21574248.0, + "reward": 0.85858154296875, + "reward_std": 0.014600392431020737, + "rewards//mean": 0.85858154296875, + "rewards//std": 0.023460015654563904, + "step": 2963 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5928, + "grad_norm": 1.4930665493011475, + "kl": 0.3011648338288069, + "learning_rate": 3.627915122840812e-07, + "loss": 0.012, + "num_tokens": 21581560.0, + "reward": 0.84649658203125, + "reward_std": 0.012593405321240425, + "rewards//mean": 0.84649658203125, + "rewards//std": 0.02074805460870266, + "step": 2964 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.875, + "epoch": 0.593, + "grad_norm": 1.8341681957244873, + "kl": 0.3725258391350508, + "learning_rate": 3.624863894925579e-07, + "loss": 0.0118, + "num_tokens": 21588808.0, + "reward": 0.85137939453125, + "reward_std": 0.01882185786962509, + "rewards//mean": 0.85137939453125, + "rewards//std": 0.03287731856107712, + "step": 2965 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.875, + "epoch": 0.5932, + "grad_norm": 2.1421196460723877, + "kl": 0.5539847686886787, + "learning_rate": 3.621813220915004e-07, + "loss": 0.0136, + "num_tokens": 21596000.0, + "reward": 0.8284912109375, + "reward_std": 0.017647050321102142, + "rewards//mean": 0.8284912109375, + "rewards//std": 0.024515537545084953, + "step": 2966 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5934, + "grad_norm": 1.6330207586288452, + "kl": 0.3741891533136368, + "learning_rate": 3.6187631020378984e-07, + "loss": 0.015, + "num_tokens": 21603288.0, + "reward": 0.8341064453125, + "reward_std": 0.01577010750770569, + "rewards//mean": 0.8341064453125, + "rewards//std": 0.018804289400577545, + "step": 2967 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5936, + "grad_norm": 1.439928412437439, + "kl": 0.37623613327741623, + "learning_rate": 3.615713539522851e-07, + "loss": 0.015, + "num_tokens": 21610608.0, + "reward": 0.837158203125, + "reward_std": 0.019920848309993744, + "rewards//mean": 0.837158203125, + "rewards//std": 0.023239022120833397, + "step": 2968 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5938, + "grad_norm": 1.6073613166809082, + "kl": 0.35939048416912556, + "learning_rate": 3.6126645345982237e-07, + "loss": 0.0144, + "num_tokens": 21617880.0, + "reward": 0.8203125, + "reward_std": 0.013731449842453003, + "rewards//mean": 0.8203125, + "rewards//std": 0.01840108260512352, + "step": 2969 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.594, + "grad_norm": 2.2032904624938965, + "kl": 0.44732179678976536, + "learning_rate": 3.609616088492157e-07, + "loss": 0.0179, + "num_tokens": 21625160.0, + "reward": 0.88262939453125, + "reward_std": 0.01838713511824608, + "rewards//mean": 0.88262939453125, + "rewards//std": 0.02386179380118847, + "step": 2970 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.5942, + "grad_norm": 2.0494065284729004, + "kl": 0.38026319071650505, + "learning_rate": 3.6065682024325617e-07, + "loss": 0.0157, + "num_tokens": 21632421.0, + "reward": 0.88238525390625, + "reward_std": 0.014770815148949623, + "rewards//mean": 0.88238525390625, + "rewards//std": 0.027657438069581985, + "step": 2971 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5944, + "grad_norm": 1.479422688484192, + "kl": 0.37483353167772293, + "learning_rate": 3.603520877647129e-07, + "loss": 0.015, + "num_tokens": 21639709.0, + "reward": 0.81805419921875, + "reward_std": 0.011172345839440823, + "rewards//mean": 0.81805419921875, + "rewards//std": 0.014395526610314846, + "step": 2972 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5946, + "grad_norm": 1.2989877462387085, + "kl": 0.32579773850739, + "learning_rate": 3.6004741153633187e-07, + "loss": 0.013, + "num_tokens": 21646941.0, + "reward": 0.8477783203125, + "reward_std": 0.012802168726921082, + "rewards//mean": 0.8477783203125, + "rewards//std": 0.01730167120695114, + "step": 2973 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5948, + "grad_norm": 1.4219582080841064, + "kl": 0.3298843093216419, + "learning_rate": 3.597427916808369e-07, + "loss": 0.0132, + "num_tokens": 21654309.0, + "reward": 0.82110595703125, + "reward_std": 0.016567446291446686, + "rewards//mean": 0.82110595703125, + "rewards//std": 0.020005159080028534, + "step": 2974 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.595, + "grad_norm": 1.7139782905578613, + "kl": 0.3252757955342531, + "learning_rate": 3.594382283209286e-07, + "loss": 0.013, + "num_tokens": 21661621.0, + "reward": 0.850341796875, + "reward_std": 0.01809871569275856, + "rewards//mean": 0.850341796875, + "rewards//std": 0.027151526883244514, + "step": 2975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5952, + "grad_norm": 1.4530359506607056, + "kl": 0.40019598603248596, + "learning_rate": 3.591337215792851e-07, + "loss": 0.016, + "num_tokens": 21668973.0, + "reward": 0.8280029296875, + "reward_std": 0.013497299514710903, + "rewards//mean": 0.8280029296875, + "rewards//std": 0.01834794506430626, + "step": 2976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5954, + "grad_norm": 1.5787149667739868, + "kl": 0.37987814098596573, + "learning_rate": 3.5882927157856167e-07, + "loss": 0.0152, + "num_tokens": 21676277.0, + "reward": 0.84417724609375, + "reward_std": 0.014125879853963852, + "rewards//mean": 0.84417724609375, + "rewards//std": 0.01901114545762539, + "step": 2977 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5956, + "grad_norm": 1.4039676189422607, + "kl": 0.3030433375388384, + "learning_rate": 3.585248784413909e-07, + "loss": 0.0121, + "num_tokens": 21683517.0, + "reward": 0.85601806640625, + "reward_std": 0.013163486495614052, + "rewards//mean": 0.85601806640625, + "rewards//std": 0.02446756139397621, + "step": 2978 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.59375, + "epoch": 0.5958, + "grad_norm": 1.388590693473816, + "kl": 0.3562259953469038, + "learning_rate": 3.58220542290382e-07, + "loss": -0.0028, + "num_tokens": 21690867.0, + "reward": 0.80645751953125, + "reward_std": 0.01446653064340353, + "rewards//mean": 0.80645751953125, + "rewards//std": 0.020680824294686317, + "step": 2979 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.596, + "grad_norm": 1.6951837539672852, + "kl": 0.31632228568196297, + "learning_rate": 3.5791626324812185e-07, + "loss": 0.0127, + "num_tokens": 21698083.0, + "reward": 0.832763671875, + "reward_std": 0.016126856207847595, + "rewards//mean": 0.832763671875, + "rewards//std": 0.03519326075911522, + "step": 2980 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5962, + "grad_norm": 1.5941575765609741, + "kl": 0.35830302350223064, + "learning_rate": 3.5761204143717385e-07, + "loss": 0.0143, + "num_tokens": 21705323.0, + "reward": 0.87017822265625, + "reward_std": 0.01757602021098137, + "rewards//mean": 0.87017822265625, + "rewards//std": 0.03319486230611801, + "step": 2981 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5964, + "grad_norm": 1.459850549697876, + "kl": 0.32493964210152626, + "learning_rate": 3.5730787698007846e-07, + "loss": 0.013, + "num_tokens": 21712523.0, + "reward": 0.85662841796875, + "reward_std": 0.01642356626689434, + "rewards//mean": 0.85662841796875, + "rewards//std": 0.022159254178404808, + "step": 2982 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5966, + "grad_norm": 1.9380810260772705, + "kl": 0.31391463428735733, + "learning_rate": 3.5700376999935334e-07, + "loss": 0.0126, + "num_tokens": 21719843.0, + "reward": 0.83990478515625, + "reward_std": 0.01312628760933876, + "rewards//mean": 0.83990478515625, + "rewards//std": 0.021480059251189232, + "step": 2983 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5968, + "grad_norm": 1.5701638460159302, + "kl": 0.35112371668219566, + "learning_rate": 3.566997206174923e-07, + "loss": 0.014, + "num_tokens": 21727187.0, + "reward": 0.846435546875, + "reward_std": 0.018921229988336563, + "rewards//mean": 0.846435546875, + "rewards//std": 0.027960164472460747, + "step": 2984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.597, + "grad_norm": 1.5637813806533813, + "kl": 0.3444247171282768, + "learning_rate": 3.5639572895696687e-07, + "loss": 0.0138, + "num_tokens": 21734515.0, + "reward": 0.79400634765625, + "reward_std": 0.013328911736607552, + "rewards//mean": 0.79400634765625, + "rewards//std": 0.019490810111165047, + "step": 2985 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.5972, + "grad_norm": 1.5260043144226074, + "kl": 0.38871704787015915, + "learning_rate": 3.5609179514022446e-07, + "loss": 0.0164, + "num_tokens": 21741769.0, + "reward": 0.8056640625, + "reward_std": 0.0162210576236248, + "rewards//mean": 0.8056640625, + "rewards//std": 0.021620633080601692, + "step": 2986 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5974, + "grad_norm": 1.48805570602417, + "kl": 0.31183744594454765, + "learning_rate": 3.5578791928968993e-07, + "loss": 0.0125, + "num_tokens": 21749289.0, + "reward": 0.84222412109375, + "reward_std": 0.0162675678730011, + "rewards//mean": 0.84222412109375, + "rewards//std": 0.021817095577716827, + "step": 2987 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5976, + "grad_norm": 1.4852451086044312, + "kl": 0.3475417122244835, + "learning_rate": 3.554841015277641e-07, + "loss": 0.0139, + "num_tokens": 21756505.0, + "reward": 0.81719970703125, + "reward_std": 0.014131655916571617, + "rewards//mean": 0.81719970703125, + "rewards//std": 0.014618827030062675, + "step": 2988 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5978, + "grad_norm": 1.2326151132583618, + "kl": 0.325382549315691, + "learning_rate": 3.551803419768251e-07, + "loss": 0.013, + "num_tokens": 21763745.0, + "reward": 0.87457275390625, + "reward_std": 0.014001308009028435, + "rewards//mean": 0.87457275390625, + "rewards//std": 0.02504720725119114, + "step": 2989 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.598, + "grad_norm": 1.4311009645462036, + "kl": 0.2896106541156769, + "learning_rate": 3.5487664075922686e-07, + "loss": 0.0116, + "num_tokens": 21771049.0, + "reward": 0.81951904296875, + "reward_std": 0.011013247072696686, + "rewards//mean": 0.81951904296875, + "rewards//std": 0.012218187563121319, + "step": 2990 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5982, + "grad_norm": 1.547890543937683, + "kl": 0.3428103178739548, + "learning_rate": 3.5457299799730045e-07, + "loss": 0.0137, + "num_tokens": 21778297.0, + "reward": 0.83172607421875, + "reward_std": 0.01722554862499237, + "rewards//mean": 0.83172607421875, + "rewards//std": 0.021432790905237198, + "step": 2991 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5984, + "grad_norm": 1.419264316558838, + "kl": 0.3023490309715271, + "learning_rate": 3.5426941381335296e-07, + "loss": 0.0121, + "num_tokens": 21785577.0, + "reward": 0.84124755859375, + "reward_std": 0.01861720159649849, + "rewards//mean": 0.84124755859375, + "rewards//std": 0.024063313379883766, + "step": 2992 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5986, + "grad_norm": 1.3857088088989258, + "kl": 0.33449213579297066, + "learning_rate": 3.5396588832966824e-07, + "loss": 0.0134, + "num_tokens": 21792881.0, + "reward": 0.82928466796875, + "reward_std": 0.012911747209727764, + "rewards//mean": 0.82928466796875, + "rewards//std": 0.02895430289208889, + "step": 2993 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5988, + "grad_norm": 1.8412383794784546, + "kl": 0.2992273196578026, + "learning_rate": 3.536624216685062e-07, + "loss": 0.012, + "num_tokens": 21800137.0, + "reward": 0.8631591796875, + "reward_std": 0.01407705619931221, + "rewards//mean": 0.8631591796875, + "rewards//std": 0.020050982013344765, + "step": 2994 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.599, + "grad_norm": 1.5400482416152954, + "kl": 0.3439771234989166, + "learning_rate": 3.5335901395210326e-07, + "loss": 0.0138, + "num_tokens": 21807457.0, + "reward": 0.82177734375, + "reward_std": 0.017960794270038605, + "rewards//mean": 0.82177734375, + "rewards//std": 0.02102428302168846, + "step": 2995 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.5992, + "grad_norm": 1.4635058641433716, + "kl": 0.343129426240921, + "learning_rate": 3.530556653026721e-07, + "loss": 0.0117, + "num_tokens": 21814755.0, + "reward": 0.8492431640625, + "reward_std": 0.01882820576429367, + "rewards//mean": 0.8492431640625, + "rewards//std": 0.026261338964104652, + "step": 2996 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.8125, + "epoch": 0.5994, + "grad_norm": 1.4880867004394531, + "kl": 0.35018736869096756, + "learning_rate": 3.5275237584240123e-07, + "loss": 0.0114, + "num_tokens": 21822143.0, + "reward": 0.8184814453125, + "reward_std": 0.014569338411092758, + "rewards//mean": 0.8184814453125, + "rewards//std": 0.016688868403434753, + "step": 2997 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5996, + "grad_norm": 1.5421135425567627, + "kl": 0.3473391905426979, + "learning_rate": 3.5244914569345574e-07, + "loss": 0.0139, + "num_tokens": 21829423.0, + "reward": 0.868408203125, + "reward_std": 0.018198605626821518, + "rewards//mean": 0.868408203125, + "rewards//std": 0.032240886241197586, + "step": 2998 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.5998, + "grad_norm": 1.3126617670059204, + "kl": 0.3494596164673567, + "learning_rate": 3.521459749779768e-07, + "loss": 0.014, + "num_tokens": 21836759.0, + "reward": 0.8255615234375, + "reward_std": 0.009042780846357346, + "rewards//mean": 0.8255615234375, + "rewards//std": 0.01219912339001894, + "step": 2999 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6, + "grad_norm": 1.8571648597717285, + "kl": 0.3510334640741348, + "learning_rate": 3.518428638180813e-07, + "loss": 0.014, + "num_tokens": 21843951.0, + "reward": 0.83453369140625, + "reward_std": 0.017101876437664032, + "rewards//mean": 0.83453369140625, + "rewards//std": 0.02462051622569561, + "step": 3000 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6002, + "grad_norm": 1.5001989603042603, + "kl": 0.37381186336278915, + "learning_rate": 3.5153981233586274e-07, + "loss": 0.015, + "num_tokens": 21851175.0, + "reward": 0.8564453125, + "reward_std": 0.012205657549202442, + "rewards//mean": 0.8564453125, + "rewards//std": 0.01922503113746643, + "step": 3001 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6004, + "grad_norm": 1.4324592351913452, + "kl": 0.2801688816398382, + "learning_rate": 3.512368206533898e-07, + "loss": 0.0112, + "num_tokens": 21858447.0, + "reward": 0.84637451171875, + "reward_std": 0.009466591291129589, + "rewards//mean": 0.84637451171875, + "rewards//std": 0.021179860457777977, + "step": 3002 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6006, + "grad_norm": 1.3910950422286987, + "kl": 0.2989783752709627, + "learning_rate": 3.509338888927079e-07, + "loss": 0.012, + "num_tokens": 21865719.0, + "reward": 0.86273193359375, + "reward_std": 0.015030784532427788, + "rewards//mean": 0.86273193359375, + "rewards//std": 0.018634773790836334, + "step": 3003 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6008, + "grad_norm": 1.878029227256775, + "kl": 0.3723164238035679, + "learning_rate": 3.506310171758375e-07, + "loss": 0.0149, + "num_tokens": 21872991.0, + "reward": 0.84881591796875, + "reward_std": 0.021579984575510025, + "rewards//mean": 0.84881591796875, + "rewards//std": 0.02650882862508297, + "step": 3004 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.601, + "grad_norm": 1.3357951641082764, + "kl": 0.34801856614649296, + "learning_rate": 3.503282056247757e-07, + "loss": 0.0139, + "num_tokens": 21880263.0, + "reward": 0.8585205078125, + "reward_std": 0.01621771790087223, + "rewards//mean": 0.8585205078125, + "rewards//std": 0.02549147978425026, + "step": 3005 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6012, + "grad_norm": 1.5808337926864624, + "kl": 0.3402007035911083, + "learning_rate": 3.500254543614947e-07, + "loss": 0.0136, + "num_tokens": 21887527.0, + "reward": 0.81787109375, + "reward_std": 0.013473191298544407, + "rewards//mean": 0.81787109375, + "rewards//std": 0.01830209791660309, + "step": 3006 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6014, + "grad_norm": 1.6095752716064453, + "kl": 0.31563943810760975, + "learning_rate": 3.4972276350794284e-07, + "loss": 0.0126, + "num_tokens": 21894791.0, + "reward": 0.87982177734375, + "reward_std": 0.018371207639575005, + "rewards//mean": 0.87982177734375, + "rewards//std": 0.023424498736858368, + "step": 3007 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.828125, + "epoch": 0.6016, + "grad_norm": 1.9664093255996704, + "kl": 0.3780068978667259, + "learning_rate": 3.494201331860438e-07, + "loss": 0.0058, + "num_tokens": 21901996.0, + "reward": 0.85028076171875, + "reward_std": 0.018260054290294647, + "rewards//mean": 0.85028076171875, + "rewards//std": 0.02482319436967373, + "step": 3008 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6018, + "grad_norm": 1.5906028747558594, + "kl": 0.36933067440986633, + "learning_rate": 3.4911756351769716e-07, + "loss": 0.0148, + "num_tokens": 21909276.0, + "reward": 0.85186767578125, + "reward_std": 0.01898638904094696, + "rewards//mean": 0.85186767578125, + "rewards//std": 0.026441359892487526, + "step": 3009 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.8125, + "epoch": 0.602, + "grad_norm": 1.2715675830841064, + "kl": 0.3318915292620659, + "learning_rate": 3.488150546247778e-07, + "loss": 0.0027, + "num_tokens": 21916592.0, + "reward": 0.785888671875, + "reward_std": 0.014535004273056984, + "rewards//mean": 0.785888671875, + "rewards//std": 0.01721615344285965, + "step": 3010 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6022, + "grad_norm": 1.589215874671936, + "kl": 0.33902323991060257, + "learning_rate": 3.485126066291364e-07, + "loss": 0.0136, + "num_tokens": 21923800.0, + "reward": 0.861083984375, + "reward_std": 0.01414177380502224, + "rewards//mean": 0.861083984375, + "rewards//std": 0.020420897752046585, + "step": 3011 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6024, + "grad_norm": 1.3457368612289429, + "kl": 0.3998603355139494, + "learning_rate": 3.48210219652599e-07, + "loss": 0.016, + "num_tokens": 21931184.0, + "reward": 0.83966064453125, + "reward_std": 0.014407023787498474, + "rewards//mean": 0.83966064453125, + "rewards//std": 0.020403748378157616, + "step": 3012 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6026, + "grad_norm": 1.4524621963500977, + "kl": 0.3110523857176304, + "learning_rate": 3.4790789381696685e-07, + "loss": 0.0124, + "num_tokens": 21938480.0, + "reward": 0.80419921875, + "reward_std": 0.015667878091335297, + "rewards//mean": 0.80419921875, + "rewards//std": 0.022877952083945274, + "step": 3013 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6028, + "grad_norm": 1.2942944765090942, + "kl": 0.31726062670350075, + "learning_rate": 3.4760562924401706e-07, + "loss": 0.0127, + "num_tokens": 21945864.0, + "reward": 0.89794921875, + "reward_std": 0.014993307180702686, + "rewards//mean": 0.89794921875, + "rewards//std": 0.018182605504989624, + "step": 3014 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.603, + "grad_norm": 1.608015775680542, + "kl": 0.28753551468253136, + "learning_rate": 3.4730342605550134e-07, + "loss": 0.0115, + "num_tokens": 21953176.0, + "reward": 0.83209228515625, + "reward_std": 0.01348588801920414, + "rewards//mean": 0.83209228515625, + "rewards//std": 0.020804159343242645, + "step": 3015 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.6032, + "grad_norm": 1.3770439624786377, + "kl": 0.3148358017206192, + "learning_rate": 3.470012843731476e-07, + "loss": 0.0158, + "num_tokens": 21960514.0, + "reward": 0.85601806640625, + "reward_std": 0.014266576617956161, + "rewards//mean": 0.85601806640625, + "rewards//std": 0.023103054612874985, + "step": 3016 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6034, + "grad_norm": 1.5141490697860718, + "kl": 0.37450597807765007, + "learning_rate": 3.4669920431865795e-07, + "loss": 0.015, + "num_tokens": 21967850.0, + "reward": 0.843994140625, + "reward_std": 0.013175624422729015, + "rewards//mean": 0.843994140625, + "rewards//std": 0.018660852685570717, + "step": 3017 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6036, + "grad_norm": 1.5539861917495728, + "kl": 0.31211279705166817, + "learning_rate": 3.463971860137107e-07, + "loss": 0.0125, + "num_tokens": 21975058.0, + "reward": 0.87103271484375, + "reward_std": 0.01706107147037983, + "rewards//mean": 0.87103271484375, + "rewards//std": 0.023432252928614616, + "step": 3018 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6038, + "grad_norm": 1.7348886728286743, + "kl": 0.2999376505613327, + "learning_rate": 3.460952295799584e-07, + "loss": 0.012, + "num_tokens": 21982306.0, + "reward": 0.8671875, + "reward_std": 0.015326336026191711, + "rewards//mean": 0.8671875, + "rewards//std": 0.02343750186264515, + "step": 3019 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.604, + "grad_norm": 1.4608145952224731, + "kl": 0.341806773096323, + "learning_rate": 3.457933351390293e-07, + "loss": 0.0137, + "num_tokens": 21989530.0, + "reward": 0.881103515625, + "reward_std": 0.014996114186942577, + "rewards//mean": 0.881103515625, + "rewards//std": 0.026225866749882698, + "step": 3020 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6042, + "grad_norm": 1.622542142868042, + "kl": 0.3352331668138504, + "learning_rate": 3.454915028125263e-07, + "loss": 0.0134, + "num_tokens": 21996826.0, + "reward": 0.84808349609375, + "reward_std": 0.017252307385206223, + "rewards//mean": 0.84808349609375, + "rewards//std": 0.02699265629053116, + "step": 3021 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6044, + "grad_norm": 1.7174720764160156, + "kl": 0.358975812792778, + "learning_rate": 3.451897327220276e-07, + "loss": 0.0144, + "num_tokens": 22004242.0, + "reward": 0.87249755859375, + "reward_std": 0.014151498675346375, + "rewards//mean": 0.87249755859375, + "rewards//std": 0.017404930666089058, + "step": 3022 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6046, + "grad_norm": 1.7542831897735596, + "kl": 0.38708954490721226, + "learning_rate": 3.448880249890859e-07, + "loss": 0.0155, + "num_tokens": 22011538.0, + "reward": 0.868896484375, + "reward_std": 0.017890730872750282, + "rewards//mean": 0.868896484375, + "rewards//std": 0.025102822110056877, + "step": 3023 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6048, + "grad_norm": 1.473271369934082, + "kl": 0.3198715876787901, + "learning_rate": 3.445863797352293e-07, + "loss": 0.0128, + "num_tokens": 22018874.0, + "reward": 0.8583984375, + "reward_std": 0.016113940626382828, + "rewards//mean": 0.8583984375, + "rewards//std": 0.03024967759847641, + "step": 3024 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.605, + "grad_norm": 1.7140024900436401, + "kl": 0.33183047734200954, + "learning_rate": 3.4428479708196033e-07, + "loss": 0.0133, + "num_tokens": 22026130.0, + "reward": 0.8458251953125, + "reward_std": 0.010790698230266571, + "rewards//mean": 0.8458251953125, + "rewards//std": 0.018068606033921242, + "step": 3025 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6052, + "grad_norm": 2.2411446571350098, + "kl": 0.3457157723605633, + "learning_rate": 3.439832771507565e-07, + "loss": 0.0138, + "num_tokens": 22033394.0, + "reward": 0.80938720703125, + "reward_std": 0.020502697676420212, + "rewards//mean": 0.80938720703125, + "rewards//std": 0.0384952612221241, + "step": 3026 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6054, + "grad_norm": 1.4937134981155396, + "kl": 0.3172860164195299, + "learning_rate": 3.4368182006307e-07, + "loss": 0.0127, + "num_tokens": 22040714.0, + "reward": 0.841552734375, + "reward_std": 0.012113437987864017, + "rewards//mean": 0.841552734375, + "rewards//std": 0.015853900462388992, + "step": 3027 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.6056, + "grad_norm": 1.4634733200073242, + "kl": 0.30743489041924477, + "learning_rate": 3.433804259403276e-07, + "loss": 0.0121, + "num_tokens": 22048016.0, + "reward": 0.81451416015625, + "reward_std": 0.014078628271818161, + "rewards//mean": 0.81451416015625, + "rewards//std": 0.024309907108545303, + "step": 3028 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.578125, + "epoch": 0.6058, + "grad_norm": 1.5046985149383545, + "kl": 0.3531029522418976, + "learning_rate": 3.430790949039309e-07, + "loss": -0.0154, + "num_tokens": 22055325.0, + "reward": 0.82464599609375, + "reward_std": 0.01566402241587639, + "rewards//mean": 0.82464599609375, + "rewards//std": 0.020149169489741325, + "step": 3029 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.606, + "grad_norm": 2.0606911182403564, + "kl": 0.4355045147240162, + "learning_rate": 3.4277782707525603e-07, + "loss": 0.0174, + "num_tokens": 22062589.0, + "reward": 0.81842041015625, + "reward_std": 0.011445049196481705, + "rewards//mean": 0.81842041015625, + "rewards//std": 0.018636398017406464, + "step": 3030 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6062, + "grad_norm": 1.5967622995376587, + "kl": 0.4187221862375736, + "learning_rate": 3.4247662257565366e-07, + "loss": 0.0167, + "num_tokens": 22069885.0, + "reward": 0.87347412109375, + "reward_std": 0.01712528057396412, + "rewards//mean": 0.87347412109375, + "rewards//std": 0.0198280680924654, + "step": 3031 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.6064, + "grad_norm": 1.4546068906784058, + "kl": 0.3201816603541374, + "learning_rate": 3.421754815264488e-07, + "loss": 0.0124, + "num_tokens": 22077202.0, + "reward": 0.86846923828125, + "reward_std": 0.014014553278684616, + "rewards//mean": 0.86846923828125, + "rewards//std": 0.024676403030753136, + "step": 3032 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6066, + "grad_norm": 1.5593740940093994, + "kl": 0.39326708018779755, + "learning_rate": 3.418744040489412e-07, + "loss": 0.0157, + "num_tokens": 22084522.0, + "reward": 0.86273193359375, + "reward_std": 0.019494496285915375, + "rewards//mean": 0.86273193359375, + "rewards//std": 0.03181735798716545, + "step": 3033 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6068, + "grad_norm": 1.4797545671463013, + "kl": 0.3712471704930067, + "learning_rate": 3.415733902644046e-07, + "loss": 0.0148, + "num_tokens": 22091754.0, + "reward": 0.8751220703125, + "reward_std": 0.021354660391807556, + "rewards//mean": 0.8751220703125, + "rewards//std": 0.03347261622548103, + "step": 3034 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.607, + "grad_norm": 2.356367588043213, + "kl": 0.45881910622119904, + "learning_rate": 3.4127244029408756e-07, + "loss": 0.0184, + "num_tokens": 22099074.0, + "reward": 0.85015869140625, + "reward_std": 0.013228103518486023, + "rewards//mean": 0.85015869140625, + "rewards//std": 0.019479932263493538, + "step": 3035 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6072, + "grad_norm": 2.0371570587158203, + "kl": 0.32808071561157703, + "learning_rate": 3.4097155425921256e-07, + "loss": 0.0131, + "num_tokens": 22106442.0, + "reward": 0.84912109375, + "reward_std": 0.013345079496502876, + "rewards//mean": 0.84912109375, + "rewards//std": 0.018394500017166138, + "step": 3036 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6074, + "grad_norm": 1.9407976865768433, + "kl": 0.3981231562793255, + "learning_rate": 3.4067073228097655e-07, + "loss": 0.0159, + "num_tokens": 22113730.0, + "reward": 0.843994140625, + "reward_std": 0.01816372759640217, + "rewards//mean": 0.843994140625, + "rewards//std": 0.021360008046030998, + "step": 3037 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6076, + "grad_norm": 1.6989836692810059, + "kl": 0.33682263270020485, + "learning_rate": 3.4036997448055036e-07, + "loss": 0.0135, + "num_tokens": 22121026.0, + "reward": 0.8385009765625, + "reward_std": 0.01697877049446106, + "rewards//mean": 0.8385009765625, + "rewards//std": 0.027737028896808624, + "step": 3038 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.84375, + "epoch": 0.6078, + "grad_norm": 1.4349865913391113, + "kl": 0.3970720134675503, + "learning_rate": 3.4006928097907954e-07, + "loss": 0.0075, + "num_tokens": 22128336.0, + "reward": 0.81365966796875, + "reward_std": 0.013982699252665043, + "rewards//mean": 0.81365966796875, + "rewards//std": 0.023205695673823357, + "step": 3039 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.608, + "grad_norm": 1.9754716157913208, + "kl": 0.4053019620478153, + "learning_rate": 3.397686518976831e-07, + "loss": 0.0162, + "num_tokens": 22135632.0, + "reward": 0.831787109375, + "reward_std": 0.015270471572875977, + "rewards//mean": 0.831787109375, + "rewards//std": 0.01934903860092163, + "step": 3040 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6082, + "grad_norm": 2.4181103706359863, + "kl": 0.4260733239352703, + "learning_rate": 3.394680873574546e-07, + "loss": 0.017, + "num_tokens": 22142968.0, + "reward": 0.8260498046875, + "reward_std": 0.019888412207365036, + "rewards//mean": 0.8260498046875, + "rewards//std": 0.023560861125588417, + "step": 3041 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6084, + "grad_norm": 1.4847118854522705, + "kl": 0.35669256187975407, + "learning_rate": 3.391675874794612e-07, + "loss": 0.0143, + "num_tokens": 22150336.0, + "reward": 0.83551025390625, + "reward_std": 0.014733768999576569, + "rewards//mean": 0.83551025390625, + "rewards//std": 0.01632421836256981, + "step": 3042 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.84375, + "epoch": 0.6086, + "grad_norm": 1.4312434196472168, + "kl": 0.3748391531407833, + "learning_rate": 3.388671523847445e-07, + "loss": 0.02, + "num_tokens": 22157606.0, + "reward": 0.8853759765625, + "reward_std": 0.017557337880134583, + "rewards//mean": 0.8853759765625, + "rewards//std": 0.022350136190652847, + "step": 3043 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.8125, + "epoch": 0.6088, + "grad_norm": 1.3830490112304688, + "kl": 0.32212546840310097, + "learning_rate": 3.3856678219431944e-07, + "loss": 0.0072, + "num_tokens": 22164930.0, + "reward": 0.8162841796875, + "reward_std": 0.012126155197620392, + "rewards//mean": 0.8162841796875, + "rewards//std": 0.01325987372547388, + "step": 3044 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.609, + "grad_norm": 1.714254379272461, + "kl": 0.34840404614806175, + "learning_rate": 3.382664770291752e-07, + "loss": 0.0139, + "num_tokens": 22172130.0, + "reward": 0.81744384765625, + "reward_std": 0.01690104976296425, + "rewards//mean": 0.81744384765625, + "rewards//std": 0.024638954550027847, + "step": 3045 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6092, + "grad_norm": 1.4695897102355957, + "kl": 0.35719627141952515, + "learning_rate": 3.3796623701027473e-07, + "loss": 0.0143, + "num_tokens": 22179386.0, + "reward": 0.8665771484375, + "reward_std": 0.01614203304052353, + "rewards//mean": 0.8665771484375, + "rewards//std": 0.02171250805258751, + "step": 3046 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6094, + "grad_norm": 1.772377848625183, + "kl": 0.31295995227992535, + "learning_rate": 3.376660622585545e-07, + "loss": 0.0125, + "num_tokens": 22186666.0, + "reward": 0.8526611328125, + "reward_std": 0.01739485189318657, + "rewards//mean": 0.8526611328125, + "rewards//std": 0.02209126390516758, + "step": 3047 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6096, + "grad_norm": 1.4793728590011597, + "kl": 0.2843288704752922, + "learning_rate": 3.373659528949251e-07, + "loss": 0.0114, + "num_tokens": 22193906.0, + "reward": 0.86700439453125, + "reward_std": 0.01580766960978508, + "rewards//mean": 0.86700439453125, + "rewards//std": 0.023234380409121513, + "step": 3048 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6098, + "grad_norm": 1.52200448513031, + "kl": 0.29007086902856827, + "learning_rate": 3.370659090402703e-07, + "loss": 0.0116, + "num_tokens": 22201210.0, + "reward": 0.85382080078125, + "reward_std": 0.01540780533105135, + "rewards//mean": 0.85382080078125, + "rewards//std": 0.01970784179866314, + "step": 3049 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.61, + "grad_norm": 3.13236927986145, + "kl": 0.5349430739879608, + "learning_rate": 3.36765930815448e-07, + "loss": 0.0214, + "num_tokens": 22208410.0, + "reward": 0.8724365234375, + "reward_std": 0.01972714066505432, + "rewards//mean": 0.8724365234375, + "rewards//std": 0.02561233565211296, + "step": 3050 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6102, + "grad_norm": 1.4027254581451416, + "kl": 0.2810220569372177, + "learning_rate": 3.3646601834128916e-07, + "loss": 0.0112, + "num_tokens": 22215722.0, + "reward": 0.80340576171875, + "reward_std": 0.0182915311306715, + "rewards//mean": 0.80340576171875, + "rewards//std": 0.02736636996269226, + "step": 3051 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6104, + "grad_norm": 1.620347261428833, + "kl": 0.4378087632358074, + "learning_rate": 3.361661717385986e-07, + "loss": 0.0175, + "num_tokens": 22223050.0, + "reward": 0.85552978515625, + "reward_std": 0.012302394956350327, + "rewards//mean": 0.85552978515625, + "rewards//std": 0.016521470621228218, + "step": 3052 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6106, + "grad_norm": 1.421624779701233, + "kl": 0.305895671248436, + "learning_rate": 3.358663911281544e-07, + "loss": 0.0122, + "num_tokens": 22230258.0, + "reward": 0.872802734375, + "reward_std": 0.015071751549839973, + "rewards//mean": 0.872802734375, + "rewards//std": 0.02722279727458954, + "step": 3053 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6108, + "grad_norm": 1.5686466693878174, + "kl": 0.3263969048857689, + "learning_rate": 3.3556667663070835e-07, + "loss": 0.0131, + "num_tokens": 22237530.0, + "reward": 0.82379150390625, + "reward_std": 0.013304457068443298, + "rewards//mean": 0.82379150390625, + "rewards//std": 0.01801690272986889, + "step": 3054 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.611, + "grad_norm": 2.0873851776123047, + "kl": 0.46971597522497177, + "learning_rate": 3.3526702836698515e-07, + "loss": 0.0188, + "num_tokens": 22244738.0, + "reward": 0.880126953125, + "reward_std": 0.01744995266199112, + "rewards//mean": 0.880126953125, + "rewards//std": 0.01873856410384178, + "step": 3055 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6112, + "grad_norm": 1.7398004531860352, + "kl": 0.3678518533706665, + "learning_rate": 3.349674464576834e-07, + "loss": 0.0147, + "num_tokens": 22251938.0, + "reward": 0.7625732421875, + "reward_std": 0.01619039475917816, + "rewards//mean": 0.7625732421875, + "rewards//std": 0.022102223709225655, + "step": 3056 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.84375, + "epoch": 0.6114, + "grad_norm": 1.3417413234710693, + "kl": 0.30895385704934597, + "learning_rate": 3.3466793102347433e-07, + "loss": 0.0095, + "num_tokens": 22259184.0, + "reward": 0.854248046875, + "reward_std": 0.01261981576681137, + "rewards//mean": 0.854248046875, + "rewards//std": 0.024138763546943665, + "step": 3057 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6116, + "grad_norm": 1.634548306465149, + "kl": 0.3159082680940628, + "learning_rate": 3.34368482185003e-07, + "loss": 0.0126, + "num_tokens": 22266464.0, + "reward": 0.826171875, + "reward_std": 0.02079668641090393, + "rewards//mean": 0.826171875, + "rewards//std": 0.027586262673139572, + "step": 3058 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6118, + "grad_norm": 1.2872613668441772, + "kl": 0.3313405532389879, + "learning_rate": 3.3406910006288716e-07, + "loss": 0.0133, + "num_tokens": 22273696.0, + "reward": 0.8570556640625, + "reward_std": 0.01302928663790226, + "rewards//mean": 0.8570556640625, + "rewards//std": 0.022985881194472313, + "step": 3059 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.612, + "grad_norm": 1.6337963342666626, + "kl": 0.3707265593111515, + "learning_rate": 3.337697847777179e-07, + "loss": 0.0148, + "num_tokens": 22280952.0, + "reward": 0.84869384765625, + "reward_std": 0.016554679721593857, + "rewards//mean": 0.84869384765625, + "rewards//std": 0.01987229846417904, + "step": 3060 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.8125, + "epoch": 0.6122, + "grad_norm": 1.7843784093856812, + "kl": 0.4716687574982643, + "learning_rate": 3.3347053645005965e-07, + "loss": 0.0179, + "num_tokens": 22288164.0, + "reward": 0.85162353515625, + "reward_std": 0.02116832137107849, + "rewards//mean": 0.85162353515625, + "rewards//std": 0.02721049077808857, + "step": 3061 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6124, + "grad_norm": 1.8329256772994995, + "kl": 0.3640912529081106, + "learning_rate": 3.331713552004492e-07, + "loss": 0.0146, + "num_tokens": 22295588.0, + "reward": 0.81353759765625, + "reward_std": 0.021031761541962624, + "rewards//mean": 0.81353759765625, + "rewards//std": 0.02908419258892536, + "step": 3062 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.6126, + "grad_norm": 1.5270607471466064, + "kl": 0.34832598827779293, + "learning_rate": 3.3287224114939704e-07, + "loss": 0.0165, + "num_tokens": 22302847.0, + "reward": 0.8846435546875, + "reward_std": 0.015255522914230824, + "rewards//mean": 0.8846435546875, + "rewards//std": 0.02559104934334755, + "step": 3063 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6128, + "grad_norm": 1.422326922416687, + "kl": 0.32069697231054306, + "learning_rate": 3.325731944173861e-07, + "loss": 0.0128, + "num_tokens": 22310079.0, + "reward": 0.858642578125, + "reward_std": 0.01677737385034561, + "rewards//mean": 0.858642578125, + "rewards//std": 0.020680183544754982, + "step": 3064 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.613, + "grad_norm": 1.7135026454925537, + "kl": 0.3495893497020006, + "learning_rate": 3.3227421512487255e-07, + "loss": 0.014, + "num_tokens": 22317303.0, + "reward": 0.8328857421875, + "reward_std": 0.018902797251939774, + "rewards//mean": 0.8328857421875, + "rewards//std": 0.026018114760518074, + "step": 3065 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6132, + "grad_norm": 1.7645801305770874, + "kl": 0.3843367751687765, + "learning_rate": 3.319753033922849e-07, + "loss": 0.0154, + "num_tokens": 22324607.0, + "reward": 0.84661865234375, + "reward_std": 0.011611121706664562, + "rewards//mean": 0.84661865234375, + "rewards//std": 0.018085667863488197, + "step": 3066 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.6134, + "grad_norm": 1.573747992515564, + "kl": 0.36503518000245094, + "learning_rate": 3.316764593400251e-07, + "loss": 0.0154, + "num_tokens": 22331890.0, + "reward": 0.821044921875, + "reward_std": 0.014591732062399387, + "rewards//mean": 0.821044921875, + "rewards//std": 0.01923605054616928, + "step": 3067 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6136, + "grad_norm": 2.0186514854431152, + "kl": 0.2693955861032009, + "learning_rate": 3.313776830884672e-07, + "loss": 0.0108, + "num_tokens": 22339178.0, + "reward": 0.86944580078125, + "reward_std": 0.012577731162309647, + "rewards//mean": 0.86944580078125, + "rewards//std": 0.02802012860774994, + "step": 3068 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.6138, + "grad_norm": 2.145810127258301, + "kl": 0.32978126779198647, + "learning_rate": 3.3107897475795855e-07, + "loss": 0.0138, + "num_tokens": 22346641.0, + "reward": 0.77880859375, + "reward_std": 0.011103234253823757, + "rewards//mean": 0.77880859375, + "rewards//std": 0.020628880709409714, + "step": 3069 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.614, + "grad_norm": 1.4964467287063599, + "kl": 0.31294400431215763, + "learning_rate": 3.307803344688185e-07, + "loss": 0.0125, + "num_tokens": 22353889.0, + "reward": 0.84228515625, + "reward_std": 0.01489957980811596, + "rewards//mean": 0.84228515625, + "rewards//std": 0.019054194912314415, + "step": 3070 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6142, + "grad_norm": 1.6943626403808594, + "kl": 0.3781398218125105, + "learning_rate": 3.3048176234133963e-07, + "loss": 0.0151, + "num_tokens": 22361129.0, + "reward": 0.82196044921875, + "reward_std": 0.015412552282214165, + "rewards//mean": 0.82196044921875, + "rewards//std": 0.023966871201992035, + "step": 3071 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6144, + "grad_norm": 3.007279396057129, + "kl": 0.5922277811914682, + "learning_rate": 3.3018325849578656e-07, + "loss": 0.0237, + "num_tokens": 22368377.0, + "reward": 0.8677978515625, + "reward_std": 0.012966310605406761, + "rewards//mean": 0.8677978515625, + "rewards//std": 0.020114298909902573, + "step": 3072 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6146, + "grad_norm": 1.312986135482788, + "kl": 0.32692781277000904, + "learning_rate": 3.298848230523967e-07, + "loss": 0.0131, + "num_tokens": 22375633.0, + "reward": 0.89111328125, + "reward_std": 0.013861662708222866, + "rewards//mean": 0.89111328125, + "rewards//std": 0.01944425143301487, + "step": 3073 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6148, + "grad_norm": 1.5390777587890625, + "kl": 0.30499932914972305, + "learning_rate": 3.295864561313797e-07, + "loss": 0.0122, + "num_tokens": 22382913.0, + "reward": 0.85223388671875, + "reward_std": 0.014796929433941841, + "rewards//mean": 0.85223388671875, + "rewards//std": 0.021626846864819527, + "step": 3074 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.615, + "grad_norm": 1.9355518817901611, + "kl": 0.40959756076335907, + "learning_rate": 3.2928815785291786e-07, + "loss": 0.0164, + "num_tokens": 22390161.0, + "reward": 0.85064697265625, + "reward_std": 0.016905806958675385, + "rewards//mean": 0.85064697265625, + "rewards//std": 0.018594112247228622, + "step": 3075 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.578125, + "epoch": 0.6152, + "grad_norm": 1.8447678089141846, + "kl": 0.3544553406536579, + "learning_rate": 3.2898992833716563e-07, + "loss": -0.0174, + "num_tokens": 22397486.0, + "reward": 0.83575439453125, + "reward_std": 0.020364969968795776, + "rewards//mean": 0.83575439453125, + "rewards//std": 0.029027404263615608, + "step": 3076 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6154, + "grad_norm": 1.7436493635177612, + "kl": 0.35274479165673256, + "learning_rate": 3.2869176770424973e-07, + "loss": 0.0141, + "num_tokens": 22404742.0, + "reward": 0.80633544921875, + "reward_std": 0.013101148419082165, + "rewards//mean": 0.80633544921875, + "rewards//std": 0.023835133761167526, + "step": 3077 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6156, + "grad_norm": 1.5162209272384644, + "kl": 0.37881900370121, + "learning_rate": 3.2839367607426937e-07, + "loss": 0.0152, + "num_tokens": 22412046.0, + "reward": 0.84710693359375, + "reward_std": 0.011460689827799797, + "rewards//mean": 0.84710693359375, + "rewards//std": 0.01875138469040394, + "step": 3078 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.6158, + "grad_norm": 1.2915667295455933, + "kl": 0.3635083958506584, + "learning_rate": 3.2809565356729575e-07, + "loss": 0.015, + "num_tokens": 22419417.0, + "reward": 0.878662109375, + "reward_std": 0.016497325152158737, + "rewards//mean": 0.878662109375, + "rewards//std": 0.023436209186911583, + "step": 3079 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.616, + "grad_norm": 1.5257506370544434, + "kl": 0.37829137966036797, + "learning_rate": 3.2779770030337235e-07, + "loss": 0.0151, + "num_tokens": 22426609.0, + "reward": 0.80340576171875, + "reward_std": 0.018032262101769447, + "rewards//mean": 0.80340576171875, + "rewards//std": 0.023458724841475487, + "step": 3080 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6162, + "grad_norm": 1.9142439365386963, + "kl": 0.4329465329647064, + "learning_rate": 3.274998164025148e-07, + "loss": 0.0173, + "num_tokens": 22433921.0, + "reward": 0.85589599609375, + "reward_std": 0.020810943096876144, + "rewards//mean": 0.85589599609375, + "rewards//std": 0.024519475176930428, + "step": 3081 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6164, + "grad_norm": 1.707417368888855, + "kl": 0.3906662240624428, + "learning_rate": 3.272020019847104e-07, + "loss": 0.0156, + "num_tokens": 22441209.0, + "reward": 0.8477783203125, + "reward_std": 0.01524885930120945, + "rewards//mean": 0.8477783203125, + "rewards//std": 0.017634442076086998, + "step": 3082 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6166, + "grad_norm": 1.6967262029647827, + "kl": 0.35809094831347466, + "learning_rate": 3.2690425716991897e-07, + "loss": 0.0143, + "num_tokens": 22448585.0, + "reward": 0.8612060546875, + "reward_std": 0.016644522547721863, + "rewards//mean": 0.8612060546875, + "rewards//std": 0.020123327150940895, + "step": 3083 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6168, + "grad_norm": 1.6525285243988037, + "kl": 0.3646965026855469, + "learning_rate": 3.26606582078072e-07, + "loss": 0.0146, + "num_tokens": 22455849.0, + "reward": 0.875732421875, + "reward_std": 0.013463289476931095, + "rewards//mean": 0.875732421875, + "rewards//std": 0.020680183544754982, + "step": 3084 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.617, + "grad_norm": 1.4334410429000854, + "kl": 0.33620419912040234, + "learning_rate": 3.263089768290731e-07, + "loss": 0.0134, + "num_tokens": 22463057.0, + "reward": 0.76678466796875, + "reward_std": 0.01181553490459919, + "rewards//mean": 0.76678466796875, + "rewards//std": 0.019620083272457123, + "step": 3085 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6172, + "grad_norm": 1.5048273801803589, + "kl": 0.30089518427848816, + "learning_rate": 3.260114415427975e-07, + "loss": 0.012, + "num_tokens": 22470401.0, + "reward": 0.864013671875, + "reward_std": 0.014081129804253578, + "rewards//mean": 0.864013671875, + "rewards//std": 0.023311864584684372, + "step": 3086 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6174, + "grad_norm": 1.2968826293945312, + "kl": 0.2672660183161497, + "learning_rate": 3.257139763390925e-07, + "loss": 0.0107, + "num_tokens": 22477745.0, + "reward": 0.87432861328125, + "reward_std": 0.015908725559711456, + "rewards//mean": 0.87432861328125, + "rewards//std": 0.024255670607089996, + "step": 3087 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6176, + "grad_norm": 1.528407335281372, + "kl": 0.40446479246020317, + "learning_rate": 3.254165813377769e-07, + "loss": 0.0162, + "num_tokens": 22484961.0, + "reward": 0.84832763671875, + "reward_std": 0.018954351544380188, + "rewards//mean": 0.84832763671875, + "rewards//std": 0.0255017951130867, + "step": 3088 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6178, + "grad_norm": 2.0331737995147705, + "kl": 0.44450512528419495, + "learning_rate": 3.251192566586416e-07, + "loss": 0.0178, + "num_tokens": 22492241.0, + "reward": 0.82635498046875, + "reward_std": 0.015352720394730568, + "rewards//mean": 0.82635498046875, + "rewards//std": 0.02053980529308319, + "step": 3089 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.618, + "grad_norm": 1.7928073406219482, + "kl": 0.3594125397503376, + "learning_rate": 3.2482200242144874e-07, + "loss": 0.0144, + "num_tokens": 22499545.0, + "reward": 0.8660888671875, + "reward_std": 0.017775828018784523, + "rewards//mean": 0.8660888671875, + "rewards//std": 0.03198894485831261, + "step": 3090 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6182, + "grad_norm": 1.821291446685791, + "kl": 0.4050298873335123, + "learning_rate": 3.245248187459323e-07, + "loss": 0.0162, + "num_tokens": 22506961.0, + "reward": 0.81597900390625, + "reward_std": 0.015679452568292618, + "rewards//mean": 0.81597900390625, + "rewards//std": 0.025440581142902374, + "step": 3091 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.8125, + "epoch": 0.6184, + "grad_norm": 1.7790590524673462, + "kl": 0.3807833530008793, + "learning_rate": 3.2422770575179793e-07, + "loss": 0.0159, + "num_tokens": 22514181.0, + "reward": 0.862548828125, + "reward_std": 0.01418247539550066, + "rewards//mean": 0.862548828125, + "rewards//std": 0.020726976916193962, + "step": 3092 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6186, + "grad_norm": 1.3703858852386475, + "kl": 0.31190432980656624, + "learning_rate": 3.239306635587226e-07, + "loss": 0.0125, + "num_tokens": 22521429.0, + "reward": 0.83685302734375, + "reward_std": 0.01396610401570797, + "rewards//mean": 0.83685302734375, + "rewards//std": 0.023080766201019287, + "step": 3093 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6188, + "grad_norm": 1.2478046417236328, + "kl": 0.3241423200815916, + "learning_rate": 3.2363369228635504e-07, + "loss": 0.013, + "num_tokens": 22528629.0, + "reward": 0.8382568359375, + "reward_std": 0.009911554865539074, + "rewards//mean": 0.8382568359375, + "rewards//std": 0.019669879227876663, + "step": 3094 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.619, + "grad_norm": 1.5343419313430786, + "kl": 0.37444423511624336, + "learning_rate": 3.233367920543151e-07, + "loss": 0.015, + "num_tokens": 22536069.0, + "reward": 0.83782958984375, + "reward_std": 0.02212078869342804, + "rewards//mean": 0.83782958984375, + "rewards//std": 0.039358947426080704, + "step": 3095 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6192, + "grad_norm": 1.7717018127441406, + "kl": 0.28286999836564064, + "learning_rate": 3.2303996298219413e-07, + "loss": 0.0113, + "num_tokens": 22543317.0, + "reward": 0.86328125, + "reward_std": 0.012435798533260822, + "rewards//mean": 0.86328125, + "rewards//std": 0.015916794538497925, + "step": 3096 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6194, + "grad_norm": 1.3392540216445923, + "kl": 0.35743037052452564, + "learning_rate": 3.2274320518955493e-07, + "loss": 0.0143, + "num_tokens": 22550565.0, + "reward": 0.790771484375, + "reward_std": 0.016917463392019272, + "rewards//mean": 0.790771484375, + "rewards//std": 0.02394736371934414, + "step": 3097 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6196, + "grad_norm": 1.4931399822235107, + "kl": 0.2923269420862198, + "learning_rate": 3.2244651879593156e-07, + "loss": 0.0117, + "num_tokens": 22557861.0, + "reward": 0.8681640625, + "reward_std": 0.017024565488100052, + "rewards//mean": 0.8681640625, + "rewards//std": 0.02520032413303852, + "step": 3098 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6198, + "grad_norm": 1.3382201194763184, + "kl": 0.30181629583239555, + "learning_rate": 3.221499039208291e-07, + "loss": 0.0121, + "num_tokens": 22565117.0, + "reward": 0.88787841796875, + "reward_std": 0.01920054852962494, + "rewards//mean": 0.88787841796875, + "rewards//std": 0.0257957112044096, + "step": 3099 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.62, + "grad_norm": 1.6283109188079834, + "kl": 0.3326738439500332, + "learning_rate": 3.2185336068372415e-07, + "loss": 0.0133, + "num_tokens": 22572485.0, + "reward": 0.864013671875, + "reward_std": 0.020035576075315475, + "rewards//mean": 0.864013671875, + "rewards//std": 0.028424086049199104, + "step": 3100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6202, + "grad_norm": 1.7307711839675903, + "kl": 0.281473970040679, + "learning_rate": 3.215568892040641e-07, + "loss": 0.0113, + "num_tokens": 22579869.0, + "reward": 0.85296630859375, + "reward_std": 0.010605362243950367, + "rewards//mean": 0.85296630859375, + "rewards//std": 0.014303747564554214, + "step": 3101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.6204, + "grad_norm": 1.7639178037643433, + "kl": 0.42637334391474724, + "learning_rate": 3.2126048960126785e-07, + "loss": 0.0155, + "num_tokens": 22587059.0, + "reward": 0.82012939453125, + "reward_std": 0.013591835275292397, + "rewards//mean": 0.82012939453125, + "rewards//std": 0.022254686802625656, + "step": 3102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6206, + "grad_norm": 1.7594624757766724, + "kl": 0.45530661568045616, + "learning_rate": 3.2096416199472494e-07, + "loss": 0.0182, + "num_tokens": 22594355.0, + "reward": 0.83270263671875, + "reward_std": 0.014015795662999153, + "rewards//mean": 0.83270263671875, + "rewards//std": 0.016913551837205887, + "step": 3103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6208, + "grad_norm": 1.4459807872772217, + "kl": 0.3585219793021679, + "learning_rate": 3.2066790650379624e-07, + "loss": 0.0143, + "num_tokens": 22601643.0, + "reward": 0.84039306640625, + "reward_std": 0.01672377437353134, + "rewards//mean": 0.84039306640625, + "rewards//std": 0.02542867884039879, + "step": 3104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.621, + "grad_norm": 1.5053757429122925, + "kl": 0.3139904960989952, + "learning_rate": 3.2037172324781326e-07, + "loss": 0.0126, + "num_tokens": 22608907.0, + "reward": 0.777099609375, + "reward_std": 0.015205681324005127, + "rewards//mean": 0.777099609375, + "rewards//std": 0.021864313632249832, + "step": 3105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.6212, + "grad_norm": 1.3494985103607178, + "kl": 0.40479090064764023, + "learning_rate": 3.2007561234607877e-07, + "loss": 0.0189, + "num_tokens": 22616214.0, + "reward": 0.84832763671875, + "reward_std": 0.013074111193418503, + "rewards//mean": 0.84832763671875, + "rewards//std": 0.025425702333450317, + "step": 3106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6214, + "grad_norm": 1.6217941045761108, + "kl": 0.31110082380473614, + "learning_rate": 3.1977957391786614e-07, + "loss": 0.0124, + "num_tokens": 22623494.0, + "reward": 0.86126708984375, + "reward_std": 0.020085632801055908, + "rewards//mean": 0.86126708984375, + "rewards//std": 0.025939086452126503, + "step": 3107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6216, + "grad_norm": 1.3838361501693726, + "kl": 0.29674101434648037, + "learning_rate": 3.1948360808241944e-07, + "loss": 0.0119, + "num_tokens": 22630742.0, + "reward": 0.85052490234375, + "reward_std": 0.014642378315329552, + "rewards//mean": 0.85052490234375, + "rewards//std": 0.02081288769841194, + "step": 3108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6218, + "grad_norm": 1.607269048690796, + "kl": 0.33622901514172554, + "learning_rate": 3.191877149589539e-07, + "loss": 0.0134, + "num_tokens": 22638038.0, + "reward": 0.839599609375, + "reward_std": 0.014852971769869328, + "rewards//mean": 0.839599609375, + "rewards//std": 0.01640942320227623, + "step": 3109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.622, + "grad_norm": 1.3786283731460571, + "kl": 0.38516442477703094, + "learning_rate": 3.188918946666551e-07, + "loss": 0.0154, + "num_tokens": 22645374.0, + "reward": 0.78973388671875, + "reward_std": 0.010813255794346333, + "rewards//mean": 0.78973388671875, + "rewards//std": 0.012343913316726685, + "step": 3110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6222, + "grad_norm": 1.323130488395691, + "kl": 0.3066776227205992, + "learning_rate": 3.1859614732467954e-07, + "loss": 0.0123, + "num_tokens": 22652670.0, + "reward": 0.85235595703125, + "reward_std": 0.013505296781659126, + "rewards//mean": 0.85235595703125, + "rewards//std": 0.015424992889165878, + "step": 3111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.703125, + "epoch": 0.6224, + "grad_norm": 1.5225157737731934, + "kl": 0.4053703173995018, + "learning_rate": 3.1830047305215415e-07, + "loss": 0.0014, + "num_tokens": 22659939.0, + "reward": 0.85101318359375, + "reward_std": 0.014330266043543816, + "rewards//mean": 0.85101318359375, + "rewards//std": 0.0184781476855278, + "step": 3112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6226, + "grad_norm": 1.6676939725875854, + "kl": 0.33058081939816475, + "learning_rate": 3.1800487196817645e-07, + "loss": 0.0132, + "num_tokens": 22667243.0, + "reward": 0.83673095703125, + "reward_std": 0.0179214458912611, + "rewards//mean": 0.83673095703125, + "rewards//std": 0.01937786675989628, + "step": 3113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6228, + "grad_norm": 2.257286787033081, + "kl": 0.42098929174244404, + "learning_rate": 3.177093441918145e-07, + "loss": 0.0168, + "num_tokens": 22674547.0, + "reward": 0.82806396484375, + "reward_std": 0.0195939764380455, + "rewards//mean": 0.82806396484375, + "rewards//std": 0.028565332293510437, + "step": 3114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.623, + "grad_norm": 1.57742178440094, + "kl": 0.3404693640768528, + "learning_rate": 3.1741388984210703e-07, + "loss": 0.0136, + "num_tokens": 22682043.0, + "reward": 0.80316162109375, + "reward_std": 0.014517908915877342, + "rewards//mean": 0.80316162109375, + "rewards//std": 0.021560249850153923, + "step": 3115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.59375, + "epoch": 0.6232, + "grad_norm": 1.6445626020431519, + "kl": 0.3142421245574951, + "learning_rate": 3.1711850903806276e-07, + "loss": -0.0124, + "num_tokens": 22689385.0, + "reward": 0.85113525390625, + "reward_std": 0.0202502254396677, + "rewards//mean": 0.85113525390625, + "rewards//std": 0.02399023063480854, + "step": 3116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6234, + "grad_norm": 1.1780983209609985, + "kl": 0.2935120966285467, + "learning_rate": 3.168232018986613e-07, + "loss": 0.0117, + "num_tokens": 22696777.0, + "reward": 0.8681640625, + "reward_std": 0.016113419085741043, + "rewards//mean": 0.8681640625, + "rewards//std": 0.026199301704764366, + "step": 3117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6236, + "grad_norm": 1.4381086826324463, + "kl": 0.37381079606711864, + "learning_rate": 3.165279685428521e-07, + "loss": 0.015, + "num_tokens": 22704009.0, + "reward": 0.80255126953125, + "reward_std": 0.012404076755046844, + "rewards//mean": 0.80255126953125, + "rewards//std": 0.01623215340077877, + "step": 3118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6238, + "grad_norm": 2.0590176582336426, + "kl": 0.3233977500349283, + "learning_rate": 3.1623280908955536e-07, + "loss": 0.0129, + "num_tokens": 22711361.0, + "reward": 0.86761474609375, + "reward_std": 0.02443903498351574, + "rewards//mean": 0.86761474609375, + "rewards//std": 0.03162073343992233, + "step": 3119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.624, + "grad_norm": 1.7992092370986938, + "kl": 0.34338131733238697, + "learning_rate": 3.15937723657661e-07, + "loss": 0.0137, + "num_tokens": 22718593.0, + "reward": 0.8359375, + "reward_std": 0.017430659383535385, + "rewards//mean": 0.8359375, + "rewards//std": 0.023114927113056183, + "step": 3120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6242, + "grad_norm": 1.5112687349319458, + "kl": 0.24965571612119675, + "learning_rate": 3.156427123660297e-07, + "loss": 0.01, + "num_tokens": 22725841.0, + "reward": 0.85760498046875, + "reward_std": 0.01595635712146759, + "rewards//mean": 0.85760498046875, + "rewards//std": 0.022956473752856255, + "step": 3121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6244, + "grad_norm": 1.4342536926269531, + "kl": 0.33566318452358246, + "learning_rate": 3.1534777533349175e-07, + "loss": 0.0134, + "num_tokens": 22733105.0, + "reward": 0.81634521484375, + "reward_std": 0.011946736834943295, + "rewards//mean": 0.81634521484375, + "rewards//std": 0.01899043284356594, + "step": 3122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6246, + "grad_norm": 1.549133062362671, + "kl": 0.40487901866436005, + "learning_rate": 3.150529126788477e-07, + "loss": 0.0162, + "num_tokens": 22740417.0, + "reward": 0.83251953125, + "reward_std": 0.011340814642608166, + "rewards//mean": 0.83251953125, + "rewards//std": 0.018235808238387108, + "step": 3123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6248, + "grad_norm": 1.5583606958389282, + "kl": 0.3517281450331211, + "learning_rate": 3.147581245208685e-07, + "loss": 0.0141, + "num_tokens": 22747649.0, + "reward": 0.83953857421875, + "reward_std": 0.016519978642463684, + "rewards//mean": 0.83953857421875, + "rewards//std": 0.019958190619945526, + "step": 3124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.625, + "grad_norm": 1.5583714246749878, + "kl": 0.37058785557746887, + "learning_rate": 3.144634109782944e-07, + "loss": 0.0148, + "num_tokens": 22754945.0, + "reward": 0.780517578125, + "reward_std": 0.014314139261841774, + "rewards//mean": 0.780517578125, + "rewards//std": 0.017536740750074387, + "step": 3125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6252, + "grad_norm": 1.7651355266571045, + "kl": 0.38273508474230766, + "learning_rate": 3.141687721698363e-07, + "loss": 0.0153, + "num_tokens": 22762153.0, + "reward": 0.8291015625, + "reward_std": 0.021486781537532806, + "rewards//mean": 0.8291015625, + "rewards//std": 0.027418937534093857, + "step": 3126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.6875, + "epoch": 0.6254, + "grad_norm": 1.407523512840271, + "kl": 0.3832579143345356, + "learning_rate": 3.138742082141744e-07, + "loss": -0.0083, + "num_tokens": 22769445.0, + "reward": 0.81536865234375, + "reward_std": 0.01543636154383421, + "rewards//mean": 0.81536865234375, + "rewards//std": 0.02312793955206871, + "step": 3127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6256, + "grad_norm": 1.6025193929672241, + "kl": 0.2924391068518162, + "learning_rate": 3.1357971922995935e-07, + "loss": 0.0117, + "num_tokens": 22776637.0, + "reward": 0.84576416015625, + "reward_std": 0.017705902457237244, + "rewards//mean": 0.84576416015625, + "rewards//std": 0.029446274042129517, + "step": 3128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.75, + "epoch": 0.6258, + "grad_norm": 1.961737871170044, + "kl": 0.4077335558831692, + "learning_rate": 3.13285305335811e-07, + "loss": -0.0016, + "num_tokens": 22783981.0, + "reward": 0.84173583984375, + "reward_std": 0.023548107594251633, + "rewards//mean": 0.84173583984375, + "rewards//std": 0.028271295130252838, + "step": 3129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.626, + "grad_norm": 1.3897284269332886, + "kl": 0.35886807180941105, + "learning_rate": 3.129909666503194e-07, + "loss": 0.0144, + "num_tokens": 22791229.0, + "reward": 0.86297607421875, + "reward_std": 0.01593632623553276, + "rewards//mean": 0.86297607421875, + "rewards//std": 0.027518615126609802, + "step": 3130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6262, + "grad_norm": 1.787519931793213, + "kl": 0.3351271264255047, + "learning_rate": 3.1269670329204393e-07, + "loss": 0.0134, + "num_tokens": 22798509.0, + "reward": 0.818603515625, + "reward_std": 0.013312483206391335, + "rewards//mean": 0.818603515625, + "rewards//std": 0.022237760946154594, + "step": 3131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6264, + "grad_norm": 1.4779441356658936, + "kl": 0.3678131904453039, + "learning_rate": 3.124025153795141e-07, + "loss": 0.0147, + "num_tokens": 22805797.0, + "reward": 0.836181640625, + "reward_std": 0.018674425780773163, + "rewards//mean": 0.836181640625, + "rewards//std": 0.0237544197589159, + "step": 3132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6266, + "grad_norm": 1.707353115081787, + "kl": 0.4009977765381336, + "learning_rate": 3.121084030312286e-07, + "loss": 0.016, + "num_tokens": 22813221.0, + "reward": 0.83465576171875, + "reward_std": 0.016101829707622528, + "rewards//mean": 0.83465576171875, + "rewards//std": 0.022174278274178505, + "step": 3133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6268, + "grad_norm": 1.8379064798355103, + "kl": 0.3880986236035824, + "learning_rate": 3.1181436636565596e-07, + "loss": 0.0155, + "num_tokens": 22820517.0, + "reward": 0.85443115234375, + "reward_std": 0.02194252610206604, + "rewards//mean": 0.85443115234375, + "rewards//std": 0.029117483645677567, + "step": 3134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.627, + "grad_norm": 1.4253939390182495, + "kl": 0.3302721846848726, + "learning_rate": 3.1152040550123393e-07, + "loss": 0.0132, + "num_tokens": 22827733.0, + "reward": 0.869873046875, + "reward_std": 0.017472457140684128, + "rewards//mean": 0.869873046875, + "rewards//std": 0.023113617673516273, + "step": 3135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6272, + "grad_norm": 1.6395606994628906, + "kl": 0.3479885905981064, + "learning_rate": 3.112265205563701e-07, + "loss": 0.0139, + "num_tokens": 22835013.0, + "reward": 0.85614013671875, + "reward_std": 0.018575124442577362, + "rewards//mean": 0.85614013671875, + "rewards//std": 0.02234293892979622, + "step": 3136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.8125, + "epoch": 0.6274, + "grad_norm": 1.8772224187850952, + "kl": 0.3560584895312786, + "learning_rate": 3.109327116494411e-07, + "loss": 0.0148, + "num_tokens": 22842353.0, + "reward": 0.81195068359375, + "reward_std": 0.015624482184648514, + "rewards//mean": 0.81195068359375, + "rewards//std": 0.017173785716295242, + "step": 3137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6276, + "grad_norm": 1.2588249444961548, + "kl": 0.30923967994749546, + "learning_rate": 3.106389788987934e-07, + "loss": 0.0124, + "num_tokens": 22849697.0, + "reward": 0.85894775390625, + "reward_std": 0.011540912091732025, + "rewards//mean": 0.85894775390625, + "rewards//std": 0.020352492108941078, + "step": 3138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6278, + "grad_norm": 1.3913357257843018, + "kl": 0.3067501373589039, + "learning_rate": 3.103453224227424e-07, + "loss": 0.0123, + "num_tokens": 22856937.0, + "reward": 0.88409423828125, + "reward_std": 0.01906052976846695, + "rewards//mean": 0.88409423828125, + "rewards//std": 0.030017023906111717, + "step": 3139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.628, + "grad_norm": 1.4202970266342163, + "kl": 0.3376442827284336, + "learning_rate": 3.1005174233957267e-07, + "loss": 0.0135, + "num_tokens": 22864329.0, + "reward": 0.870361328125, + "reward_std": 0.019834628328680992, + "rewards//mean": 0.870361328125, + "rewards//std": 0.028636319562792778, + "step": 3140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6282, + "grad_norm": 2.2007744312286377, + "kl": 0.3021048679947853, + "learning_rate": 3.097582387675385e-07, + "loss": 0.0121, + "num_tokens": 22871593.0, + "reward": 0.847900390625, + "reward_std": 0.012758981436491013, + "rewards//mean": 0.847900390625, + "rewards//std": 0.016096480190753937, + "step": 3141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6284, + "grad_norm": 1.4824976921081543, + "kl": 0.38184410333633423, + "learning_rate": 3.0946481182486297e-07, + "loss": 0.0153, + "num_tokens": 22878905.0, + "reward": 0.8316650390625, + "reward_std": 0.01637118123471737, + "rewards//mean": 0.8316650390625, + "rewards//std": 0.022816665470600128, + "step": 3142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.6286, + "grad_norm": 1.8042196035385132, + "kl": 0.3848433382809162, + "learning_rate": 3.0917146162973846e-07, + "loss": 0.0165, + "num_tokens": 22886120.0, + "reward": 0.82440185546875, + "reward_std": 0.016795866191387177, + "rewards//mean": 0.82440185546875, + "rewards//std": 0.0234522707760334, + "step": 3143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6288, + "grad_norm": 1.6377575397491455, + "kl": 0.3457224480807781, + "learning_rate": 3.088781883003263e-07, + "loss": 0.0138, + "num_tokens": 22893480.0, + "reward": 0.87298583984375, + "reward_std": 0.018683183938264847, + "rewards//mean": 0.87298583984375, + "rewards//std": 0.027778679504990578, + "step": 3144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.629, + "grad_norm": 1.9297622442245483, + "kl": 0.3169064540416002, + "learning_rate": 3.085849919547572e-07, + "loss": 0.0127, + "num_tokens": 22900832.0, + "reward": 0.86236572265625, + "reward_std": 0.01918129250407219, + "rewards//mean": 0.86236572265625, + "rewards//std": 0.027854865416884422, + "step": 3145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6292, + "grad_norm": 1.2856322526931763, + "kl": 0.3214341979473829, + "learning_rate": 3.0829187271113035e-07, + "loss": 0.0129, + "num_tokens": 22908184.0, + "reward": 0.84576416015625, + "reward_std": 0.015068748965859413, + "rewards//mean": 0.84576416015625, + "rewards//std": 0.022358516231179237, + "step": 3146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6294, + "grad_norm": 1.4850983619689941, + "kl": 0.2831522636115551, + "learning_rate": 3.079988306875143e-07, + "loss": 0.0113, + "num_tokens": 22915432.0, + "reward": 0.8411865234375, + "reward_std": 0.015627741813659668, + "rewards//mean": 0.8411865234375, + "rewards//std": 0.023419085890054703, + "step": 3147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6296, + "grad_norm": 1.4157795906066895, + "kl": 0.31068912521004677, + "learning_rate": 3.0770586600194614e-07, + "loss": 0.0124, + "num_tokens": 22922696.0, + "reward": 0.83660888671875, + "reward_std": 0.009321187622845173, + "rewards//mean": 0.83660888671875, + "rewards//std": 0.018183333799242973, + "step": 3148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.6298, + "grad_norm": 1.4907511472702026, + "kl": 0.34321579337120056, + "learning_rate": 3.0741297877243235e-07, + "loss": 0.0125, + "num_tokens": 22929906.0, + "reward": 0.83135986328125, + "reward_std": 0.01703726127743721, + "rewards//mean": 0.83135986328125, + "rewards//std": 0.023638714104890823, + "step": 3149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.63, + "grad_norm": 2.6384639739990234, + "kl": 0.5136867612600327, + "learning_rate": 3.0712016911694755e-07, + "loss": 0.0205, + "num_tokens": 22937170.0, + "reward": 0.877685546875, + "reward_std": 0.014074728824198246, + "rewards//mean": 0.877685546875, + "rewards//std": 0.019473811611533165, + "step": 3150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.6302, + "grad_norm": 2.063405990600586, + "kl": 0.40555898658931255, + "learning_rate": 3.068274371534356e-07, + "loss": 0.0192, + "num_tokens": 22944436.0, + "reward": 0.82659912109375, + "reward_std": 0.01224506925791502, + "rewards//mean": 0.82659912109375, + "rewards//std": 0.019182365387678146, + "step": 3151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.6304, + "grad_norm": 1.371252417564392, + "kl": 0.33558294363319874, + "learning_rate": 3.065347829998089e-07, + "loss": 0.0114, + "num_tokens": 22951753.0, + "reward": 0.865966796875, + "reward_std": 0.02071446180343628, + "rewards//mean": 0.865966796875, + "rewards//std": 0.03124128095805645, + "step": 3152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6306, + "grad_norm": 1.7529140710830688, + "kl": 0.3163985498249531, + "learning_rate": 3.0624220677394854e-07, + "loss": 0.0127, + "num_tokens": 22959025.0, + "reward": 0.8563232421875, + "reward_std": 0.017983827739953995, + "rewards//mean": 0.8563232421875, + "rewards//std": 0.027765393257141113, + "step": 3153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6308, + "grad_norm": 1.6905866861343384, + "kl": 0.3572031054645777, + "learning_rate": 3.0594970859370404e-07, + "loss": 0.0143, + "num_tokens": 22966241.0, + "reward": 0.7872314453125, + "reward_std": 0.01469154842197895, + "rewards//mean": 0.7872314453125, + "rewards//std": 0.019697565585374832, + "step": 3154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.631, + "grad_norm": 1.5905507802963257, + "kl": 0.2933832313865423, + "learning_rate": 3.0565728857689366e-07, + "loss": 0.0117, + "num_tokens": 22973553.0, + "reward": 0.8817138671875, + "reward_std": 0.01398781780153513, + "rewards//mean": 0.8817138671875, + "rewards//std": 0.021704141050577164, + "step": 3155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6312, + "grad_norm": 1.5047317743301392, + "kl": 0.3611921928822994, + "learning_rate": 3.053649468413043e-07, + "loss": 0.0144, + "num_tokens": 22980809.0, + "reward": 0.80987548828125, + "reward_std": 0.009424582123756409, + "rewards//mean": 0.80987548828125, + "rewards//std": 0.01928860694169998, + "step": 3156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6314, + "grad_norm": 1.9049789905548096, + "kl": 0.33036781661212444, + "learning_rate": 3.0507268350469093e-07, + "loss": 0.0132, + "num_tokens": 22988049.0, + "reward": 0.85894775390625, + "reward_std": 0.02343941107392311, + "rewards//mean": 0.85894775390625, + "rewards//std": 0.026061350479722023, + "step": 3157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.875, + "epoch": 0.6316, + "grad_norm": 1.6242278814315796, + "kl": 0.37002889439463615, + "learning_rate": 3.0478049868477745e-07, + "loss": 0.0064, + "num_tokens": 22995297.0, + "reward": 0.83392333984375, + "reward_std": 0.012737032026052475, + "rewards//mean": 0.83392333984375, + "rewards//std": 0.016307517886161804, + "step": 3158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6318, + "grad_norm": 1.7234224081039429, + "kl": 0.32780937664210796, + "learning_rate": 3.0448839249925566e-07, + "loss": 0.0131, + "num_tokens": 23002545.0, + "reward": 0.88775634765625, + "reward_std": 0.01758638583123684, + "rewards//mean": 0.88775634765625, + "rewards//std": 0.02835897170007229, + "step": 3159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.578125, + "epoch": 0.632, + "grad_norm": 1.4150937795639038, + "kl": 0.3516644537448883, + "learning_rate": 3.0419636506578617e-07, + "loss": 0.0166, + "num_tokens": 23009790.0, + "reward": 0.7603759765625, + "reward_std": 0.013475606217980385, + "rewards//mean": 0.7603759765625, + "rewards//std": 0.024589523673057556, + "step": 3160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6322, + "grad_norm": 2.1967263221740723, + "kl": 0.3484107032418251, + "learning_rate": 3.039044165019972e-07, + "loss": 0.0139, + "num_tokens": 23017038.0, + "reward": 0.82745361328125, + "reward_std": 0.015022937208414078, + "rewards//mean": 0.82745361328125, + "rewards//std": 0.023577159270644188, + "step": 3161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.78125, + "epoch": 0.6324, + "grad_norm": 1.4919676780700684, + "kl": 0.38224294409155846, + "learning_rate": 3.03612546925486e-07, + "loss": 0.0059, + "num_tokens": 23024240.0, + "reward": 0.84295654296875, + "reward_std": 0.018340451642870903, + "rewards//mean": 0.84295654296875, + "rewards//std": 0.02693033404648304, + "step": 3162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6326, + "grad_norm": 1.5574548244476318, + "kl": 0.3634252995252609, + "learning_rate": 3.0332075645381726e-07, + "loss": 0.0145, + "num_tokens": 23031544.0, + "reward": 0.8875732421875, + "reward_std": 0.016011685132980347, + "rewards//mean": 0.8875732421875, + "rewards//std": 0.02706748992204666, + "step": 3163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6328, + "grad_norm": 1.408185362815857, + "kl": 0.2959322798997164, + "learning_rate": 3.0302904520452443e-07, + "loss": 0.0118, + "num_tokens": 23038800.0, + "reward": 0.88604736328125, + "reward_std": 0.014246379025280476, + "rewards//mean": 0.88604736328125, + "rewards//std": 0.026717012748122215, + "step": 3164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.633, + "grad_norm": 1.5759481191635132, + "kl": 0.3804610036313534, + "learning_rate": 3.027374132951085e-07, + "loss": 0.0166, + "num_tokens": 23046163.0, + "reward": 0.85992431640625, + "reward_std": 0.014826002530753613, + "rewards//mean": 0.85992431640625, + "rewards//std": 0.023631669580936432, + "step": 3165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6332, + "grad_norm": 1.7958446741104126, + "kl": 0.37891457602381706, + "learning_rate": 3.02445860843039e-07, + "loss": 0.0152, + "num_tokens": 23053411.0, + "reward": 0.8436279296875, + "reward_std": 0.019484881311655045, + "rewards//mean": 0.8436279296875, + "rewards//std": 0.025873422622680664, + "step": 3166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6334, + "grad_norm": 1.5441462993621826, + "kl": 0.3588989991694689, + "learning_rate": 3.0215438796575327e-07, + "loss": 0.0144, + "num_tokens": 23060731.0, + "reward": 0.867919921875, + "reward_std": 0.015474203042685986, + "rewards//mean": 0.867919921875, + "rewards//std": 0.02566576935350895, + "step": 3167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6336, + "grad_norm": 1.7702486515045166, + "kl": 0.34698519855737686, + "learning_rate": 3.018629947806563e-07, + "loss": 0.0139, + "num_tokens": 23068083.0, + "reward": 0.837646484375, + "reward_std": 0.015625936910510063, + "rewards//mean": 0.837646484375, + "rewards//std": 0.021143468096852303, + "step": 3168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6338, + "grad_norm": 1.4720838069915771, + "kl": 0.3465990349650383, + "learning_rate": 3.015716814051212e-07, + "loss": 0.0139, + "num_tokens": 23075387.0, + "reward": 0.78564453125, + "reward_std": 0.013341475278139114, + "rewards//mean": 0.78564453125, + "rewards//std": 0.030125316232442856, + "step": 3169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.78125, + "epoch": 0.634, + "grad_norm": 1.4340590238571167, + "kl": 0.366794902831316, + "learning_rate": 3.0128044795648923e-07, + "loss": -0.0023, + "num_tokens": 23082605.0, + "reward": 0.866455078125, + "reward_std": 0.013935955241322517, + "rewards//mean": 0.866455078125, + "rewards//std": 0.023446541279554367, + "step": 3170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6342, + "grad_norm": 1.4519299268722534, + "kl": 0.31185861118137836, + "learning_rate": 3.00989294552069e-07, + "loss": 0.0125, + "num_tokens": 23089917.0, + "reward": 0.85601806640625, + "reward_std": 0.012614225968718529, + "rewards//mean": 0.85601806640625, + "rewards//std": 0.01956058293581009, + "step": 3171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6344, + "grad_norm": 1.566766381263733, + "kl": 0.3471913728863001, + "learning_rate": 3.0069822130913716e-07, + "loss": 0.0139, + "num_tokens": 23097229.0, + "reward": 0.87823486328125, + "reward_std": 0.020885266363620758, + "rewards//mean": 0.87823486328125, + "rewards//std": 0.025444746017456055, + "step": 3172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6346, + "grad_norm": 1.3770325183868408, + "kl": 0.37868792563676834, + "learning_rate": 3.004072283449379e-07, + "loss": 0.0151, + "num_tokens": 23104549.0, + "reward": 0.83062744140625, + "reward_std": 0.013950597494840622, + "rewards//mean": 0.83062744140625, + "rewards//std": 0.021591119468212128, + "step": 3173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6348, + "grad_norm": 1.648139476776123, + "kl": 0.32961129397153854, + "learning_rate": 3.0011631577668325e-07, + "loss": 0.0132, + "num_tokens": 23111837.0, + "reward": 0.82904052734375, + "reward_std": 0.015440746210515499, + "rewards//mean": 0.82904052734375, + "rewards//std": 0.01873684860765934, + "step": 3174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.635, + "grad_norm": 1.5842496156692505, + "kl": 0.356418177485466, + "learning_rate": 2.9982548372155256e-07, + "loss": 0.0143, + "num_tokens": 23119061.0, + "reward": 0.84527587890625, + "reward_std": 0.015407810918986797, + "rewards//mean": 0.84527587890625, + "rewards//std": 0.018263908103108406, + "step": 3175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.875, + "epoch": 0.6352, + "grad_norm": 1.3251134157180786, + "kl": 0.3124542199075222, + "learning_rate": 2.9953473229669324e-07, + "loss": 0.0095, + "num_tokens": 23126397.0, + "reward": 0.77252197265625, + "reward_std": 0.009838620200753212, + "rewards//mean": 0.77252197265625, + "rewards//std": 0.015402404591441154, + "step": 3176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6354, + "grad_norm": 1.5392932891845703, + "kl": 0.3358469642698765, + "learning_rate": 2.9924406161921966e-07, + "loss": 0.0134, + "num_tokens": 23133685.0, + "reward": 0.8609619140625, + "reward_std": 0.021017972379922867, + "rewards//mean": 0.8609619140625, + "rewards//std": 0.0321870818734169, + "step": 3177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6356, + "grad_norm": 1.417223572731018, + "kl": 0.37468111515045166, + "learning_rate": 2.989534718062142e-07, + "loss": 0.015, + "num_tokens": 23140917.0, + "reward": 0.81365966796875, + "reward_std": 0.018034441396594048, + "rewards//mean": 0.81365966796875, + "rewards//std": 0.021928520873188972, + "step": 3178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6358, + "grad_norm": 1.4577959775924683, + "kl": 0.3490968719124794, + "learning_rate": 2.9866296297472613e-07, + "loss": 0.014, + "num_tokens": 23148253.0, + "reward": 0.87176513671875, + "reward_std": 0.016011210158467293, + "rewards//mean": 0.87176513671875, + "rewards//std": 0.01902944967150688, + "step": 3179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.734375, + "epoch": 0.636, + "grad_norm": 1.5827510356903076, + "kl": 0.32549019157886505, + "learning_rate": 2.9837253524177256e-07, + "loss": 0.009, + "num_tokens": 23155572.0, + "reward": 0.85247802734375, + "reward_std": 0.018269143998622894, + "rewards//mean": 0.85247802734375, + "rewards//std": 0.022912250831723213, + "step": 3180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6362, + "grad_norm": 1.9909205436706543, + "kl": 0.4386014863848686, + "learning_rate": 2.9808218872433766e-07, + "loss": 0.0175, + "num_tokens": 23162868.0, + "reward": 0.84857177734375, + "reward_std": 0.01695546694099903, + "rewards//mean": 0.84857177734375, + "rewards//std": 0.021829580888152122, + "step": 3181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6364, + "grad_norm": 1.3751627206802368, + "kl": 0.3071772065013647, + "learning_rate": 2.97791923539373e-07, + "loss": 0.0123, + "num_tokens": 23170132.0, + "reward": 0.87017822265625, + "reward_std": 0.017861083149909973, + "rewards//mean": 0.87017822265625, + "rewards//std": 0.023300092667341232, + "step": 3182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6366, + "grad_norm": 1.8414865732192993, + "kl": 0.3515696842223406, + "learning_rate": 2.9750173980379733e-07, + "loss": 0.0141, + "num_tokens": 23177420.0, + "reward": 0.84765625, + "reward_std": 0.019786875694990158, + "rewards//mean": 0.84765625, + "rewards//std": 0.02755112014710903, + "step": 3183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6368, + "grad_norm": 1.8939553499221802, + "kl": 0.4000442326068878, + "learning_rate": 2.9721163763449677e-07, + "loss": 0.016, + "num_tokens": 23184708.0, + "reward": 0.77593994140625, + "reward_std": 0.01367166917771101, + "rewards//mean": 0.77593994140625, + "rewards//std": 0.019861631095409393, + "step": 3184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.637, + "grad_norm": 1.4013423919677734, + "kl": 0.37704984098672867, + "learning_rate": 2.969216171483242e-07, + "loss": 0.0151, + "num_tokens": 23191948.0, + "reward": 0.82696533203125, + "reward_std": 0.013924174010753632, + "rewards//mean": 0.82696533203125, + "rewards//std": 0.01661193184554577, + "step": 3185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6372, + "grad_norm": 1.3562370538711548, + "kl": 0.2837059684097767, + "learning_rate": 2.9663167846209996e-07, + "loss": 0.0113, + "num_tokens": 23199164.0, + "reward": 0.861328125, + "reward_std": 0.014143168926239014, + "rewards//mean": 0.861328125, + "rewards//std": 0.018414240330457687, + "step": 3186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6374, + "grad_norm": 1.5289700031280518, + "kl": 0.3525548353791237, + "learning_rate": 2.9634182169261133e-07, + "loss": 0.0141, + "num_tokens": 23206428.0, + "reward": 0.850830078125, + "reward_std": 0.012524724006652832, + "rewards//mean": 0.850830078125, + "rewards//std": 0.01786513440310955, + "step": 3187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6376, + "grad_norm": 1.5391966104507446, + "kl": 0.33661325462162495, + "learning_rate": 2.9605204695661256e-07, + "loss": 0.0135, + "num_tokens": 23213724.0, + "reward": 0.85589599609375, + "reward_std": 0.01265730895102024, + "rewards//mean": 0.85589599609375, + "rewards//std": 0.019773023203015327, + "step": 3188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6378, + "grad_norm": 1.7820857763290405, + "kl": 0.40656666830182076, + "learning_rate": 2.9576235437082495e-07, + "loss": 0.0163, + "num_tokens": 23220948.0, + "reward": 0.8394775390625, + "reward_std": 0.012953723780810833, + "rewards//mean": 0.8394775390625, + "rewards//std": 0.019817089661955833, + "step": 3189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.638, + "grad_norm": 1.6550642251968384, + "kl": 0.3629530742764473, + "learning_rate": 2.9547274405193645e-07, + "loss": 0.0145, + "num_tokens": 23228244.0, + "reward": 0.79571533203125, + "reward_std": 0.010437123477458954, + "rewards//mean": 0.79571533203125, + "rewards//std": 0.027548305690288544, + "step": 3190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6382, + "grad_norm": 1.4378700256347656, + "kl": 0.3125884626060724, + "learning_rate": 2.9518321611660234e-07, + "loss": 0.0125, + "num_tokens": 23235516.0, + "reward": 0.82586669921875, + "reward_std": 0.011608447879552841, + "rewards//mean": 0.82586669921875, + "rewards//std": 0.016304733231663704, + "step": 3191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6384, + "grad_norm": 1.4908965826034546, + "kl": 0.35364826023578644, + "learning_rate": 2.948937706814442e-07, + "loss": 0.0141, + "num_tokens": 23242876.0, + "reward": 0.898193359375, + "reward_std": 0.014499031938612461, + "rewards//mean": 0.898193359375, + "rewards//std": 0.022966446354985237, + "step": 3192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6386, + "grad_norm": 1.4933010339736938, + "kl": 0.31049202382564545, + "learning_rate": 2.9460440786305077e-07, + "loss": 0.0124, + "num_tokens": 23250244.0, + "reward": 0.82598876953125, + "reward_std": 0.015313982963562012, + "rewards//mean": 0.82598876953125, + "rewards//std": 0.024507742375135422, + "step": 3193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6388, + "grad_norm": 1.3734924793243408, + "kl": 0.287609301507473, + "learning_rate": 2.943151277779771e-07, + "loss": 0.0115, + "num_tokens": 23257388.0, + "reward": 0.83258056640625, + "reward_std": 0.012561575509607792, + "rewards//mean": 0.83258056640625, + "rewards//std": 0.019708609208464622, + "step": 3194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.639, + "grad_norm": 1.4267908334732056, + "kl": 0.34979110211133957, + "learning_rate": 2.9402593054274557e-07, + "loss": 0.014, + "num_tokens": 23264788.0, + "reward": 0.8388671875, + "reward_std": 0.01423579826951027, + "rewards//mean": 0.8388671875, + "rewards//std": 0.016689321026206017, + "step": 3195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.875, + "epoch": 0.6392, + "grad_norm": 1.4025732278823853, + "kl": 0.3051482867449522, + "learning_rate": 2.9373681627384445e-07, + "loss": 0.0136, + "num_tokens": 23271988.0, + "reward": 0.85693359375, + "reward_std": 0.017524590715765953, + "rewards//mean": 0.85693359375, + "rewards//std": 0.028827045112848282, + "step": 3196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.84375, + "epoch": 0.6394, + "grad_norm": 1.5403567552566528, + "kl": 0.3545091524720192, + "learning_rate": 2.9344778508772914e-07, + "loss": 0.0081, + "num_tokens": 23279194.0, + "reward": 0.840576171875, + "reward_std": 0.018018502742052078, + "rewards//mean": 0.840576171875, + "rewards//std": 0.022987527772784233, + "step": 3197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6396, + "grad_norm": 1.5977017879486084, + "kl": 0.38293692097067833, + "learning_rate": 2.9315883710082125e-07, + "loss": 0.0153, + "num_tokens": 23286602.0, + "reward": 0.83697509765625, + "reward_std": 0.017834432423114777, + "rewards//mean": 0.83697509765625, + "rewards//std": 0.021330133080482483, + "step": 3198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6398, + "grad_norm": 1.685302734375, + "kl": 0.3281424194574356, + "learning_rate": 2.9286997242950913e-07, + "loss": 0.0131, + "num_tokens": 23293842.0, + "reward": 0.8746337890625, + "reward_std": 0.014988718554377556, + "rewards//mean": 0.8746337890625, + "rewards//std": 0.026582177728414536, + "step": 3199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.64, + "grad_norm": 1.8558733463287354, + "kl": 0.4127119295299053, + "learning_rate": 2.925811911901473e-07, + "loss": 0.0165, + "num_tokens": 23301250.0, + "reward": 0.82403564453125, + "reward_std": 0.015019257552921772, + "rewards//mean": 0.82403564453125, + "rewards//std": 0.016579093411564827, + "step": 3200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6402, + "grad_norm": 1.3239718675613403, + "kl": 0.2718811724334955, + "learning_rate": 2.922924934990568e-07, + "loss": 0.0109, + "num_tokens": 23308522.0, + "reward": 0.848388671875, + "reward_std": 0.015907302498817444, + "rewards//mean": 0.848388671875, + "rewards//std": 0.0193365179002285, + "step": 3201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6404, + "grad_norm": 1.3232172727584839, + "kl": 0.28981511760503054, + "learning_rate": 2.920038794725252e-07, + "loss": 0.0116, + "num_tokens": 23315786.0, + "reward": 0.83880615234375, + "reward_std": 0.012585677206516266, + "rewards//mean": 0.83880615234375, + "rewards//std": 0.0220118910074234, + "step": 3202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.6406, + "grad_norm": 1.7153736352920532, + "kl": 0.357038713991642, + "learning_rate": 2.9171534922680597e-07, + "loss": 0.0142, + "num_tokens": 23323054.0, + "reward": 0.82720947265625, + "reward_std": 0.013523450121283531, + "rewards//mean": 0.82720947265625, + "rewards//std": 0.017242401838302612, + "step": 3203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6408, + "grad_norm": 1.5834444761276245, + "kl": 0.351661404594779, + "learning_rate": 2.914269028781191e-07, + "loss": 0.0141, + "num_tokens": 23330294.0, + "reward": 0.85888671875, + "reward_std": 0.014030545949935913, + "rewards//mean": 0.85888671875, + "rewards//std": 0.01866896077990532, + "step": 3204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.641, + "grad_norm": 1.529121994972229, + "kl": 0.3504072818905115, + "learning_rate": 2.9113854054265107e-07, + "loss": 0.014, + "num_tokens": 23337614.0, + "reward": 0.84552001953125, + "reward_std": 0.01752515882253647, + "rewards//mean": 0.84552001953125, + "rewards//std": 0.02241193875670433, + "step": 3205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6412, + "grad_norm": 1.7276415824890137, + "kl": 0.38724651373922825, + "learning_rate": 2.9085026233655365e-07, + "loss": 0.0155, + "num_tokens": 23344910.0, + "reward": 0.8253173828125, + "reward_std": 0.012745387852191925, + "rewards//mean": 0.8253173828125, + "rewards//std": 0.019418932497501373, + "step": 3206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6414, + "grad_norm": 1.7907130718231201, + "kl": 0.44490499421954155, + "learning_rate": 2.9056206837594563e-07, + "loss": 0.0178, + "num_tokens": 23352310.0, + "reward": 0.8505859375, + "reward_std": 0.01897665485739708, + "rewards//mean": 0.8505859375, + "rewards//std": 0.022195516154170036, + "step": 3207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.734375, + "epoch": 0.6416, + "grad_norm": 1.8479266166687012, + "kl": 0.41301146149635315, + "learning_rate": 2.902739587769114e-07, + "loss": 0.0004, + "num_tokens": 23359501.0, + "reward": 0.83416748046875, + "reward_std": 0.015127388760447502, + "rewards//mean": 0.83416748046875, + "rewards//std": 0.017852313816547394, + "step": 3208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6418, + "grad_norm": 2.145207166671753, + "kl": 0.3788825683295727, + "learning_rate": 2.8998593365550173e-07, + "loss": 0.0152, + "num_tokens": 23366901.0, + "reward": 0.84442138671875, + "reward_std": 0.018802037462592125, + "rewards//mean": 0.84442138671875, + "rewards//std": 0.027643751353025436, + "step": 3209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.642, + "grad_norm": 1.7972291707992554, + "kl": 0.3589158486574888, + "learning_rate": 2.896979931277326e-07, + "loss": 0.0144, + "num_tokens": 23374213.0, + "reward": 0.85736083984375, + "reward_std": 0.015691854059696198, + "rewards//mean": 0.85736083984375, + "rewards//std": 0.028637312352657318, + "step": 3210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6422, + "grad_norm": 1.3454302549362183, + "kl": 0.32919833436608315, + "learning_rate": 2.894101373095867e-07, + "loss": 0.0132, + "num_tokens": 23381485.0, + "reward": 0.8619384765625, + "reward_std": 0.012911442667245865, + "rewards//mean": 0.8619384765625, + "rewards//std": 0.020460518077015877, + "step": 3211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6424, + "grad_norm": 1.432276964187622, + "kl": 0.3712534233927727, + "learning_rate": 2.891223663170123e-07, + "loss": 0.0149, + "num_tokens": 23388885.0, + "reward": 0.8636474609375, + "reward_std": 0.014764955267310143, + "rewards//mean": 0.8636474609375, + "rewards//std": 0.017944185063242912, + "step": 3212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6426, + "grad_norm": 1.3959543704986572, + "kl": 0.37166935950517654, + "learning_rate": 2.888346802659238e-07, + "loss": 0.0149, + "num_tokens": 23396189.0, + "reward": 0.83172607421875, + "reward_std": 0.012827124446630478, + "rewards//mean": 0.83172607421875, + "rewards//std": 0.01774856261909008, + "step": 3213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.796875, + "epoch": 0.6428, + "grad_norm": 1.5541032552719116, + "kl": 0.4501974508166313, + "learning_rate": 2.8854707927220057e-07, + "loss": 0.0207, + "num_tokens": 23403448.0, + "reward": 0.839599609375, + "reward_std": 0.01912233978509903, + "rewards//mean": 0.839599609375, + "rewards//std": 0.028560098260641098, + "step": 3214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.643, + "grad_norm": 1.4837197065353394, + "kl": 0.3207739647477865, + "learning_rate": 2.8825956345168854e-07, + "loss": 0.0128, + "num_tokens": 23410760.0, + "reward": 0.8399658203125, + "reward_std": 0.01733158901333809, + "rewards//mean": 0.8399658203125, + "rewards//std": 0.022595327347517014, + "step": 3215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6432, + "grad_norm": 2.2620866298675537, + "kl": 0.5769629143178463, + "learning_rate": 2.8797213292019924e-07, + "loss": 0.0231, + "num_tokens": 23418104.0, + "reward": 0.7860107421875, + "reward_std": 0.015302512794733047, + "rewards//mean": 0.7860107421875, + "rewards//std": 0.025291163474321365, + "step": 3216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6434, + "grad_norm": 2.1367809772491455, + "kl": 0.4984273184090853, + "learning_rate": 2.8768478779350925e-07, + "loss": 0.0199, + "num_tokens": 23425416.0, + "reward": 0.8258056640625, + "reward_std": 0.012931449338793755, + "rewards//mean": 0.8258056640625, + "rewards//std": 0.017101023346185684, + "step": 3217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6436, + "grad_norm": 1.8674585819244385, + "kl": 0.34130046889185905, + "learning_rate": 2.873975281873613e-07, + "loss": 0.0137, + "num_tokens": 23432688.0, + "reward": 0.83050537109375, + "reward_std": 0.008391935378313065, + "rewards//mean": 0.83050537109375, + "rewards//std": 0.014068995602428913, + "step": 3218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6438, + "grad_norm": 1.7990095615386963, + "kl": 0.3521752152591944, + "learning_rate": 2.8711035421746363e-07, + "loss": 0.0141, + "num_tokens": 23440016.0, + "reward": 0.8375244140625, + "reward_std": 0.01510024257004261, + "rewards//mean": 0.8375244140625, + "rewards//std": 0.020560890436172485, + "step": 3219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.644, + "grad_norm": 1.618828535079956, + "kl": 0.3419591896235943, + "learning_rate": 2.8682326599949e-07, + "loss": 0.0137, + "num_tokens": 23447302.0, + "reward": 0.84796142578125, + "reward_std": 0.015637435019016266, + "rewards//mean": 0.84796142578125, + "rewards//std": 0.025932665914297104, + "step": 3220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6442, + "grad_norm": 1.6292742490768433, + "kl": 0.3021085225045681, + "learning_rate": 2.8653626364907914e-07, + "loss": 0.0121, + "num_tokens": 23454510.0, + "reward": 0.8704833984375, + "reward_std": 0.014692764729261398, + "rewards//mean": 0.8704833984375, + "rewards//std": 0.021318508312106133, + "step": 3221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6444, + "grad_norm": 1.5110474824905396, + "kl": 0.3235462252050638, + "learning_rate": 2.862493472818357e-07, + "loss": 0.0129, + "num_tokens": 23461790.0, + "reward": 0.79632568359375, + "reward_std": 0.013949892483651638, + "rewards//mean": 0.79632568359375, + "rewards//std": 0.016831910237669945, + "step": 3222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.6446, + "grad_norm": 1.551602840423584, + "kl": 0.3713921830058098, + "learning_rate": 2.859625170133297e-07, + "loss": 0.0149, + "num_tokens": 23469045.0, + "reward": 0.8734130859375, + "reward_std": 0.019910575821995735, + "rewards//mean": 0.8734130859375, + "rewards//std": 0.02478085085749626, + "step": 3223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6448, + "grad_norm": 1.4989864826202393, + "kl": 0.3348625358194113, + "learning_rate": 2.856757729590964e-07, + "loss": 0.0134, + "num_tokens": 23476229.0, + "reward": 0.8023681640625, + "reward_std": 0.01728568598628044, + "rewards//mean": 0.8023681640625, + "rewards//std": 0.026709433645009995, + "step": 3224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.645, + "grad_norm": 1.6240209341049194, + "kl": 0.4213840030133724, + "learning_rate": 2.853891152346359e-07, + "loss": 0.0169, + "num_tokens": 23483509.0, + "reward": 0.86279296875, + "reward_std": 0.018279697746038437, + "rewards//mean": 0.86279296875, + "rewards//std": 0.02456272952258587, + "step": 3225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6452, + "grad_norm": 1.727774739265442, + "kl": 0.3388099744915962, + "learning_rate": 2.8510254395541414e-07, + "loss": 0.0136, + "num_tokens": 23490797.0, + "reward": 0.7088623046875, + "reward_std": 0.012574683874845505, + "rewards//mean": 0.7088623046875, + "rewards//std": 0.02260604314506054, + "step": 3226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6454, + "grad_norm": 1.342637062072754, + "kl": 0.41208086535334587, + "learning_rate": 2.8481605923686205e-07, + "loss": 0.0165, + "num_tokens": 23498021.0, + "reward": 0.861083984375, + "reward_std": 0.01501612551510334, + "rewards//mean": 0.861083984375, + "rewards//std": 0.018957022577524185, + "step": 3227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6456, + "grad_norm": 1.828676462173462, + "kl": 0.3542724046856165, + "learning_rate": 2.845296611943756e-07, + "loss": 0.0142, + "num_tokens": 23505301.0, + "reward": 0.8507080078125, + "reward_std": 0.015115661546587944, + "rewards//mean": 0.8507080078125, + "rewards//std": 0.03140053153038025, + "step": 3228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6458, + "grad_norm": 1.4678163528442383, + "kl": 0.3251927327364683, + "learning_rate": 2.842433499433158e-07, + "loss": 0.013, + "num_tokens": 23512637.0, + "reward": 0.80401611328125, + "reward_std": 0.011460874229669571, + "rewards//mean": 0.80401611328125, + "rewards//std": 0.017098698765039444, + "step": 3229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.646, + "grad_norm": 1.5990607738494873, + "kl": 0.32736145332455635, + "learning_rate": 2.8395712559900874e-07, + "loss": 0.0131, + "num_tokens": 23519941.0, + "reward": 0.8626708984375, + "reward_std": 0.01622965931892395, + "rewards//mean": 0.8626708984375, + "rewards//std": 0.02581954002380371, + "step": 3230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6462, + "grad_norm": 1.6447540521621704, + "kl": 0.38012631610035896, + "learning_rate": 2.8367098827674576e-07, + "loss": 0.0152, + "num_tokens": 23527277.0, + "reward": 0.835693359375, + "reward_std": 0.014615194872021675, + "rewards//mean": 0.835693359375, + "rewards//std": 0.01760566234588623, + "step": 3231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6464, + "grad_norm": 1.529192328453064, + "kl": 0.3378209974616766, + "learning_rate": 2.83384938091783e-07, + "loss": 0.0135, + "num_tokens": 23534565.0, + "reward": 0.87335205078125, + "reward_std": 0.016888078302145004, + "rewards//mean": 0.87335205078125, + "rewards//std": 0.023723093792796135, + "step": 3232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.6466, + "grad_norm": 2.6328604221343994, + "kl": 0.30548748187720776, + "learning_rate": 2.83098975159341e-07, + "loss": 0.0131, + "num_tokens": 23541764.0, + "reward": 0.8424072265625, + "reward_std": 0.013799145817756653, + "rewards//mean": 0.8424072265625, + "rewards//std": 0.023033250123262405, + "step": 3233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.6468, + "grad_norm": 1.6591453552246094, + "kl": 0.4104588069021702, + "learning_rate": 2.8281309959460595e-07, + "loss": 0.0155, + "num_tokens": 23549041.0, + "reward": 0.8291015625, + "reward_std": 0.014974788762629032, + "rewards//mean": 0.8291015625, + "rewards//std": 0.021987205371260643, + "step": 3234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.647, + "grad_norm": 1.685150146484375, + "kl": 0.4088112339377403, + "learning_rate": 2.825273115127286e-07, + "loss": 0.0164, + "num_tokens": 23556313.0, + "reward": 0.8515625, + "reward_std": 0.017365001142024994, + "rewards//mean": 0.8515625, + "rewards//std": 0.02153082937002182, + "step": 3235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.84375, + "epoch": 0.6472, + "grad_norm": 1.4098267555236816, + "kl": 0.34105829149484634, + "learning_rate": 2.8224161102882393e-07, + "loss": 0.0208, + "num_tokens": 23563615.0, + "reward": 0.83734130859375, + "reward_std": 0.014848697930574417, + "rewards//mean": 0.83734130859375, + "rewards//std": 0.0325213186442852, + "step": 3236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6474, + "grad_norm": 1.6392267942428589, + "kl": 0.35015639290213585, + "learning_rate": 2.819559982579723e-07, + "loss": 0.014, + "num_tokens": 23570863.0, + "reward": 0.882080078125, + "reward_std": 0.017176616936922073, + "rewards//mean": 0.882080078125, + "rewards//std": 0.031908635050058365, + "step": 3237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.6476, + "grad_norm": 1.4338420629501343, + "kl": 0.32816527411341667, + "learning_rate": 2.8167047331521847e-07, + "loss": 0.0112, + "num_tokens": 23578205.0, + "reward": 0.82733154296875, + "reward_std": 0.013199372217059135, + "rewards//mean": 0.82733154296875, + "rewards//std": 0.023106331005692482, + "step": 3238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6478, + "grad_norm": 1.425241231918335, + "kl": 0.3317929469048977, + "learning_rate": 2.8138503631557213e-07, + "loss": 0.0133, + "num_tokens": 23585405.0, + "reward": 0.86444091796875, + "reward_std": 0.01281156949698925, + "rewards//mean": 0.86444091796875, + "rewards//std": 0.022985469549894333, + "step": 3239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.734375, + "epoch": 0.648, + "grad_norm": 1.4308520555496216, + "kl": 0.38337094709277153, + "learning_rate": 2.810996873740068e-07, + "loss": 0.0178, + "num_tokens": 23592772.0, + "reward": 0.862548828125, + "reward_std": 0.01643565483391285, + "rewards//mean": 0.862548828125, + "rewards//std": 0.024158822372555733, + "step": 3240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6482, + "grad_norm": 1.6392834186553955, + "kl": 0.3113619480282068, + "learning_rate": 2.808144266054612e-07, + "loss": 0.0125, + "num_tokens": 23599964.0, + "reward": 0.8509521484375, + "reward_std": 0.01404594723135233, + "rewards//mean": 0.8509521484375, + "rewards//std": 0.024282259866595268, + "step": 3241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.6484, + "grad_norm": 1.568252682685852, + "kl": 0.323579890653491, + "learning_rate": 2.805292541248384e-07, + "loss": 0.0119, + "num_tokens": 23607166.0, + "reward": 0.89202880859375, + "reward_std": 0.01817196235060692, + "rewards//mean": 0.89202880859375, + "rewards//std": 0.026119371876120567, + "step": 3242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6486, + "grad_norm": 1.5406919717788696, + "kl": 0.3003047127276659, + "learning_rate": 2.8024417004700595e-07, + "loss": 0.012, + "num_tokens": 23614470.0, + "reward": 0.84027099609375, + "reward_std": 0.014685023576021194, + "rewards//mean": 0.84027099609375, + "rewards//std": 0.02028096280992031, + "step": 3243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.6488, + "grad_norm": 1.636613130569458, + "kl": 0.4249405525624752, + "learning_rate": 2.7995917448679534e-07, + "loss": 0.0173, + "num_tokens": 23621771.0, + "reward": 0.86090087890625, + "reward_std": 0.021573014557361603, + "rewards//mean": 0.86090087890625, + "rewards//std": 0.03797224536538124, + "step": 3244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.649, + "grad_norm": 1.3787639141082764, + "kl": 0.39350005611777306, + "learning_rate": 2.796742675590029e-07, + "loss": 0.0157, + "num_tokens": 23628955.0, + "reward": 0.86749267578125, + "reward_std": 0.015330797992646694, + "rewards//mean": 0.86749267578125, + "rewards//std": 0.024322979152202606, + "step": 3245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6492, + "grad_norm": 1.4400824308395386, + "kl": 0.37189460545778275, + "learning_rate": 2.7938944937838923e-07, + "loss": 0.0149, + "num_tokens": 23636307.0, + "reward": 0.86590576171875, + "reward_std": 0.01182318665087223, + "rewards//mean": 0.86590576171875, + "rewards//std": 0.025171998888254166, + "step": 3246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6494, + "grad_norm": 1.5071840286254883, + "kl": 0.32231942750513554, + "learning_rate": 2.791047200596791e-07, + "loss": 0.0129, + "num_tokens": 23643523.0, + "reward": 0.842529296875, + "reward_std": 0.015142660588026047, + "rewards//mean": 0.842529296875, + "rewards//std": 0.025141386315226555, + "step": 3247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6496, + "grad_norm": 1.541374683380127, + "kl": 0.2982425335794687, + "learning_rate": 2.7882007971756113e-07, + "loss": 0.0119, + "num_tokens": 23650795.0, + "reward": 0.782958984375, + "reward_std": 0.01250227726995945, + "rewards//mean": 0.782958984375, + "rewards//std": 0.019198240712285042, + "step": 3248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6498, + "grad_norm": 1.391676425933838, + "kl": 0.3447785787284374, + "learning_rate": 2.785355284666886e-07, + "loss": 0.0138, + "num_tokens": 23658043.0, + "reward": 0.8282470703125, + "reward_std": 0.01883913204073906, + "rewards//mean": 0.8282470703125, + "rewards//std": 0.02941315993666649, + "step": 3249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.65, + "grad_norm": 1.7583370208740234, + "kl": 0.3249739073216915, + "learning_rate": 2.782510664216789e-07, + "loss": 0.013, + "num_tokens": 23665355.0, + "reward": 0.85888671875, + "reward_std": 0.010834109038114548, + "rewards//mean": 0.85888671875, + "rewards//std": 0.02456272952258587, + "step": 3250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6502, + "grad_norm": 1.7199934720993042, + "kl": 0.3364192545413971, + "learning_rate": 2.779666936971129e-07, + "loss": 0.0135, + "num_tokens": 23672723.0, + "reward": 0.86676025390625, + "reward_std": 0.011647268198430538, + "rewards//mean": 0.86676025390625, + "rewards//std": 0.01808399334549904, + "step": 3251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6504, + "grad_norm": 1.5484925508499146, + "kl": 0.3195311762392521, + "learning_rate": 2.776824104075364e-07, + "loss": 0.0128, + "num_tokens": 23679971.0, + "reward": 0.88275146484375, + "reward_std": 0.014257600530982018, + "rewards//mean": 0.88275146484375, + "rewards//std": 0.0220476221293211, + "step": 3252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6506, + "grad_norm": 1.4908736944198608, + "kl": 0.3268216960132122, + "learning_rate": 2.7739821666745817e-07, + "loss": 0.0131, + "num_tokens": 23687267.0, + "reward": 0.87408447265625, + "reward_std": 0.011599849909543991, + "rewards//mean": 0.87408447265625, + "rewards//std": 0.01583658531308174, + "step": 3253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6508, + "grad_norm": 1.596375584602356, + "kl": 0.39068603329360485, + "learning_rate": 2.7711411259135167e-07, + "loss": 0.0156, + "num_tokens": 23694483.0, + "reward": 0.83477783203125, + "reward_std": 0.013331563211977482, + "rewards//mean": 0.83477783203125, + "rewards//std": 0.018941745162010193, + "step": 3254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.651, + "grad_norm": 1.7606192827224731, + "kl": 0.3887399435043335, + "learning_rate": 2.768300982936541e-07, + "loss": 0.0155, + "num_tokens": 23701779.0, + "reward": 0.85662841796875, + "reward_std": 0.02124284952878952, + "rewards//mean": 0.85662841796875, + "rewards//std": 0.0260572861880064, + "step": 3255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6512, + "grad_norm": 1.3907020092010498, + "kl": 0.3858175612986088, + "learning_rate": 2.765461738887661e-07, + "loss": 0.0154, + "num_tokens": 23709051.0, + "reward": 0.83074951171875, + "reward_std": 0.0145072927698493, + "rewards//mean": 0.83074951171875, + "rewards//std": 0.01717643067240715, + "step": 3256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.6514, + "grad_norm": 1.6069021224975586, + "kl": 0.3433171361684799, + "learning_rate": 2.762623394910525e-07, + "loss": 0.0135, + "num_tokens": 23716297.0, + "reward": 0.87896728515625, + "reward_std": 0.016130657866597176, + "rewards//mean": 0.87896728515625, + "rewards//std": 0.026725510135293007, + "step": 3257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6516, + "grad_norm": 1.774588942527771, + "kl": 0.35832060873508453, + "learning_rate": 2.759785952148418e-07, + "loss": 0.0143, + "num_tokens": 23723561.0, + "reward": 0.8592529296875, + "reward_std": 0.012454641051590443, + "rewards//mean": 0.8592529296875, + "rewards//std": 0.01632201485335827, + "step": 3258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6518, + "grad_norm": 1.7108495235443115, + "kl": 0.4475961737334728, + "learning_rate": 2.7569494117442635e-07, + "loss": 0.0179, + "num_tokens": 23730857.0, + "reward": 0.812744140625, + "reward_std": 0.00977312307804823, + "rewards//mean": 0.812744140625, + "rewards//std": 0.016111519187688828, + "step": 3259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.652, + "grad_norm": 1.7231965065002441, + "kl": 0.36120876483619213, + "learning_rate": 2.754113774840616e-07, + "loss": 0.0165, + "num_tokens": 23738188.0, + "reward": 0.8509521484375, + "reward_std": 0.021950755268335342, + "rewards//mean": 0.8509521484375, + "rewards//std": 0.02590382844209671, + "step": 3260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6522, + "grad_norm": 1.4529807567596436, + "kl": 0.3860611878335476, + "learning_rate": 2.751279042579672e-07, + "loss": 0.0154, + "num_tokens": 23745436.0, + "reward": 0.852294921875, + "reward_std": 0.01431745383888483, + "rewards//mean": 0.852294921875, + "rewards//std": 0.023145031183958054, + "step": 3261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6524, + "grad_norm": 2.0496556758880615, + "kl": 0.3809702806174755, + "learning_rate": 2.748445216103262e-07, + "loss": 0.0152, + "num_tokens": 23752580.0, + "reward": 0.84893798828125, + "reward_std": 0.016536902636289597, + "rewards//mean": 0.84893798828125, + "rewards//std": 0.02649397775530815, + "step": 3262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6526, + "grad_norm": 1.5714433193206787, + "kl": 0.29923280514776707, + "learning_rate": 2.745612296552847e-07, + "loss": 0.012, + "num_tokens": 23759852.0, + "reward": 0.85748291015625, + "reward_std": 0.016516737639904022, + "rewards//mean": 0.85748291015625, + "rewards//std": 0.018453553318977356, + "step": 3263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6528, + "grad_norm": 1.4314836263656616, + "kl": 0.3362332619726658, + "learning_rate": 2.74278028506953e-07, + "loss": 0.0134, + "num_tokens": 23767108.0, + "reward": 0.86822509765625, + "reward_std": 0.01770276203751564, + "rewards//mean": 0.86822509765625, + "rewards//std": 0.026586946099996567, + "step": 3264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.78125, + "epoch": 0.653, + "grad_norm": 1.7789795398712158, + "kl": 0.31917675770819187, + "learning_rate": 2.7399491827940444e-07, + "loss": 0.0108, + "num_tokens": 23774326.0, + "reward": 0.8756103515625, + "reward_std": 0.012494891881942749, + "rewards//mean": 0.8756103515625, + "rewards//std": 0.021790454164147377, + "step": 3265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6532, + "grad_norm": 1.4324506521224976, + "kl": 0.35615429654717445, + "learning_rate": 2.73711899086676e-07, + "loss": 0.0142, + "num_tokens": 23781638.0, + "reward": 0.80682373046875, + "reward_std": 0.016862787306308746, + "rewards//mean": 0.80682373046875, + "rewards//std": 0.022850725799798965, + "step": 3266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.6534, + "grad_norm": 1.4352600574493408, + "kl": 0.40073976293206215, + "learning_rate": 2.734289710427673e-07, + "loss": 0.0157, + "num_tokens": 23789013.0, + "reward": 0.8704833984375, + "reward_std": 0.016528954729437828, + "rewards//mean": 0.8704833984375, + "rewards//std": 0.021261626854538918, + "step": 3267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6536, + "grad_norm": 1.541959285736084, + "kl": 0.3632401376962662, + "learning_rate": 2.73146134261642e-07, + "loss": 0.0145, + "num_tokens": 23796397.0, + "reward": 0.87420654296875, + "reward_std": 0.016985777765512466, + "rewards//mean": 0.87420654296875, + "rewards//std": 0.028384050354361534, + "step": 3268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6538, + "grad_norm": 1.4268574714660645, + "kl": 0.36108406633138657, + "learning_rate": 2.728633888572267e-07, + "loss": 0.0144, + "num_tokens": 23803805.0, + "reward": 0.8450927734375, + "reward_std": 0.013706987723708153, + "rewards//mean": 0.8450927734375, + "rewards//std": 0.025479601696133614, + "step": 3269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.654, + "grad_norm": 1.732445240020752, + "kl": 0.33519509620964527, + "learning_rate": 2.7258073494341136e-07, + "loss": 0.0134, + "num_tokens": 23811141.0, + "reward": 0.89691162109375, + "reward_std": 0.01940813474357128, + "rewards//mean": 0.89691162109375, + "rewards//std": 0.029908401891589165, + "step": 3270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6542, + "grad_norm": 1.4682350158691406, + "kl": 0.3551382143050432, + "learning_rate": 2.7229817263404864e-07, + "loss": 0.0142, + "num_tokens": 23818405.0, + "reward": 0.85565185546875, + "reward_std": 0.017807168886065483, + "rewards//mean": 0.85565185546875, + "rewards//std": 0.026715878397226334, + "step": 3271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6544, + "grad_norm": 1.4753267765045166, + "kl": 0.3795026559382677, + "learning_rate": 2.720157020429547e-07, + "loss": 0.0152, + "num_tokens": 23825621.0, + "reward": 0.864501953125, + "reward_std": 0.016758274286985397, + "rewards//mean": 0.864501953125, + "rewards//std": 0.019597791135311127, + "step": 3272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6546, + "grad_norm": 1.5701171159744263, + "kl": 0.33546966686844826, + "learning_rate": 2.7173332328390876e-07, + "loss": 0.0134, + "num_tokens": 23832853.0, + "reward": 0.8592529296875, + "reward_std": 0.02185131050646305, + "rewards//mean": 0.8592529296875, + "rewards//std": 0.027161281555891037, + "step": 3273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.6548, + "grad_norm": 1.2482693195343018, + "kl": 0.2974862679839134, + "learning_rate": 2.71451036470653e-07, + "loss": 0.0138, + "num_tokens": 23840182.0, + "reward": 0.8145751953125, + "reward_std": 0.013914760202169418, + "rewards//mean": 0.8145751953125, + "rewards//std": 0.022734249010682106, + "step": 3274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.655, + "grad_norm": 1.5228420495986938, + "kl": 0.2887256480753422, + "learning_rate": 2.7116884171689236e-07, + "loss": 0.0115, + "num_tokens": 23847446.0, + "reward": 0.8470458984375, + "reward_std": 0.014568864367902279, + "rewards//mean": 0.8470458984375, + "rewards//std": 0.02182377316057682, + "step": 3275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6552, + "grad_norm": 1.5586457252502441, + "kl": 0.3535265773534775, + "learning_rate": 2.708867391362948e-07, + "loss": 0.0141, + "num_tokens": 23854702.0, + "reward": 0.8232421875, + "reward_std": 0.0160976629704237, + "rewards//mean": 0.8232421875, + "rewards//std": 0.017839696258306503, + "step": 3276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6554, + "grad_norm": 1.623681902885437, + "kl": 0.438395481556654, + "learning_rate": 2.706047288424914e-07, + "loss": 0.0175, + "num_tokens": 23862006.0, + "reward": 0.88720703125, + "reward_std": 0.019931325688958168, + "rewards//mean": 0.88720703125, + "rewards//std": 0.023224687203764915, + "step": 3277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6556, + "grad_norm": 1.6344504356384277, + "kl": 0.4286835491657257, + "learning_rate": 2.7032281094907594e-07, + "loss": 0.0171, + "num_tokens": 23869278.0, + "reward": 0.81585693359375, + "reward_std": 0.010761636309325695, + "rewards//mean": 0.81585693359375, + "rewards//std": 0.01518265251070261, + "step": 3278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6558, + "grad_norm": 1.7509342432022095, + "kl": 0.36083610728383064, + "learning_rate": 2.7004098556960454e-07, + "loss": 0.0144, + "num_tokens": 23876550.0, + "reward": 0.84765625, + "reward_std": 0.017832735553383827, + "rewards//mean": 0.84765625, + "rewards//std": 0.028047733008861542, + "step": 3279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.765625, + "epoch": 0.656, + "grad_norm": 1.4956879615783691, + "kl": 0.3352545537054539, + "learning_rate": 2.697592528175967e-07, + "loss": -0.0023, + "num_tokens": 23883895.0, + "reward": 0.8558349609375, + "reward_std": 0.014781628735363483, + "rewards//mean": 0.8558349609375, + "rewards//std": 0.021784896031022072, + "step": 3280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.6562, + "grad_norm": 1.4858734607696533, + "kl": 0.30648326128721237, + "learning_rate": 2.6947761280653447e-07, + "loss": 0.0127, + "num_tokens": 23891225.0, + "reward": 0.8624267578125, + "reward_std": 0.02634930983185768, + "rewards//mean": 0.8624267578125, + "rewards//std": 0.039741549640893936, + "step": 3281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6564, + "grad_norm": 1.410209059715271, + "kl": 0.3304777890443802, + "learning_rate": 2.6919606564986207e-07, + "loss": 0.0132, + "num_tokens": 23898465.0, + "reward": 0.82318115234375, + "reward_std": 0.012761096470057964, + "rewards//mean": 0.82318115234375, + "rewards//std": 0.024282492697238922, + "step": 3282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6566, + "grad_norm": 1.546723484992981, + "kl": 0.3296445216983557, + "learning_rate": 2.6891461146098676e-07, + "loss": 0.0132, + "num_tokens": 23905681.0, + "reward": 0.8201904296875, + "reward_std": 0.01347258035093546, + "rewards//mean": 0.8201904296875, + "rewards//std": 0.01596190594136715, + "step": 3283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6568, + "grad_norm": 1.3165310621261597, + "kl": 0.32658106833696365, + "learning_rate": 2.686332503532783e-07, + "loss": 0.0131, + "num_tokens": 23912921.0, + "reward": 0.822265625, + "reward_std": 0.015037231147289276, + "rewards//mean": 0.822265625, + "rewards//std": 0.02717047557234764, + "step": 3284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.657, + "grad_norm": 1.5928430557250977, + "kl": 0.32467809692025185, + "learning_rate": 2.683519824400692e-07, + "loss": 0.013, + "num_tokens": 23920233.0, + "reward": 0.812744140625, + "reward_std": 0.01183292455971241, + "rewards//mean": 0.812744140625, + "rewards//std": 0.020290011540055275, + "step": 3285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6572, + "grad_norm": 1.8457159996032715, + "kl": 0.36578440479934216, + "learning_rate": 2.680708078346537e-07, + "loss": 0.0146, + "num_tokens": 23927617.0, + "reward": 0.85675048828125, + "reward_std": 0.017955228686332703, + "rewards//mean": 0.85675048828125, + "rewards//std": 0.02500789240002632, + "step": 3286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.6574, + "grad_norm": 2.037289619445801, + "kl": 0.5394141785800457, + "learning_rate": 2.6778972665028906e-07, + "loss": 0.0213, + "num_tokens": 23934847.0, + "reward": 0.848876953125, + "reward_std": 0.0166579931974411, + "rewards//mean": 0.848876953125, + "rewards//std": 0.018174277618527412, + "step": 3287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6576, + "grad_norm": 1.433321237564087, + "kl": 0.3419906906783581, + "learning_rate": 2.675087390001947e-07, + "loss": 0.0137, + "num_tokens": 23942183.0, + "reward": 0.820068359375, + "reward_std": 0.01730189099907875, + "rewards//mean": 0.820068359375, + "rewards//std": 0.018969794735312462, + "step": 3288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6578, + "grad_norm": 1.5026910305023193, + "kl": 0.36757970601320267, + "learning_rate": 2.6722784499755267e-07, + "loss": 0.0147, + "num_tokens": 23949535.0, + "reward": 0.8597412109375, + "reward_std": 0.02084757387638092, + "rewards//mean": 0.8597412109375, + "rewards//std": 0.024297216907143593, + "step": 3289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.658, + "grad_norm": 1.5202157497406006, + "kl": 0.31721998006105423, + "learning_rate": 2.6694704475550666e-07, + "loss": 0.0127, + "num_tokens": 23956823.0, + "reward": 0.8909912109375, + "reward_std": 0.02084258757531643, + "rewards//mean": 0.8909912109375, + "rewards//std": 0.02690819278359413, + "step": 3290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6582, + "grad_norm": 1.9891754388809204, + "kl": 0.33490715734660625, + "learning_rate": 2.6666633838716314e-07, + "loss": 0.0134, + "num_tokens": 23964175.0, + "reward": 0.83587646484375, + "reward_std": 0.018713276833295822, + "rewards//mean": 0.83587646484375, + "rewards//std": 0.01953115314245224, + "step": 3291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6584, + "grad_norm": 1.5174336433410645, + "kl": 0.37292713671922684, + "learning_rate": 2.6638572600559063e-07, + "loss": 0.0149, + "num_tokens": 23971479.0, + "reward": 0.7841796875, + "reward_std": 0.01936545968055725, + "rewards//mean": 0.7841796875, + "rewards//std": 0.024190131574869156, + "step": 3292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6586, + "grad_norm": 1.431727647781372, + "kl": 0.35583953000605106, + "learning_rate": 2.6610520772381996e-07, + "loss": 0.0142, + "num_tokens": 23978855.0, + "reward": 0.87384033203125, + "reward_std": 0.014691718854010105, + "rewards//mean": 0.87384033203125, + "rewards//std": 0.022752469405531883, + "step": 3293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6588, + "grad_norm": 1.498555064201355, + "kl": 0.33854080736637115, + "learning_rate": 2.658247836548434e-07, + "loss": 0.0135, + "num_tokens": 23986135.0, + "reward": 0.88018798828125, + "reward_std": 0.013221238739788532, + "rewards//mean": 0.88018798828125, + "rewards//std": 0.023520590737462044, + "step": 3294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.659, + "grad_norm": 1.490562081336975, + "kl": 0.3616829328238964, + "learning_rate": 2.65544453911616e-07, + "loss": 0.0145, + "num_tokens": 23993383.0, + "reward": 0.85693359375, + "reward_std": 0.017168808728456497, + "rewards//mean": 0.85693359375, + "rewards//std": 0.020908765494823456, + "step": 3295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6592, + "grad_norm": 1.5192492008209229, + "kl": 0.3446528986096382, + "learning_rate": 2.6526421860705473e-07, + "loss": 0.0138, + "num_tokens": 24000703.0, + "reward": 0.88140869140625, + "reward_std": 0.01334074605256319, + "rewards//mean": 0.88140869140625, + "rewards//std": 0.016882197931408882, + "step": 3296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6594, + "grad_norm": 1.5117218494415283, + "kl": 0.3957766927778721, + "learning_rate": 2.649840778540379e-07, + "loss": 0.0158, + "num_tokens": 24007991.0, + "reward": 0.86334228515625, + "reward_std": 0.015143884345889091, + "rewards//mean": 0.86334228515625, + "rewards//std": 0.02194853127002716, + "step": 3297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.875, + "epoch": 0.6596, + "grad_norm": 1.2763527631759644, + "kl": 0.35383828543126583, + "learning_rate": 2.6470403176540644e-07, + "loss": 0.0185, + "num_tokens": 24015279.0, + "reward": 0.85552978515625, + "reward_std": 0.013505592942237854, + "rewards//mean": 0.85552978515625, + "rewards//std": 0.02430492453277111, + "step": 3298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6598, + "grad_norm": 1.5773605108261108, + "kl": 0.3573454227298498, + "learning_rate": 2.644240804539629e-07, + "loss": 0.0143, + "num_tokens": 24022559.0, + "reward": 0.86163330078125, + "reward_std": 0.016598522663116455, + "rewards//mean": 0.86163330078125, + "rewards//std": 0.022753801196813583, + "step": 3299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.66, + "grad_norm": 1.4094247817993164, + "kl": 0.34009447135031223, + "learning_rate": 2.641442240324717e-07, + "loss": 0.0136, + "num_tokens": 24029823.0, + "reward": 0.8065185546875, + "reward_std": 0.01686146855354309, + "rewards//mean": 0.8065185546875, + "rewards//std": 0.024079428985714912, + "step": 3300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6602, + "grad_norm": 1.4822490215301514, + "kl": 0.34379140101373196, + "learning_rate": 2.638644626136587e-07, + "loss": 0.0138, + "num_tokens": 24037071.0, + "reward": 0.811767578125, + "reward_std": 0.019148170948028564, + "rewards//mean": 0.811767578125, + "rewards//std": 0.030937448143959045, + "step": 3301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6604, + "grad_norm": 1.436228632926941, + "kl": 0.2683802414685488, + "learning_rate": 2.635847963102119e-07, + "loss": 0.0107, + "num_tokens": 24044295.0, + "reward": 0.86907958984375, + "reward_std": 0.015574770979583263, + "rewards//mean": 0.86907958984375, + "rewards//std": 0.028322651982307434, + "step": 3302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6606, + "grad_norm": 1.3221724033355713, + "kl": 0.3696925602853298, + "learning_rate": 2.6330522523478084e-07, + "loss": 0.0148, + "num_tokens": 24051631.0, + "reward": 0.8719482421875, + "reward_std": 0.017404189333319664, + "rewards//mean": 0.8719482421875, + "rewards//std": 0.024104561656713486, + "step": 3303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6608, + "grad_norm": 1.5342563390731812, + "kl": 0.33139050751924515, + "learning_rate": 2.63025749499977e-07, + "loss": 0.0133, + "num_tokens": 24058967.0, + "reward": 0.87255859375, + "reward_std": 0.016991307958960533, + "rewards//mean": 0.87255859375, + "rewards//std": 0.0272994264960289, + "step": 3304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.661, + "grad_norm": 1.6071046590805054, + "kl": 0.3840428553521633, + "learning_rate": 2.6274636921837267e-07, + "loss": 0.0154, + "num_tokens": 24066183.0, + "reward": 0.8359375, + "reward_std": 0.013429427519440651, + "rewards//mean": 0.8359375, + "rewards//std": 0.015422170981764793, + "step": 3305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.6612, + "grad_norm": 1.4686659574508667, + "kl": 0.3119164202362299, + "learning_rate": 2.6246708450250256e-07, + "loss": 0.0116, + "num_tokens": 24073469.0, + "reward": 0.87042236328125, + "reward_std": 0.01976657658815384, + "rewards//mean": 0.87042236328125, + "rewards//std": 0.030989699065685272, + "step": 3306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6614, + "grad_norm": 1.5480382442474365, + "kl": 0.4583230596035719, + "learning_rate": 2.621878954648623e-07, + "loss": 0.0183, + "num_tokens": 24080685.0, + "reward": 0.80938720703125, + "reward_std": 0.010198621079325676, + "rewards//mean": 0.80938720703125, + "rewards//std": 0.012098663486540318, + "step": 3307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6616, + "grad_norm": 1.3690388202667236, + "kl": 0.34573978930711746, + "learning_rate": 2.6190880221790954e-07, + "loss": 0.0138, + "num_tokens": 24087941.0, + "reward": 0.88330078125, + "reward_std": 0.015217477455735207, + "rewards//mean": 0.88330078125, + "rewards//std": 0.026652982458472252, + "step": 3308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6618, + "grad_norm": 1.7464511394500732, + "kl": 0.4312780536711216, + "learning_rate": 2.6162980487406253e-07, + "loss": 0.0173, + "num_tokens": 24095173.0, + "reward": 0.8262939453125, + "reward_std": 0.013985109515488148, + "rewards//mean": 0.8262939453125, + "rewards//std": 0.02000260539352894, + "step": 3309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.662, + "grad_norm": 1.325600504875183, + "kl": 0.3542402368038893, + "learning_rate": 2.6135090354570165e-07, + "loss": 0.0142, + "num_tokens": 24102389.0, + "reward": 0.863037109375, + "reward_std": 0.01532808132469654, + "rewards//mean": 0.863037109375, + "rewards//std": 0.020206280052661896, + "step": 3310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6622, + "grad_norm": 1.6897263526916504, + "kl": 0.3754114657640457, + "learning_rate": 2.610720983451685e-07, + "loss": 0.015, + "num_tokens": 24109685.0, + "reward": 0.8277587890625, + "reward_std": 0.014303512871265411, + "rewards//mean": 0.8277587890625, + "rewards//std": 0.023660877719521523, + "step": 3311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6624, + "grad_norm": 1.2763423919677734, + "kl": 0.27829344384372234, + "learning_rate": 2.6079338938476536e-07, + "loss": 0.0111, + "num_tokens": 24116949.0, + "reward": 0.84820556640625, + "reward_std": 0.012137096375226974, + "rewards//mean": 0.84820556640625, + "rewards//std": 0.023374037817120552, + "step": 3312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6626, + "grad_norm": 1.57493257522583, + "kl": 0.3600759916007519, + "learning_rate": 2.605147767767564e-07, + "loss": 0.0144, + "num_tokens": 24124189.0, + "reward": 0.869384765625, + "reward_std": 0.01887807622551918, + "rewards//mean": 0.869384765625, + "rewards//std": 0.021675176918506622, + "step": 3313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6628, + "grad_norm": 1.570204734802246, + "kl": 0.3051733020693064, + "learning_rate": 2.6023626063336665e-07, + "loss": 0.0122, + "num_tokens": 24131461.0, + "reward": 0.8494873046875, + "reward_std": 0.019801510497927666, + "rewards//mean": 0.8494873046875, + "rewards//std": 0.029927439987659454, + "step": 3314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.663, + "grad_norm": 1.5862704515457153, + "kl": 0.36812201514840126, + "learning_rate": 2.5995784106678263e-07, + "loss": 0.0147, + "num_tokens": 24138773.0, + "reward": 0.82330322265625, + "reward_std": 0.017632128670811653, + "rewards//mean": 0.82330322265625, + "rewards//std": 0.021324453875422478, + "step": 3315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6632, + "grad_norm": 1.5315563678741455, + "kl": 0.35809750854969025, + "learning_rate": 2.5967951818915136e-07, + "loss": 0.0143, + "num_tokens": 24146045.0, + "reward": 0.8006591796875, + "reward_std": 0.011417614296078682, + "rewards//mean": 0.8006591796875, + "rewards//std": 0.020087188109755516, + "step": 3316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6634, + "grad_norm": 1.4596165418624878, + "kl": 0.29938820749521255, + "learning_rate": 2.5940129211258146e-07, + "loss": 0.012, + "num_tokens": 24153389.0, + "reward": 0.82452392578125, + "reward_std": 0.017225507646799088, + "rewards//mean": 0.82452392578125, + "rewards//std": 0.03357618674635887, + "step": 3317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6636, + "grad_norm": 1.519888997077942, + "kl": 0.369255006313324, + "learning_rate": 2.591231629491423e-07, + "loss": 0.0148, + "num_tokens": 24160645.0, + "reward": 0.83367919921875, + "reward_std": 0.0165071003139019, + "rewards//mean": 0.83367919921875, + "rewards//std": 0.024516388773918152, + "step": 3318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.859375, + "epoch": 0.6638, + "grad_norm": 2.0054590702056885, + "kl": 0.4680979326367378, + "learning_rate": 2.5884513081086446e-07, + "loss": 0.0121, + "num_tokens": 24167836.0, + "reward": 0.8294677734375, + "reward_std": 0.012414091266691685, + "rewards//mean": 0.8294677734375, + "rewards//std": 0.01857425831258297, + "step": 3319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.664, + "grad_norm": 1.7503153085708618, + "kl": 0.4815998449921608, + "learning_rate": 2.585671958097389e-07, + "loss": 0.0192, + "num_tokens": 24175259.0, + "reward": 0.85546875, + "reward_std": 0.015581423416733742, + "rewards//mean": 0.85546875, + "rewards//std": 0.022423502057790756, + "step": 3320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6642, + "grad_norm": 1.6426572799682617, + "kl": 0.3175412565469742, + "learning_rate": 2.58289358057718e-07, + "loss": 0.0127, + "num_tokens": 24182555.0, + "reward": 0.84735107421875, + "reward_std": 0.012283123098313808, + "rewards//mean": 0.84735107421875, + "rewards//std": 0.01302650012075901, + "step": 3321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6644, + "grad_norm": 1.7141634225845337, + "kl": 0.3149235639721155, + "learning_rate": 2.5801161766671483e-07, + "loss": 0.0126, + "num_tokens": 24189803.0, + "reward": 0.847412109375, + "reward_std": 0.012439507991075516, + "rewards//mean": 0.847412109375, + "rewards//std": 0.014447091147303581, + "step": 3322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6646, + "grad_norm": 1.396909475326538, + "kl": 0.39039043337106705, + "learning_rate": 2.5773397474860325e-07, + "loss": 0.0156, + "num_tokens": 24197227.0, + "reward": 0.85015869140625, + "reward_std": 0.017944440245628357, + "rewards//mean": 0.85015869140625, + "rewards//std": 0.02033165469765663, + "step": 3323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6648, + "grad_norm": 1.8658056259155273, + "kl": 0.39718329906463623, + "learning_rate": 2.574564294152175e-07, + "loss": 0.0159, + "num_tokens": 24204467.0, + "reward": 0.842041015625, + "reward_std": 0.018129831179976463, + "rewards//mean": 0.842041015625, + "rewards//std": 0.020361509174108505, + "step": 3324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.665, + "grad_norm": 1.60467529296875, + "kl": 0.33686857111752033, + "learning_rate": 2.5717898177835296e-07, + "loss": 0.0135, + "num_tokens": 24211747.0, + "reward": 0.865234375, + "reward_std": 0.011318055912852287, + "rewards//mean": 0.865234375, + "rewards//std": 0.026374364271759987, + "step": 3325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6652, + "grad_norm": 2.0756189823150635, + "kl": 0.35783231630921364, + "learning_rate": 2.5690163194976573e-07, + "loss": 0.0143, + "num_tokens": 24219027.0, + "reward": 0.85009765625, + "reward_std": 0.01640629954636097, + "rewards//mean": 0.85009765625, + "rewards//std": 0.02384219877421856, + "step": 3326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6654, + "grad_norm": 1.5301231145858765, + "kl": 0.33117429353296757, + "learning_rate": 2.566243800411719e-07, + "loss": 0.0132, + "num_tokens": 24226291.0, + "reward": 0.81903076171875, + "reward_std": 0.014165002852678299, + "rewards//mean": 0.81903076171875, + "rewards//std": 0.022488806396722794, + "step": 3327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6656, + "grad_norm": 1.7750376462936401, + "kl": 0.34090059250593185, + "learning_rate": 2.563472261642486e-07, + "loss": 0.0136, + "num_tokens": 24233547.0, + "reward": 0.7879638671875, + "reward_std": 0.014189823530614376, + "rewards//mean": 0.7879638671875, + "rewards//std": 0.022525545209646225, + "step": 3328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6658, + "grad_norm": 2.032827138900757, + "kl": 0.30238004960119724, + "learning_rate": 2.5607017043063353e-07, + "loss": 0.0121, + "num_tokens": 24240891.0, + "reward": 0.79010009765625, + "reward_std": 0.014257235452532768, + "rewards//mean": 0.79010009765625, + "rewards//std": 0.021690448746085167, + "step": 3329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.666, + "grad_norm": 2.271120071411133, + "kl": 0.44035715609788895, + "learning_rate": 2.557932129519249e-07, + "loss": 0.0176, + "num_tokens": 24248179.0, + "reward": 0.8380126953125, + "reward_std": 0.015354428440332413, + "rewards//mean": 0.8380126953125, + "rewards//std": 0.01949983462691307, + "step": 3330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6662, + "grad_norm": 1.312538743019104, + "kl": 0.3278988040983677, + "learning_rate": 2.555163538396806e-07, + "loss": 0.0131, + "num_tokens": 24255459.0, + "reward": 0.8648681640625, + "reward_std": 0.01365148276090622, + "rewards//mean": 0.8648681640625, + "rewards//std": 0.022279584780335426, + "step": 3331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6664, + "grad_norm": 1.8481677770614624, + "kl": 0.40672912262380123, + "learning_rate": 2.552395932054198e-07, + "loss": 0.0163, + "num_tokens": 24262795.0, + "reward": 0.8612060546875, + "reward_std": 0.01710190437734127, + "rewards//mean": 0.8612060546875, + "rewards//std": 0.028056098148226738, + "step": 3332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6666, + "grad_norm": 2.5307178497314453, + "kl": 0.5367534086108208, + "learning_rate": 2.5496293116062153e-07, + "loss": 0.0215, + "num_tokens": 24270003.0, + "reward": 0.8663330078125, + "reward_std": 0.01664252020418644, + "rewards//mean": 0.8663330078125, + "rewards//std": 0.029301784932613373, + "step": 3333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6668, + "grad_norm": 1.6115626096725464, + "kl": 0.32246312499046326, + "learning_rate": 2.546863678167255e-07, + "loss": 0.0129, + "num_tokens": 24277323.0, + "reward": 0.8548583984375, + "reward_std": 0.01452319510281086, + "rewards//mean": 0.8548583984375, + "rewards//std": 0.021135948598384857, + "step": 3334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.667, + "grad_norm": 1.5646710395812988, + "kl": 0.3074373509734869, + "learning_rate": 2.5440990328513096e-07, + "loss": 0.0123, + "num_tokens": 24284507.0, + "reward": 0.783447265625, + "reward_std": 0.013565394096076488, + "rewards//mean": 0.783447265625, + "rewards//std": 0.01698956824839115, + "step": 3335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.765625, + "epoch": 0.6672, + "grad_norm": 1.6330097913742065, + "kl": 0.34860416129231453, + "learning_rate": 2.54133537677198e-07, + "loss": 0.0029, + "num_tokens": 24291756.0, + "reward": 0.82220458984375, + "reward_std": 0.01538397092372179, + "rewards//mean": 0.82220458984375, + "rewards//std": 0.02476397156715393, + "step": 3336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6674, + "grad_norm": 1.4099200963974, + "kl": 0.3086399082094431, + "learning_rate": 2.538572711042469e-07, + "loss": 0.0123, + "num_tokens": 24299004.0, + "reward": 0.87225341796875, + "reward_std": 0.018212981522083282, + "rewards//mean": 0.87225341796875, + "rewards//std": 0.024176908656954765, + "step": 3337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6676, + "grad_norm": 1.9577527046203613, + "kl": 0.44737326353788376, + "learning_rate": 2.535811036775574e-07, + "loss": 0.0179, + "num_tokens": 24306364.0, + "reward": 0.81298828125, + "reward_std": 0.013801174238324165, + "rewards//mean": 0.81298828125, + "rewards//std": 0.01884971745312214, + "step": 3338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6678, + "grad_norm": 1.629288911819458, + "kl": 0.3738246839493513, + "learning_rate": 2.5330503550837004e-07, + "loss": 0.015, + "num_tokens": 24313756.0, + "reward": 0.8670654296875, + "reward_std": 0.013373639434576035, + "rewards//mean": 0.8670654296875, + "rewards//std": 0.019425168633461, + "step": 3339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.668, + "grad_norm": 1.7904175519943237, + "kl": 0.3002842515707016, + "learning_rate": 2.530290667078846e-07, + "loss": 0.012, + "num_tokens": 24321180.0, + "reward": 0.83746337890625, + "reward_std": 0.01428382471203804, + "rewards//mean": 0.83746337890625, + "rewards//std": 0.019458161666989326, + "step": 3340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.78125, + "epoch": 0.6682, + "grad_norm": 1.6716864109039307, + "kl": 0.32853954285383224, + "learning_rate": 2.5275319738726165e-07, + "loss": 0.012, + "num_tokens": 24328398.0, + "reward": 0.8699951171875, + "reward_std": 0.01618291437625885, + "rewards//mean": 0.8699951171875, + "rewards//std": 0.019243525341153145, + "step": 3341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6684, + "grad_norm": 1.6014416217803955, + "kl": 0.37281153723597527, + "learning_rate": 2.524774276576214e-07, + "loss": 0.0149, + "num_tokens": 24335590.0, + "reward": 0.82879638671875, + "reward_std": 0.015488408505916595, + "rewards//mean": 0.82879638671875, + "rewards//std": 0.022256048396229744, + "step": 3342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6686, + "grad_norm": 1.4782586097717285, + "kl": 0.31855726800858974, + "learning_rate": 2.522017576300434e-07, + "loss": 0.0127, + "num_tokens": 24342902.0, + "reward": 0.84393310546875, + "reward_std": 0.019129393622279167, + "rewards//mean": 0.84393310546875, + "rewards//std": 0.027334269136190414, + "step": 3343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6688, + "grad_norm": 1.3596433401107788, + "kl": 0.31058645993471146, + "learning_rate": 2.519261874155679e-07, + "loss": 0.0124, + "num_tokens": 24350094.0, + "reward": 0.8414306640625, + "reward_std": 0.012397321872413158, + "rewards//mean": 0.8414306640625, + "rewards//std": 0.0200358759611845, + "step": 3344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.669, + "grad_norm": 1.5828421115875244, + "kl": 0.335276510566473, + "learning_rate": 2.5165071712519445e-07, + "loss": 0.0134, + "num_tokens": 24357326.0, + "reward": 0.8094482421875, + "reward_std": 0.015217301435768604, + "rewards//mean": 0.8094482421875, + "rewards//std": 0.0222768671810627, + "step": 3345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.6692, + "grad_norm": 1.5786091089248657, + "kl": 0.32618022337555885, + "learning_rate": 2.513753468698826e-07, + "loss": 0.0106, + "num_tokens": 24364785.0, + "reward": 0.83740234375, + "reward_std": 0.021744105964899063, + "rewards//mean": 0.83740234375, + "rewards//std": 0.025224339216947556, + "step": 3346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6694, + "grad_norm": 1.3627231121063232, + "kl": 0.3574681803584099, + "learning_rate": 2.5110007676055107e-07, + "loss": 0.0143, + "num_tokens": 24372129.0, + "reward": 0.8858642578125, + "reward_std": 0.01688794419169426, + "rewards//mean": 0.8858642578125, + "rewards//std": 0.02898598089814186, + "step": 3347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6696, + "grad_norm": 1.7497133016586304, + "kl": 0.2957992795854807, + "learning_rate": 2.508249069080789e-07, + "loss": 0.0118, + "num_tokens": 24379441.0, + "reward": 0.8199462890625, + "reward_std": 0.012227912433445454, + "rewards//mean": 0.8199462890625, + "rewards//std": 0.0180115457624197, + "step": 3348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6698, + "grad_norm": 1.658457636833191, + "kl": 0.3617372587323189, + "learning_rate": 2.5054983742330437e-07, + "loss": 0.0145, + "num_tokens": 24386729.0, + "reward": 0.8382568359375, + "reward_std": 0.014954591169953346, + "rewards//mean": 0.8382568359375, + "rewards//std": 0.0246632881462574, + "step": 3349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.67, + "grad_norm": 1.8366063833236694, + "kl": 0.3536721020936966, + "learning_rate": 2.5027486841702577e-07, + "loss": 0.0141, + "num_tokens": 24394041.0, + "reward": 0.7984619140625, + "reward_std": 0.015994684770703316, + "rewards//mean": 0.7984619140625, + "rewards//std": 0.01939709298312664, + "step": 3350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.875, + "epoch": 0.6702, + "grad_norm": 1.7233126163482666, + "kl": 0.39778055250644684, + "learning_rate": 2.500000000000001e-07, + "loss": 0.0066, + "num_tokens": 24401281.0, + "reward": 0.83563232421875, + "reward_std": 0.013538007624447346, + "rewards//mean": 0.83563232421875, + "rewards//std": 0.01912783645093441, + "step": 3351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6704, + "grad_norm": 1.6127021312713623, + "kl": 0.3701545037329197, + "learning_rate": 2.497252322829445e-07, + "loss": 0.0148, + "num_tokens": 24408641.0, + "reward": 0.87451171875, + "reward_std": 0.009570627473294735, + "rewards//mean": 0.87451171875, + "rewards//std": 0.015477040782570839, + "step": 3352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6706, + "grad_norm": 1.467481255531311, + "kl": 0.32170594669878483, + "learning_rate": 2.494505653765354e-07, + "loss": 0.0129, + "num_tokens": 24415857.0, + "reward": 0.84765625, + "reward_std": 0.016957517713308334, + "rewards//mean": 0.84765625, + "rewards//std": 0.032204244285821915, + "step": 3353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6708, + "grad_norm": 1.4506394863128662, + "kl": 0.3965573310852051, + "learning_rate": 2.491759993914088e-07, + "loss": 0.0159, + "num_tokens": 24423193.0, + "reward": 0.818603515625, + "reward_std": 0.014395851641893387, + "rewards//mean": 0.818603515625, + "rewards//std": 0.016231337562203407, + "step": 3354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.78125, + "epoch": 0.671, + "grad_norm": 1.6084351539611816, + "kl": 0.312028331682086, + "learning_rate": 2.489015344381595e-07, + "loss": 0.0146, + "num_tokens": 24430467.0, + "reward": 0.86614990234375, + "reward_std": 0.016898423433303833, + "rewards//mean": 0.86614990234375, + "rewards//std": 0.022800985723733902, + "step": 3355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6712, + "grad_norm": 1.6359128952026367, + "kl": 0.32125888764858246, + "learning_rate": 2.4862717062734206e-07, + "loss": 0.0129, + "num_tokens": 24437691.0, + "reward": 0.85504150390625, + "reward_std": 0.012858632951974869, + "rewards//mean": 0.85504150390625, + "rewards//std": 0.019858581945300102, + "step": 3356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6714, + "grad_norm": 1.8337141275405884, + "kl": 0.3575122207403183, + "learning_rate": 2.4835290806947045e-07, + "loss": 0.0143, + "num_tokens": 24444963.0, + "reward": 0.85028076171875, + "reward_std": 0.017461087554693222, + "rewards//mean": 0.85028076171875, + "rewards//std": 0.0208688173443079, + "step": 3357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6716, + "grad_norm": 1.5951262712478638, + "kl": 0.3430583272129297, + "learning_rate": 2.4807874687501715e-07, + "loss": 0.0137, + "num_tokens": 24452235.0, + "reward": 0.86151123046875, + "reward_std": 0.01650039106607437, + "rewards//mean": 0.86151123046875, + "rewards//std": 0.020278723910450935, + "step": 3358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.6718, + "grad_norm": 1.4185596704483032, + "kl": 0.28047232888638973, + "learning_rate": 2.4780468715441457e-07, + "loss": 0.0116, + "num_tokens": 24459522.0, + "reward": 0.806884765625, + "reward_std": 0.012675801292061806, + "rewards//mean": 0.806884765625, + "rewards//std": 0.018000196665525436, + "step": 3359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.672, + "grad_norm": 1.3527525663375854, + "kl": 0.32800909504294395, + "learning_rate": 2.4753072901805376e-07, + "loss": 0.0131, + "num_tokens": 24466746.0, + "reward": 0.8485107421875, + "reward_std": 0.013112178072333336, + "rewards//mean": 0.8485107421875, + "rewards//std": 0.015144342556595802, + "step": 3360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6722, + "grad_norm": 1.5179802179336548, + "kl": 0.2806025594472885, + "learning_rate": 2.472568725762853e-07, + "loss": 0.0112, + "num_tokens": 24473946.0, + "reward": 0.847412109375, + "reward_std": 0.014167840592563152, + "rewards//mean": 0.847412109375, + "rewards//std": 0.018893033266067505, + "step": 3361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6724, + "grad_norm": 1.5144383907318115, + "kl": 0.34916589222848415, + "learning_rate": 2.469831179394182e-07, + "loss": 0.014, + "num_tokens": 24481226.0, + "reward": 0.8450927734375, + "reward_std": 0.014165692962706089, + "rewards//mean": 0.8450927734375, + "rewards//std": 0.014963340014219284, + "step": 3362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6726, + "grad_norm": 1.7372723817825317, + "kl": 0.3634151890873909, + "learning_rate": 2.467094652177209e-07, + "loss": 0.0145, + "num_tokens": 24488594.0, + "reward": 0.86126708984375, + "reward_std": 0.021439485251903534, + "rewards//mean": 0.86126708984375, + "rewards//std": 0.02536311186850071, + "step": 3363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6728, + "grad_norm": 1.6381868124008179, + "kl": 0.34993308037519455, + "learning_rate": 2.464359145214207e-07, + "loss": 0.014, + "num_tokens": 24495938.0, + "reward": 0.83026123046875, + "reward_std": 0.01388377882540226, + "rewards//mean": 0.83026123046875, + "rewards//std": 0.02519003488123417, + "step": 3364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.673, + "grad_norm": 1.8315589427947998, + "kl": 0.3253883942961693, + "learning_rate": 2.46162465960704e-07, + "loss": 0.013, + "num_tokens": 24503130.0, + "reward": 0.85491943359375, + "reward_std": 0.015423109754920006, + "rewards//mean": 0.85491943359375, + "rewards//std": 0.02254057675600052, + "step": 3365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6732, + "grad_norm": 1.5834486484527588, + "kl": 0.36262669041752815, + "learning_rate": 2.458891196457155e-07, + "loss": 0.0145, + "num_tokens": 24510362.0, + "reward": 0.845947265625, + "reward_std": 0.015622757375240326, + "rewards//mean": 0.845947265625, + "rewards//std": 0.018267326056957245, + "step": 3366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6734, + "grad_norm": 2.0493967533111572, + "kl": 0.38101140037178993, + "learning_rate": 2.4561587568655924e-07, + "loss": 0.0152, + "num_tokens": 24517722.0, + "reward": 0.81939697265625, + "reward_std": 0.01581387221813202, + "rewards//mean": 0.81939697265625, + "rewards//std": 0.019472159445285797, + "step": 3367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.578125, + "epoch": 0.6736, + "grad_norm": 1.5080883502960205, + "kl": 0.31202734261751175, + "learning_rate": 2.4534273419329775e-07, + "loss": -0.0108, + "num_tokens": 24524967.0, + "reward": 0.853515625, + "reward_std": 0.017900947481393814, + "rewards//mean": 0.853515625, + "rewards//std": 0.022649193182587624, + "step": 3368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6738, + "grad_norm": 2.2261621952056885, + "kl": 0.37484827637672424, + "learning_rate": 2.450696952759527e-07, + "loss": 0.015, + "num_tokens": 24532183.0, + "reward": 0.82879638671875, + "reward_std": 0.02799832820892334, + "rewards//mean": 0.82879638671875, + "rewards//std": 0.028734935447573662, + "step": 3369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.875, + "epoch": 0.674, + "grad_norm": 1.4618104696273804, + "kl": 0.3214741386473179, + "learning_rate": 2.4479675904450376e-07, + "loss": 0.0046, + "num_tokens": 24539519.0, + "reward": 0.822021484375, + "reward_std": 0.01510897371917963, + "rewards//mean": 0.822021484375, + "rewards//std": 0.02083188109099865, + "step": 3370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.59375, + "epoch": 0.6742, + "grad_norm": 1.4792965650558472, + "kl": 0.3238056246191263, + "learning_rate": 2.4452392560888976e-07, + "loss": -0.0197, + "num_tokens": 24546765.0, + "reward": 0.8497314453125, + "reward_std": 0.019215725362300873, + "rewards//mean": 0.8497314453125, + "rewards//std": 0.03573344647884369, + "step": 3371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6744, + "grad_norm": 1.9247157573699951, + "kl": 0.42641132324934006, + "learning_rate": 2.442511950790081e-07, + "loss": 0.0171, + "num_tokens": 24554005.0, + "reward": 0.82598876953125, + "reward_std": 0.016987111419439316, + "rewards//mean": 0.82598876953125, + "rewards//std": 0.02123553492128849, + "step": 3372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6746, + "grad_norm": 1.5843431949615479, + "kl": 0.2748568691313267, + "learning_rate": 2.439785675647143e-07, + "loss": 0.011, + "num_tokens": 24561221.0, + "reward": 0.8193359375, + "reward_std": 0.01403682678937912, + "rewards//mean": 0.8193359375, + "rewards//std": 0.02046976424753666, + "step": 3373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6748, + "grad_norm": 1.4587664604187012, + "kl": 0.3280709907412529, + "learning_rate": 2.4370604317582286e-07, + "loss": 0.0131, + "num_tokens": 24568541.0, + "reward": 0.8458251953125, + "reward_std": 0.020944498479366302, + "rewards//mean": 0.8458251953125, + "rewards//std": 0.03138124197721481, + "step": 3374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.675, + "grad_norm": 1.6227076053619385, + "kl": 0.354659516364336, + "learning_rate": 2.4343362202210667e-07, + "loss": 0.0142, + "num_tokens": 24575749.0, + "reward": 0.869873046875, + "reward_std": 0.01588056981563568, + "rewards//mean": 0.869873046875, + "rewards//std": 0.023662477731704712, + "step": 3375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6752, + "grad_norm": 1.5619595050811768, + "kl": 0.340689517557621, + "learning_rate": 2.4316130421329696e-07, + "loss": 0.0136, + "num_tokens": 24582965.0, + "reward": 0.82452392578125, + "reward_std": 0.01519658975303173, + "rewards//mean": 0.82452392578125, + "rewards//std": 0.02331632934510708, + "step": 3376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.6754, + "grad_norm": 1.463582992553711, + "kl": 0.3575294353067875, + "learning_rate": 2.42889089859083e-07, + "loss": 0.0127, + "num_tokens": 24590387.0, + "reward": 0.87213134765625, + "reward_std": 0.01465057022869587, + "rewards//mean": 0.87213134765625, + "rewards//std": 0.020928936079144478, + "step": 3377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6756, + "grad_norm": 1.461673378944397, + "kl": 0.3891821000725031, + "learning_rate": 2.426169790691129e-07, + "loss": 0.0156, + "num_tokens": 24597651.0, + "reward": 0.8702392578125, + "reward_std": 0.011750398203730583, + "rewards//mean": 0.8702392578125, + "rewards//std": 0.015287602320313454, + "step": 3378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6758, + "grad_norm": 1.5639781951904297, + "kl": 0.3538023214787245, + "learning_rate": 2.4234497195299287e-07, + "loss": 0.0142, + "num_tokens": 24604955.0, + "reward": 0.78955078125, + "reward_std": 0.01672176644206047, + "rewards//mean": 0.78955078125, + "rewards//std": 0.027272796258330345, + "step": 3379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.676, + "grad_norm": 1.287794589996338, + "kl": 0.3413005117326975, + "learning_rate": 2.4207306862028753e-07, + "loss": 0.0137, + "num_tokens": 24612147.0, + "reward": 0.84881591796875, + "reward_std": 0.013586641289293766, + "rewards//mean": 0.84881591796875, + "rewards//std": 0.01889052800834179, + "step": 3380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.6762, + "grad_norm": 1.6363178491592407, + "kl": 0.3311314135789871, + "learning_rate": 2.418012691805191e-07, + "loss": 0.0126, + "num_tokens": 24619479.0, + "reward": 0.84527587890625, + "reward_std": 0.0165560320019722, + "rewards//mean": 0.84527587890625, + "rewards//std": 0.018722299486398697, + "step": 3381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6764, + "grad_norm": 1.388262391090393, + "kl": 0.3780644666403532, + "learning_rate": 2.4152957374316856e-07, + "loss": 0.0151, + "num_tokens": 24626799.0, + "reward": 0.84716796875, + "reward_std": 0.014772150665521622, + "rewards//mean": 0.84716796875, + "rewards//std": 0.021253438666462898, + "step": 3382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6766, + "grad_norm": 1.6545251607894897, + "kl": 0.30594930797815323, + "learning_rate": 2.412579824176748e-07, + "loss": 0.0122, + "num_tokens": 24634159.0, + "reward": 0.79522705078125, + "reward_std": 0.010048076510429382, + "rewards//mean": 0.79522705078125, + "rewards//std": 0.012504740618169308, + "step": 3383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6768, + "grad_norm": 1.6979551315307617, + "kl": 0.3786165751516819, + "learning_rate": 2.4098649531343494e-07, + "loss": 0.0151, + "num_tokens": 24641447.0, + "reward": 0.832275390625, + "reward_std": 0.01309158280491829, + "rewards//mean": 0.832275390625, + "rewards//std": 0.018000196665525436, + "step": 3384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.578125, + "epoch": 0.677, + "grad_norm": 1.8692289590835571, + "kl": 0.39148247614502907, + "learning_rate": 2.407151125398036e-07, + "loss": -0.0049, + "num_tokens": 24648636.0, + "reward": 0.8546142578125, + "reward_std": 0.01837162673473358, + "rewards//mean": 0.8546142578125, + "rewards//std": 0.029376082122325897, + "step": 3385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.765625, + "epoch": 0.6772, + "grad_norm": 2.2884111404418945, + "kl": 0.35730985924601555, + "learning_rate": 2.4044383420609406e-07, + "loss": 0.0003, + "num_tokens": 24656013.0, + "reward": 0.87799072265625, + "reward_std": 0.016604339703917503, + "rewards//mean": 0.87799072265625, + "rewards//std": 0.02452564798295498, + "step": 3386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6774, + "grad_norm": 1.6314008235931396, + "kl": 0.30153824388980865, + "learning_rate": 2.4017266042157695e-07, + "loss": 0.0121, + "num_tokens": 24663341.0, + "reward": 0.868408203125, + "reward_std": 0.017524391412734985, + "rewards//mean": 0.868408203125, + "rewards//std": 0.026674555614590645, + "step": 3387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6776, + "grad_norm": 2.8215317726135254, + "kl": 0.5455790087580681, + "learning_rate": 2.3990159129548133e-07, + "loss": 0.0218, + "num_tokens": 24670669.0, + "reward": 0.85626220703125, + "reward_std": 0.014430465176701546, + "rewards//mean": 0.85626220703125, + "rewards//std": 0.015919528901576996, + "step": 3388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6778, + "grad_norm": 1.5200897455215454, + "kl": 0.3141790460795164, + "learning_rate": 2.396306269369935e-07, + "loss": 0.0126, + "num_tokens": 24677877.0, + "reward": 0.79302978515625, + "reward_std": 0.01313895732164383, + "rewards//mean": 0.79302978515625, + "rewards//std": 0.019431693479418755, + "step": 3389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.678, + "grad_norm": 1.5584663152694702, + "kl": 0.33207246847450733, + "learning_rate": 2.393597674552579e-07, + "loss": 0.0133, + "num_tokens": 24685101.0, + "reward": 0.876708984375, + "reward_std": 0.017342913895845413, + "rewards//mean": 0.876708984375, + "rewards//std": 0.024408170953392982, + "step": 3390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.828125, + "epoch": 0.6782, + "grad_norm": 1.532261610031128, + "kl": 0.37302470207214355, + "learning_rate": 2.390890129593771e-07, + "loss": 0.0104, + "num_tokens": 24692402.0, + "reward": 0.8341064453125, + "reward_std": 0.017801471054553986, + "rewards//mean": 0.8341064453125, + "rewards//std": 0.023632710799574852, + "step": 3391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6784, + "grad_norm": 1.3377432823181152, + "kl": 0.3936217576265335, + "learning_rate": 2.3881836355841045e-07, + "loss": 0.0157, + "num_tokens": 24699674.0, + "reward": 0.83355712890625, + "reward_std": 0.014761611819267273, + "rewards//mean": 0.83355712890625, + "rewards//std": 0.019889049232006073, + "step": 3392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.875, + "epoch": 0.6786, + "grad_norm": 1.6774464845657349, + "kl": 0.35206116549670696, + "learning_rate": 2.3854781936137576e-07, + "loss": 0.0091, + "num_tokens": 24706954.0, + "reward": 0.7911376953125, + "reward_std": 0.01684820093214512, + "rewards//mean": 0.7911376953125, + "rewards//std": 0.0243096724152565, + "step": 3393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6788, + "grad_norm": 1.8284640312194824, + "kl": 0.3688094951212406, + "learning_rate": 2.382773804772481e-07, + "loss": 0.0148, + "num_tokens": 24714210.0, + "reward": 0.83367919921875, + "reward_std": 0.013937513343989849, + "rewards//mean": 0.83367919921875, + "rewards//std": 0.01826639473438263, + "step": 3394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.679, + "grad_norm": 1.8849399089813232, + "kl": 0.4213009998202324, + "learning_rate": 2.380070470149605e-07, + "loss": 0.0169, + "num_tokens": 24721466.0, + "reward": 0.8817138671875, + "reward_std": 0.018475892022252083, + "rewards//mean": 0.8817138671875, + "rewards//std": 0.029252147302031517, + "step": 3395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.6792, + "grad_norm": 1.958656907081604, + "kl": 0.31500000320374966, + "learning_rate": 2.3773681908340282e-07, + "loss": 0.0136, + "num_tokens": 24728779.0, + "reward": 0.8570556640625, + "reward_std": 0.012463918887078762, + "rewards//mean": 0.8570556640625, + "rewards//std": 0.01982930861413479, + "step": 3396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6794, + "grad_norm": 1.43436598777771, + "kl": 0.2812935169786215, + "learning_rate": 2.3746669679142312e-07, + "loss": 0.0113, + "num_tokens": 24736107.0, + "reward": 0.8193359375, + "reward_std": 0.010599279776215553, + "rewards//mean": 0.8193359375, + "rewards//std": 0.020303435623645782, + "step": 3397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6796, + "grad_norm": 1.350469708442688, + "kl": 0.317795192822814, + "learning_rate": 2.3719668024782647e-07, + "loss": 0.0127, + "num_tokens": 24743419.0, + "reward": 0.8316650390625, + "reward_std": 0.009332253597676754, + "rewards//mean": 0.8316650390625, + "rewards//std": 0.019780389964580536, + "step": 3398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6798, + "grad_norm": 1.7785154581069946, + "kl": 0.3752956632524729, + "learning_rate": 2.369267695613758e-07, + "loss": 0.015, + "num_tokens": 24750691.0, + "reward": 0.7325439453125, + "reward_std": 0.014996150508522987, + "rewards//mean": 0.7325439453125, + "rewards//std": 0.017287665978074074, + "step": 3399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.68, + "grad_norm": 1.6544713973999023, + "kl": 0.3499097041785717, + "learning_rate": 2.3665696484079074e-07, + "loss": 0.014, + "num_tokens": 24757963.0, + "reward": 0.77069091796875, + "reward_std": 0.01270270999521017, + "rewards//mean": 0.77069091796875, + "rewards//std": 0.026754381135106087, + "step": 3400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6802, + "grad_norm": 1.7278589010238647, + "kl": 0.3455641232430935, + "learning_rate": 2.3638726619474875e-07, + "loss": 0.0138, + "num_tokens": 24765147.0, + "reward": 0.86376953125, + "reward_std": 0.01902509294450283, + "rewards//mean": 0.86376953125, + "rewards//std": 0.027878835797309875, + "step": 3401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6804, + "grad_norm": 1.3494011163711548, + "kl": 0.36950724199414253, + "learning_rate": 2.361176737318844e-07, + "loss": 0.0148, + "num_tokens": 24772387.0, + "reward": 0.871337890625, + "reward_std": 0.014583177864551544, + "rewards//mean": 0.871337890625, + "rewards//std": 0.021484375, + "step": 3402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6806, + "grad_norm": 1.8784067630767822, + "kl": 0.38696929067373276, + "learning_rate": 2.3584818756078968e-07, + "loss": 0.0155, + "num_tokens": 24779643.0, + "reward": 0.83917236328125, + "reward_std": 0.011990130878984928, + "rewards//mean": 0.83917236328125, + "rewards//std": 0.017995886504650116, + "step": 3403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6808, + "grad_norm": 1.8709981441497803, + "kl": 0.4126734547317028, + "learning_rate": 2.355788077900132e-07, + "loss": 0.0165, + "num_tokens": 24786995.0, + "reward": 0.8243408203125, + "reward_std": 0.012466974556446075, + "rewards//mean": 0.8243408203125, + "rewards//std": 0.01840067096054554, + "step": 3404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.8125, + "epoch": 0.681, + "grad_norm": 1.5004527568817139, + "kl": 0.3033923674374819, + "learning_rate": 2.353095345280614e-07, + "loss": 0.0072, + "num_tokens": 24794271.0, + "reward": 0.74359130859375, + "reward_std": 0.011104302480816841, + "rewards//mean": 0.74359130859375, + "rewards//std": 0.02241666615009308, + "step": 3405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6812, + "grad_norm": 1.3706387281417847, + "kl": 0.3576857000589371, + "learning_rate": 2.350403678833976e-07, + "loss": 0.0143, + "num_tokens": 24801551.0, + "reward": 0.87060546875, + "reward_std": 0.01077396608889103, + "rewards//mean": 0.87060546875, + "rewards//std": 0.012748263776302338, + "step": 3406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6814, + "grad_norm": 1.4488413333892822, + "kl": 0.35112638399004936, + "learning_rate": 2.3477130796444173e-07, + "loss": 0.014, + "num_tokens": 24808951.0, + "reward": 0.81988525390625, + "reward_std": 0.013610748574137688, + "rewards//mean": 0.81988525390625, + "rewards//std": 0.017608992755413055, + "step": 3407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6816, + "grad_norm": 1.992266058921814, + "kl": 0.357315506786108, + "learning_rate": 2.3450235487957133e-07, + "loss": 0.0143, + "num_tokens": 24816143.0, + "reward": 0.842529296875, + "reward_std": 0.01763848215341568, + "rewards//mean": 0.842529296875, + "rewards//std": 0.021797746419906616, + "step": 3408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6818, + "grad_norm": 1.3869050741195679, + "kl": 0.3581321854144335, + "learning_rate": 2.3423350873712054e-07, + "loss": 0.0143, + "num_tokens": 24823439.0, + "reward": 0.83013916015625, + "reward_std": 0.01724432222545147, + "rewards//mean": 0.83013916015625, + "rewards//std": 0.022777073085308075, + "step": 3409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.682, + "grad_norm": 1.5114471912384033, + "kl": 0.32229516468942165, + "learning_rate": 2.3396476964538093e-07, + "loss": 0.0107, + "num_tokens": 24830723.0, + "reward": 0.8504638671875, + "reward_std": 0.01840103045105934, + "rewards//mean": 0.8504638671875, + "rewards//std": 0.030389223247766495, + "step": 3410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6822, + "grad_norm": 1.474536657333374, + "kl": 0.3266499936580658, + "learning_rate": 2.3369613771260005e-07, + "loss": 0.0131, + "num_tokens": 24837987.0, + "reward": 0.8590087890625, + "reward_std": 0.01390366442501545, + "rewards//mean": 0.8590087890625, + "rewards//std": 0.018172195181250572, + "step": 3411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6824, + "grad_norm": 1.4181731939315796, + "kl": 0.2977987118065357, + "learning_rate": 2.334276130469831e-07, + "loss": 0.0119, + "num_tokens": 24845235.0, + "reward": 0.859375, + "reward_std": 0.013834409415721893, + "rewards//mean": 0.859375, + "rewards//std": 0.02232607826590538, + "step": 3412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6826, + "grad_norm": 1.5122581720352173, + "kl": 0.3866444565355778, + "learning_rate": 2.331591957566917e-07, + "loss": 0.0155, + "num_tokens": 24852483.0, + "reward": 0.84228515625, + "reward_std": 0.018405165523290634, + "rewards//mean": 0.84228515625, + "rewards//std": 0.02384219877421856, + "step": 3413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6828, + "grad_norm": 1.5532768964767456, + "kl": 0.3509260378777981, + "learning_rate": 2.328908859498445e-07, + "loss": 0.014, + "num_tokens": 24859675.0, + "reward": 0.8653564453125, + "reward_std": 0.016006819903850555, + "rewards//mean": 0.8653564453125, + "rewards//std": 0.02303062006831169, + "step": 3414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.683, + "grad_norm": 1.555235743522644, + "kl": 0.3570924624800682, + "learning_rate": 2.3262268373451637e-07, + "loss": 0.0143, + "num_tokens": 24866963.0, + "reward": 0.87139892578125, + "reward_std": 0.019061213359236717, + "rewards//mean": 0.87139892578125, + "rewards//std": 0.023149527609348297, + "step": 3415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6832, + "grad_norm": 1.535149097442627, + "kl": 0.34803637489676476, + "learning_rate": 2.3235458921873923e-07, + "loss": 0.0139, + "num_tokens": 24874227.0, + "reward": 0.88189697265625, + "reward_std": 0.018231231719255447, + "rewards//mean": 0.88189697265625, + "rewards//std": 0.027715394273400307, + "step": 3416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6834, + "grad_norm": 1.7085022926330566, + "kl": 0.40615370497107506, + "learning_rate": 2.3208660251050156e-07, + "loss": 0.0162, + "num_tokens": 24881563.0, + "reward": 0.87335205078125, + "reward_std": 0.01354163233190775, + "rewards//mean": 0.87335205078125, + "rewards//std": 0.01821327768266201, + "step": 3417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6836, + "grad_norm": 1.7772457599639893, + "kl": 0.33620760217309, + "learning_rate": 2.3181872371774853e-07, + "loss": 0.0134, + "num_tokens": 24888811.0, + "reward": 0.85577392578125, + "reward_std": 0.018870726227760315, + "rewards//mean": 0.85577392578125, + "rewards//std": 0.02327474020421505, + "step": 3418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6838, + "grad_norm": 1.6104331016540527, + "kl": 0.33204309083521366, + "learning_rate": 2.3155095294838133e-07, + "loss": 0.0133, + "num_tokens": 24896083.0, + "reward": 0.768310546875, + "reward_std": 0.01458577997982502, + "rewards//mean": 0.768310546875, + "rewards//std": 0.0184913519769907, + "step": 3419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.684, + "grad_norm": 1.5389004945755005, + "kl": 0.371082603931427, + "learning_rate": 2.3128329031025818e-07, + "loss": 0.0148, + "num_tokens": 24903463.0, + "reward": 0.84588623046875, + "reward_std": 0.017719076946377754, + "rewards//mean": 0.84588623046875, + "rewards//std": 0.0254958588629961, + "step": 3420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6842, + "grad_norm": 1.5601516962051392, + "kl": 0.3913302980363369, + "learning_rate": 2.310157359111938e-07, + "loss": 0.0157, + "num_tokens": 24910775.0, + "reward": 0.86029052734375, + "reward_std": 0.015278054401278496, + "rewards//mean": 0.86029052734375, + "rewards//std": 0.026224639266729355, + "step": 3421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6844, + "grad_norm": 1.884227991104126, + "kl": 0.27482255920767784, + "learning_rate": 2.3074828985895855e-07, + "loss": 0.011, + "num_tokens": 24918191.0, + "reward": 0.86663818359375, + "reward_std": 0.014936678111553192, + "rewards//mean": 0.86663818359375, + "rewards//std": 0.02403813786804676, + "step": 3422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6846, + "grad_norm": 1.357208490371704, + "kl": 0.3662238083779812, + "learning_rate": 2.3048095226128017e-07, + "loss": 0.0146, + "num_tokens": 24925479.0, + "reward": 0.80889892578125, + "reward_std": 0.01282556727528572, + "rewards//mean": 0.80889892578125, + "rewards//std": 0.016838204115629196, + "step": 3423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.6848, + "grad_norm": 1.7165613174438477, + "kl": 0.3657785393297672, + "learning_rate": 2.3021372322584183e-07, + "loss": 0.0141, + "num_tokens": 24932731.0, + "reward": 0.85748291015625, + "reward_std": 0.014774312265217304, + "rewards//mean": 0.85748291015625, + "rewards//std": 0.019035018980503082, + "step": 3424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.685, + "grad_norm": 1.561250925064087, + "kl": 0.37255205400288105, + "learning_rate": 2.2994660286028345e-07, + "loss": 0.0149, + "num_tokens": 24940003.0, + "reward": 0.86767578125, + "reward_std": 0.013757719658315182, + "rewards//mean": 0.86767578125, + "rewards//std": 0.016015399247407913, + "step": 3425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6852, + "grad_norm": 1.3843857049942017, + "kl": 0.30326351523399353, + "learning_rate": 2.2967959127220137e-07, + "loss": 0.0121, + "num_tokens": 24947371.0, + "reward": 0.82489013671875, + "reward_std": 0.014273213222622871, + "rewards//mean": 0.82489013671875, + "rewards//std": 0.016682857647538185, + "step": 3426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.8125, + "epoch": 0.6854, + "grad_norm": 1.5046318769454956, + "kl": 0.35221109725534916, + "learning_rate": 2.2941268856914743e-07, + "loss": 0.0139, + "num_tokens": 24954719.0, + "reward": 0.79248046875, + "reward_std": 0.011699959635734558, + "rewards//mean": 0.79248046875, + "rewards//std": 0.018977774307131767, + "step": 3427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.78125, + "epoch": 0.6856, + "grad_norm": 1.5441726446151733, + "kl": 0.3687676526606083, + "learning_rate": 2.2914589485863012e-07, + "loss": 0.0081, + "num_tokens": 24962049.0, + "reward": 0.81427001953125, + "reward_std": 0.014187732711434364, + "rewards//mean": 0.81427001953125, + "rewards//std": 0.02056337520480156, + "step": 3428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6858, + "grad_norm": 1.6546146869659424, + "kl": 0.3425767831504345, + "learning_rate": 2.2887921024811402e-07, + "loss": 0.0137, + "num_tokens": 24969273.0, + "reward": 0.87188720703125, + "reward_std": 0.019593972712755203, + "rewards//mean": 0.87188720703125, + "rewards//std": 0.023366913199424744, + "step": 3429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.686, + "grad_norm": 1.8542805910110474, + "kl": 0.33133431524038315, + "learning_rate": 2.2861263484501974e-07, + "loss": 0.0133, + "num_tokens": 24976513.0, + "reward": 0.88372802734375, + "reward_std": 0.015349366702139378, + "rewards//mean": 0.88372802734375, + "rewards//std": 0.02546556107699871, + "step": 3430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6862, + "grad_norm": 1.6956470012664795, + "kl": 0.39695439115166664, + "learning_rate": 2.283461687567236e-07, + "loss": 0.0159, + "num_tokens": 24983801.0, + "reward": 0.8582763671875, + "reward_std": 0.01838843524456024, + "rewards//mean": 0.8582763671875, + "rewards//std": 0.030785147100687027, + "step": 3431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.71875, + "epoch": 0.6864, + "grad_norm": 1.480758547782898, + "kl": 0.3533215634524822, + "learning_rate": 2.280798120905581e-07, + "loss": 0.0098, + "num_tokens": 24991039.0, + "reward": 0.88043212890625, + "reward_std": 0.021412014961242676, + "rewards//mean": 0.88043212890625, + "rewards//std": 0.028760738670825958, + "step": 3432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6866, + "grad_norm": 1.6740379333496094, + "kl": 0.4212347362190485, + "learning_rate": 2.278135649538118e-07, + "loss": 0.0168, + "num_tokens": 24998335.0, + "reward": 0.794189453125, + "reward_std": 0.01683362014591694, + "rewards//mean": 0.794189453125, + "rewards//std": 0.019977238029241562, + "step": 3433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.765625, + "epoch": 0.6868, + "grad_norm": 1.9179232120513916, + "kl": 0.4075439302250743, + "learning_rate": 2.275474274537292e-07, + "loss": 0.0115, + "num_tokens": 25005688.0, + "reward": 0.84637451171875, + "reward_std": 0.020247239619493484, + "rewards//mean": 0.84637451171875, + "rewards//std": 0.023847833275794983, + "step": 3434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.687, + "grad_norm": 1.4945164918899536, + "kl": 0.39327606186270714, + "learning_rate": 2.2728139969751003e-07, + "loss": 0.0157, + "num_tokens": 25013000.0, + "reward": 0.8388671875, + "reward_std": 0.017578523606061935, + "rewards//mean": 0.8388671875, + "rewards//std": 0.02265988476574421, + "step": 3435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6872, + "grad_norm": 1.835587978363037, + "kl": 0.41731430403888226, + "learning_rate": 2.2701548179231046e-07, + "loss": 0.0167, + "num_tokens": 25020304.0, + "reward": 0.80438232421875, + "reward_std": 0.011394859291613102, + "rewards//mean": 0.80438232421875, + "rewards//std": 0.014801996760070324, + "step": 3436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.625, + "epoch": 0.6874, + "grad_norm": 1.5287927389144897, + "kl": 0.3366098292171955, + "learning_rate": 2.2674967384524234e-07, + "loss": 0.0079, + "num_tokens": 25027472.0, + "reward": 0.8416748046875, + "reward_std": 0.011501381173729897, + "rewards//mean": 0.8416748046875, + "rewards//std": 0.01626254990696907, + "step": 3437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6876, + "grad_norm": 1.5034418106079102, + "kl": 0.347249923273921, + "learning_rate": 2.2648397596337276e-07, + "loss": 0.0139, + "num_tokens": 25034728.0, + "reward": 0.87030029296875, + "reward_std": 0.015574520453810692, + "rewards//mean": 0.87030029296875, + "rewards//std": 0.020928213372826576, + "step": 3438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.859375, + "epoch": 0.6878, + "grad_norm": 1.363628625869751, + "kl": 0.3053191378712654, + "learning_rate": 2.262183882537249e-07, + "loss": 0.0072, + "num_tokens": 25042095.0, + "reward": 0.84210205078125, + "reward_std": 0.01557998452335596, + "rewards//mean": 0.84210205078125, + "rewards//std": 0.020970122888684273, + "step": 3439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.688, + "grad_norm": 1.6492390632629395, + "kl": 0.3424539603292942, + "learning_rate": 2.2595291082327762e-07, + "loss": 0.0137, + "num_tokens": 25049359.0, + "reward": 0.84637451171875, + "reward_std": 0.01565014198422432, + "rewards//mean": 0.84637451171875, + "rewards//std": 0.021766386926174164, + "step": 3440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6882, + "grad_norm": 1.484063744544983, + "kl": 0.34932128712534904, + "learning_rate": 2.2568754377896515e-07, + "loss": 0.014, + "num_tokens": 25056551.0, + "reward": 0.82427978515625, + "reward_std": 0.01403588242828846, + "rewards//mean": 0.82427978515625, + "rewards//std": 0.017420578747987747, + "step": 3441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6884, + "grad_norm": 1.2333842515945435, + "kl": 0.3453822210431099, + "learning_rate": 2.2542228722767714e-07, + "loss": 0.0138, + "num_tokens": 25063879.0, + "reward": 0.7861328125, + "reward_std": 0.01629617251455784, + "rewards//mean": 0.7861328125, + "rewards//std": 0.02463657222688198, + "step": 3442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6886, + "grad_norm": 1.2210693359375, + "kl": 0.30534128099679947, + "learning_rate": 2.2515714127625897e-07, + "loss": 0.0122, + "num_tokens": 25071127.0, + "reward": 0.86859130859375, + "reward_std": 0.014572561718523502, + "rewards//mean": 0.86859130859375, + "rewards//std": 0.023492898792028427, + "step": 3443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6888, + "grad_norm": 1.8086919784545898, + "kl": 0.40979509241878986, + "learning_rate": 2.2489210603151144e-07, + "loss": 0.0164, + "num_tokens": 25078447.0, + "reward": 0.880126953125, + "reward_std": 0.01568550243973732, + "rewards//mean": 0.880126953125, + "rewards//std": 0.024447832256555557, + "step": 3444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.689, + "grad_norm": 1.5911990404129028, + "kl": 0.37847867608070374, + "learning_rate": 2.2462718160019083e-07, + "loss": 0.0146, + "num_tokens": 25085776.0, + "reward": 0.889404296875, + "reward_std": 0.017549503594636917, + "rewards//mean": 0.889404296875, + "rewards//std": 0.029584812000393867, + "step": 3445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6892, + "grad_norm": 1.6356014013290405, + "kl": 0.3234760910272598, + "learning_rate": 2.2436236808900844e-07, + "loss": 0.0129, + "num_tokens": 25093096.0, + "reward": 0.84686279296875, + "reward_std": 0.013338619843125343, + "rewards//mean": 0.84686279296875, + "rewards//std": 0.021911947056651115, + "step": 3446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6894, + "grad_norm": 1.5529032945632935, + "kl": 0.31044621020555496, + "learning_rate": 2.2409766560463118e-07, + "loss": 0.0124, + "num_tokens": 25100448.0, + "reward": 0.8880615234375, + "reward_std": 0.01218743622303009, + "rewards//mean": 0.8880615234375, + "rewards//std": 0.017950933426618576, + "step": 3447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6896, + "grad_norm": 1.6559174060821533, + "kl": 0.36546747013926506, + "learning_rate": 2.238330742536812e-07, + "loss": 0.0146, + "num_tokens": 25107848.0, + "reward": 0.83331298828125, + "reward_std": 0.016464602202177048, + "rewards//mean": 0.83331298828125, + "rewards//std": 0.020764101296663284, + "step": 3448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6898, + "grad_norm": 1.6358357667922974, + "kl": 0.36697230860590935, + "learning_rate": 2.235685941427361e-07, + "loss": 0.0147, + "num_tokens": 25115104.0, + "reward": 0.83636474609375, + "reward_std": 0.017136581242084503, + "rewards//mean": 0.83636474609375, + "rewards//std": 0.0202211644500494, + "step": 3449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.69, + "grad_norm": 1.5534558296203613, + "kl": 0.278915099799633, + "learning_rate": 2.23304225378328e-07, + "loss": 0.0112, + "num_tokens": 25122416.0, + "reward": 0.81854248046875, + "reward_std": 0.011081382632255554, + "rewards//mean": 0.81854248046875, + "rewards//std": 0.019437145441770554, + "step": 3450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6902, + "grad_norm": 1.7992950677871704, + "kl": 0.348477054387331, + "learning_rate": 2.2303996806694486e-07, + "loss": 0.0139, + "num_tokens": 25129640.0, + "reward": 0.82965087890625, + "reward_std": 0.02125668153166771, + "rewards//mean": 0.82965087890625, + "rewards//std": 0.027530165389180183, + "step": 3451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6904, + "grad_norm": 1.2661195993423462, + "kl": 0.3300876934081316, + "learning_rate": 2.227758223150296e-07, + "loss": 0.0132, + "num_tokens": 25136912.0, + "reward": 0.85394287109375, + "reward_std": 0.02243421971797943, + "rewards//mean": 0.85394287109375, + "rewards//std": 0.02510998211801052, + "step": 3452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6906, + "grad_norm": 1.4975945949554443, + "kl": 0.3546902984380722, + "learning_rate": 2.2251178822897987e-07, + "loss": 0.0142, + "num_tokens": 25144144.0, + "reward": 0.8758544921875, + "reward_std": 0.010821288451552391, + "rewards//mean": 0.8758544921875, + "rewards//std": 0.020908402279019356, + "step": 3453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.859375, + "epoch": 0.6908, + "grad_norm": 1.9828728437423706, + "kl": 0.3777780532836914, + "learning_rate": 2.222478659151486e-07, + "loss": 0.0164, + "num_tokens": 25151439.0, + "reward": 0.8702392578125, + "reward_std": 0.015088662505149841, + "rewards//mean": 0.8702392578125, + "rewards//std": 0.02111588604748249, + "step": 3454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.691, + "grad_norm": 1.467873215675354, + "kl": 0.3748072274029255, + "learning_rate": 2.2198405547984371e-07, + "loss": 0.015, + "num_tokens": 25158671.0, + "reward": 0.90185546875, + "reward_std": 0.01606157422065735, + "rewards//mean": 0.90185546875, + "rewards//std": 0.02157016471028328, + "step": 3455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6912, + "grad_norm": 1.554078221321106, + "kl": 0.33443619683384895, + "learning_rate": 2.2172035702932823e-07, + "loss": 0.0134, + "num_tokens": 25165951.0, + "reward": 0.84271240234375, + "reward_std": 0.015253824181854725, + "rewards//mean": 0.84271240234375, + "rewards//std": 0.02177959680557251, + "step": 3456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6914, + "grad_norm": 1.4028970003128052, + "kl": 0.39325525239109993, + "learning_rate": 2.2145677066981945e-07, + "loss": 0.0157, + "num_tokens": 25173095.0, + "reward": 0.82330322265625, + "reward_std": 0.01677590236067772, + "rewards//mean": 0.82330322265625, + "rewards//std": 0.02480061911046505, + "step": 3457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6916, + "grad_norm": 1.4573920965194702, + "kl": 0.2943352907896042, + "learning_rate": 2.2119329650749018e-07, + "loss": 0.0118, + "num_tokens": 25180375.0, + "reward": 0.8323974609375, + "reward_std": 0.015458826906979084, + "rewards//mean": 0.8323974609375, + "rewards//std": 0.016714246943593025, + "step": 3458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.6918, + "grad_norm": 1.6016204357147217, + "kl": 0.3772869408130646, + "learning_rate": 2.209299346484677e-07, + "loss": 0.0156, + "num_tokens": 25187682.0, + "reward": 0.8848876953125, + "reward_std": 0.018125997856259346, + "rewards//mean": 0.8848876953125, + "rewards//std": 0.023560861125588417, + "step": 3459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.692, + "grad_norm": 1.3570523262023926, + "kl": 0.2944999821484089, + "learning_rate": 2.2066668519883436e-07, + "loss": 0.0118, + "num_tokens": 25195074.0, + "reward": 0.84466552734375, + "reward_std": 0.011828730814158916, + "rewards//mean": 0.84466552734375, + "rewards//std": 0.017101354897022247, + "step": 3460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6922, + "grad_norm": 1.4208682775497437, + "kl": 0.31906696408987045, + "learning_rate": 2.2040354826462664e-07, + "loss": 0.0128, + "num_tokens": 25202330.0, + "reward": 0.827392578125, + "reward_std": 0.012884523719549179, + "rewards//mean": 0.827392578125, + "rewards//std": 0.017173897475004196, + "step": 3461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6924, + "grad_norm": 1.5002505779266357, + "kl": 0.32600685581564903, + "learning_rate": 2.2014052395183623e-07, + "loss": 0.013, + "num_tokens": 25209586.0, + "reward": 0.85589599609375, + "reward_std": 0.013070065528154373, + "rewards//mean": 0.85589599609375, + "rewards//std": 0.03043420985341072, + "step": 3462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6926, + "grad_norm": 3.838951349258423, + "kl": 0.5269115604460239, + "learning_rate": 2.1987761236640933e-07, + "loss": 0.0211, + "num_tokens": 25216922.0, + "reward": 0.8455810546875, + "reward_std": 0.013903395272791386, + "rewards//mean": 0.8455810546875, + "rewards//std": 0.017146991565823555, + "step": 3463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6928, + "grad_norm": 1.7123945951461792, + "kl": 0.32215647026896477, + "learning_rate": 2.1961481361424683e-07, + "loss": 0.0129, + "num_tokens": 25224194.0, + "reward": 0.760498046875, + "reward_std": 0.010403743013739586, + "rewards//mean": 0.760498046875, + "rewards//std": 0.017032282426953316, + "step": 3464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.693, + "grad_norm": 1.4475750923156738, + "kl": 0.2978465761989355, + "learning_rate": 2.1935212780120365e-07, + "loss": 0.0119, + "num_tokens": 25231490.0, + "reward": 0.86956787109375, + "reward_std": 0.01638307236135006, + "rewards//mean": 0.86956787109375, + "rewards//std": 0.021379753947257996, + "step": 3465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6932, + "grad_norm": 1.689426064491272, + "kl": 0.3654649555683136, + "learning_rate": 2.190895550330899e-07, + "loss": 0.0146, + "num_tokens": 25238770.0, + "reward": 0.86590576171875, + "reward_std": 0.015347320586442947, + "rewards//mean": 0.86590576171875, + "rewards//std": 0.023756252601742744, + "step": 3466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6934, + "grad_norm": 1.8840895891189575, + "kl": 0.37762103602290154, + "learning_rate": 2.1882709541566996e-07, + "loss": 0.0151, + "num_tokens": 25246058.0, + "reward": 0.83563232421875, + "reward_std": 0.023698166012763977, + "rewards//mean": 0.83563232421875, + "rewards//std": 0.0351480171084404, + "step": 3467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6936, + "grad_norm": 1.2689526081085205, + "kl": 0.28077370673418045, + "learning_rate": 2.1856474905466215e-07, + "loss": 0.0112, + "num_tokens": 25253298.0, + "reward": 0.863037109375, + "reward_std": 0.017999624833464622, + "rewards//mean": 0.863037109375, + "rewards//std": 0.026801373809576035, + "step": 3468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6938, + "grad_norm": 1.647396206855774, + "kl": 0.4570025075227022, + "learning_rate": 2.1830251605573978e-07, + "loss": 0.0183, + "num_tokens": 25260642.0, + "reward": 0.871826171875, + "reward_std": 0.017803125083446503, + "rewards//mean": 0.871826171875, + "rewards//std": 0.021963784471154213, + "step": 3469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.694, + "grad_norm": 1.4995328187942505, + "kl": 0.27994328178465366, + "learning_rate": 2.1804039652453028e-07, + "loss": 0.0122, + "num_tokens": 25267957.0, + "reward": 0.83551025390625, + "reward_std": 0.014663366600871086, + "rewards//mean": 0.83551025390625, + "rewards//std": 0.018310263752937317, + "step": 3470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6942, + "grad_norm": 1.6787819862365723, + "kl": 0.4226686954498291, + "learning_rate": 2.177783905666155e-07, + "loss": 0.0169, + "num_tokens": 25275261.0, + "reward": 0.86279296875, + "reward_std": 0.017063170671463013, + "rewards//mean": 0.86279296875, + "rewards//std": 0.02954077534377575, + "step": 3471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6944, + "grad_norm": 1.4054216146469116, + "kl": 0.3033318482339382, + "learning_rate": 2.1751649828753106e-07, + "loss": 0.0121, + "num_tokens": 25282509.0, + "reward": 0.84027099609375, + "reward_std": 0.014069569297134876, + "rewards//mean": 0.84027099609375, + "rewards//std": 0.02010102942585945, + "step": 3472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6946, + "grad_norm": 1.3412929773330688, + "kl": 0.2862650789320469, + "learning_rate": 2.1725471979276734e-07, + "loss": 0.0115, + "num_tokens": 25289781.0, + "reward": 0.823486328125, + "reward_std": 0.015006620436906815, + "rewards//mean": 0.823486328125, + "rewards//std": 0.031634196639060974, + "step": 3473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6948, + "grad_norm": 1.8939235210418701, + "kl": 0.3449276275932789, + "learning_rate": 2.1699305518776868e-07, + "loss": 0.0138, + "num_tokens": 25297117.0, + "reward": 0.859619140625, + "reward_std": 0.01374067272990942, + "rewards//mean": 0.859619140625, + "rewards//std": 0.016453644260764122, + "step": 3474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.828125, + "epoch": 0.695, + "grad_norm": 1.5907984972000122, + "kl": 0.2753186244517565, + "learning_rate": 2.1673150457793372e-07, + "loss": 0.0045, + "num_tokens": 25304362.0, + "reward": 0.865478515625, + "reward_std": 0.014406929723918438, + "rewards//mean": 0.865478515625, + "rewards//std": 0.01872563548386097, + "step": 3475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6952, + "grad_norm": 1.7227190732955933, + "kl": 0.46692075952887535, + "learning_rate": 2.1647006806861469e-07, + "loss": 0.0187, + "num_tokens": 25311634.0, + "reward": 0.888671875, + "reward_std": 0.015305186621844769, + "rewards//mean": 0.888671875, + "rewards//std": 0.02481290139257908, + "step": 3476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6954, + "grad_norm": 1.6155227422714233, + "kl": 0.32377086393535137, + "learning_rate": 2.1620874576511827e-07, + "loss": 0.013, + "num_tokens": 25318994.0, + "reward": 0.86663818359375, + "reward_std": 0.016484931111335754, + "rewards//mean": 0.86663818359375, + "rewards//std": 0.028441058471798897, + "step": 3477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6956, + "grad_norm": 1.6595011949539185, + "kl": 0.3066395912319422, + "learning_rate": 2.1594753777270513e-07, + "loss": 0.0123, + "num_tokens": 25326226.0, + "reward": 0.83807373046875, + "reward_std": 0.012160157784819603, + "rewards//mean": 0.83807373046875, + "rewards//std": 0.018294548615813255, + "step": 3478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6958, + "grad_norm": 1.664064884185791, + "kl": 0.4273916855454445, + "learning_rate": 2.1568644419659003e-07, + "loss": 0.0171, + "num_tokens": 25333666.0, + "reward": 0.8245849609375, + "reward_std": 0.013808042742311954, + "rewards//mean": 0.8245849609375, + "rewards//std": 0.021003752946853638, + "step": 3479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.696, + "grad_norm": 1.4307516813278198, + "kl": 0.35115180909633636, + "learning_rate": 2.15425465141941e-07, + "loss": 0.014, + "num_tokens": 25341066.0, + "reward": 0.836181640625, + "reward_std": 0.011872978881001472, + "rewards//mean": 0.836181640625, + "rewards//std": 0.022018853574991226, + "step": 3480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6962, + "grad_norm": 2.223552942276001, + "kl": 0.46504972875118256, + "learning_rate": 2.151646007138806e-07, + "loss": 0.0186, + "num_tokens": 25348330.0, + "reward": 0.85028076171875, + "reward_std": 0.019216172397136688, + "rewards//mean": 0.85028076171875, + "rewards//std": 0.03115728124976158, + "step": 3481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.84375, + "epoch": 0.6964, + "grad_norm": 1.6955410242080688, + "kl": 0.4373819213360548, + "learning_rate": 2.1490385101748516e-07, + "loss": 0.0107, + "num_tokens": 25355632.0, + "reward": 0.8826904296875, + "reward_std": 0.019906047731637955, + "rewards//mean": 0.8826904296875, + "rewards//std": 0.028973445296287537, + "step": 3482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6966, + "grad_norm": 1.497357726097107, + "kl": 0.3524985574185848, + "learning_rate": 2.146432161577842e-07, + "loss": 0.0141, + "num_tokens": 25362864.0, + "reward": 0.860107421875, + "reward_std": 0.01664668135344982, + "rewards//mean": 0.860107421875, + "rewards//std": 0.0224437452852726, + "step": 3483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6968, + "grad_norm": 1.5295076370239258, + "kl": 0.3544151224195957, + "learning_rate": 2.1438269623976168e-07, + "loss": 0.0142, + "num_tokens": 25370152.0, + "reward": 0.87371826171875, + "reward_std": 0.014827076345682144, + "rewards//mean": 0.87371826171875, + "rewards//std": 0.024597756564617157, + "step": 3484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.859375, + "epoch": 0.697, + "grad_norm": 1.373666763305664, + "kl": 0.2954385206103325, + "learning_rate": 2.1412229136835497e-07, + "loss": 0.0152, + "num_tokens": 25377495.0, + "reward": 0.85357666015625, + "reward_std": 0.019049890339374542, + "rewards//mean": 0.85357666015625, + "rewards//std": 0.03361719101667404, + "step": 3485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6972, + "grad_norm": 1.8279057741165161, + "kl": 0.40619461983442307, + "learning_rate": 2.1386200164845525e-07, + "loss": 0.0162, + "num_tokens": 25384871.0, + "reward": 0.83709716796875, + "reward_std": 0.013334471732378006, + "rewards//mean": 0.83709716796875, + "rewards//std": 0.021884296089410782, + "step": 3486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6974, + "grad_norm": 1.3652057647705078, + "kl": 0.2927756533026695, + "learning_rate": 2.1360182718490689e-07, + "loss": 0.0117, + "num_tokens": 25392119.0, + "reward": 0.85845947265625, + "reward_std": 0.013650862500071526, + "rewards//mean": 0.85845947265625, + "rewards//std": 0.0188013706356287, + "step": 3487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6976, + "grad_norm": 1.4591360092163086, + "kl": 0.35051097348332405, + "learning_rate": 2.133417680825083e-07, + "loss": 0.014, + "num_tokens": 25399391.0, + "reward": 0.8162841796875, + "reward_std": 0.011776547878980637, + "rewards//mean": 0.8162841796875, + "rewards//std": 0.021395059302449226, + "step": 3488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.59375, + "epoch": 0.6978, + "grad_norm": 1.6361935138702393, + "kl": 0.31782214529812336, + "learning_rate": 2.1308182444601126e-07, + "loss": -0.0174, + "num_tokens": 25406661.0, + "reward": 0.8758544921875, + "reward_std": 0.021250221878290176, + "rewards//mean": 0.8758544921875, + "rewards//std": 0.03476808965206146, + "step": 3489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.698, + "grad_norm": 1.2915393114089966, + "kl": 0.3113931976258755, + "learning_rate": 2.1282199638012116e-07, + "loss": 0.0125, + "num_tokens": 25413973.0, + "reward": 0.8804931640625, + "reward_std": 0.016246095299720764, + "rewards//mean": 0.8804931640625, + "rewards//std": 0.03243445232510567, + "step": 3490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.6982, + "grad_norm": 1.4036041498184204, + "kl": 0.33051642775535583, + "learning_rate": 2.125622839894964e-07, + "loss": 0.0128, + "num_tokens": 25421154.0, + "reward": 0.857177734375, + "reward_std": 0.015030345879495144, + "rewards//mean": 0.857177734375, + "rewards//std": 0.022797085344791412, + "step": 3491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6984, + "grad_norm": 1.330737829208374, + "kl": 0.3895503766834736, + "learning_rate": 2.123026873787493e-07, + "loss": 0.0156, + "num_tokens": 25428450.0, + "reward": 0.86822509765625, + "reward_std": 0.015756618231534958, + "rewards//mean": 0.86822509765625, + "rewards//std": 0.017555613070726395, + "step": 3492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.65625, + "epoch": 0.6986, + "grad_norm": 1.4928103685379028, + "kl": 0.408745463937521, + "learning_rate": 2.120432066524453e-07, + "loss": 0.0182, + "num_tokens": 25435836.0, + "reward": 0.8800048828125, + "reward_std": 0.01639120653271675, + "rewards//mean": 0.8800048828125, + "rewards//std": 0.0249343141913414, + "step": 3493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6988, + "grad_norm": 1.3046942949295044, + "kl": 0.2591760642826557, + "learning_rate": 2.117838419151034e-07, + "loss": 0.0104, + "num_tokens": 25443316.0, + "reward": 0.85052490234375, + "reward_std": 0.013196304440498352, + "rewards//mean": 0.85052490234375, + "rewards//std": 0.022630389779806137, + "step": 3494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.699, + "grad_norm": 1.386967420578003, + "kl": 0.3141542486846447, + "learning_rate": 2.1152459327119537e-07, + "loss": 0.0126, + "num_tokens": 25450540.0, + "reward": 0.8155517578125, + "reward_std": 0.015368666499853134, + "rewards//mean": 0.8155517578125, + "rewards//std": 0.01832813210785389, + "step": 3495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.6992, + "grad_norm": 2.1350271701812744, + "kl": 0.364634670317173, + "learning_rate": 2.1126546082514663e-07, + "loss": 0.0146, + "num_tokens": 25457805.0, + "reward": 0.85986328125, + "reward_std": 0.017131030559539795, + "rewards//mean": 0.85986328125, + "rewards//std": 0.02756430394947529, + "step": 3496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6994, + "grad_norm": 1.3594439029693604, + "kl": 0.3220415059477091, + "learning_rate": 2.1100644468133573e-07, + "loss": 0.0129, + "num_tokens": 25465085.0, + "reward": 0.87408447265625, + "reward_std": 0.01491469144821167, + "rewards//mean": 0.87408447265625, + "rewards//std": 0.022116174921393394, + "step": 3497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6996, + "grad_norm": 1.9447460174560547, + "kl": 0.29755022563040257, + "learning_rate": 2.1074754494409457e-07, + "loss": 0.0119, + "num_tokens": 25472405.0, + "reward": 0.84210205078125, + "reward_std": 0.01863975077867508, + "rewards//mean": 0.84210205078125, + "rewards//std": 0.02891349419951439, + "step": 3498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.6998, + "grad_norm": 1.7109776735305786, + "kl": 0.3565874621272087, + "learning_rate": 2.104887617177075e-07, + "loss": 0.0143, + "num_tokens": 25479645.0, + "reward": 0.81646728515625, + "reward_std": 0.01642678678035736, + "rewards//mean": 0.81646728515625, + "rewards//std": 0.023390870541334152, + "step": 3499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7, + "grad_norm": 1.9282430410385132, + "kl": 0.32415904849767685, + "learning_rate": 2.1023009510641264e-07, + "loss": 0.013, + "num_tokens": 25487005.0, + "reward": 0.86077880859375, + "reward_std": 0.012176615186035633, + "rewards//mean": 0.86077880859375, + "rewards//std": 0.016824712976813316, + "step": 3500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7002, + "grad_norm": 1.7385996580123901, + "kl": 0.39113158360123634, + "learning_rate": 2.0997154521440097e-07, + "loss": 0.0156, + "num_tokens": 25494221.0, + "reward": 0.85931396484375, + "reward_std": 0.021437935531139374, + "rewards//mean": 0.85931396484375, + "rewards//std": 0.027529064565896988, + "step": 3501 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.7004, + "grad_norm": 1.3363488912582397, + "kl": 0.314713791012764, + "learning_rate": 2.0971311214581598e-07, + "loss": 0.0138, + "num_tokens": 25501538.0, + "reward": 0.86676025390625, + "reward_std": 0.014768724329769611, + "rewards//mean": 0.86676025390625, + "rewards//std": 0.030829554423689842, + "step": 3502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7006, + "grad_norm": 1.3600276708602905, + "kl": 0.35015102475881577, + "learning_rate": 2.0945479600475479e-07, + "loss": 0.014, + "num_tokens": 25508842.0, + "reward": 0.86309814453125, + "reward_std": 0.014113593846559525, + "rewards//mean": 0.86309814453125, + "rewards//std": 0.020236877724528313, + "step": 3503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7008, + "grad_norm": 1.5078121423721313, + "kl": 0.36434195935726166, + "learning_rate": 2.0919659689526698e-07, + "loss": 0.0146, + "num_tokens": 25516082.0, + "reward": 0.867431640625, + "reward_std": 0.01603258028626442, + "rewards//mean": 0.867431640625, + "rewards//std": 0.01903352700173855, + "step": 3504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.701, + "grad_norm": 1.618444561958313, + "kl": 0.3702322021126747, + "learning_rate": 2.0893851492135532e-07, + "loss": 0.0148, + "num_tokens": 25523354.0, + "reward": 0.87408447265625, + "reward_std": 0.01902962289750576, + "rewards//mean": 0.87408447265625, + "rewards//std": 0.02535117231309414, + "step": 3505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7012, + "grad_norm": 1.894781231880188, + "kl": 0.3531286157667637, + "learning_rate": 2.086805501869749e-07, + "loss": 0.0141, + "num_tokens": 25530642.0, + "reward": 0.85845947265625, + "reward_std": 0.011035188101232052, + "rewards//mean": 0.85845947265625, + "rewards//std": 0.017101354897022247, + "step": 3506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7014, + "grad_norm": 1.4874300956726074, + "kl": 0.32489325664937496, + "learning_rate": 2.08422702796034e-07, + "loss": 0.013, + "num_tokens": 25537994.0, + "reward": 0.84417724609375, + "reward_std": 0.014374200254678726, + "rewards//mean": 0.84417724609375, + "rewards//std": 0.020149169489741325, + "step": 3507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7016, + "grad_norm": 1.5168285369873047, + "kl": 0.36601807177066803, + "learning_rate": 2.081649728523937e-07, + "loss": 0.0146, + "num_tokens": 25545298.0, + "reward": 0.847412109375, + "reward_std": 0.014129644259810448, + "rewards//mean": 0.847412109375, + "rewards//std": 0.019782302901148796, + "step": 3508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7018, + "grad_norm": 1.6080809831619263, + "kl": 0.3767637312412262, + "learning_rate": 2.0790736045986734e-07, + "loss": 0.0151, + "num_tokens": 25552562.0, + "reward": 0.869873046875, + "reward_std": 0.014489080756902695, + "rewards//mean": 0.869873046875, + "rewards//std": 0.023259857669472694, + "step": 3509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.702, + "grad_norm": 1.6154088973999023, + "kl": 0.34495330415666103, + "learning_rate": 2.0764986572222137e-07, + "loss": 0.0138, + "num_tokens": 25559770.0, + "reward": 0.83197021484375, + "reward_std": 0.01625736802816391, + "rewards//mean": 0.83197021484375, + "rewards//std": 0.0215925220400095, + "step": 3510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.671875, + "epoch": 0.7022, + "grad_norm": 1.6747941970825195, + "kl": 0.29671815782785416, + "learning_rate": 2.0739248874317438e-07, + "loss": -0.0089, + "num_tokens": 25567133.0, + "reward": 0.837890625, + "reward_std": 0.017658058553934097, + "rewards//mean": 0.837890625, + "rewards//std": 0.025094378739595413, + "step": 3511 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7024, + "grad_norm": 1.5040956735610962, + "kl": 0.35007466562092304, + "learning_rate": 2.071352296263979e-07, + "loss": 0.014, + "num_tokens": 25574549.0, + "reward": 0.83612060546875, + "reward_std": 0.015948954969644547, + "rewards//mean": 0.83612060546875, + "rewards//std": 0.022123701870441437, + "step": 3512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7026, + "grad_norm": 1.729435682296753, + "kl": 0.29788273200392723, + "learning_rate": 2.0687808847551607e-07, + "loss": 0.0119, + "num_tokens": 25581757.0, + "reward": 0.7362060546875, + "reward_std": 0.01203384529799223, + "rewards//mean": 0.7362060546875, + "rewards//std": 0.01469795685261488, + "step": 3513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7028, + "grad_norm": 1.6649450063705444, + "kl": 0.3869988042861223, + "learning_rate": 2.06621065394105e-07, + "loss": 0.0155, + "num_tokens": 25589069.0, + "reward": 0.86529541015625, + "reward_std": 0.012198066338896751, + "rewards//mean": 0.86529541015625, + "rewards//std": 0.015295892022550106, + "step": 3514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.703, + "grad_norm": 1.555250644683838, + "kl": 0.36893932335078716, + "learning_rate": 2.0636416048569373e-07, + "loss": 0.0148, + "num_tokens": 25596349.0, + "reward": 0.832275390625, + "reward_std": 0.011042951606214046, + "rewards//mean": 0.832275390625, + "rewards//std": 0.018893033266067505, + "step": 3515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.7032, + "grad_norm": 1.5081363916397095, + "kl": 0.4348040819168091, + "learning_rate": 2.0610737385376348e-07, + "loss": 0.0152, + "num_tokens": 25603623.0, + "reward": 0.84027099609375, + "reward_std": 0.015947096049785614, + "rewards//mean": 0.84027099609375, + "rewards//std": 0.02578103542327881, + "step": 3516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7034, + "grad_norm": 1.9583978652954102, + "kl": 0.41141573525965214, + "learning_rate": 2.0585070560174806e-07, + "loss": 0.0165, + "num_tokens": 25610935.0, + "reward": 0.87060546875, + "reward_std": 0.013957453891634941, + "rewards//mean": 0.87060546875, + "rewards//std": 0.02411492168903351, + "step": 3517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7036, + "grad_norm": 1.4510321617126465, + "kl": 0.328509034588933, + "learning_rate": 2.0559415583303307e-07, + "loss": 0.0131, + "num_tokens": 25618151.0, + "reward": 0.85955810546875, + "reward_std": 0.014499290846288204, + "rewards//mean": 0.85955810546875, + "rewards//std": 0.016983218491077423, + "step": 3518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.7038, + "grad_norm": 1.5150318145751953, + "kl": 0.3508900683373213, + "learning_rate": 2.0533772465095688e-07, + "loss": 0.014, + "num_tokens": 25625462.0, + "reward": 0.8548583984375, + "reward_std": 0.016376443207263947, + "rewards//mean": 0.8548583984375, + "rewards//std": 0.024815035983920097, + "step": 3519 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.704, + "grad_norm": 1.4058582782745361, + "kl": 0.3417102135717869, + "learning_rate": 2.0508141215881004e-07, + "loss": 0.0137, + "num_tokens": 25632646.0, + "reward": 0.80322265625, + "reward_std": 0.013075088150799274, + "rewards//mean": 0.80322265625, + "rewards//std": 0.023318355903029442, + "step": 3520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7042, + "grad_norm": 1.4910887479782104, + "kl": 0.3950191270560026, + "learning_rate": 2.048252184598352e-07, + "loss": 0.0158, + "num_tokens": 25639902.0, + "reward": 0.855712890625, + "reward_std": 0.014949807897210121, + "rewards//mean": 0.855712890625, + "rewards//std": 0.02233557030558586, + "step": 3521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.5625, + "epoch": 0.7044, + "grad_norm": 2.0695862770080566, + "kl": 0.42832231894135475, + "learning_rate": 2.0456914365722695e-07, + "loss": -0.0179, + "num_tokens": 25647242.0, + "reward": 0.8333740234375, + "reward_std": 0.02016107738018036, + "rewards//mean": 0.8333740234375, + "rewards//std": 0.02906733751296997, + "step": 3522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7046, + "grad_norm": 1.5028767585754395, + "kl": 0.36796408891677856, + "learning_rate": 2.0431318785413228e-07, + "loss": 0.0147, + "num_tokens": 25654554.0, + "reward": 0.80560302734375, + "reward_std": 0.015891633927822113, + "rewards//mean": 0.80560302734375, + "rewards//std": 0.019805150106549263, + "step": 3523 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7048, + "grad_norm": 1.4776513576507568, + "kl": 0.3178371973335743, + "learning_rate": 2.040573511536502e-07, + "loss": 0.0127, + "num_tokens": 25661826.0, + "reward": 0.86370849609375, + "reward_std": 0.019718091934919357, + "rewards//mean": 0.86370849609375, + "rewards//std": 0.03052162379026413, + "step": 3524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.705, + "grad_norm": 2.3061389923095703, + "kl": 0.4605582244694233, + "learning_rate": 2.0380163365883184e-07, + "loss": 0.0184, + "num_tokens": 25669210.0, + "reward": 0.80084228515625, + "reward_std": 0.012910491786897182, + "rewards//mean": 0.80084228515625, + "rewards//std": 0.020710812881588936, + "step": 3525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7052, + "grad_norm": 1.5120556354522705, + "kl": 0.3155284598469734, + "learning_rate": 2.0354603547267984e-07, + "loss": 0.0126, + "num_tokens": 25676490.0, + "reward": 0.88031005859375, + "reward_std": 0.01720198430120945, + "rewards//mean": 0.88031005859375, + "rewards//std": 0.021034270524978638, + "step": 3526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7054, + "grad_norm": 1.5317391157150269, + "kl": 0.34224128164350986, + "learning_rate": 2.0329055669814933e-07, + "loss": 0.0137, + "num_tokens": 25683754.0, + "reward": 0.85760498046875, + "reward_std": 0.01950272172689438, + "rewards//mean": 0.85760498046875, + "rewards//std": 0.03035302646458149, + "step": 3527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.7056, + "grad_norm": 1.391193151473999, + "kl": 0.3078838251531124, + "learning_rate": 2.0303519743814724e-07, + "loss": 0.009, + "num_tokens": 25690951.0, + "reward": 0.88079833984375, + "reward_std": 0.015328248962759972, + "rewards//mean": 0.88079833984375, + "rewards//std": 0.023918189108371735, + "step": 3528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7058, + "grad_norm": 1.612862229347229, + "kl": 0.34106552973389626, + "learning_rate": 2.027799577955319e-07, + "loss": 0.0136, + "num_tokens": 25698231.0, + "reward": 0.80242919921875, + "reward_std": 0.013187151402235031, + "rewards//mean": 0.80242919921875, + "rewards//std": 0.021154114976525307, + "step": 3529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.706, + "grad_norm": 1.5734165906906128, + "kl": 0.2668897584080696, + "learning_rate": 2.0252483787311408e-07, + "loss": 0.0107, + "num_tokens": 25705439.0, + "reward": 0.76678466796875, + "reward_std": 0.013099439442157745, + "rewards//mean": 0.76678466796875, + "rewards//std": 0.01798410527408123, + "step": 3530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7062, + "grad_norm": 1.5615158081054688, + "kl": 0.3616899475455284, + "learning_rate": 2.0226983777365603e-07, + "loss": 0.0145, + "num_tokens": 25712703.0, + "reward": 0.8511962890625, + "reward_std": 0.01428550761193037, + "rewards//mean": 0.8511962890625, + "rewards//std": 0.016930213198065758, + "step": 3531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7064, + "grad_norm": 1.716569423675537, + "kl": 0.32597823068499565, + "learning_rate": 2.020149575998718e-07, + "loss": 0.013, + "num_tokens": 25720055.0, + "reward": 0.84429931640625, + "reward_std": 0.014118468388915062, + "rewards//mean": 0.84429931640625, + "rewards//std": 0.01983112096786499, + "step": 3532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.7066, + "grad_norm": 1.5633653402328491, + "kl": 0.35580847039818764, + "learning_rate": 2.017601974544269e-07, + "loss": 0.0125, + "num_tokens": 25727289.0, + "reward": 0.84332275390625, + "reward_std": 0.017118442803621292, + "rewards//mean": 0.84332275390625, + "rewards//std": 0.021866995841264725, + "step": 3533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.84375, + "epoch": 0.7068, + "grad_norm": 1.4717506170272827, + "kl": 0.35373276472091675, + "learning_rate": 2.0150555743993873e-07, + "loss": 0.013, + "num_tokens": 25734551.0, + "reward": 0.85821533203125, + "reward_std": 0.015818897634744644, + "rewards//mean": 0.85821533203125, + "rewards//std": 0.019157886505126953, + "step": 3534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.707, + "grad_norm": 1.5732359886169434, + "kl": 0.35079388692975044, + "learning_rate": 2.012510376589764e-07, + "loss": 0.014, + "num_tokens": 25741887.0, + "reward": 0.8167724609375, + "reward_std": 0.013816945254802704, + "rewards//mean": 0.8167724609375, + "rewards//std": 0.014627755619585514, + "step": 3535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7072, + "grad_norm": 1.3865379095077515, + "kl": 0.39528998732566833, + "learning_rate": 2.0099663821406055e-07, + "loss": 0.0158, + "num_tokens": 25749199.0, + "reward": 0.86260986328125, + "reward_std": 0.013507549650967121, + "rewards//mean": 0.86260986328125, + "rewards//std": 0.019961223006248474, + "step": 3536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7074, + "grad_norm": 1.4498615264892578, + "kl": 0.42332247644662857, + "learning_rate": 2.0074235920766285e-07, + "loss": 0.0169, + "num_tokens": 25756455.0, + "reward": 0.78863525390625, + "reward_std": 0.014224477112293243, + "rewards//mean": 0.78863525390625, + "rewards//std": 0.01980973593890667, + "step": 3537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7076, + "grad_norm": 1.7067114114761353, + "kl": 0.44512369483709335, + "learning_rate": 2.0048820074220711e-07, + "loss": 0.0178, + "num_tokens": 25763759.0, + "reward": 0.863525390625, + "reward_std": 0.015587056055665016, + "rewards//mean": 0.863525390625, + "rewards//std": 0.018699748441576958, + "step": 3538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7078, + "grad_norm": 1.693124771118164, + "kl": 0.40838194638490677, + "learning_rate": 2.0023416292006828e-07, + "loss": 0.0163, + "num_tokens": 25770983.0, + "reward": 0.804443359375, + "reward_std": 0.013427993282675743, + "rewards//mean": 0.804443359375, + "rewards//std": 0.020444603636860847, + "step": 3539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.708, + "grad_norm": 1.8452354669570923, + "kl": 0.3888675905764103, + "learning_rate": 1.9998024584357293e-07, + "loss": 0.0156, + "num_tokens": 25778327.0, + "reward": 0.8360595703125, + "reward_std": 0.014685200527310371, + "rewards//mean": 0.8360595703125, + "rewards//std": 0.027000294998288155, + "step": 3540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7082, + "grad_norm": 1.5298101902008057, + "kl": 0.3180558644235134, + "learning_rate": 1.9972644961499853e-07, + "loss": 0.0127, + "num_tokens": 25785639.0, + "reward": 0.8609619140625, + "reward_std": 0.014094685204327106, + "rewards//mean": 0.8609619140625, + "rewards//std": 0.02629820443689823, + "step": 3541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.671875, + "epoch": 0.7084, + "grad_norm": 1.406690239906311, + "kl": 0.34157948940992355, + "learning_rate": 1.994727743365743e-07, + "loss": 0.0017, + "num_tokens": 25792858.0, + "reward": 0.83416748046875, + "reward_std": 0.013222547248005867, + "rewards//mean": 0.83416748046875, + "rewards//std": 0.015055496245622635, + "step": 3542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7086, + "grad_norm": 1.5863200426101685, + "kl": 0.3951261639595032, + "learning_rate": 1.9921922011048063e-07, + "loss": 0.0158, + "num_tokens": 25800090.0, + "reward": 0.86553955078125, + "reward_std": 0.02046407386660576, + "rewards//mean": 0.86553955078125, + "rewards//std": 0.03869372606277466, + "step": 3543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7088, + "grad_norm": 1.4031811952590942, + "kl": 0.41860687360167503, + "learning_rate": 1.989657870388493e-07, + "loss": 0.0167, + "num_tokens": 25807434.0, + "reward": 0.7633056640625, + "reward_std": 0.01583530753850937, + "rewards//mean": 0.7633056640625, + "rewards//std": 0.025322269648313522, + "step": 3544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.709, + "grad_norm": 1.3404074907302856, + "kl": 0.31409390456974506, + "learning_rate": 1.9871247522376277e-07, + "loss": 0.0126, + "num_tokens": 25814786.0, + "reward": 0.8211669921875, + "reward_std": 0.01749458536505699, + "rewards//mean": 0.8211669921875, + "rewards//std": 0.0251278355717659, + "step": 3545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7092, + "grad_norm": 1.7200692892074585, + "kl": 0.30681315809488297, + "learning_rate": 1.9845928476725522e-07, + "loss": 0.0123, + "num_tokens": 25822122.0, + "reward": 0.8697509765625, + "reward_std": 0.01762383244931698, + "rewards//mean": 0.8697509765625, + "rewards//std": 0.02874895930290222, + "step": 3546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7094, + "grad_norm": 1.5724273920059204, + "kl": 0.3824903890490532, + "learning_rate": 1.9820621577131186e-07, + "loss": 0.0153, + "num_tokens": 25829386.0, + "reward": 0.85601806640625, + "reward_std": 0.015613894909620285, + "rewards//mean": 0.85601806640625, + "rewards//std": 0.022335484623908997, + "step": 3547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7096, + "grad_norm": 1.519003987312317, + "kl": 0.3220995292067528, + "learning_rate": 1.9795326833786852e-07, + "loss": 0.0129, + "num_tokens": 25836650.0, + "reward": 0.849609375, + "reward_std": 0.014279458671808243, + "rewards//mean": 0.849609375, + "rewards//std": 0.02166539616882801, + "step": 3548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7098, + "grad_norm": 1.7438234090805054, + "kl": 0.36518455296754837, + "learning_rate": 1.9770044256881258e-07, + "loss": 0.0146, + "num_tokens": 25843930.0, + "reward": 0.85955810546875, + "reward_std": 0.013119017705321312, + "rewards//mean": 0.85955810546875, + "rewards//std": 0.017280111089348793, + "step": 3549 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.859375, + "epoch": 0.71, + "grad_norm": 1.245445728302002, + "kl": 0.36394327506422997, + "learning_rate": 1.9744773856598224e-07, + "loss": 0.0169, + "num_tokens": 25851153.0, + "reward": 0.8148193359375, + "reward_std": 0.012059137225151062, + "rewards//mean": 0.8148193359375, + "rewards//std": 0.016772110015153885, + "step": 3550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7102, + "grad_norm": 1.5758867263793945, + "kl": 0.3615476079285145, + "learning_rate": 1.9719515643116674e-07, + "loss": 0.0145, + "num_tokens": 25858441.0, + "reward": 0.8558349609375, + "reward_std": 0.01963835582137108, + "rewards//mean": 0.8558349609375, + "rewards//std": 0.025503354147076607, + "step": 3551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7104, + "grad_norm": 1.462764024734497, + "kl": 0.38160916045308113, + "learning_rate": 1.9694269626610588e-07, + "loss": 0.0153, + "num_tokens": 25865825.0, + "reward": 0.8692626953125, + "reward_std": 0.01904468610882759, + "rewards//mean": 0.8692626953125, + "rewards//std": 0.02789159305393696, + "step": 3552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7106, + "grad_norm": 1.7085474729537964, + "kl": 0.3723207078874111, + "learning_rate": 1.9669035817249074e-07, + "loss": 0.0149, + "num_tokens": 25873177.0, + "reward": 0.8338623046875, + "reward_std": 0.017029326409101486, + "rewards//mean": 0.8338623046875, + "rewards//std": 0.024379314854741096, + "step": 3553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.875, + "epoch": 0.7108, + "grad_norm": 1.9595309495925903, + "kl": 0.5278106369078159, + "learning_rate": 1.9643814225196304e-07, + "loss": 0.0171, + "num_tokens": 25880465.0, + "reward": 0.8516845703125, + "reward_std": 0.019045304507017136, + "rewards//mean": 0.8516845703125, + "rewards//std": 0.02128155343234539, + "step": 3554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.711, + "grad_norm": 1.4568346738815308, + "kl": 0.2768012098968029, + "learning_rate": 1.9618604860611554e-07, + "loss": 0.0111, + "num_tokens": 25887729.0, + "reward": 0.87890625, + "reward_std": 0.01567373424768448, + "rewards//mean": 0.87890625, + "rewards//std": 0.019954491406679153, + "step": 3555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.7112, + "grad_norm": 1.994946002960205, + "kl": 0.39712638780474663, + "learning_rate": 1.959340773364911e-07, + "loss": 0.0154, + "num_tokens": 25895039.0, + "reward": 0.8408203125, + "reward_std": 0.00959266908466816, + "rewards//mean": 0.8408203125, + "rewards//std": 0.016381725668907166, + "step": 3556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7114, + "grad_norm": 1.5845106840133667, + "kl": 0.3629378229379654, + "learning_rate": 1.95682228544584e-07, + "loss": 0.0145, + "num_tokens": 25902335.0, + "reward": 0.7891845703125, + "reward_std": 0.013233684003353119, + "rewards//mean": 0.7891845703125, + "rewards//std": 0.01661614701151848, + "step": 3557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7116, + "grad_norm": 3.9444596767425537, + "kl": 0.7373294681310654, + "learning_rate": 1.9543050233183878e-07, + "loss": 0.0295, + "num_tokens": 25909631.0, + "reward": 0.8004150390625, + "reward_std": 0.014312426559627056, + "rewards//mean": 0.8004150390625, + "rewards//std": 0.024066852405667305, + "step": 3558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7118, + "grad_norm": 1.695119857788086, + "kl": 0.34852019138634205, + "learning_rate": 1.9517889879965104e-07, + "loss": 0.0139, + "num_tokens": 25916943.0, + "reward": 0.88031005859375, + "reward_std": 0.020063769072294235, + "rewards//mean": 0.88031005859375, + "rewards//std": 0.031942714005708694, + "step": 3559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.78125, + "epoch": 0.712, + "grad_norm": 1.645024061203003, + "kl": 0.40017327293753624, + "learning_rate": 1.9492741804936618e-07, + "loss": 0.0097, + "num_tokens": 25924241.0, + "reward": 0.83734130859375, + "reward_std": 0.01893220841884613, + "rewards//mean": 0.83734130859375, + "rewards//std": 0.020685946568846703, + "step": 3560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7122, + "grad_norm": 1.7910245656967163, + "kl": 0.3103675339370966, + "learning_rate": 1.9467606018228088e-07, + "loss": 0.0124, + "num_tokens": 25931553.0, + "reward": 0.88153076171875, + "reward_std": 0.017157018184661865, + "rewards//mean": 0.88153076171875, + "rewards//std": 0.028814898803830147, + "step": 3561 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7124, + "grad_norm": 1.5224825143814087, + "kl": 0.37963966093957424, + "learning_rate": 1.9442482529964222e-07, + "loss": 0.0152, + "num_tokens": 25938841.0, + "reward": 0.84808349609375, + "reward_std": 0.015568557195365429, + "rewards//mean": 0.84808349609375, + "rewards//std": 0.022229505702853203, + "step": 3562 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7126, + "grad_norm": 1.5899243354797363, + "kl": 0.33244922012090683, + "learning_rate": 1.9417371350264716e-07, + "loss": 0.0133, + "num_tokens": 25946249.0, + "reward": 0.8126220703125, + "reward_std": 0.014819161966443062, + "rewards//mean": 0.8126220703125, + "rewards//std": 0.01742720417678356, + "step": 3563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7128, + "grad_norm": 1.3888005018234253, + "kl": 0.374047813937068, + "learning_rate": 1.9392272489244377e-07, + "loss": 0.015, + "num_tokens": 25953665.0, + "reward": 0.84332275390625, + "reward_std": 0.014518186450004578, + "rewards//mean": 0.84332275390625, + "rewards//std": 0.020962180569767952, + "step": 3564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.713, + "grad_norm": 1.4614031314849854, + "kl": 0.31999805197119713, + "learning_rate": 1.936718595701302e-07, + "loss": 0.0128, + "num_tokens": 25961017.0, + "reward": 0.76953125, + "reward_std": 0.015325796790421009, + "rewards//mean": 0.76953125, + "rewards//std": 0.025161849334836006, + "step": 3565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7132, + "grad_norm": 2.5689265727996826, + "kl": 0.3682952430099249, + "learning_rate": 1.934211176367551e-07, + "loss": 0.0147, + "num_tokens": 25968305.0, + "reward": 0.78460693359375, + "reward_std": 0.014543507248163223, + "rewards//mean": 0.78460693359375, + "rewards//std": 0.025495264679193497, + "step": 3566 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7134, + "grad_norm": 1.530213475227356, + "kl": 0.34533482044935226, + "learning_rate": 1.9317049919331702e-07, + "loss": 0.0138, + "num_tokens": 25975641.0, + "reward": 0.81427001953125, + "reward_std": 0.019384805113077164, + "rewards//mean": 0.81427001953125, + "rewards//std": 0.02123553492128849, + "step": 3567 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7136, + "grad_norm": 1.4253959655761719, + "kl": 0.4072861559689045, + "learning_rate": 1.929200043407651e-07, + "loss": 0.0163, + "num_tokens": 25982977.0, + "reward": 0.78912353515625, + "reward_std": 0.015193028375506401, + "rewards//mean": 0.78912353515625, + "rewards//std": 0.018182501196861267, + "step": 3568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7138, + "grad_norm": 1.621188759803772, + "kl": 0.32962492667138577, + "learning_rate": 1.926696331799988e-07, + "loss": 0.0132, + "num_tokens": 25990281.0, + "reward": 0.75152587890625, + "reward_std": 0.010799866169691086, + "rewards//mean": 0.75152587890625, + "rewards//std": 0.021048659458756447, + "step": 3569 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.8125, + "epoch": 0.714, + "grad_norm": 1.646147608757019, + "kl": 0.33180346712470055, + "learning_rate": 1.9241938581186762e-07, + "loss": 0.0042, + "num_tokens": 25997637.0, + "reward": 0.8580322265625, + "reward_std": 0.015998225659132004, + "rewards//mean": 0.8580322265625, + "rewards//std": 0.02227414958178997, + "step": 3570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7142, + "grad_norm": 1.7790864706039429, + "kl": 0.49845779687166214, + "learning_rate": 1.9216926233717084e-07, + "loss": 0.0199, + "num_tokens": 26004949.0, + "reward": 0.8775634765625, + "reward_std": 0.018962649628520012, + "rewards//mean": 0.8775634765625, + "rewards//std": 0.02123883180320263, + "step": 3571 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.7144, + "grad_norm": 1.878191590309143, + "kl": 0.43111108988523483, + "learning_rate": 1.9191926285665843e-07, + "loss": 0.0171, + "num_tokens": 26012360.0, + "reward": 0.81103515625, + "reward_std": 0.011729174293577671, + "rewards//mean": 0.81103515625, + "rewards//std": 0.019568417221307755, + "step": 3572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7146, + "grad_norm": 1.7141658067703247, + "kl": 0.31267889589071274, + "learning_rate": 1.9166938747103012e-07, + "loss": 0.0125, + "num_tokens": 26019736.0, + "reward": 0.8726806640625, + "reward_std": 0.014864839613437653, + "rewards//mean": 0.8726806640625, + "rewards//std": 0.023589113727211952, + "step": 3573 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7148, + "grad_norm": 1.43106210231781, + "kl": 0.3674626611173153, + "learning_rate": 1.9141963628093582e-07, + "loss": 0.0147, + "num_tokens": 26027032.0, + "reward": 0.85382080078125, + "reward_std": 0.020764270797371864, + "rewards//mean": 0.85382080078125, + "rewards//std": 0.02926320768892765, + "step": 3574 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.796875, + "epoch": 0.715, + "grad_norm": 1.8736664056777954, + "kl": 0.348224975168705, + "learning_rate": 1.911700093869749e-07, + "loss": 0.005, + "num_tokens": 26034179.0, + "reward": 0.88079833984375, + "reward_std": 0.014838800765573978, + "rewards//mean": 0.88079833984375, + "rewards//std": 0.027063224464654922, + "step": 3575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7152, + "grad_norm": 1.9492803812026978, + "kl": 0.3798196576535702, + "learning_rate": 1.9092050688969736e-07, + "loss": 0.0152, + "num_tokens": 26041419.0, + "reward": 0.88555908203125, + "reward_std": 0.02344873733818531, + "rewards//mean": 0.88555908203125, + "rewards//std": 0.029435476288199425, + "step": 3576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7154, + "grad_norm": 1.3726681470870972, + "kl": 0.3585554212331772, + "learning_rate": 1.906711288896028e-07, + "loss": 0.0143, + "num_tokens": 26048683.0, + "reward": 0.85382080078125, + "reward_std": 0.015069572255015373, + "rewards//mean": 0.85382080078125, + "rewards//std": 0.019572189077734947, + "step": 3577 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7156, + "grad_norm": 1.350428581237793, + "kl": 0.29213553108274937, + "learning_rate": 1.9042187548714033e-07, + "loss": 0.0117, + "num_tokens": 26055979.0, + "reward": 0.85723876953125, + "reward_std": 0.01015671156346798, + "rewards//mean": 0.85723876953125, + "rewards//std": 0.016082249581813812, + "step": 3578 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7158, + "grad_norm": 1.4754395484924316, + "kl": 0.39053872786462307, + "learning_rate": 1.9017274678270945e-07, + "loss": 0.0156, + "num_tokens": 26063259.0, + "reward": 0.878662109375, + "reward_std": 0.015128009021282196, + "rewards//mean": 0.878662109375, + "rewards//std": 0.02198582887649536, + "step": 3579 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.716, + "grad_norm": 1.4651559591293335, + "kl": 0.3160886839032173, + "learning_rate": 1.8992374287665908e-07, + "loss": 0.0126, + "num_tokens": 26070571.0, + "reward": 0.78216552734375, + "reward_std": 0.01796436309814453, + "rewards//mean": 0.78216552734375, + "rewards//std": 0.026002036407589912, + "step": 3580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7162, + "grad_norm": 1.267460584640503, + "kl": 0.28564462810754776, + "learning_rate": 1.8967486386928817e-07, + "loss": 0.0114, + "num_tokens": 26077835.0, + "reward": 0.82696533203125, + "reward_std": 0.013411343097686768, + "rewards//mean": 0.82696533203125, + "rewards//std": 0.019094569608569145, + "step": 3581 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7164, + "grad_norm": 1.6256532669067383, + "kl": 0.3628530763089657, + "learning_rate": 1.8942610986084484e-07, + "loss": 0.0145, + "num_tokens": 26085107.0, + "reward": 0.84527587890625, + "reward_std": 0.02591124176979065, + "rewards//mean": 0.84527587890625, + "rewards//std": 0.03133990988135338, + "step": 3582 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7166, + "grad_norm": 2.1321916580200195, + "kl": 0.32743961922824383, + "learning_rate": 1.891774809515273e-07, + "loss": 0.0131, + "num_tokens": 26092523.0, + "reward": 0.83868408203125, + "reward_std": 0.01784897781908512, + "rewards//mean": 0.83868408203125, + "rewards//std": 0.023752428591251373, + "step": 3583 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7168, + "grad_norm": 1.638575553894043, + "kl": 0.37264627404510975, + "learning_rate": 1.8892897724148322e-07, + "loss": 0.0149, + "num_tokens": 26099747.0, + "reward": 0.83880615234375, + "reward_std": 0.015058271586894989, + "rewards//mean": 0.83880615234375, + "rewards//std": 0.023584218695759773, + "step": 3584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.717, + "grad_norm": 2.043013572692871, + "kl": 0.3339947909116745, + "learning_rate": 1.8868059883081011e-07, + "loss": 0.0134, + "num_tokens": 26107155.0, + "reward": 0.85089111328125, + "reward_std": 0.01617608219385147, + "rewards//mean": 0.85089111328125, + "rewards//std": 0.021231258288025856, + "step": 3585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.875, + "epoch": 0.7172, + "grad_norm": 1.416330099105835, + "kl": 0.3556871674954891, + "learning_rate": 1.8843234581955441e-07, + "loss": 0.0139, + "num_tokens": 26114411.0, + "reward": 0.8516845703125, + "reward_std": 0.015406379476189613, + "rewards//mean": 0.8516845703125, + "rewards//std": 0.022530920803546906, + "step": 3586 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7174, + "grad_norm": 1.3857840299606323, + "kl": 0.3235942181199789, + "learning_rate": 1.8818421830771252e-07, + "loss": 0.0129, + "num_tokens": 26121923.0, + "reward": 0.8787841796875, + "reward_std": 0.01597491465508938, + "rewards//mean": 0.8787841796875, + "rewards//std": 0.02942962571978569, + "step": 3587 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7176, + "grad_norm": 1.9038946628570557, + "kl": 0.33787885680794716, + "learning_rate": 1.8793621639523027e-07, + "loss": 0.0135, + "num_tokens": 26129227.0, + "reward": 0.84429931640625, + "reward_std": 0.011730100028216839, + "rewards//mean": 0.84429931640625, + "rewards//std": 0.01327627431601286, + "step": 3588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7178, + "grad_norm": 1.5572682619094849, + "kl": 0.3700452335178852, + "learning_rate": 1.8768834018200286e-07, + "loss": 0.0148, + "num_tokens": 26136499.0, + "reward": 0.8489990234375, + "reward_std": 0.01886763423681259, + "rewards//mean": 0.8489990234375, + "rewards//std": 0.022490572184324265, + "step": 3589 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.718, + "grad_norm": 1.5849477052688599, + "kl": 0.3056187219917774, + "learning_rate": 1.8744058976787452e-07, + "loss": 0.0122, + "num_tokens": 26143723.0, + "reward": 0.85064697265625, + "reward_std": 0.013482838869094849, + "rewards//mean": 0.85064697265625, + "rewards//std": 0.01712965779006481, + "step": 3590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7182, + "grad_norm": 1.7510254383087158, + "kl": 0.3271700572222471, + "learning_rate": 1.8719296525263923e-07, + "loss": 0.0131, + "num_tokens": 26150883.0, + "reward": 0.8525390625, + "reward_std": 0.01454403717070818, + "rewards//mean": 0.8525390625, + "rewards//std": 0.018817566335201263, + "step": 3591 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7184, + "grad_norm": 1.744616150856018, + "kl": 0.36557142436504364, + "learning_rate": 1.869454667360401e-07, + "loss": 0.0146, + "num_tokens": 26158171.0, + "reward": 0.77203369140625, + "reward_std": 0.013816069811582565, + "rewards//mean": 0.77203369140625, + "rewards//std": 0.019727034494280815, + "step": 3592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7186, + "grad_norm": 1.4874846935272217, + "kl": 0.3410100843757391, + "learning_rate": 1.8669809431776988e-07, + "loss": 0.0136, + "num_tokens": 26165411.0, + "reward": 0.78997802734375, + "reward_std": 0.015667367726564407, + "rewards//mean": 0.78997802734375, + "rewards//std": 0.02684532105922699, + "step": 3593 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7188, + "grad_norm": 1.6865477561950684, + "kl": 0.4405151456594467, + "learning_rate": 1.8645084809746952e-07, + "loss": 0.0176, + "num_tokens": 26172739.0, + "reward": 0.8765869140625, + "reward_std": 0.017607953399419785, + "rewards//mean": 0.8765869140625, + "rewards//std": 0.019018808379769325, + "step": 3594 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.719, + "grad_norm": 1.4547631740570068, + "kl": 0.28564935736358166, + "learning_rate": 1.8620372817473002e-07, + "loss": 0.0114, + "num_tokens": 26179971.0, + "reward": 0.86376953125, + "reward_std": 0.0175769105553627, + "rewards//mean": 0.86376953125, + "rewards//std": 0.02810380794107914, + "step": 3595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7192, + "grad_norm": 1.6055450439453125, + "kl": 0.3605962451547384, + "learning_rate": 1.859567346490913e-07, + "loss": 0.0144, + "num_tokens": 26187195.0, + "reward": 0.84814453125, + "reward_std": 0.018745552748441696, + "rewards//mean": 0.84814453125, + "rewards//std": 0.023882798850536346, + "step": 3596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7194, + "grad_norm": 1.3787403106689453, + "kl": 0.3767162822186947, + "learning_rate": 1.8570986762004242e-07, + "loss": 0.0151, + "num_tokens": 26194451.0, + "reward": 0.8465576171875, + "reward_std": 0.01882527768611908, + "rewards//mean": 0.8465576171875, + "rewards//std": 0.02468046545982361, + "step": 3597 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.828125, + "epoch": 0.7196, + "grad_norm": 1.7985568046569824, + "kl": 0.37756847590208054, + "learning_rate": 1.8546312718702118e-07, + "loss": 0.0042, + "num_tokens": 26201736.0, + "reward": 0.8310546875, + "reward_std": 0.01503712683916092, + "rewards//mean": 0.8310546875, + "rewards//std": 0.02133873663842678, + "step": 3598 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7198, + "grad_norm": 1.4826195240020752, + "kl": 0.37354349717497826, + "learning_rate": 1.8521651344941463e-07, + "loss": 0.0149, + "num_tokens": 26209080.0, + "reward": 0.84039306640625, + "reward_std": 0.014439534395933151, + "rewards//mean": 0.84039306640625, + "rewards//std": 0.01808064617216587, + "step": 3599 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.72, + "grad_norm": 1.4745047092437744, + "kl": 0.35968497954308987, + "learning_rate": 1.8497002650655885e-07, + "loss": 0.0144, + "num_tokens": 26216296.0, + "reward": 0.818603515625, + "reward_std": 0.010267464444041252, + "rewards//mean": 0.818603515625, + "rewards//std": 0.016261154785752296, + "step": 3600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7202, + "grad_norm": 1.4194506406784058, + "kl": 0.3741169720888138, + "learning_rate": 1.847236664577389e-07, + "loss": 0.015, + "num_tokens": 26223576.0, + "reward": 0.8135986328125, + "reward_std": 0.011539454571902752, + "rewards//mean": 0.8135986328125, + "rewards//std": 0.017586303874850273, + "step": 3601 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7204, + "grad_norm": 1.607305884361267, + "kl": 0.4260091669857502, + "learning_rate": 1.8447743340218818e-07, + "loss": 0.017, + "num_tokens": 26231224.0, + "reward": 0.88671875, + "reward_std": 0.01848304457962513, + "rewards//mean": 0.88671875, + "rewards//std": 0.029405182227492332, + "step": 3602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7206, + "grad_norm": 1.3560892343521118, + "kl": 0.37297552824020386, + "learning_rate": 1.842313274390896e-07, + "loss": 0.0149, + "num_tokens": 26238504.0, + "reward": 0.79046630859375, + "reward_std": 0.01663506031036377, + "rewards//mean": 0.79046630859375, + "rewards//std": 0.023677745833992958, + "step": 3603 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7208, + "grad_norm": 1.6525427103042603, + "kl": 0.4325292520225048, + "learning_rate": 1.8398534866757455e-07, + "loss": 0.0173, + "num_tokens": 26245808.0, + "reward": 0.833984375, + "reward_std": 0.01633346825838089, + "rewards//mean": 0.833984375, + "rewards//std": 0.02790054678916931, + "step": 3604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.721, + "grad_norm": 2.2652575969696045, + "kl": 0.36364229023456573, + "learning_rate": 1.8373949718672344e-07, + "loss": 0.0145, + "num_tokens": 26253192.0, + "reward": 0.78228759765625, + "reward_std": 0.011938963085412979, + "rewards//mean": 0.78228759765625, + "rewards//std": 0.01823238469660282, + "step": 3605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7212, + "grad_norm": 1.3252626657485962, + "kl": 0.32776103541255, + "learning_rate": 1.8349377309556486e-07, + "loss": 0.0131, + "num_tokens": 26260408.0, + "reward": 0.87066650390625, + "reward_std": 0.014179853722453117, + "rewards//mean": 0.87066650390625, + "rewards//std": 0.0201851986348629, + "step": 3606 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.859375, + "epoch": 0.7214, + "grad_norm": 1.9494481086730957, + "kl": 0.3395882248878479, + "learning_rate": 1.8324817649307668e-07, + "loss": 0.0147, + "num_tokens": 26267759.0, + "reward": 0.888671875, + "reward_std": 0.021367192268371582, + "rewards//mean": 0.888671875, + "rewards//std": 0.03302117437124252, + "step": 3607 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7216, + "grad_norm": 1.3029371500015259, + "kl": 0.34502475522458553, + "learning_rate": 1.8300270747818526e-07, + "loss": 0.0138, + "num_tokens": 26275127.0, + "reward": 0.83331298828125, + "reward_std": 0.014781298115849495, + "rewards//mean": 0.83331298828125, + "rewards//std": 0.028306078165769577, + "step": 3608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7218, + "grad_norm": 1.8862369060516357, + "kl": 0.3351361844688654, + "learning_rate": 1.8275736614976517e-07, + "loss": 0.0134, + "num_tokens": 26282391.0, + "reward": 0.82208251953125, + "reward_std": 0.01671532541513443, + "rewards//mean": 0.82208251953125, + "rewards//std": 0.020062586292624474, + "step": 3609 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.722, + "grad_norm": 1.5254453420639038, + "kl": 0.35054950788617134, + "learning_rate": 1.8251215260664006e-07, + "loss": 0.014, + "num_tokens": 26289655.0, + "reward": 0.812744140625, + "reward_std": 0.014608010649681091, + "rewards//mean": 0.812744140625, + "rewards//std": 0.02094782330095768, + "step": 3610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7222, + "grad_norm": 1.7524245977401733, + "kl": 0.33575810492038727, + "learning_rate": 1.8226706694758193e-07, + "loss": 0.0134, + "num_tokens": 26296983.0, + "reward": 0.84832763671875, + "reward_std": 0.01800059713423252, + "rewards//mean": 0.84832763671875, + "rewards//std": 0.01942000538110733, + "step": 3611 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7224, + "grad_norm": 1.6190190315246582, + "kl": 0.3016854953020811, + "learning_rate": 1.820221092713114e-07, + "loss": 0.0121, + "num_tokens": 26304263.0, + "reward": 0.82000732421875, + "reward_std": 0.015701137483119965, + "rewards//mean": 0.82000732421875, + "rewards//std": 0.02099897898733616, + "step": 3612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7226, + "grad_norm": 1.6253387928009033, + "kl": 0.3560047186911106, + "learning_rate": 1.8177727967649703e-07, + "loss": 0.0142, + "num_tokens": 26311607.0, + "reward": 0.8349609375, + "reward_std": 0.014214426279067993, + "rewards//mean": 0.8349609375, + "rewards//std": 0.020937703549861908, + "step": 3613 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7228, + "grad_norm": 1.6497435569763184, + "kl": 0.3155143726617098, + "learning_rate": 1.815325782617564e-07, + "loss": 0.0126, + "num_tokens": 26318855.0, + "reward": 0.8770751953125, + "reward_std": 0.013530464842915535, + "rewards//mean": 0.8770751953125, + "rewards//std": 0.02127016894519329, + "step": 3614 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.723, + "grad_norm": 1.678423285484314, + "kl": 0.3044101558625698, + "learning_rate": 1.812880051256551e-07, + "loss": 0.0122, + "num_tokens": 26326167.0, + "reward": 0.88140869140625, + "reward_std": 0.01460905559360981, + "rewards//mean": 0.88140869140625, + "rewards//std": 0.02578338421881199, + "step": 3615 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7232, + "grad_norm": 1.5299314260482788, + "kl": 0.3662657402455807, + "learning_rate": 1.810435603667075e-07, + "loss": 0.0147, + "num_tokens": 26333471.0, + "reward": 0.84429931640625, + "reward_std": 0.01752036064863205, + "rewards//mean": 0.84429931640625, + "rewards//std": 0.028132811188697815, + "step": 3616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7234, + "grad_norm": 1.5369936227798462, + "kl": 0.4142065867781639, + "learning_rate": 1.8079924408337537e-07, + "loss": 0.0166, + "num_tokens": 26340751.0, + "reward": 0.8067626953125, + "reward_std": 0.013695267960429192, + "rewards//mean": 0.8067626953125, + "rewards//std": 0.018387503921985626, + "step": 3617 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.84375, + "epoch": 0.7236, + "grad_norm": 1.7110651731491089, + "kl": 0.36598580330610275, + "learning_rate": 1.8055505637406958e-07, + "loss": 0.0026, + "num_tokens": 26348053.0, + "reward": 0.84661865234375, + "reward_std": 0.016701268032193184, + "rewards//mean": 0.84661865234375, + "rewards//std": 0.02913411520421505, + "step": 3618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7238, + "grad_norm": 1.82253897190094, + "kl": 0.4018046669661999, + "learning_rate": 1.8031099733714889e-07, + "loss": 0.0161, + "num_tokens": 26355301.0, + "reward": 0.83673095703125, + "reward_std": 0.01966256834566593, + "rewards//mean": 0.83673095703125, + "rewards//std": 0.028630439192056656, + "step": 3619 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.75, + "epoch": 0.724, + "grad_norm": 1.4990404844284058, + "kl": 0.34307914413511753, + "learning_rate": 1.800670670709204e-07, + "loss": 0.0002, + "num_tokens": 26362581.0, + "reward": 0.81768798828125, + "reward_std": 0.011824719607830048, + "rewards//mean": 0.81768798828125, + "rewards//std": 0.017358774319291115, + "step": 3620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.875, + "epoch": 0.7242, + "grad_norm": 2.226217269897461, + "kl": 0.48793578520417213, + "learning_rate": 1.7982326567363886e-07, + "loss": 0.0214, + "num_tokens": 26369861.0, + "reward": 0.8209228515625, + "reward_std": 0.01807752437889576, + "rewards//mean": 0.8209228515625, + "rewards//std": 0.031057341024279594, + "step": 3621 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7244, + "grad_norm": 1.487886905670166, + "kl": 0.2994050942361355, + "learning_rate": 1.7957959324350763e-07, + "loss": 0.012, + "num_tokens": 26377125.0, + "reward": 0.84521484375, + "reward_std": 0.010315487161278725, + "rewards//mean": 0.84521484375, + "rewards//std": 0.01768287643790245, + "step": 3622 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7246, + "grad_norm": 1.503193974494934, + "kl": 0.3037459049373865, + "learning_rate": 1.7933604987867813e-07, + "loss": 0.0121, + "num_tokens": 26384365.0, + "reward": 0.82464599609375, + "reward_std": 0.01281458605080843, + "rewards//mean": 0.82464599609375, + "rewards//std": 0.015361071564257145, + "step": 3623 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7248, + "grad_norm": 1.655444860458374, + "kl": 0.3687603622674942, + "learning_rate": 1.7909263567724914e-07, + "loss": 0.0148, + "num_tokens": 26391637.0, + "reward": 0.82232666015625, + "reward_std": 0.019313763827085495, + "rewards//mean": 0.82232666015625, + "rewards//std": 0.025429869070649147, + "step": 3624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.725, + "grad_norm": 1.458184838294983, + "kl": 0.35645170137286186, + "learning_rate": 1.788493507372682e-07, + "loss": 0.0143, + "num_tokens": 26398965.0, + "reward": 0.8525390625, + "reward_std": 0.012081017717719078, + "rewards//mean": 0.8525390625, + "rewards//std": 0.019512640312314034, + "step": 3625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7252, + "grad_norm": 1.5752336978912354, + "kl": 0.3516084458678961, + "learning_rate": 1.7860619515673032e-07, + "loss": 0.0141, + "num_tokens": 26406213.0, + "reward": 0.8448486328125, + "reward_std": 0.014999196864664555, + "rewards//mean": 0.8448486328125, + "rewards//std": 0.023973582312464714, + "step": 3626 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7254, + "grad_norm": 1.38524329662323, + "kl": 0.3179172445088625, + "learning_rate": 1.783631690335788e-07, + "loss": 0.0127, + "num_tokens": 26413597.0, + "reward": 0.885498046875, + "reward_std": 0.022730987519025803, + "rewards//mean": 0.885498046875, + "rewards//std": 0.03085906244814396, + "step": 3627 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7256, + "grad_norm": 2.339545965194702, + "kl": 0.3962135277688503, + "learning_rate": 1.7812027246570416e-07, + "loss": 0.0158, + "num_tokens": 26420805.0, + "reward": 0.8681640625, + "reward_std": 0.01880965381860733, + "rewards//mean": 0.8681640625, + "rewards//std": 0.030536562204360962, + "step": 3628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7258, + "grad_norm": 1.5129536390304565, + "kl": 0.3543066084384918, + "learning_rate": 1.7787750555094528e-07, + "loss": 0.0142, + "num_tokens": 26428125.0, + "reward": 0.84930419921875, + "reward_std": 0.016286389902234077, + "rewards//mean": 0.84930419921875, + "rewards//std": 0.020585447549819946, + "step": 3629 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.726, + "grad_norm": 1.471679449081421, + "kl": 0.2948204427957535, + "learning_rate": 1.7763486838708856e-07, + "loss": 0.0118, + "num_tokens": 26435357.0, + "reward": 0.77130126953125, + "reward_std": 0.012778420001268387, + "rewards//mean": 0.77130126953125, + "rewards//std": 0.015342336148023605, + "step": 3630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7262, + "grad_norm": 1.4650238752365112, + "kl": 0.2748573515564203, + "learning_rate": 1.7739236107186857e-07, + "loss": 0.011, + "num_tokens": 26442645.0, + "reward": 0.8096923828125, + "reward_std": 0.011493895202875137, + "rewards//mean": 0.8096923828125, + "rewards//std": 0.04093046486377716, + "step": 3631 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7264, + "grad_norm": 1.4923917055130005, + "kl": 0.3481820188462734, + "learning_rate": 1.7714998370296674e-07, + "loss": 0.0139, + "num_tokens": 26449965.0, + "reward": 0.78240966796875, + "reward_std": 0.013379736803472042, + "rewards//mean": 0.78240966796875, + "rewards//std": 0.030849678441882133, + "step": 3632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7266, + "grad_norm": 1.4628416299819946, + "kl": 0.3689672816544771, + "learning_rate": 1.7690773637801292e-07, + "loss": 0.0148, + "num_tokens": 26457205.0, + "reward": 0.7462158203125, + "reward_std": 0.008116859942674637, + "rewards//mean": 0.7462158203125, + "rewards//std": 0.012352034449577332, + "step": 3633 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7268, + "grad_norm": 1.622602105140686, + "kl": 0.36199190467596054, + "learning_rate": 1.7666561919458422e-07, + "loss": 0.0145, + "num_tokens": 26464405.0, + "reward": 0.826416015625, + "reward_std": 0.011357948184013367, + "rewards//mean": 0.826416015625, + "rewards//std": 0.014430316165089607, + "step": 3634 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.727, + "grad_norm": 1.5472248792648315, + "kl": 0.3700005169957876, + "learning_rate": 1.7642363225020557e-07, + "loss": 0.0148, + "num_tokens": 26471685.0, + "reward": 0.86090087890625, + "reward_std": 0.012170043773949146, + "rewards//mean": 0.86090087890625, + "rewards//std": 0.016122672706842422, + "step": 3635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7272, + "grad_norm": 1.4277857542037964, + "kl": 0.3728603273630142, + "learning_rate": 1.7618177564234904e-07, + "loss": 0.0149, + "num_tokens": 26478981.0, + "reward": 0.82672119140625, + "reward_std": 0.014909476973116398, + "rewards//mean": 0.82672119140625, + "rewards//std": 0.019279971718788147, + "step": 3636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7274, + "grad_norm": 1.2868595123291016, + "kl": 0.3208488039672375, + "learning_rate": 1.7594004946843454e-07, + "loss": 0.0128, + "num_tokens": 26486325.0, + "reward": 0.813720703125, + "reward_std": 0.012291112914681435, + "rewards//mean": 0.813720703125, + "rewards//std": 0.02442800998687744, + "step": 3637 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7276, + "grad_norm": 1.3587466478347778, + "kl": 0.3602490946650505, + "learning_rate": 1.7569845382582937e-07, + "loss": 0.0144, + "num_tokens": 26493685.0, + "reward": 0.87127685546875, + "reward_std": 0.0157973263412714, + "rewards//mean": 0.87127685546875, + "rewards//std": 0.021501895040273666, + "step": 3638 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7278, + "grad_norm": 1.3375228643417358, + "kl": 0.2880217656493187, + "learning_rate": 1.7545698881184833e-07, + "loss": 0.0115, + "num_tokens": 26500997.0, + "reward": 0.8740234375, + "reward_std": 0.015392803587019444, + "rewards//mean": 0.8740234375, + "rewards//std": 0.027321597561240196, + "step": 3639 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.728, + "grad_norm": 1.9795621633529663, + "kl": 0.4357558283954859, + "learning_rate": 1.752156545237533e-07, + "loss": 0.0174, + "num_tokens": 26508213.0, + "reward": 0.82965087890625, + "reward_std": 0.01606103777885437, + "rewards//mean": 0.82965087890625, + "rewards//std": 0.022408561781048775, + "step": 3640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7282, + "grad_norm": 1.4311857223510742, + "kl": 0.34626638516783714, + "learning_rate": 1.7497445105875374e-07, + "loss": 0.0139, + "num_tokens": 26515453.0, + "reward": 0.82244873046875, + "reward_std": 0.016355503350496292, + "rewards//mean": 0.82244873046875, + "rewards//std": 0.02239031344652176, + "step": 3641 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.84375, + "epoch": 0.7284, + "grad_norm": 1.3884080648422241, + "kl": 0.4013837091624737, + "learning_rate": 1.747333785140066e-07, + "loss": 0.0169, + "num_tokens": 26522691.0, + "reward": 0.8369140625, + "reward_std": 0.01737726852297783, + "rewards//mean": 0.8369140625, + "rewards//std": 0.022097086533904076, + "step": 3642 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7286, + "grad_norm": 1.6522533893585205, + "kl": 0.3609035313129425, + "learning_rate": 1.7449243698661552e-07, + "loss": 0.0144, + "num_tokens": 26529995.0, + "reward": 0.8736572265625, + "reward_std": 0.010825083591043949, + "rewards//mean": 0.8736572265625, + "rewards//std": 0.01761038973927498, + "step": 3643 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7288, + "grad_norm": 1.3092845678329468, + "kl": 0.3359241560101509, + "learning_rate": 1.742516265736319e-07, + "loss": 0.0134, + "num_tokens": 26537331.0, + "reward": 0.8223876953125, + "reward_std": 0.013270031660795212, + "rewards//mean": 0.8223876953125, + "rewards//std": 0.02164267748594284, + "step": 3644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.729, + "grad_norm": 1.9386601448059082, + "kl": 0.5217039808630943, + "learning_rate": 1.7401094737205414e-07, + "loss": 0.0209, + "num_tokens": 26544635.0, + "reward": 0.8203125, + "reward_std": 0.01557648554444313, + "rewards//mean": 0.8203125, + "rewards//std": 0.02108754962682724, + "step": 3645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7292, + "grad_norm": 1.7510744333267212, + "kl": 0.4103828016668558, + "learning_rate": 1.7377039947882798e-07, + "loss": 0.0164, + "num_tokens": 26551883.0, + "reward": 0.8623046875, + "reward_std": 0.015216724947094917, + "rewards//mean": 0.8623046875, + "rewards//std": 0.02018379233777523, + "step": 3646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7294, + "grad_norm": 1.6336127519607544, + "kl": 0.40999022871255875, + "learning_rate": 1.735299829908457e-07, + "loss": 0.0164, + "num_tokens": 26559315.0, + "reward": 0.84088134765625, + "reward_std": 0.019953155890107155, + "rewards//mean": 0.84088134765625, + "rewards//std": 0.02825629897415638, + "step": 3647 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7296, + "grad_norm": 1.3820234537124634, + "kl": 0.32463833317160606, + "learning_rate": 1.7328969800494726e-07, + "loss": 0.013, + "num_tokens": 26566539.0, + "reward": 0.78143310546875, + "reward_std": 0.014458201825618744, + "rewards//mean": 0.78143310546875, + "rewards//std": 0.020780133083462715, + "step": 3648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.78125, + "epoch": 0.7298, + "grad_norm": 1.5244157314300537, + "kl": 0.3753705583512783, + "learning_rate": 1.7304954461791937e-07, + "loss": 0.011, + "num_tokens": 26573861.0, + "reward": 0.822509765625, + "reward_std": 0.01659592241048813, + "rewards//mean": 0.822509765625, + "rewards//std": 0.023559898138046265, + "step": 3649 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.73, + "grad_norm": 1.9388447999954224, + "kl": 0.378364484757185, + "learning_rate": 1.7280952292649598e-07, + "loss": 0.0151, + "num_tokens": 26581125.0, + "reward": 0.89117431640625, + "reward_std": 0.013496290892362595, + "rewards//mean": 0.89117431640625, + "rewards//std": 0.03164179250597954, + "step": 3650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7302, + "grad_norm": 4.868616104125977, + "kl": 0.757957398891449, + "learning_rate": 1.725696330273575e-07, + "loss": 0.0303, + "num_tokens": 26588365.0, + "reward": 0.84515380859375, + "reward_std": 0.016084372997283936, + "rewards//mean": 0.84515380859375, + "rewards//std": 0.026396093890070915, + "step": 3651 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7304, + "grad_norm": 1.8101695775985718, + "kl": 0.35769845359027386, + "learning_rate": 1.7232987501713164e-07, + "loss": 0.0143, + "num_tokens": 26595541.0, + "reward": 0.841064453125, + "reward_std": 0.020339269191026688, + "rewards//mean": 0.841064453125, + "rewards//std": 0.02478238008916378, + "step": 3652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7306, + "grad_norm": 1.7108570337295532, + "kl": 0.320750180631876, + "learning_rate": 1.7209024899239293e-07, + "loss": 0.0128, + "num_tokens": 26602925.0, + "reward": 0.83935546875, + "reward_std": 0.00984141044318676, + "rewards//mean": 0.83935546875, + "rewards//std": 0.013963288627564907, + "step": 3653 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7308, + "grad_norm": 1.4338098764419556, + "kl": 0.33138224855065346, + "learning_rate": 1.718507550496629e-07, + "loss": 0.0133, + "num_tokens": 26610213.0, + "reward": 0.84918212890625, + "reward_std": 0.015222115442156792, + "rewards//mean": 0.84918212890625, + "rewards//std": 0.021288931369781494, + "step": 3654 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.731, + "grad_norm": 1.4615604877471924, + "kl": 0.3040441572666168, + "learning_rate": 1.716113932854093e-07, + "loss": 0.0122, + "num_tokens": 26617501.0, + "reward": 0.84307861328125, + "reward_std": 0.01428612507879734, + "rewards//mean": 0.84307861328125, + "rewards//std": 0.020428957417607307, + "step": 3655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7312, + "grad_norm": 1.4261109828948975, + "kl": 0.3614938519895077, + "learning_rate": 1.7137216379604724e-07, + "loss": 0.0145, + "num_tokens": 26624917.0, + "reward": 0.83782958984375, + "reward_std": 0.014743460342288017, + "rewards//mean": 0.83782958984375, + "rewards//std": 0.01899680867791176, + "step": 3656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7314, + "grad_norm": 1.4347169399261475, + "kl": 0.411367978900671, + "learning_rate": 1.7113306667793847e-07, + "loss": 0.0165, + "num_tokens": 26632197.0, + "reward": 0.86810302734375, + "reward_std": 0.02054944634437561, + "rewards//mean": 0.86810302734375, + "rewards//std": 0.025341615080833435, + "step": 3657 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7316, + "grad_norm": 1.632647156715393, + "kl": 0.4326598346233368, + "learning_rate": 1.708941020273909e-07, + "loss": 0.0173, + "num_tokens": 26639589.0, + "reward": 0.84381103515625, + "reward_std": 0.018337523564696312, + "rewards//mean": 0.84381103515625, + "rewards//std": 0.028437864035367966, + "step": 3658 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7318, + "grad_norm": 1.941162109375, + "kl": 0.4327334500849247, + "learning_rate": 1.7065526994065972e-07, + "loss": 0.0173, + "num_tokens": 26646925.0, + "reward": 0.85931396484375, + "reward_std": 0.015658926218748093, + "rewards//mean": 0.85931396484375, + "rewards//std": 0.020640525966882706, + "step": 3659 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.732, + "grad_norm": 1.4912924766540527, + "kl": 0.41530710831284523, + "learning_rate": 1.704165705139464e-07, + "loss": 0.0166, + "num_tokens": 26654189.0, + "reward": 0.84912109375, + "reward_std": 0.014999853447079659, + "rewards//mean": 0.84912109375, + "rewards//std": 0.02437466010451317, + "step": 3660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7322, + "grad_norm": 1.4447816610336304, + "kl": 0.3329259864985943, + "learning_rate": 1.7017800384339924e-07, + "loss": 0.0133, + "num_tokens": 26661533.0, + "reward": 0.86224365234375, + "reward_std": 0.019855324178934097, + "rewards//mean": 0.86224365234375, + "rewards//std": 0.025640910491347313, + "step": 3661 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7324, + "grad_norm": 1.6571725606918335, + "kl": 0.32174958288669586, + "learning_rate": 1.6993957002511257e-07, + "loss": 0.0129, + "num_tokens": 26668821.0, + "reward": 0.85552978515625, + "reward_std": 0.015786033123731613, + "rewards//mean": 0.85552978515625, + "rewards//std": 0.021748993545770645, + "step": 3662 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7326, + "grad_norm": 1.584105134010315, + "kl": 0.3372591361403465, + "learning_rate": 1.6970126915512756e-07, + "loss": 0.0135, + "num_tokens": 26676029.0, + "reward": 0.8695068359375, + "reward_std": 0.017411937937140465, + "rewards//mean": 0.8695068359375, + "rewards//std": 0.024771075695753098, + "step": 3663 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7328, + "grad_norm": 1.468538522720337, + "kl": 0.2885014433413744, + "learning_rate": 1.6946310132943187e-07, + "loss": 0.0115, + "num_tokens": 26683221.0, + "reward": 0.8421630859375, + "reward_std": 0.019432978704571724, + "rewards//mean": 0.8421630859375, + "rewards//std": 0.02415725588798523, + "step": 3664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.733, + "grad_norm": 1.6548460721969604, + "kl": 0.37433145195245743, + "learning_rate": 1.692250666439596e-07, + "loss": 0.015, + "num_tokens": 26690605.0, + "reward": 0.78570556640625, + "reward_std": 0.016140703111886978, + "rewards//mean": 0.78570556640625, + "rewards//std": 0.021439146250486374, + "step": 3665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7332, + "grad_norm": 1.8959635496139526, + "kl": 0.4085095450282097, + "learning_rate": 1.6898716519459072e-07, + "loss": 0.0163, + "num_tokens": 26697853.0, + "reward": 0.85107421875, + "reward_std": 0.014312086626887321, + "rewards//mean": 0.85107421875, + "rewards//std": 0.021412387490272522, + "step": 3666 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7334, + "grad_norm": 1.7710775136947632, + "kl": 0.3100202865898609, + "learning_rate": 1.6874939707715214e-07, + "loss": 0.0124, + "num_tokens": 26705101.0, + "reward": 0.8436279296875, + "reward_std": 0.018715888261795044, + "rewards//mean": 0.8436279296875, + "rewards//std": 0.023836802691221237, + "step": 3667 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7336, + "grad_norm": 1.4712646007537842, + "kl": 0.3486510179936886, + "learning_rate": 1.6851176238741683e-07, + "loss": 0.0139, + "num_tokens": 26712485.0, + "reward": 0.82684326171875, + "reward_std": 0.010238519869744778, + "rewards//mean": 0.82684326171875, + "rewards//std": 0.01469937339425087, + "step": 3668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7338, + "grad_norm": 1.7292897701263428, + "kl": 0.327215775847435, + "learning_rate": 1.6827426122110412e-07, + "loss": 0.0131, + "num_tokens": 26719853.0, + "reward": 0.82147216796875, + "reward_std": 0.017304979264736176, + "rewards//mean": 0.82147216796875, + "rewards//std": 0.025767529383301735, + "step": 3669 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.734, + "grad_norm": 1.7399089336395264, + "kl": 0.33788590505719185, + "learning_rate": 1.6803689367387918e-07, + "loss": 0.0135, + "num_tokens": 26727101.0, + "reward": 0.83184814453125, + "reward_std": 0.014797092415392399, + "rewards//mean": 0.83184814453125, + "rewards//std": 0.01965400017797947, + "step": 3670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7342, + "grad_norm": 1.65183424949646, + "kl": 0.367828905582428, + "learning_rate": 1.6779965984135374e-07, + "loss": 0.0147, + "num_tokens": 26734517.0, + "reward": 0.8419189453125, + "reward_std": 0.014663739129900932, + "rewards//mean": 0.8419189453125, + "rewards//std": 0.020786413922905922, + "step": 3671 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7344, + "grad_norm": 1.505300521850586, + "kl": 0.29657211899757385, + "learning_rate": 1.675625598190858e-07, + "loss": 0.0119, + "num_tokens": 26741861.0, + "reward": 0.82598876953125, + "reward_std": 0.015665167942643166, + "rewards//mean": 0.82598876953125, + "rewards//std": 0.030329078435897827, + "step": 3672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.8125, + "epoch": 0.7346, + "grad_norm": 1.5345280170440674, + "kl": 0.3321081195026636, + "learning_rate": 1.6732559370257882e-07, + "loss": 0.0133, + "num_tokens": 26749105.0, + "reward": 0.8656005859375, + "reward_std": 0.015253851190209389, + "rewards//mean": 0.8656005859375, + "rewards//std": 0.02294369414448738, + "step": 3673 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7348, + "grad_norm": 1.5814050436019897, + "kl": 0.37148287519812584, + "learning_rate": 1.670887615872829e-07, + "loss": 0.0149, + "num_tokens": 26756417.0, + "reward": 0.82537841796875, + "reward_std": 0.017743811011314392, + "rewards//mean": 0.82537841796875, + "rewards//std": 0.02132374420762062, + "step": 3674 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.735, + "grad_norm": 2.113293170928955, + "kl": 0.3682316541671753, + "learning_rate": 1.6685206356859398e-07, + "loss": 0.0147, + "num_tokens": 26763657.0, + "reward": 0.86212158203125, + "reward_std": 0.019374262541532516, + "rewards//mean": 0.86212158203125, + "rewards//std": 0.026481404900550842, + "step": 3675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7352, + "grad_norm": 1.392877459526062, + "kl": 0.35097840428352356, + "learning_rate": 1.6661549974185424e-07, + "loss": 0.014, + "num_tokens": 26770993.0, + "reward": 0.82568359375, + "reward_std": 0.013746250420808792, + "rewards//mean": 0.82568359375, + "rewards//std": 0.017696566879749298, + "step": 3676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7354, + "grad_norm": 1.5965516567230225, + "kl": 0.43253443762660027, + "learning_rate": 1.6637907020235114e-07, + "loss": 0.0173, + "num_tokens": 26778185.0, + "reward": 0.79034423828125, + "reward_std": 0.015374465845525265, + "rewards//mean": 0.79034423828125, + "rewards//std": 0.021463844925165176, + "step": 3677 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7356, + "grad_norm": 1.7746877670288086, + "kl": 0.35913511738181114, + "learning_rate": 1.6614277504531866e-07, + "loss": 0.0144, + "num_tokens": 26785497.0, + "reward": 0.835693359375, + "reward_std": 0.0193351898342371, + "rewards//mean": 0.835693359375, + "rewards//std": 0.023957474157214165, + "step": 3678 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.7358, + "grad_norm": 1.5256750583648682, + "kl": 0.28532625176012516, + "learning_rate": 1.659066143659366e-07, + "loss": 0.0082, + "num_tokens": 26792819.0, + "reward": 0.8311767578125, + "reward_std": 0.011447515338659286, + "rewards//mean": 0.8311767578125, + "rewards//std": 0.025906166061758995, + "step": 3679 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.736, + "grad_norm": 1.7871698141098022, + "kl": 0.3412812501192093, + "learning_rate": 1.6567058825933022e-07, + "loss": 0.0137, + "num_tokens": 26800027.0, + "reward": 0.86279296875, + "reward_std": 0.01920982636511326, + "rewards//mean": 0.86279296875, + "rewards//std": 0.02455286681652069, + "step": 3680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7362, + "grad_norm": 1.5548595190048218, + "kl": 0.31919459626078606, + "learning_rate": 1.6543469682057104e-07, + "loss": 0.0128, + "num_tokens": 26807339.0, + "reward": 0.82940673828125, + "reward_std": 0.012858926318585873, + "rewards//mean": 0.82940673828125, + "rewards//std": 0.023756252601742744, + "step": 3681 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7364, + "grad_norm": 1.3108623027801514, + "kl": 0.3414698615670204, + "learning_rate": 1.6519894014467578e-07, + "loss": 0.0137, + "num_tokens": 26814547.0, + "reward": 0.7977294921875, + "reward_std": 0.013914378359913826, + "rewards//mean": 0.7977294921875, + "rewards//std": 0.03534156084060669, + "step": 3682 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7366, + "grad_norm": 1.399489164352417, + "kl": 0.37002574279904366, + "learning_rate": 1.6496331832660742e-07, + "loss": 0.0148, + "num_tokens": 26821755.0, + "reward": 0.85498046875, + "reward_std": 0.012749665416777134, + "rewards//mean": 0.85498046875, + "rewards//std": 0.016075776889920235, + "step": 3683 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7368, + "grad_norm": 1.626212239265442, + "kl": 0.3616410195827484, + "learning_rate": 1.6472783146127438e-07, + "loss": 0.0145, + "num_tokens": 26828987.0, + "reward": 0.87158203125, + "reward_std": 0.016622336581349373, + "rewards//mean": 0.87158203125, + "rewards//std": 0.022515127435326576, + "step": 3684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.737, + "grad_norm": 1.6072026491165161, + "kl": 0.373788021504879, + "learning_rate": 1.644924796435309e-07, + "loss": 0.0151, + "num_tokens": 26836290.0, + "reward": 0.87298583984375, + "reward_std": 0.01816319115459919, + "rewards//mean": 0.87298583984375, + "rewards//std": 0.03120340220630169, + "step": 3685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.59375, + "epoch": 0.7372, + "grad_norm": 1.3139750957489014, + "kl": 0.38009658083319664, + "learning_rate": 1.6425726296817632e-07, + "loss": -0.0163, + "num_tokens": 26843552.0, + "reward": 0.8662109375, + "reward_std": 0.01790613681077957, + "rewards//mean": 0.8662109375, + "rewards//std": 0.02856963686645031, + "step": 3686 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7374, + "grad_norm": 1.5343278646469116, + "kl": 0.33731942996382713, + "learning_rate": 1.6402218152995607e-07, + "loss": 0.0135, + "num_tokens": 26850896.0, + "reward": 0.851806640625, + "reward_std": 0.015130789019167423, + "rewards//mean": 0.851806640625, + "rewards//std": 0.021952755749225616, + "step": 3687 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7376, + "grad_norm": 1.5607585906982422, + "kl": 0.34852674044668674, + "learning_rate": 1.637872354235611e-07, + "loss": 0.0139, + "num_tokens": 26858144.0, + "reward": 0.81085205078125, + "reward_std": 0.013762000948190689, + "rewards//mean": 0.81085205078125, + "rewards//std": 0.022291388362646103, + "step": 3688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.828125, + "epoch": 0.7378, + "grad_norm": 1.740885853767395, + "kl": 0.3925721328705549, + "learning_rate": 1.6355242474362728e-07, + "loss": 0.0026, + "num_tokens": 26865541.0, + "reward": 0.8487548828125, + "reward_std": 0.01970614679157734, + "rewards//mean": 0.8487548828125, + "rewards//std": 0.027263637632131577, + "step": 3689 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.738, + "grad_norm": 1.4029209613800049, + "kl": 0.36101965233683586, + "learning_rate": 1.633177495847366e-07, + "loss": 0.0144, + "num_tokens": 26872773.0, + "reward": 0.7584228515625, + "reward_std": 0.01713269203901291, + "rewards//mean": 0.7584228515625, + "rewards//std": 0.02426230162382126, + "step": 3690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7382, + "grad_norm": 1.5506341457366943, + "kl": 0.30662606097757816, + "learning_rate": 1.6308321004141607e-07, + "loss": 0.0123, + "num_tokens": 26880093.0, + "reward": 0.7861328125, + "reward_std": 0.012805100530385971, + "rewards//mean": 0.7861328125, + "rewards//std": 0.020564204081892967, + "step": 3691 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7384, + "grad_norm": 1.5576786994934082, + "kl": 0.38475129194557667, + "learning_rate": 1.6284880620813846e-07, + "loss": 0.0154, + "num_tokens": 26887309.0, + "reward": 0.88262939453125, + "reward_std": 0.021335389465093613, + "rewards//mean": 0.88262939453125, + "rewards//std": 0.02866636961698532, + "step": 3692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7386, + "grad_norm": 1.484133005142212, + "kl": 0.38870586454868317, + "learning_rate": 1.6261453817932119e-07, + "loss": 0.0155, + "num_tokens": 26894573.0, + "reward": 0.87567138671875, + "reward_std": 0.014880452305078506, + "rewards//mean": 0.87567138671875, + "rewards//std": 0.026726076379418373, + "step": 3693 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7388, + "grad_norm": 1.7185310125350952, + "kl": 0.46201655082404613, + "learning_rate": 1.6238040604932757e-07, + "loss": 0.0185, + "num_tokens": 26901813.0, + "reward": 0.787353515625, + "reward_std": 0.011759113520383835, + "rewards//mean": 0.787353515625, + "rewards//std": 0.022215967997908592, + "step": 3694 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.739, + "grad_norm": 1.4447064399719238, + "kl": 0.35592967830598354, + "learning_rate": 1.6214640991246609e-07, + "loss": 0.0142, + "num_tokens": 26909221.0, + "reward": 0.83905029296875, + "reward_std": 0.021479856222867966, + "rewards//mean": 0.83905029296875, + "rewards//std": 0.027243293821811676, + "step": 3695 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7392, + "grad_norm": 1.897171974182129, + "kl": 0.3054013680666685, + "learning_rate": 1.6191254986299042e-07, + "loss": 0.0122, + "num_tokens": 26916501.0, + "reward": 0.834716796875, + "reward_std": 0.013516775332391262, + "rewards//mean": 0.834716796875, + "rewards//std": 0.017783604562282562, + "step": 3696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7394, + "grad_norm": 1.4652496576309204, + "kl": 0.357112318277359, + "learning_rate": 1.6167882599509902e-07, + "loss": 0.0143, + "num_tokens": 26923717.0, + "reward": 0.84124755859375, + "reward_std": 0.015227225609123707, + "rewards//mean": 0.84124755859375, + "rewards//std": 0.017279235646128654, + "step": 3697 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7396, + "grad_norm": 1.4330055713653564, + "kl": 0.2867995109409094, + "learning_rate": 1.614452384029361e-07, + "loss": 0.0115, + "num_tokens": 26931037.0, + "reward": 0.7728271484375, + "reward_std": 0.012593141756951809, + "rewards//mean": 0.7728271484375, + "rewards//std": 0.018871787935495377, + "step": 3698 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7398, + "grad_norm": 1.4640034437179565, + "kl": 0.3031481634825468, + "learning_rate": 1.612117871805907e-07, + "loss": 0.0121, + "num_tokens": 26938413.0, + "reward": 0.86566162109375, + "reward_std": 0.012290554121136665, + "rewards//mean": 0.86566162109375, + "rewards//std": 0.018618520349264145, + "step": 3699 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.74, + "grad_norm": 1.745713472366333, + "kl": 0.35725632309913635, + "learning_rate": 1.60978472422097e-07, + "loss": 0.0143, + "num_tokens": 26945765.0, + "reward": 0.84429931640625, + "reward_std": 0.013674410060048103, + "rewards//mean": 0.84429931640625, + "rewards//std": 0.01805383339524269, + "step": 3700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.7402, + "grad_norm": 1.3026739358901978, + "kl": 0.36818815022706985, + "learning_rate": 1.6074529422143396e-07, + "loss": 0.0156, + "num_tokens": 26953128.0, + "reward": 0.8369140625, + "reward_std": 0.010351849719882011, + "rewards//mean": 0.8369140625, + "rewards//std": 0.013937246054410934, + "step": 3701 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7404, + "grad_norm": 1.5480259656906128, + "kl": 0.3102179206907749, + "learning_rate": 1.6051225267252583e-07, + "loss": 0.0124, + "num_tokens": 26960680.0, + "reward": 0.838134765625, + "reward_std": 0.015309866517782211, + "rewards//mean": 0.838134765625, + "rewards//std": 0.018893033266067505, + "step": 3702 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7406, + "grad_norm": 1.709018588066101, + "kl": 0.3671904616057873, + "learning_rate": 1.6027934786924185e-07, + "loss": 0.0147, + "num_tokens": 26967872.0, + "reward": 0.76397705078125, + "reward_std": 0.009261132217943668, + "rewards//mean": 0.76397705078125, + "rewards//std": 0.011474485509097576, + "step": 3703 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7408, + "grad_norm": 1.5202245712280273, + "kl": 0.3500564284622669, + "learning_rate": 1.6004657990539578e-07, + "loss": 0.014, + "num_tokens": 26975176.0, + "reward": 0.80255126953125, + "reward_std": 0.01440637931227684, + "rewards//mean": 0.80255126953125, + "rewards//std": 0.020062586292624474, + "step": 3704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.741, + "grad_norm": 1.5603185892105103, + "kl": 0.3553269077092409, + "learning_rate": 1.598139488747467e-07, + "loss": 0.0145, + "num_tokens": 26982438.0, + "reward": 0.854736328125, + "reward_std": 0.013463614508509636, + "rewards//mean": 0.854736328125, + "rewards//std": 0.021730976179242134, + "step": 3705 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7412, + "grad_norm": 1.4450641870498657, + "kl": 0.36907501332461834, + "learning_rate": 1.5958145487099827e-07, + "loss": 0.0148, + "num_tokens": 26989646.0, + "reward": 0.81451416015625, + "reward_std": 0.01633821800351143, + "rewards//mean": 0.81451416015625, + "rewards//std": 0.021190578117966652, + "step": 3706 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.7414, + "grad_norm": 1.5790107250213623, + "kl": 0.31796362809836864, + "learning_rate": 1.5934909798779933e-07, + "loss": 0.014, + "num_tokens": 26996889.0, + "reward": 0.8546142578125, + "reward_std": 0.016839761286973953, + "rewards//mean": 0.8546142578125, + "rewards//std": 0.024084458127617836, + "step": 3707 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7416, + "grad_norm": 1.8377505540847778, + "kl": 0.3174891509115696, + "learning_rate": 1.5911687831874278e-07, + "loss": 0.0127, + "num_tokens": 27004153.0, + "reward": 0.8543701171875, + "reward_std": 0.016055282205343246, + "rewards//mean": 0.8543701171875, + "rewards//std": 0.022760868072509766, + "step": 3708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7418, + "grad_norm": 1.4867969751358032, + "kl": 0.3994241151958704, + "learning_rate": 1.5888479595736694e-07, + "loss": 0.016, + "num_tokens": 27011393.0, + "reward": 0.83721923828125, + "reward_std": 0.016237890347838402, + "rewards//mean": 0.83721923828125, + "rewards//std": 0.024676403030753136, + "step": 3709 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.742, + "grad_norm": 1.6354479789733887, + "kl": 0.31687403842806816, + "learning_rate": 1.5865285099715442e-07, + "loss": 0.0127, + "num_tokens": 27018705.0, + "reward": 0.8658447265625, + "reward_std": 0.017948977649211884, + "rewards//mean": 0.8658447265625, + "rewards//std": 0.024302199482917786, + "step": 3710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.7422, + "grad_norm": 1.5582345724105835, + "kl": 0.3610922731459141, + "learning_rate": 1.5842104353153285e-07, + "loss": 0.0146, + "num_tokens": 27025915.0, + "reward": 0.8475341796875, + "reward_std": 0.016308635473251343, + "rewards//mean": 0.8475341796875, + "rewards//std": 0.019287526607513428, + "step": 3711 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7424, + "grad_norm": 1.469092845916748, + "kl": 0.35663570277392864, + "learning_rate": 1.5818937365387396e-07, + "loss": 0.0143, + "num_tokens": 27033275.0, + "reward": 0.830322265625, + "reward_std": 0.012396440841257572, + "rewards//mean": 0.830322265625, + "rewards//std": 0.017173897475004196, + "step": 3712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7426, + "grad_norm": 1.5834498405456543, + "kl": 0.3376987688243389, + "learning_rate": 1.5795784145749453e-07, + "loss": 0.0135, + "num_tokens": 27040547.0, + "reward": 0.84796142578125, + "reward_std": 0.01229649968445301, + "rewards//mean": 0.84796142578125, + "rewards//std": 0.016606463119387627, + "step": 3713 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.765625, + "epoch": 0.7428, + "grad_norm": 1.4485492706298828, + "kl": 0.37253426760435104, + "learning_rate": 1.5772644703565564e-07, + "loss": 0.0082, + "num_tokens": 27047924.0, + "reward": 0.84271240234375, + "reward_std": 0.01683724671602249, + "rewards//mean": 0.84271240234375, + "rewards//std": 0.028664786368608475, + "step": 3714 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.743, + "grad_norm": 1.4291249513626099, + "kl": 0.32466999627649784, + "learning_rate": 1.5749519048156306e-07, + "loss": 0.0126, + "num_tokens": 27055307.0, + "reward": 0.86944580078125, + "reward_std": 0.013677094131708145, + "rewards//mean": 0.86944580078125, + "rewards//std": 0.02268984355032444, + "step": 3715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7432, + "grad_norm": 1.4668854475021362, + "kl": 0.3539946712553501, + "learning_rate": 1.5726407188836672e-07, + "loss": 0.0142, + "num_tokens": 27062531.0, + "reward": 0.85858154296875, + "reward_std": 0.016753925010561943, + "rewards//mean": 0.85858154296875, + "rewards//std": 0.024440942332148552, + "step": 3716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7434, + "grad_norm": 1.5078831911087036, + "kl": 0.34238792583346367, + "learning_rate": 1.5703309134916116e-07, + "loss": 0.0137, + "num_tokens": 27069827.0, + "reward": 0.82952880859375, + "reward_std": 0.017803287133574486, + "rewards//mean": 0.82952880859375, + "rewards//std": 0.02374924160540104, + "step": 3717 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.71875, + "epoch": 0.7436, + "grad_norm": 2.053051233291626, + "kl": 0.45394292101264, + "learning_rate": 1.5680224895698558e-07, + "loss": 0.0093, + "num_tokens": 27077097.0, + "reward": 0.8316650390625, + "reward_std": 0.02093282714486122, + "rewards//mean": 0.8316650390625, + "rewards//std": 0.03480464220046997, + "step": 3718 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7438, + "grad_norm": 1.3877729177474976, + "kl": 0.3699510544538498, + "learning_rate": 1.5657154480482293e-07, + "loss": 0.0148, + "num_tokens": 27084481.0, + "reward": 0.86981201171875, + "reward_std": 0.014658521860837936, + "rewards//mean": 0.86981201171875, + "rewards//std": 0.020327933132648468, + "step": 3719 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.744, + "grad_norm": 1.3710906505584717, + "kl": 0.3250546585768461, + "learning_rate": 1.5634097898560096e-07, + "loss": 0.013, + "num_tokens": 27091785.0, + "reward": 0.85186767578125, + "reward_std": 0.012356719002127647, + "rewards//mean": 0.85186767578125, + "rewards//std": 0.015583164058625698, + "step": 3720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7442, + "grad_norm": 1.539773941040039, + "kl": 0.356097798794508, + "learning_rate": 1.561105515921915e-07, + "loss": 0.0142, + "num_tokens": 27099025.0, + "reward": 0.83331298828125, + "reward_std": 0.02180497720837593, + "rewards//mean": 0.83331298828125, + "rewards//std": 0.027463551610708237, + "step": 3721 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7444, + "grad_norm": 1.6177692413330078, + "kl": 0.3585927411913872, + "learning_rate": 1.5588026271741095e-07, + "loss": 0.0143, + "num_tokens": 27106273.0, + "reward": 0.8336181640625, + "reward_std": 0.014464965090155602, + "rewards//mean": 0.8336181640625, + "rewards//std": 0.019755885004997253, + "step": 3722 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7446, + "grad_norm": 1.441702961921692, + "kl": 0.41841160878539085, + "learning_rate": 1.5565011245401927e-07, + "loss": 0.0167, + "num_tokens": 27113577.0, + "reward": 0.83062744140625, + "reward_std": 0.022336812689900398, + "rewards//mean": 0.83062744140625, + "rewards//std": 0.02952328510582447, + "step": 3723 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7448, + "grad_norm": 1.3727631568908691, + "kl": 0.3368542902171612, + "learning_rate": 1.5542010089472108e-07, + "loss": 0.0135, + "num_tokens": 27120905.0, + "reward": 0.86676025390625, + "reward_std": 0.0147252744063735, + "rewards//mean": 0.86676025390625, + "rewards//std": 0.02267182245850563, + "step": 3724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.745, + "grad_norm": 1.9394042491912842, + "kl": 0.3069725073873997, + "learning_rate": 1.551902281321651e-07, + "loss": 0.0123, + "num_tokens": 27128185.0, + "reward": 0.86590576171875, + "reward_std": 0.018100224435329437, + "rewards//mean": 0.86590576171875, + "rewards//std": 0.02538280002772808, + "step": 3725 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7452, + "grad_norm": 1.548791766166687, + "kl": 0.43148312345147133, + "learning_rate": 1.5496049425894408e-07, + "loss": 0.0173, + "num_tokens": 27135601.0, + "reward": 0.85333251953125, + "reward_std": 0.013954656198620796, + "rewards//mean": 0.85333251953125, + "rewards//std": 0.023795727640390396, + "step": 3726 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7454, + "grad_norm": 1.900331974029541, + "kl": 0.3383782897144556, + "learning_rate": 1.5473089936759458e-07, + "loss": 0.0135, + "num_tokens": 27142865.0, + "reward": 0.8878173828125, + "reward_std": 0.015953969210386276, + "rewards//mean": 0.8878173828125, + "rewards//std": 0.02657078579068184, + "step": 3727 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.7456, + "grad_norm": 1.7579281330108643, + "kl": 0.3556124307215214, + "learning_rate": 1.5450144355059752e-07, + "loss": 0.015, + "num_tokens": 27150194.0, + "reward": 0.8653564453125, + "reward_std": 0.020929737016558647, + "rewards//mean": 0.8653564453125, + "rewards//std": 0.028382249176502228, + "step": 3728 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7458, + "grad_norm": 1.7151658535003662, + "kl": 0.3592696636915207, + "learning_rate": 1.542721269003777e-07, + "loss": 0.0144, + "num_tokens": 27157426.0, + "reward": 0.85540771484375, + "reward_std": 0.019111067056655884, + "rewards//mean": 0.85540771484375, + "rewards//std": 0.029019581153988838, + "step": 3729 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.746, + "grad_norm": 1.7979549169540405, + "kl": 0.29385059140622616, + "learning_rate": 1.5404294950930397e-07, + "loss": 0.0118, + "num_tokens": 27164706.0, + "reward": 0.81768798828125, + "reward_std": 0.012170161120593548, + "rewards//mean": 0.81768798828125, + "rewards//std": 0.02416062355041504, + "step": 3730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7462, + "grad_norm": 1.4569014310836792, + "kl": 0.3743790127336979, + "learning_rate": 1.5381391146968863e-07, + "loss": 0.015, + "num_tokens": 27172002.0, + "reward": 0.81134033203125, + "reward_std": 0.017414648085832596, + "rewards//mean": 0.81134033203125, + "rewards//std": 0.020047489553689957, + "step": 3731 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7464, + "grad_norm": 1.3471242189407349, + "kl": 0.3434038981795311, + "learning_rate": 1.535850128737884e-07, + "loss": 0.0137, + "num_tokens": 27179266.0, + "reward": 0.83709716796875, + "reward_std": 0.01179030817002058, + "rewards//mean": 0.83709716796875, + "rewards//std": 0.01459395419806242, + "step": 3732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7466, + "grad_norm": 1.4681308269500732, + "kl": 0.3350066374987364, + "learning_rate": 1.5335625381380364e-07, + "loss": 0.0134, + "num_tokens": 27186554.0, + "reward": 0.88311767578125, + "reward_std": 0.0207204632461071, + "rewards//mean": 0.88311767578125, + "rewards//std": 0.029059719294309616, + "step": 3733 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7468, + "grad_norm": 1.5643095970153809, + "kl": 0.39070853032171726, + "learning_rate": 1.5312763438187826e-07, + "loss": 0.0156, + "num_tokens": 27193834.0, + "reward": 0.86517333984375, + "reward_std": 0.0227559432387352, + "rewards//mean": 0.86517333984375, + "rewards//std": 0.02684757672250271, + "step": 3734 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.747, + "grad_norm": 1.5060105323791504, + "kl": 0.29444178007543087, + "learning_rate": 1.5289915467010029e-07, + "loss": 0.0118, + "num_tokens": 27201106.0, + "reward": 0.8785400390625, + "reward_std": 0.015073679387569427, + "rewards//mean": 0.8785400390625, + "rewards//std": 0.02738109417259693, + "step": 3735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7472, + "grad_norm": 1.6604704856872559, + "kl": 0.33306053653359413, + "learning_rate": 1.5267081477050131e-07, + "loss": 0.0133, + "num_tokens": 27208490.0, + "reward": 0.85992431640625, + "reward_std": 0.017463499680161476, + "rewards//mean": 0.85992431640625, + "rewards//std": 0.023144949227571487, + "step": 3736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7474, + "grad_norm": 1.2932424545288086, + "kl": 0.34051316045224667, + "learning_rate": 1.5244261477505676e-07, + "loss": 0.0136, + "num_tokens": 27215826.0, + "reward": 0.85003662109375, + "reward_std": 0.011100435629487038, + "rewards//mean": 0.85003662109375, + "rewards//std": 0.014358674176037312, + "step": 3737 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7476, + "grad_norm": 1.6536931991577148, + "kl": 0.29030792228877544, + "learning_rate": 1.5221455477568523e-07, + "loss": 0.0116, + "num_tokens": 27223026.0, + "reward": 0.82806396484375, + "reward_std": 0.012163961306214333, + "rewards//mean": 0.82806396484375, + "rewards//std": 0.017954621464014053, + "step": 3738 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.75, + "epoch": 0.7478, + "grad_norm": 1.5904635190963745, + "kl": 0.3378488067537546, + "learning_rate": 1.5198663486424944e-07, + "loss": -0.0017, + "num_tokens": 27230250.0, + "reward": 0.8603515625, + "reward_std": 0.011392946355044842, + "rewards//mean": 0.8603515625, + "rewards//std": 0.020575977861881256, + "step": 3739 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.748, + "grad_norm": 1.3162994384765625, + "kl": 0.3093662466853857, + "learning_rate": 1.517588551325556e-07, + "loss": 0.0129, + "num_tokens": 27237496.0, + "reward": 0.8828125, + "reward_std": 0.013862095773220062, + "rewards//mean": 0.8828125, + "rewards//std": 0.02380661852657795, + "step": 3740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7482, + "grad_norm": 1.476346731185913, + "kl": 0.3501651082187891, + "learning_rate": 1.5153121567235333e-07, + "loss": 0.014, + "num_tokens": 27244832.0, + "reward": 0.8472900390625, + "reward_std": 0.015352088958024979, + "rewards//mean": 0.8472900390625, + "rewards//std": 0.023517129942774773, + "step": 3741 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7484, + "grad_norm": 1.5412137508392334, + "kl": 0.37575818225741386, + "learning_rate": 1.5130371657533558e-07, + "loss": 0.015, + "num_tokens": 27252152.0, + "reward": 0.8787841796875, + "reward_std": 0.018495114520192146, + "rewards//mean": 0.8787841796875, + "rewards//std": 0.021462874487042427, + "step": 3742 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7486, + "grad_norm": 1.504488229751587, + "kl": 0.3547091968357563, + "learning_rate": 1.510763579331391e-07, + "loss": 0.0142, + "num_tokens": 27259416.0, + "reward": 0.8763427734375, + "reward_std": 0.014285877346992493, + "rewards//mean": 0.8763427734375, + "rewards//std": 0.029587626457214355, + "step": 3743 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7488, + "grad_norm": 1.645622730255127, + "kl": 0.31953502632677555, + "learning_rate": 1.5084913983734393e-07, + "loss": 0.0128, + "num_tokens": 27266704.0, + "reward": 0.85089111328125, + "reward_std": 0.012216972187161446, + "rewards//mean": 0.85089111328125, + "rewards//std": 0.018669672310352325, + "step": 3744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.749, + "grad_norm": 1.7326377630233765, + "kl": 0.36385431699454784, + "learning_rate": 1.5062206237947362e-07, + "loss": 0.0146, + "num_tokens": 27273984.0, + "reward": 0.83154296875, + "reward_std": 0.01789218559861183, + "rewards//mean": 0.83154296875, + "rewards//std": 0.03105173446238041, + "step": 3745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.7492, + "grad_norm": 1.6169629096984863, + "kl": 0.29580502212047577, + "learning_rate": 1.5039512565099466e-07, + "loss": 0.0093, + "num_tokens": 27281177.0, + "reward": 0.7708740234375, + "reward_std": 0.019717693328857422, + "rewards//mean": 0.7708740234375, + "rewards//std": 0.02483454905450344, + "step": 3746 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7494, + "grad_norm": 1.4195120334625244, + "kl": 0.29979259334504604, + "learning_rate": 1.5016832974331723e-07, + "loss": 0.012, + "num_tokens": 27288473.0, + "reward": 0.83221435546875, + "reward_std": 0.01969393901526928, + "rewards//mean": 0.83221435546875, + "rewards//std": 0.027156474068760872, + "step": 3747 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7496, + "grad_norm": 1.3627992868423462, + "kl": 0.33997878804802895, + "learning_rate": 1.499416747477948e-07, + "loss": 0.0136, + "num_tokens": 27295681.0, + "reward": 0.8748779296875, + "reward_std": 0.016217529773712158, + "rewards//mean": 0.8748779296875, + "rewards//std": 0.024547627195715904, + "step": 3748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7498, + "grad_norm": 1.8662359714508057, + "kl": 0.37329722568392754, + "learning_rate": 1.4971516075572405e-07, + "loss": 0.0149, + "num_tokens": 27302849.0, + "reward": 0.8135986328125, + "reward_std": 0.014788862317800522, + "rewards//mean": 0.8135986328125, + "rewards//std": 0.01713992841541767, + "step": 3749 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.75, + "grad_norm": 1.2537710666656494, + "kl": 0.3075369708240032, + "learning_rate": 1.494887878583445e-07, + "loss": 0.0123, + "num_tokens": 27310153.0, + "reward": 0.815185546875, + "reward_std": 0.012844700366258621, + "rewards//mean": 0.815185546875, + "rewards//std": 0.0152146490290761, + "step": 3750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7502, + "grad_norm": 1.6508680582046509, + "kl": 0.3813861422240734, + "learning_rate": 1.492625561468393e-07, + "loss": 0.0153, + "num_tokens": 27317385.0, + "reward": 0.830322265625, + "reward_std": 0.00850522331893444, + "rewards//mean": 0.830322265625, + "rewards//std": 0.013018508441746235, + "step": 3751 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7504, + "grad_norm": 1.9154179096221924, + "kl": 0.3453150577843189, + "learning_rate": 1.490364657123347e-07, + "loss": 0.0138, + "num_tokens": 27324657.0, + "reward": 0.8465576171875, + "reward_std": 0.014650201424956322, + "rewards//mean": 0.8465576171875, + "rewards//std": 0.022428564727306366, + "step": 3752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7506, + "grad_norm": 1.9168437719345093, + "kl": 0.4044802449643612, + "learning_rate": 1.4881051664589956e-07, + "loss": 0.0162, + "num_tokens": 27331977.0, + "reward": 0.833740234375, + "reward_std": 0.0162859708070755, + "rewards//mean": 0.833740234375, + "rewards//std": 0.020773665979504585, + "step": 3753 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7508, + "grad_norm": 1.2283875942230225, + "kl": 0.34315093606710434, + "learning_rate": 1.485847090385463e-07, + "loss": 0.0137, + "num_tokens": 27339257.0, + "reward": 0.87603759765625, + "reward_std": 0.017468804493546486, + "rewards//mean": 0.87603759765625, + "rewards//std": 0.030838390812277794, + "step": 3754 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.751, + "grad_norm": 1.7549593448638916, + "kl": 0.33696187660098076, + "learning_rate": 1.4835904298123026e-07, + "loss": 0.0135, + "num_tokens": 27346473.0, + "reward": 0.849365234375, + "reward_std": 0.018845191225409508, + "rewards//mean": 0.849365234375, + "rewards//std": 0.02198582887649536, + "step": 3755 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7512, + "grad_norm": 1.9296648502349854, + "kl": 0.4415874779224396, + "learning_rate": 1.481335185648498e-07, + "loss": 0.0177, + "num_tokens": 27353729.0, + "reward": 0.87652587890625, + "reward_std": 0.01797986775636673, + "rewards//mean": 0.87652587890625, + "rewards//std": 0.03163222223520279, + "step": 3756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7514, + "grad_norm": 1.4922016859054565, + "kl": 0.33292729407548904, + "learning_rate": 1.4790813588024581e-07, + "loss": 0.0133, + "num_tokens": 27361033.0, + "reward": 0.826416015625, + "reward_std": 0.011844111606478691, + "rewards//mean": 0.826416015625, + "rewards//std": 0.02410864271223545, + "step": 3757 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7516, + "grad_norm": 1.5018061399459839, + "kl": 0.3494676314294338, + "learning_rate": 1.4768289501820263e-07, + "loss": 0.014, + "num_tokens": 27368257.0, + "reward": 0.78070068359375, + "reward_std": 0.010817988775670528, + "rewards//mean": 0.78070068359375, + "rewards//std": 0.018892932683229446, + "step": 3758 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7518, + "grad_norm": 1.7470507621765137, + "kl": 0.2940207589417696, + "learning_rate": 1.4745779606944714e-07, + "loss": 0.0118, + "num_tokens": 27375529.0, + "reward": 0.8548583984375, + "reward_std": 0.01204718928784132, + "rewards//mean": 0.8548583984375, + "rewards//std": 0.018768833950161934, + "step": 3759 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.752, + "grad_norm": 1.3859783411026, + "kl": 0.37545763328671455, + "learning_rate": 1.472328391246494e-07, + "loss": 0.015, + "num_tokens": 27382897.0, + "reward": 0.852783203125, + "reward_std": 0.014170534908771515, + "rewards//mean": 0.852783203125, + "rewards//std": 0.017467549070715904, + "step": 3760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7522, + "grad_norm": 1.731363296508789, + "kl": 0.35839908570051193, + "learning_rate": 1.4700802427442178e-07, + "loss": 0.0143, + "num_tokens": 27390169.0, + "reward": 0.8250732421875, + "reward_std": 0.013435110449790955, + "rewards//mean": 0.8250732421875, + "rewards//std": 0.020966242998838425, + "step": 3761 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7524, + "grad_norm": 1.3870042562484741, + "kl": 0.3253307230770588, + "learning_rate": 1.4678335160931972e-07, + "loss": 0.013, + "num_tokens": 27397433.0, + "reward": 0.8389892578125, + "reward_std": 0.014033574610948563, + "rewards//mean": 0.8389892578125, + "rewards//std": 0.03503180667757988, + "step": 3762 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7526, + "grad_norm": 1.5057367086410522, + "kl": 0.3359672874212265, + "learning_rate": 1.4655882121984136e-07, + "loss": 0.0134, + "num_tokens": 27404681.0, + "reward": 0.82427978515625, + "reward_std": 0.012730813585221767, + "rewards//mean": 0.82427978515625, + "rewards//std": 0.02041635662317276, + "step": 3763 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7528, + "grad_norm": 1.590192198753357, + "kl": 0.3151465244591236, + "learning_rate": 1.4633443319642792e-07, + "loss": 0.0126, + "num_tokens": 27411929.0, + "reward": 0.8475341796875, + "reward_std": 0.012997681275010109, + "rewards//mean": 0.8475341796875, + "rewards//std": 0.021349729970097542, + "step": 3764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.753, + "grad_norm": 1.8978360891342163, + "kl": 0.37904094718396664, + "learning_rate": 1.4611018762946215e-07, + "loss": 0.0152, + "num_tokens": 27419193.0, + "reward": 0.83526611328125, + "reward_std": 0.01745738834142685, + "rewards//mean": 0.83526611328125, + "rewards//std": 0.025157563388347626, + "step": 3765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7532, + "grad_norm": 1.6982450485229492, + "kl": 0.29219866171479225, + "learning_rate": 1.4588608460927048e-07, + "loss": 0.0117, + "num_tokens": 27426401.0, + "reward": 0.85980224609375, + "reward_std": 0.02075999230146408, + "rewards//mean": 0.85980224609375, + "rewards//std": 0.024271268397569656, + "step": 3766 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.7534, + "grad_norm": 1.5570319890975952, + "kl": 0.40574811398983, + "learning_rate": 1.4566212422612156e-07, + "loss": 0.016, + "num_tokens": 27433736.0, + "reward": 0.8623046875, + "reward_std": 0.021105807274580002, + "rewards//mean": 0.8623046875, + "rewards//std": 0.03124224953353405, + "step": 3767 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7536, + "grad_norm": 1.6021190881729126, + "kl": 0.31161259673535824, + "learning_rate": 1.4543830657022682e-07, + "loss": 0.0125, + "num_tokens": 27441136.0, + "reward": 0.82958984375, + "reward_std": 0.019122466444969177, + "rewards//mean": 0.82958984375, + "rewards//std": 0.02083914540708065, + "step": 3768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7538, + "grad_norm": 2.179276466369629, + "kl": 0.45578479021787643, + "learning_rate": 1.4521463173173965e-07, + "loss": 0.0182, + "num_tokens": 27448488.0, + "reward": 0.81182861328125, + "reward_std": 0.01327560842037201, + "rewards//mean": 0.81182861328125, + "rewards//std": 0.026083994656801224, + "step": 3769 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.754, + "grad_norm": 1.931821584701538, + "kl": 0.40310216322541237, + "learning_rate": 1.4499109980075635e-07, + "loss": 0.0161, + "num_tokens": 27455776.0, + "reward": 0.8585205078125, + "reward_std": 0.02293521910905838, + "rewards//mean": 0.8585205078125, + "rewards//std": 0.03198894485831261, + "step": 3770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7542, + "grad_norm": 1.5176637172698975, + "kl": 0.3044288903474808, + "learning_rate": 1.4476771086731565e-07, + "loss": 0.0122, + "num_tokens": 27463048.0, + "reward": 0.786865234375, + "reward_std": 0.015059889294207096, + "rewards//mean": 0.786865234375, + "rewards//std": 0.01832028478384018, + "step": 3771 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7544, + "grad_norm": 1.534124732017517, + "kl": 0.38677918910980225, + "learning_rate": 1.445444650213986e-07, + "loss": 0.0155, + "num_tokens": 27470224.0, + "reward": 0.8702392578125, + "reward_std": 0.01581503264605999, + "rewards//mean": 0.8702392578125, + "rewards//std": 0.023524854332208633, + "step": 3772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7546, + "grad_norm": 1.4843653440475464, + "kl": 0.35733568854629993, + "learning_rate": 1.4432136235292846e-07, + "loss": 0.0143, + "num_tokens": 27477480.0, + "reward": 0.797607421875, + "reward_std": 0.013597934506833553, + "rewards//mean": 0.797607421875, + "rewards//std": 0.016918139532208443, + "step": 3773 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7548, + "grad_norm": 1.5389056205749512, + "kl": 0.38294949755072594, + "learning_rate": 1.44098402951771e-07, + "loss": 0.0153, + "num_tokens": 27484720.0, + "reward": 0.8857421875, + "reward_std": 0.016536226496100426, + "rewards//mean": 0.8857421875, + "rewards//std": 0.021821346133947372, + "step": 3774 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.765625, + "epoch": 0.755, + "grad_norm": 1.8205980062484741, + "kl": 0.389840304851532, + "learning_rate": 1.4387558690773426e-07, + "loss": 0.0129, + "num_tokens": 27491953.0, + "reward": 0.864990234375, + "reward_std": 0.026912156492471695, + "rewards//mean": 0.864990234375, + "rewards//std": 0.03169538825750351, + "step": 3775 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7552, + "grad_norm": 1.5565848350524902, + "kl": 0.3381954189389944, + "learning_rate": 1.436529143105687e-07, + "loss": 0.0135, + "num_tokens": 27499217.0, + "reward": 0.81024169921875, + "reward_std": 0.012782832607626915, + "rewards//mean": 0.81024169921875, + "rewards//std": 0.022062037140130997, + "step": 3776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.84375, + "epoch": 0.7554, + "grad_norm": 1.8241872787475586, + "kl": 0.4339912887662649, + "learning_rate": 1.434303852499664e-07, + "loss": 0.0178, + "num_tokens": 27506479.0, + "reward": 0.8507080078125, + "reward_std": 0.019500266760587692, + "rewards//mean": 0.8507080078125, + "rewards//std": 0.03244751691818237, + "step": 3777 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7556, + "grad_norm": 1.7718755006790161, + "kl": 0.4086769111454487, + "learning_rate": 1.432079998155624e-07, + "loss": 0.0163, + "num_tokens": 27513759.0, + "reward": 0.8514404296875, + "reward_std": 0.01952206715941429, + "rewards//mean": 0.8514404296875, + "rewards//std": 0.02704511024057865, + "step": 3778 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7558, + "grad_norm": 1.4244626760482788, + "kl": 0.2952350862324238, + "learning_rate": 1.4298575809693353e-07, + "loss": 0.0118, + "num_tokens": 27520999.0, + "reward": 0.82293701171875, + "reward_std": 0.0110394898802042, + "rewards//mean": 0.82293701171875, + "rewards//std": 0.017908191308379173, + "step": 3779 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.756, + "grad_norm": 1.4912235736846924, + "kl": 0.3068411499261856, + "learning_rate": 1.4276366018359842e-07, + "loss": 0.0123, + "num_tokens": 27528319.0, + "reward": 0.89398193359375, + "reward_std": 0.010466120205819607, + "rewards//mean": 0.89398193359375, + "rewards//std": 0.02372373268008232, + "step": 3780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7562, + "grad_norm": 1.5371350049972534, + "kl": 0.3349467143416405, + "learning_rate": 1.4254170616501827e-07, + "loss": 0.0134, + "num_tokens": 27535695.0, + "reward": 0.85369873046875, + "reward_std": 0.010932877659797668, + "rewards//mean": 0.85369873046875, + "rewards//std": 0.015701230615377426, + "step": 3781 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7564, + "grad_norm": 1.5158377885818481, + "kl": 0.2787307258695364, + "learning_rate": 1.4231989613059614e-07, + "loss": 0.0111, + "num_tokens": 27542975.0, + "reward": 0.84686279296875, + "reward_std": 0.014824917539954185, + "rewards//mean": 0.84686279296875, + "rewards//std": 0.025163577869534492, + "step": 3782 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.7566, + "grad_norm": 1.2629040479660034, + "kl": 0.34763046726584435, + "learning_rate": 1.420982301696772e-07, + "loss": 0.0145, + "num_tokens": 27550342.0, + "reward": 0.85638427734375, + "reward_std": 0.01058176625519991, + "rewards//mean": 0.85638427734375, + "rewards//std": 0.012367191724479198, + "step": 3783 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7568, + "grad_norm": 1.5861629247665405, + "kl": 0.359266746789217, + "learning_rate": 1.4187670837154824e-07, + "loss": 0.0144, + "num_tokens": 27557630.0, + "reward": 0.8057861328125, + "reward_std": 0.01635625585913658, + "rewards//mean": 0.8057861328125, + "rewards//std": 0.02858208492398262, + "step": 3784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.757, + "grad_norm": 1.7155495882034302, + "kl": 0.36479271203279495, + "learning_rate": 1.4165533082543828e-07, + "loss": 0.0146, + "num_tokens": 27564878.0, + "reward": 0.87762451171875, + "reward_std": 0.014548178762197495, + "rewards//mean": 0.87762451171875, + "rewards//std": 0.0247352235019207, + "step": 3785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7572, + "grad_norm": 1.4808894395828247, + "kl": 0.3382764421403408, + "learning_rate": 1.414340976205183e-07, + "loss": 0.0135, + "num_tokens": 27572150.0, + "reward": 0.86212158203125, + "reward_std": 0.02405422553420067, + "rewards//mean": 0.86212158203125, + "rewards//std": 0.028929198160767555, + "step": 3786 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.78125, + "epoch": 0.7574, + "grad_norm": 1.828429937362671, + "kl": 0.30456006340682507, + "learning_rate": 1.4121300884590098e-07, + "loss": -0.0018, + "num_tokens": 27579448.0, + "reward": 0.83258056640625, + "reward_std": 0.015649672597646713, + "rewards//mean": 0.83258056640625, + "rewards//std": 0.02464509755373001, + "step": 3787 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7576, + "grad_norm": 1.5465214252471924, + "kl": 0.3410634547472, + "learning_rate": 1.4099206459064062e-07, + "loss": 0.0136, + "num_tokens": 27586696.0, + "reward": 0.80804443359375, + "reward_std": 0.014067983254790306, + "rewards//mean": 0.80804443359375, + "rewards//std": 0.01913495734333992, + "step": 3788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.7578, + "grad_norm": 1.3403202295303345, + "kl": 0.349367655813694, + "learning_rate": 1.4077126494373376e-07, + "loss": 0.0125, + "num_tokens": 27594093.0, + "reward": 0.846923828125, + "reward_std": 0.017244812101125717, + "rewards//mean": 0.846923828125, + "rewards//std": 0.025965990498661995, + "step": 3789 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.758, + "grad_norm": 1.538459062576294, + "kl": 0.3386253733187914, + "learning_rate": 1.4055060999411838e-07, + "loss": 0.0135, + "num_tokens": 27601461.0, + "reward": 0.76214599609375, + "reward_std": 0.010209104977548122, + "rewards//mean": 0.76214599609375, + "rewards//std": 0.012374533340334892, + "step": 3790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7582, + "grad_norm": 1.5162514448165894, + "kl": 0.4041747897863388, + "learning_rate": 1.4033009983067452e-07, + "loss": 0.0162, + "num_tokens": 27608725.0, + "reward": 0.8602294921875, + "reward_std": 0.021945606917142868, + "rewards//mean": 0.8602294921875, + "rewards//std": 0.029614219442009926, + "step": 3791 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7584, + "grad_norm": 1.3984348773956299, + "kl": 0.3723468668758869, + "learning_rate": 1.4010973454222323e-07, + "loss": 0.0149, + "num_tokens": 27615949.0, + "reward": 0.85797119140625, + "reward_std": 0.014698336832225323, + "rewards//mean": 0.85797119140625, + "rewards//std": 0.02784290723502636, + "step": 3792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7586, + "grad_norm": 1.7284834384918213, + "kl": 0.403568297624588, + "learning_rate": 1.3988951421752788e-07, + "loss": 0.0161, + "num_tokens": 27623293.0, + "reward": 0.8575439453125, + "reward_std": 0.015237477608025074, + "rewards//mean": 0.8575439453125, + "rewards//std": 0.02187642641365528, + "step": 3793 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7588, + "grad_norm": 1.4289926290512085, + "kl": 0.32863837480545044, + "learning_rate": 1.396694389452931e-07, + "loss": 0.0131, + "num_tokens": 27630621.0, + "reward": 0.8221435546875, + "reward_std": 0.014220182783901691, + "rewards//mean": 0.8221435546875, + "rewards//std": 0.034329939633607864, + "step": 3794 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.765625, + "epoch": 0.759, + "grad_norm": 1.3899377584457397, + "kl": 0.388690460473299, + "learning_rate": 1.394495088141654e-07, + "loss": 0.0045, + "num_tokens": 27637862.0, + "reward": 0.88031005859375, + "reward_std": 0.017135513946413994, + "rewards//mean": 0.88031005859375, + "rewards//std": 0.02885112538933754, + "step": 3795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7592, + "grad_norm": 1.66519296169281, + "kl": 0.3022379595786333, + "learning_rate": 1.3922972391273225e-07, + "loss": 0.0121, + "num_tokens": 27645438.0, + "reward": 0.867431640625, + "reward_std": 0.025731965899467468, + "rewards//mean": 0.867431640625, + "rewards//std": 0.03776923567056656, + "step": 3796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.796875, + "epoch": 0.7594, + "grad_norm": 1.4755865335464478, + "kl": 0.32402994111180305, + "learning_rate": 1.3901008432952322e-07, + "loss": 0.0084, + "num_tokens": 27652697.0, + "reward": 0.82611083984375, + "reward_std": 0.012788424268364906, + "rewards//mean": 0.82611083984375, + "rewards//std": 0.02021966688334942, + "step": 3797 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7596, + "grad_norm": 1.6348873376846313, + "kl": 0.31761827506124973, + "learning_rate": 1.3879059015300915e-07, + "loss": 0.0127, + "num_tokens": 27659977.0, + "reward": 0.85540771484375, + "reward_std": 0.016679581254720688, + "rewards//mean": 0.85540771484375, + "rewards//std": 0.026525383815169334, + "step": 3798 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.828125, + "epoch": 0.7598, + "grad_norm": 1.544692039489746, + "kl": 0.34861492179334164, + "learning_rate": 1.3857124147160204e-07, + "loss": 0.0157, + "num_tokens": 27667270.0, + "reward": 0.85888671875, + "reward_std": 0.013130750507116318, + "rewards//mean": 0.85888671875, + "rewards//std": 0.022146357223391533, + "step": 3799 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.84375, + "epoch": 0.76, + "grad_norm": 1.5070686340332031, + "kl": 0.33456263691186905, + "learning_rate": 1.3835203837365561e-07, + "loss": 0.0068, + "num_tokens": 27674628.0, + "reward": 0.78448486328125, + "reward_std": 0.017010238021612167, + "rewards//mean": 0.78448486328125, + "rewards//std": 0.024275634437799454, + "step": 3800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7602, + "grad_norm": 2.0353124141693115, + "kl": 0.4755947720259428, + "learning_rate": 1.381329809474649e-07, + "loss": 0.019, + "num_tokens": 27681916.0, + "reward": 0.8353271484375, + "reward_std": 0.0159614160656929, + "rewards//mean": 0.8353271484375, + "rewards//std": 0.02122742496430874, + "step": 3801 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7604, + "grad_norm": 1.4171980619430542, + "kl": 0.3274041414260864, + "learning_rate": 1.3791406928126635e-07, + "loss": 0.0131, + "num_tokens": 27689212.0, + "reward": 0.86175537109375, + "reward_std": 0.016553420573472977, + "rewards//mean": 0.86175537109375, + "rewards//std": 0.023089947178959846, + "step": 3802 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.7606, + "grad_norm": 1.2335703372955322, + "kl": 0.3233592305332422, + "learning_rate": 1.3769530346323721e-07, + "loss": 0.0133, + "num_tokens": 27696416.0, + "reward": 0.84552001953125, + "reward_std": 0.01513120997697115, + "rewards//mean": 0.84552001953125, + "rewards//std": 0.0209137424826622, + "step": 3803 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.78125, + "epoch": 0.7608, + "grad_norm": 1.4546520709991455, + "kl": 0.3669327311217785, + "learning_rate": 1.3747668358149656e-07, + "loss": 0.0013, + "num_tokens": 27703682.0, + "reward": 0.852783203125, + "reward_std": 0.015802182257175446, + "rewards//mean": 0.852783203125, + "rewards//std": 0.027338214218616486, + "step": 3804 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.761, + "grad_norm": 1.8105212450027466, + "kl": 0.2781612742692232, + "learning_rate": 1.3725820972410434e-07, + "loss": 0.0111, + "num_tokens": 27710954.0, + "reward": 0.86676025390625, + "reward_std": 0.019871406257152557, + "rewards//mean": 0.86676025390625, + "rewards//std": 0.02306370809674263, + "step": 3805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7612, + "grad_norm": 1.8544732332229614, + "kl": 0.3214030973613262, + "learning_rate": 1.3703988197906207e-07, + "loss": 0.0129, + "num_tokens": 27718122.0, + "reward": 0.8526611328125, + "reward_std": 0.010221302509307861, + "rewards//mean": 0.8526611328125, + "rewards//std": 0.020309027284383774, + "step": 3806 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7614, + "grad_norm": 1.3811182975769043, + "kl": 0.317978173494339, + "learning_rate": 1.3682170043431173e-07, + "loss": 0.0127, + "num_tokens": 27725458.0, + "reward": 0.81298828125, + "reward_std": 0.014464622363448143, + "rewards//mean": 0.81298828125, + "rewards//std": 0.023963792249560356, + "step": 3807 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7616, + "grad_norm": 1.371882438659668, + "kl": 0.32713240943849087, + "learning_rate": 1.3660366517773708e-07, + "loss": 0.0131, + "num_tokens": 27732866.0, + "reward": 0.873046875, + "reward_std": 0.015573517419397831, + "rewards//mean": 0.873046875, + "rewards//std": 0.020856572315096855, + "step": 3808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7618, + "grad_norm": 1.7020536661148071, + "kl": 0.3673033006489277, + "learning_rate": 1.3638577629716263e-07, + "loss": 0.0147, + "num_tokens": 27740138.0, + "reward": 0.7982177734375, + "reward_std": 0.0151829170063138, + "rewards//mean": 0.7982177734375, + "rewards//std": 0.01825862191617489, + "step": 3809 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.762, + "grad_norm": 1.4402297735214233, + "kl": 0.3886822573840618, + "learning_rate": 1.3616803388035413e-07, + "loss": 0.0155, + "num_tokens": 27747370.0, + "reward": 0.814697265625, + "reward_std": 0.01937181130051613, + "rewards//mean": 0.814697265625, + "rewards//std": 0.033587511628866196, + "step": 3810 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.734375, + "epoch": 0.7622, + "grad_norm": 1.333398699760437, + "kl": 0.2958492860198021, + "learning_rate": 1.3595043801501794e-07, + "loss": -0.0039, + "num_tokens": 27754529.0, + "reward": 0.85467529296875, + "reward_std": 0.01166423037648201, + "rewards//mean": 0.85467529296875, + "rewards//std": 0.022436240687966347, + "step": 3811 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.859375, + "epoch": 0.7624, + "grad_norm": 1.7529330253601074, + "kl": 0.3395610935986042, + "learning_rate": 1.3573298878880179e-07, + "loss": 0.004, + "num_tokens": 27761968.0, + "reward": 0.8709716796875, + "reward_std": 0.016647392883896828, + "rewards//mean": 0.8709716796875, + "rewards//std": 0.02656622789800167, + "step": 3812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7626, + "grad_norm": 1.4593547582626343, + "kl": 0.3803436979651451, + "learning_rate": 1.3551568628929432e-07, + "loss": 0.0152, + "num_tokens": 27769296.0, + "reward": 0.86669921875, + "reward_std": 0.0200380589812994, + "rewards//mean": 0.86669921875, + "rewards//std": 0.03226810693740845, + "step": 3813 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7628, + "grad_norm": 1.6699892282485962, + "kl": 0.3404366821050644, + "learning_rate": 1.352985306040247e-07, + "loss": 0.0136, + "num_tokens": 27776584.0, + "reward": 0.851318359375, + "reward_std": 0.017974048852920532, + "rewards//mean": 0.851318359375, + "rewards//std": 0.021051626652479172, + "step": 3814 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.763, + "grad_norm": 1.5154399871826172, + "kl": 0.4019438475370407, + "learning_rate": 1.3508152182046335e-07, + "loss": 0.0161, + "num_tokens": 27783848.0, + "reward": 0.86419677734375, + "reward_std": 0.01817484200000763, + "rewards//mean": 0.86419677734375, + "rewards//std": 0.025072576478123665, + "step": 3815 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7632, + "grad_norm": 1.3035176992416382, + "kl": 0.3015513550490141, + "learning_rate": 1.3486466002602132e-07, + "loss": 0.0121, + "num_tokens": 27791080.0, + "reward": 0.84075927734375, + "reward_std": 0.015107310377061367, + "rewards//mean": 0.84075927734375, + "rewards//std": 0.03272455930709839, + "step": 3816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7634, + "grad_norm": 1.4368853569030762, + "kl": 0.3666229210793972, + "learning_rate": 1.3464794530805073e-07, + "loss": 0.0147, + "num_tokens": 27798408.0, + "reward": 0.8558349609375, + "reward_std": 0.014455081894993782, + "rewards//mean": 0.8558349609375, + "rewards//std": 0.01795767806470394, + "step": 3817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7636, + "grad_norm": 1.502479910850525, + "kl": 0.3372741509228945, + "learning_rate": 1.3443137775384396e-07, + "loss": 0.0135, + "num_tokens": 27805688.0, + "reward": 0.84027099609375, + "reward_std": 0.015184210613369942, + "rewards//mean": 0.84027099609375, + "rewards//std": 0.02394981123507023, + "step": 3818 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7638, + "grad_norm": 1.6265783309936523, + "kl": 0.31241401471197605, + "learning_rate": 1.342149574506345e-07, + "loss": 0.0125, + "num_tokens": 27813024.0, + "reward": 0.83636474609375, + "reward_std": 0.013368009589612484, + "rewards//mean": 0.83636474609375, + "rewards//std": 0.02219679392874241, + "step": 3819 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.764, + "grad_norm": 1.474609375, + "kl": 0.34200121462345123, + "learning_rate": 1.3399868448559636e-07, + "loss": 0.0137, + "num_tokens": 27820352.0, + "reward": 0.81524658203125, + "reward_std": 0.00997670367360115, + "rewards//mean": 0.81524658203125, + "rewards//std": 0.015131717547774315, + "step": 3820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7642, + "grad_norm": 1.8242197036743164, + "kl": 0.3957312572747469, + "learning_rate": 1.3378255894584462e-07, + "loss": 0.0158, + "num_tokens": 27827640.0, + "reward": 0.8399658203125, + "reward_std": 0.017237218096852303, + "rewards//mean": 0.8399658203125, + "rewards//std": 0.033313047140836716, + "step": 3821 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7644, + "grad_norm": 1.9819238185882568, + "kl": 0.36862190812826157, + "learning_rate": 1.335665809184341e-07, + "loss": 0.0147, + "num_tokens": 27835072.0, + "reward": 0.7718505859375, + "reward_std": 0.011016419157385826, + "rewards//mean": 0.7718505859375, + "rewards//std": 0.016641635447740555, + "step": 3822 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7646, + "grad_norm": 1.56498122215271, + "kl": 0.37366747856140137, + "learning_rate": 1.3335075049036099e-07, + "loss": 0.0149, + "num_tokens": 27842256.0, + "reward": 0.85650634765625, + "reward_std": 0.01374916173517704, + "rewards//mean": 0.85650634765625, + "rewards//std": 0.018743310123682022, + "step": 3823 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7648, + "grad_norm": 1.431947946548462, + "kl": 0.31176597252488136, + "learning_rate": 1.3313506774856175e-07, + "loss": 0.0125, + "num_tokens": 27849608.0, + "reward": 0.837890625, + "reward_std": 0.010298988781869411, + "rewards//mean": 0.837890625, + "rewards//std": 0.016440758481621742, + "step": 3824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.765, + "grad_norm": 1.4950900077819824, + "kl": 0.3030197788029909, + "learning_rate": 1.3291953277991347e-07, + "loss": 0.0121, + "num_tokens": 27856896.0, + "reward": 0.8533935546875, + "reward_std": 0.012722769752144814, + "rewards//mean": 0.8533935546875, + "rewards//std": 0.02023136056959629, + "step": 3825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7652, + "grad_norm": 1.602307915687561, + "kl": 0.3396289423108101, + "learning_rate": 1.327041456712334e-07, + "loss": 0.0136, + "num_tokens": 27864224.0, + "reward": 0.85223388671875, + "reward_std": 0.0183239858597517, + "rewards//mean": 0.85223388671875, + "rewards//std": 0.023883353918790817, + "step": 3826 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7654, + "grad_norm": 1.2938390970230103, + "kl": 0.34118177369236946, + "learning_rate": 1.3248890650927945e-07, + "loss": 0.0136, + "num_tokens": 27871544.0, + "reward": 0.821533203125, + "reward_std": 0.015141545794904232, + "rewards//mean": 0.821533203125, + "rewards//std": 0.027213897556066513, + "step": 3827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.734375, + "epoch": 0.7656, + "grad_norm": 1.7664557695388794, + "kl": 0.3695742655545473, + "learning_rate": 1.3227381538075023e-07, + "loss": 0.0002, + "num_tokens": 27878839.0, + "reward": 0.85955810546875, + "reward_std": 0.014063207432627678, + "rewards//mean": 0.85955810546875, + "rewards//std": 0.025604872032999992, + "step": 3828 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7658, + "grad_norm": 1.476556420326233, + "kl": 0.361316567286849, + "learning_rate": 1.3205887237228397e-07, + "loss": 0.0145, + "num_tokens": 27886079.0, + "reward": 0.86566162109375, + "reward_std": 0.011100002564489841, + "rewards//mean": 0.86566162109375, + "rewards//std": 0.022224057465791702, + "step": 3829 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.766, + "grad_norm": 1.5482349395751953, + "kl": 0.36304468661546707, + "learning_rate": 1.3184407757045995e-07, + "loss": 0.0145, + "num_tokens": 27893303.0, + "reward": 0.83233642578125, + "reward_std": 0.012822691351175308, + "rewards//mean": 0.83233642578125, + "rewards//std": 0.018847206607460976, + "step": 3830 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7662, + "grad_norm": 1.6228060722351074, + "kl": 0.37283582985401154, + "learning_rate": 1.3162943106179748e-07, + "loss": 0.0149, + "num_tokens": 27900591.0, + "reward": 0.832275390625, + "reward_std": 0.01239863969385624, + "rewards//mean": 0.832275390625, + "rewards//std": 0.019185619428753853, + "step": 3831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7664, + "grad_norm": 2.020531177520752, + "kl": 0.3056914061307907, + "learning_rate": 1.314149329327563e-07, + "loss": 0.0122, + "num_tokens": 27907807.0, + "reward": 0.85211181640625, + "reward_std": 0.013578057289123535, + "rewards//mean": 0.85211181640625, + "rewards//std": 0.020432662218809128, + "step": 3832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7666, + "grad_norm": 1.4634027481079102, + "kl": 0.308849710971117, + "learning_rate": 1.3120058326973582e-07, + "loss": 0.0124, + "num_tokens": 27915143.0, + "reward": 0.88836669921875, + "reward_std": 0.01515498012304306, + "rewards//mean": 0.88836669921875, + "rewards//std": 0.022464560344815254, + "step": 3833 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7668, + "grad_norm": 1.552201509475708, + "kl": 0.4395564552396536, + "learning_rate": 1.3098638215907638e-07, + "loss": 0.0176, + "num_tokens": 27922471.0, + "reward": 0.791748046875, + "reward_std": 0.015972988680005074, + "rewards//mean": 0.791748046875, + "rewards//std": 0.01992868259549141, + "step": 3834 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.767, + "grad_norm": 1.499538540840149, + "kl": 0.30121989734470844, + "learning_rate": 1.3077232968705805e-07, + "loss": 0.012, + "num_tokens": 27929671.0, + "reward": 0.8817138671875, + "reward_std": 0.017440946772694588, + "rewards//mean": 0.8817138671875, + "rewards//std": 0.03262617066502571, + "step": 3835 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7672, + "grad_norm": 1.412003517150879, + "kl": 0.35298968479037285, + "learning_rate": 1.305584259399013e-07, + "loss": 0.0141, + "num_tokens": 27937079.0, + "reward": 0.845703125, + "reward_std": 0.012399401515722275, + "rewards//mean": 0.845703125, + "rewards//std": 0.02641107141971588, + "step": 3836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7674, + "grad_norm": 1.695290207862854, + "kl": 0.45288775488734245, + "learning_rate": 1.3034467100376622e-07, + "loss": 0.0181, + "num_tokens": 27944343.0, + "reward": 0.79827880859375, + "reward_std": 0.015585361048579216, + "rewards//mean": 0.79827880859375, + "rewards//std": 0.01762617751955986, + "step": 3837 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7676, + "grad_norm": 1.653372049331665, + "kl": 0.44928814470767975, + "learning_rate": 1.3013106496475352e-07, + "loss": 0.018, + "num_tokens": 27951679.0, + "reward": 0.828857421875, + "reward_std": 0.014923029579222202, + "rewards//mean": 0.828857421875, + "rewards//std": 0.02262645959854126, + "step": 3838 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.5625, + "epoch": 0.7678, + "grad_norm": 1.6042916774749756, + "kl": 0.47150181606411934, + "learning_rate": 1.299176079089036e-07, + "loss": -0.0122, + "num_tokens": 27958899.0, + "reward": 0.86029052734375, + "reward_std": 0.013816501945257187, + "rewards//mean": 0.86029052734375, + "rewards//std": 0.026344429701566696, + "step": 3839 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.768, + "grad_norm": 1.5490025281906128, + "kl": 0.3069368749856949, + "learning_rate": 1.2970429992219712e-07, + "loss": 0.0123, + "num_tokens": 27966259.0, + "reward": 0.8682861328125, + "reward_std": 0.01350223645567894, + "rewards//mean": 0.8682861328125, + "rewards//std": 0.014440278522670269, + "step": 3840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7682, + "grad_norm": 2.1584739685058594, + "kl": 0.6118620093911886, + "learning_rate": 1.2949114109055414e-07, + "loss": 0.0245, + "num_tokens": 27973499.0, + "reward": 0.8699951171875, + "reward_std": 0.011657914146780968, + "rewards//mean": 0.8699951171875, + "rewards//std": 0.020105265080928802, + "step": 3841 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.7684, + "grad_norm": 1.4663320779800415, + "kl": 0.32944056391716003, + "learning_rate": 1.2927813149983525e-07, + "loss": 0.0127, + "num_tokens": 27980794.0, + "reward": 0.88104248046875, + "reward_std": 0.016376448795199394, + "rewards//mean": 0.88104248046875, + "rewards//std": 0.024119237437844276, + "step": 3842 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7686, + "grad_norm": 1.607919454574585, + "kl": 0.37169088795781136, + "learning_rate": 1.2906527123584081e-07, + "loss": 0.0149, + "num_tokens": 27988170.0, + "reward": 0.803466796875, + "reward_std": 0.01774621196091175, + "rewards//mean": 0.803466796875, + "rewards//std": 0.025835072621703148, + "step": 3843 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7688, + "grad_norm": 2.0036017894744873, + "kl": 0.34822767600417137, + "learning_rate": 1.2885256038431064e-07, + "loss": 0.0139, + "num_tokens": 27995466.0, + "reward": 0.86444091796875, + "reward_std": 0.020651932805776596, + "rewards//mean": 0.86444091796875, + "rewards//std": 0.030794672667980194, + "step": 3844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.769, + "grad_norm": 1.283760905265808, + "kl": 0.280034439638257, + "learning_rate": 1.286399990309247e-07, + "loss": 0.0112, + "num_tokens": 28002858.0, + "reward": 0.87786865234375, + "reward_std": 0.017228003591299057, + "rewards//mean": 0.87786865234375, + "rewards//std": 0.02559363655745983, + "step": 3845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7692, + "grad_norm": 1.5756878852844238, + "kl": 0.35500802658498287, + "learning_rate": 1.284275872613028e-07, + "loss": 0.0142, + "num_tokens": 28010138.0, + "reward": 0.88702392578125, + "reward_std": 0.020262587815523148, + "rewards//mean": 0.88702392578125, + "rewards//std": 0.03301607444882393, + "step": 3846 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7694, + "grad_norm": 1.546988844871521, + "kl": 0.3423579875379801, + "learning_rate": 1.2821532516100447e-07, + "loss": 0.0137, + "num_tokens": 28017530.0, + "reward": 0.8599853515625, + "reward_std": 0.017007336020469666, + "rewards//mean": 0.8599853515625, + "rewards//std": 0.021113017573952675, + "step": 3847 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7696, + "grad_norm": 1.6860098838806152, + "kl": 0.31238401867449284, + "learning_rate": 1.280032128155285e-07, + "loss": 0.0125, + "num_tokens": 28024826.0, + "reward": 0.89306640625, + "reward_std": 0.012590162456035614, + "rewards//mean": 0.89306640625, + "rewards//std": 0.01945670321583748, + "step": 3848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7698, + "grad_norm": 3.1324081420898438, + "kl": 0.5217940174043179, + "learning_rate": 1.2779125031031414e-07, + "loss": 0.0209, + "num_tokens": 28032122.0, + "reward": 0.84942626953125, + "reward_std": 0.016640959307551384, + "rewards//mean": 0.84942626953125, + "rewards//std": 0.029181359335780144, + "step": 3849 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.77, + "grad_norm": 1.4397517442703247, + "kl": 0.4042212478816509, + "learning_rate": 1.2757943773073943e-07, + "loss": 0.0162, + "num_tokens": 28039490.0, + "reward": 0.73779296875, + "reward_std": 0.010709239169955254, + "rewards//mean": 0.73779296875, + "rewards//std": 0.013011530041694641, + "step": 3850 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7702, + "grad_norm": 1.6017603874206543, + "kl": 0.3490196131169796, + "learning_rate": 1.2736777516212267e-07, + "loss": 0.014, + "num_tokens": 28046826.0, + "reward": 0.8087158203125, + "reward_std": 0.01562093012034893, + "rewards//mean": 0.8087158203125, + "rewards//std": 0.018122145906090736, + "step": 3851 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7704, + "grad_norm": 1.5856043100357056, + "kl": 0.35427340492606163, + "learning_rate": 1.2715626268972167e-07, + "loss": 0.0142, + "num_tokens": 28054050.0, + "reward": 0.857177734375, + "reward_std": 0.01673751510679722, + "rewards//mean": 0.857177734375, + "rewards//std": 0.025102822110056877, + "step": 3852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7706, + "grad_norm": 1.9694403409957886, + "kl": 0.3341976944357157, + "learning_rate": 1.2694490039873333e-07, + "loss": 0.0134, + "num_tokens": 28061434.0, + "reward": 0.8399658203125, + "reward_std": 0.015431800857186317, + "rewards//mean": 0.8399658203125, + "rewards//std": 0.021349729970097542, + "step": 3853 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7708, + "grad_norm": 1.527319312095642, + "kl": 0.38241853564977646, + "learning_rate": 1.267336883742945e-07, + "loss": 0.0153, + "num_tokens": 28068650.0, + "reward": 0.8231201171875, + "reward_std": 0.013774987310171127, + "rewards//mean": 0.8231201171875, + "rewards//std": 0.01847292296588421, + "step": 3854 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.771, + "grad_norm": 1.4887408018112183, + "kl": 0.3368044290691614, + "learning_rate": 1.2652262670148134e-07, + "loss": 0.0135, + "num_tokens": 28075946.0, + "reward": 0.8402099609375, + "reward_std": 0.013409238308668137, + "rewards//mean": 0.8402099609375, + "rewards//std": 0.023568568751215935, + "step": 3855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7712, + "grad_norm": 1.6293429136276245, + "kl": 0.355529198423028, + "learning_rate": 1.2631171546530966e-07, + "loss": 0.0142, + "num_tokens": 28083130.0, + "reward": 0.86376953125, + "reward_std": 0.01644853688776493, + "rewards//mean": 0.86376953125, + "rewards//std": 0.025558197870850563, + "step": 3856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7714, + "grad_norm": 1.4912395477294922, + "kl": 0.34848423674702644, + "learning_rate": 1.2610095475073413e-07, + "loss": 0.0139, + "num_tokens": 28090490.0, + "reward": 0.865234375, + "reward_std": 0.015017244964838028, + "rewards//mean": 0.865234375, + "rewards//std": 0.01977158710360527, + "step": 3857 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7716, + "grad_norm": 1.5268787145614624, + "kl": 0.3420391343533993, + "learning_rate": 1.258903446426493e-07, + "loss": 0.0137, + "num_tokens": 28097810.0, + "reward": 0.8399658203125, + "reward_std": 0.015670426189899445, + "rewards//mean": 0.8399658203125, + "rewards//std": 0.021743163466453552, + "step": 3858 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7718, + "grad_norm": 1.2756505012512207, + "kl": 0.3732524383813143, + "learning_rate": 1.2567988522588908e-07, + "loss": 0.0149, + "num_tokens": 28105162.0, + "reward": 0.857421875, + "reward_std": 0.013980794697999954, + "rewards//mean": 0.857421875, + "rewards//std": 0.017510825768113136, + "step": 3859 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.772, + "grad_norm": 1.9357774257659912, + "kl": 0.33670302759855986, + "learning_rate": 1.2546957658522618e-07, + "loss": 0.0135, + "num_tokens": 28112410.0, + "reward": 0.8797607421875, + "reward_std": 0.014879605732858181, + "rewards//mean": 0.8797607421875, + "rewards//std": 0.017489634454250336, + "step": 3860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7722, + "grad_norm": 1.5081214904785156, + "kl": 0.3929796349257231, + "learning_rate": 1.2525941880537304e-07, + "loss": 0.0157, + "num_tokens": 28119738.0, + "reward": 0.79168701171875, + "reward_std": 0.01432194747030735, + "rewards//mean": 0.79168701171875, + "rewards//std": 0.016906389966607094, + "step": 3861 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7724, + "grad_norm": 1.4507250785827637, + "kl": 0.2843135315924883, + "learning_rate": 1.250494119709812e-07, + "loss": 0.0114, + "num_tokens": 28127010.0, + "reward": 0.83935546875, + "reward_std": 0.014903994277119637, + "rewards//mean": 0.83935546875, + "rewards//std": 0.02489573322236538, + "step": 3862 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7726, + "grad_norm": 1.7000818252563477, + "kl": 0.30034296587109566, + "learning_rate": 1.2483955616664148e-07, + "loss": 0.012, + "num_tokens": 28134306.0, + "reward": 0.80841064453125, + "reward_std": 0.018049422651529312, + "rewards//mean": 0.80841064453125, + "rewards//std": 0.022600267082452774, + "step": 3863 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7728, + "grad_norm": 1.6091015338897705, + "kl": 0.3810528628528118, + "learning_rate": 1.2462985147688359e-07, + "loss": 0.0152, + "num_tokens": 28141578.0, + "reward": 0.85382080078125, + "reward_std": 0.013706508092582226, + "rewards//mean": 0.85382080078125, + "rewards//std": 0.018012702465057373, + "step": 3864 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.773, + "grad_norm": 1.3280606269836426, + "kl": 0.3301904574036598, + "learning_rate": 1.244202979861766e-07, + "loss": 0.0132, + "num_tokens": 28148938.0, + "reward": 0.83740234375, + "reward_std": 0.014394648373126984, + "rewards//mean": 0.83740234375, + "rewards//std": 0.017154494300484657, + "step": 3865 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7732, + "grad_norm": 3.1178605556488037, + "kl": 0.6510063111782074, + "learning_rate": 1.2421089577892868e-07, + "loss": 0.026, + "num_tokens": 28156266.0, + "reward": 0.819091796875, + "reward_std": 0.022230342030525208, + "rewards//mean": 0.819091796875, + "rewards//std": 0.025323763489723206, + "step": 3866 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7734, + "grad_norm": 1.7767990827560425, + "kl": 0.4083384573459625, + "learning_rate": 1.240016449394871e-07, + "loss": 0.0163, + "num_tokens": 28163578.0, + "reward": 0.84088134765625, + "reward_std": 0.013036160729825497, + "rewards//mean": 0.84088134765625, + "rewards//std": 0.018961714580655098, + "step": 3867 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7736, + "grad_norm": 1.5541582107543945, + "kl": 0.34885505214333534, + "learning_rate": 1.2379254555213786e-07, + "loss": 0.014, + "num_tokens": 28170954.0, + "reward": 0.857177734375, + "reward_std": 0.014068367891013622, + "rewards//mean": 0.857177734375, + "rewards//std": 0.018969794735312462, + "step": 3868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7738, + "grad_norm": 1.7181564569473267, + "kl": 0.3699973449110985, + "learning_rate": 1.2358359770110632e-07, + "loss": 0.0148, + "num_tokens": 28178274.0, + "reward": 0.85601806640625, + "reward_std": 0.017079127952456474, + "rewards//mean": 0.85601806640625, + "rewards//std": 0.024258792400360107, + "step": 3869 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.8125, + "epoch": 0.774, + "grad_norm": 1.5691840648651123, + "kl": 0.34272802621126175, + "learning_rate": 1.2337480147055658e-07, + "loss": 0.0114, + "num_tokens": 28185510.0, + "reward": 0.8118896484375, + "reward_std": 0.014022035524249077, + "rewards//mean": 0.8118896484375, + "rewards//std": 0.0204131118953228, + "step": 3870 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7742, + "grad_norm": 1.7482823133468628, + "kl": 0.3392204847186804, + "learning_rate": 1.2316615694459186e-07, + "loss": 0.0136, + "num_tokens": 28192702.0, + "reward": 0.80389404296875, + "reward_std": 0.013483593240380287, + "rewards//mean": 0.80389404296875, + "rewards//std": 0.015782006084918976, + "step": 3871 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7744, + "grad_norm": 1.5632070302963257, + "kl": 0.3762446381151676, + "learning_rate": 1.2295766420725401e-07, + "loss": 0.015, + "num_tokens": 28199966.0, + "reward": 0.83868408203125, + "reward_std": 0.009479718282818794, + "rewards//mean": 0.83868408203125, + "rewards//std": 0.013070584274828434, + "step": 3872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.7746, + "grad_norm": 2.0077781677246094, + "kl": 0.4358008038252592, + "learning_rate": 1.2274932334252386e-07, + "loss": 0.0153, + "num_tokens": 28207236.0, + "reward": 0.879150390625, + "reward_std": 0.016028275713324547, + "rewards//mean": 0.879150390625, + "rewards//std": 0.024831196293234825, + "step": 3873 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7748, + "grad_norm": 1.714311122894287, + "kl": 0.3903156854212284, + "learning_rate": 1.225411344343213e-07, + "loss": 0.0156, + "num_tokens": 28214516.0, + "reward": 0.88104248046875, + "reward_std": 0.016094915568828583, + "rewards//mean": 0.88104248046875, + "rewards//std": 0.03385366499423981, + "step": 3874 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.775, + "grad_norm": 2.3525166511535645, + "kl": 0.42438020557165146, + "learning_rate": 1.2233309756650455e-07, + "loss": 0.017, + "num_tokens": 28221724.0, + "reward": 0.882080078125, + "reward_std": 0.014632070437073708, + "rewards//mean": 0.882080078125, + "rewards//std": 0.029272064566612244, + "step": 3875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7752, + "grad_norm": 1.2372016906738281, + "kl": 0.2973152585327625, + "learning_rate": 1.2212521282287093e-07, + "loss": 0.0119, + "num_tokens": 28229012.0, + "reward": 0.83245849609375, + "reward_std": 0.01593616046011448, + "rewards//mean": 0.83245849609375, + "rewards//std": 0.023593204095959663, + "step": 3876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.734375, + "epoch": 0.7754, + "grad_norm": 1.5841152667999268, + "kl": 0.4943915419280529, + "learning_rate": 1.219174802871563e-07, + "loss": 0.0196, + "num_tokens": 28236291.0, + "reward": 0.85601806640625, + "reward_std": 0.01202801801264286, + "rewards//mean": 0.85601806640625, + "rewards//std": 0.019084259867668152, + "step": 3877 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7756, + "grad_norm": 1.3153761625289917, + "kl": 0.30923181399703026, + "learning_rate": 1.2170990004303566e-07, + "loss": 0.0124, + "num_tokens": 28243523.0, + "reward": 0.8560791015625, + "reward_std": 0.012538122944533825, + "rewards//mean": 0.8560791015625, + "rewards//std": 0.017416777089238167, + "step": 3878 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7758, + "grad_norm": 1.8103406429290771, + "kl": 0.41266536340117455, + "learning_rate": 1.2150247217412185e-07, + "loss": 0.0165, + "num_tokens": 28250803.0, + "reward": 0.8409423828125, + "reward_std": 0.012795167043805122, + "rewards//mean": 0.8409423828125, + "rewards//std": 0.015031974762678146, + "step": 3879 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.776, + "grad_norm": 1.7092196941375732, + "kl": 0.3383233994245529, + "learning_rate": 1.21295196763967e-07, + "loss": 0.0135, + "num_tokens": 28258139.0, + "reward": 0.86138916015625, + "reward_std": 0.021094590425491333, + "rewards//mean": 0.86138916015625, + "rewards//std": 0.02351994626224041, + "step": 3880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7762, + "grad_norm": 1.5880988836288452, + "kl": 0.3384570386260748, + "learning_rate": 1.2108807389606158e-07, + "loss": 0.0135, + "num_tokens": 28265363.0, + "reward": 0.8868408203125, + "reward_std": 0.023092858493328094, + "rewards//mean": 0.8868408203125, + "rewards//std": 0.03133489936590195, + "step": 3881 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7764, + "grad_norm": 1.5284792184829712, + "kl": 0.32035353034734726, + "learning_rate": 1.2088110365383486e-07, + "loss": 0.0128, + "num_tokens": 28272611.0, + "reward": 0.79754638671875, + "reward_std": 0.012915674597024918, + "rewards//mean": 0.79754638671875, + "rewards//std": 0.022793682292103767, + "step": 3882 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7766, + "grad_norm": 1.8995261192321777, + "kl": 0.3708248920738697, + "learning_rate": 1.2067428612065406e-07, + "loss": 0.0148, + "num_tokens": 28280019.0, + "reward": 0.799560546875, + "reward_std": 0.010867023840546608, + "rewards//mean": 0.799560546875, + "rewards//std": 0.013900267891585827, + "step": 3883 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7768, + "grad_norm": 1.3833818435668945, + "kl": 0.3224822822958231, + "learning_rate": 1.2046762137982547e-07, + "loss": 0.0129, + "num_tokens": 28287315.0, + "reward": 0.77783203125, + "reward_std": 0.01102649699896574, + "rewards//mean": 0.77783203125, + "rewards//std": 0.01884971745312214, + "step": 3884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.777, + "grad_norm": 1.3541109561920166, + "kl": 0.35611770302057266, + "learning_rate": 1.202611095145936e-07, + "loss": 0.0142, + "num_tokens": 28294627.0, + "reward": 0.826416015625, + "reward_std": 0.01748783141374588, + "rewards//mean": 0.826416015625, + "rewards//std": 0.025637442246079445, + "step": 3885 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7772, + "grad_norm": 1.3315099477767944, + "kl": 0.3274829015135765, + "learning_rate": 1.2005475060814156e-07, + "loss": 0.0131, + "num_tokens": 28301939.0, + "reward": 0.88885498046875, + "reward_std": 0.016495613381266594, + "rewards//mean": 0.88885498046875, + "rewards//std": 0.024675175547599792, + "step": 3886 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7774, + "grad_norm": 1.304465889930725, + "kl": 0.3159266971051693, + "learning_rate": 1.1984854474359042e-07, + "loss": 0.0126, + "num_tokens": 28309227.0, + "reward": 0.859619140625, + "reward_std": 0.012405653484165668, + "rewards//mean": 0.859619140625, + "rewards//std": 0.02346719242632389, + "step": 3887 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7776, + "grad_norm": 1.2378754615783691, + "kl": 0.3452496714890003, + "learning_rate": 1.1964249200399995e-07, + "loss": 0.0138, + "num_tokens": 28316547.0, + "reward": 0.85791015625, + "reward_std": 0.01875508949160576, + "rewards//mean": 0.85791015625, + "rewards//std": 0.028446445241570473, + "step": 3888 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7778, + "grad_norm": 1.5237433910369873, + "kl": 0.42648518458008766, + "learning_rate": 1.1943659247236837e-07, + "loss": 0.0171, + "num_tokens": 28323835.0, + "reward": 0.74859619140625, + "reward_std": 0.015107712708413601, + "rewards//mean": 0.74859619140625, + "rewards//std": 0.02915177494287491, + "step": 3889 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.778, + "grad_norm": 1.6299704313278198, + "kl": 0.3455343618988991, + "learning_rate": 1.192308462316317e-07, + "loss": 0.0138, + "num_tokens": 28331131.0, + "reward": 0.849365234375, + "reward_std": 0.014596818014979362, + "rewards//mean": 0.849365234375, + "rewards//std": 0.023611243814229965, + "step": 3890 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.7782, + "grad_norm": 1.6812278032302856, + "kl": 0.36709266901016235, + "learning_rate": 1.1902525336466462e-07, + "loss": 0.0124, + "num_tokens": 28338372.0, + "reward": 0.81243896484375, + "reward_std": 0.015085402876138687, + "rewards//mean": 0.81243896484375, + "rewards//std": 0.01586332730948925, + "step": 3891 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7784, + "grad_norm": 1.493705153465271, + "kl": 0.2906009927392006, + "learning_rate": 1.1881981395427993e-07, + "loss": 0.0116, + "num_tokens": 28345636.0, + "reward": 0.841552734375, + "reward_std": 0.010239910334348679, + "rewards//mean": 0.841552734375, + "rewards//std": 0.01741199754178524, + "step": 3892 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7786, + "grad_norm": 1.5730774402618408, + "kl": 0.33500207774341106, + "learning_rate": 1.1861452808322874e-07, + "loss": 0.0134, + "num_tokens": 28352988.0, + "reward": 0.85626220703125, + "reward_std": 0.014740861020982265, + "rewards//mean": 0.85626220703125, + "rewards//std": 0.025173204019665718, + "step": 3893 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7788, + "grad_norm": 1.6082795858383179, + "kl": 0.37697562761604786, + "learning_rate": 1.1840939583419984e-07, + "loss": 0.0151, + "num_tokens": 28360324.0, + "reward": 0.79254150390625, + "reward_std": 0.018088875338435173, + "rewards//mean": 0.79254150390625, + "rewards//std": 0.02086954191327095, + "step": 3894 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.779, + "grad_norm": 1.548880934715271, + "kl": 0.30227937176823616, + "learning_rate": 1.1820441728982072e-07, + "loss": 0.0121, + "num_tokens": 28367580.0, + "reward": 0.8218994140625, + "reward_std": 0.015258073806762695, + "rewards//mean": 0.8218994140625, + "rewards//std": 0.019322030246257782, + "step": 3895 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7792, + "grad_norm": 1.7999802827835083, + "kl": 0.3630468286573887, + "learning_rate": 1.1799959253265668e-07, + "loss": 0.0145, + "num_tokens": 28374860.0, + "reward": 0.84466552734375, + "reward_std": 0.014650240540504456, + "rewards//mean": 0.84466552734375, + "rewards//std": 0.01900636777281761, + "step": 3896 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7794, + "grad_norm": 1.4511488676071167, + "kl": 0.362642265856266, + "learning_rate": 1.1779492164521116e-07, + "loss": 0.0145, + "num_tokens": 28382084.0, + "reward": 0.8311767578125, + "reward_std": 0.012570744380354881, + "rewards//mean": 0.8311767578125, + "rewards//std": 0.022771505638957024, + "step": 3897 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7796, + "grad_norm": 1.5012391805648804, + "kl": 0.35726112499833107, + "learning_rate": 1.1759040470992537e-07, + "loss": 0.0143, + "num_tokens": 28389412.0, + "reward": 0.87921142578125, + "reward_std": 0.017710883170366287, + "rewards//mean": 0.87921142578125, + "rewards//std": 0.02659662440419197, + "step": 3898 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7798, + "grad_norm": 1.431710124015808, + "kl": 0.33837465941905975, + "learning_rate": 1.1738604180917888e-07, + "loss": 0.0135, + "num_tokens": 28396740.0, + "reward": 0.85577392578125, + "reward_std": 0.02002131938934326, + "rewards//mean": 0.85577392578125, + "rewards//std": 0.02609095722436905, + "step": 3899 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.78, + "grad_norm": 1.795378565788269, + "kl": 0.45122355967760086, + "learning_rate": 1.1718183302528895e-07, + "loss": 0.018, + "num_tokens": 28403972.0, + "reward": 0.86114501953125, + "reward_std": 0.01969079300761223, + "rewards//mean": 0.86114501953125, + "rewards//std": 0.024487968534231186, + "step": 3900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7802, + "grad_norm": 1.565657377243042, + "kl": 0.3220665641129017, + "learning_rate": 1.1697777844051104e-07, + "loss": 0.0129, + "num_tokens": 28411196.0, + "reward": 0.836181640625, + "reward_std": 0.013168256729841232, + "rewards//mean": 0.836181640625, + "rewards//std": 0.017919281497597694, + "step": 3901 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7804, + "grad_norm": 1.3970204591751099, + "kl": 0.3444474879652262, + "learning_rate": 1.1677387813703804e-07, + "loss": 0.0138, + "num_tokens": 28418524.0, + "reward": 0.8680419921875, + "reward_std": 0.017073627561330795, + "rewards//mean": 0.8680419921875, + "rewards//std": 0.0302133746445179, + "step": 3902 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7806, + "grad_norm": 1.5328527688980103, + "kl": 0.3477213568985462, + "learning_rate": 1.1657013219700106e-07, + "loss": 0.0139, + "num_tokens": 28425844.0, + "reward": 0.81622314453125, + "reward_std": 0.01336820051074028, + "rewards//mean": 0.81622314453125, + "rewards//std": 0.02264309488236904, + "step": 3903 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7808, + "grad_norm": 1.7030155658721924, + "kl": 0.37496267817914486, + "learning_rate": 1.1636654070246904e-07, + "loss": 0.015, + "num_tokens": 28433132.0, + "reward": 0.84466552734375, + "reward_std": 0.010446232743561268, + "rewards//mean": 0.84466552734375, + "rewards//std": 0.019183943048119545, + "step": 3904 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.781, + "grad_norm": 1.471824049949646, + "kl": 0.3002325091511011, + "learning_rate": 1.1616310373544863e-07, + "loss": 0.012, + "num_tokens": 28440404.0, + "reward": 0.880615234375, + "reward_std": 0.013872135430574417, + "rewards//mean": 0.880615234375, + "rewards//std": 0.020785322412848473, + "step": 3905 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7812, + "grad_norm": 1.8775482177734375, + "kl": 0.3841847591102123, + "learning_rate": 1.1595982137788402e-07, + "loss": 0.0154, + "num_tokens": 28447636.0, + "reward": 0.8890380859375, + "reward_std": 0.015123292803764343, + "rewards//mean": 0.8890380859375, + "rewards//std": 0.023517129942774773, + "step": 3906 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7814, + "grad_norm": 1.873696208000183, + "kl": 0.29325911588966846, + "learning_rate": 1.1575669371165748e-07, + "loss": 0.0117, + "num_tokens": 28454956.0, + "reward": 0.86663818359375, + "reward_std": 0.01614212989807129, + "rewards//mean": 0.86663818359375, + "rewards//std": 0.03003164567053318, + "step": 3907 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7816, + "grad_norm": 1.5961592197418213, + "kl": 0.3809940181672573, + "learning_rate": 1.1555372081858883e-07, + "loss": 0.0152, + "num_tokens": 28462188.0, + "reward": 0.854248046875, + "reward_std": 0.021259326487779617, + "rewards//mean": 0.854248046875, + "rewards//std": 0.03437862917780876, + "step": 3908 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7818, + "grad_norm": 1.3719744682312012, + "kl": 0.31420871801674366, + "learning_rate": 1.1535090278043535e-07, + "loss": 0.0126, + "num_tokens": 28469540.0, + "reward": 0.82440185546875, + "reward_std": 0.015637483447790146, + "rewards//mean": 0.82440185546875, + "rewards//std": 0.021434202790260315, + "step": 3909 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.765625, + "epoch": 0.782, + "grad_norm": 1.3367769718170166, + "kl": 0.31528887897729874, + "learning_rate": 1.151482396788922e-07, + "loss": 0.0162, + "num_tokens": 28476789.0, + "reward": 0.82843017578125, + "reward_std": 0.01374303363263607, + "rewards//mean": 0.82843017578125, + "rewards//std": 0.017052602022886276, + "step": 3910 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7822, + "grad_norm": 1.6278046369552612, + "kl": 0.3096625432372093, + "learning_rate": 1.1494573159559212e-07, + "loss": 0.0124, + "num_tokens": 28484061.0, + "reward": 0.83551025390625, + "reward_std": 0.01269583124667406, + "rewards//mean": 0.83551025390625, + "rewards//std": 0.017732350155711174, + "step": 3911 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7824, + "grad_norm": 1.3292229175567627, + "kl": 0.3348894603550434, + "learning_rate": 1.1474337861210543e-07, + "loss": 0.0134, + "num_tokens": 28491269.0, + "reward": 0.8565673828125, + "reward_std": 0.014441858977079391, + "rewards//mean": 0.8565673828125, + "rewards//std": 0.019666800275444984, + "step": 3912 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7826, + "grad_norm": 1.367451786994934, + "kl": 0.3608581367880106, + "learning_rate": 1.1454118080993963e-07, + "loss": 0.0144, + "num_tokens": 28498613.0, + "reward": 0.8226318359375, + "reward_std": 0.01306538563221693, + "rewards//mean": 0.8226318359375, + "rewards//std": 0.01649545319378376, + "step": 3913 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.7828, + "grad_norm": 1.7805023193359375, + "kl": 0.3344572093337774, + "learning_rate": 1.1433913827054009e-07, + "loss": 0.015, + "num_tokens": 28505926.0, + "reward": 0.86358642578125, + "reward_std": 0.01774214580655098, + "rewards//mean": 0.86358642578125, + "rewards//std": 0.024560803547501564, + "step": 3914 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.783, + "grad_norm": 1.4679522514343262, + "kl": 0.37877923250198364, + "learning_rate": 1.1413725107528954e-07, + "loss": 0.0167, + "num_tokens": 28513304.0, + "reward": 0.86724853515625, + "reward_std": 0.02081015706062317, + "rewards//mean": 0.86724853515625, + "rewards//std": 0.02744094282388687, + "step": 3915 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7832, + "grad_norm": 1.7114410400390625, + "kl": 0.4462903533130884, + "learning_rate": 1.1393551930550826e-07, + "loss": 0.0179, + "num_tokens": 28520568.0, + "reward": 0.85089111328125, + "reward_std": 0.02000109851360321, + "rewards//mean": 0.85089111328125, + "rewards//std": 0.02359769493341446, + "step": 3916 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7834, + "grad_norm": 1.4895734786987305, + "kl": 0.3728749752044678, + "learning_rate": 1.1373394304245349e-07, + "loss": 0.0149, + "num_tokens": 28527912.0, + "reward": 0.85662841796875, + "reward_std": 0.013412190601229668, + "rewards//mean": 0.85662841796875, + "rewards//std": 0.0212668776512146, + "step": 3917 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7836, + "grad_norm": 1.622498869895935, + "kl": 0.47164369747042656, + "learning_rate": 1.135325223673203e-07, + "loss": 0.0189, + "num_tokens": 28535256.0, + "reward": 0.82135009765625, + "reward_std": 0.0166726466268301, + "rewards//mean": 0.82135009765625, + "rewards//std": 0.02506774663925171, + "step": 3918 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.7838, + "grad_norm": 4.516197681427002, + "kl": 0.7635924052447081, + "learning_rate": 1.1333125736124083e-07, + "loss": 0.0309, + "num_tokens": 28542533.0, + "reward": 0.82220458984375, + "reward_std": 0.014755021780729294, + "rewards//mean": 0.82220458984375, + "rewards//std": 0.026392651721835136, + "step": 3919 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.784, + "grad_norm": 1.6328767538070679, + "kl": 0.3497535791248083, + "learning_rate": 1.1313014810528482e-07, + "loss": 0.0145, + "num_tokens": 28549779.0, + "reward": 0.86639404296875, + "reward_std": 0.01506724115461111, + "rewards//mean": 0.86639404296875, + "rewards//std": 0.029815636575222015, + "step": 3920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.609375, + "epoch": 0.7842, + "grad_norm": 1.5177490711212158, + "kl": 0.29300336353480816, + "learning_rate": 1.1292919468045875e-07, + "loss": -0.0204, + "num_tokens": 28557010.0, + "reward": 0.767578125, + "reward_std": 0.013311760500073433, + "rewards//mean": 0.767578125, + "rewards//std": 0.016022957861423492, + "step": 3921 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7844, + "grad_norm": 1.5761748552322388, + "kl": 0.33036018908023834, + "learning_rate": 1.1272839716770677e-07, + "loss": 0.0132, + "num_tokens": 28564234.0, + "reward": 0.83770751953125, + "reward_std": 0.016764629632234573, + "rewards//mean": 0.83770751953125, + "rewards//std": 0.02660231478512287, + "step": 3922 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7846, + "grad_norm": 1.4436147212982178, + "kl": 0.3550559002906084, + "learning_rate": 1.1252775564791023e-07, + "loss": 0.0142, + "num_tokens": 28571554.0, + "reward": 0.8382568359375, + "reward_std": 0.01719702035188675, + "rewards//mean": 0.8382568359375, + "rewards//std": 0.02168460376560688, + "step": 3923 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.125, + "epoch": 0.7848, + "grad_norm": 1.6382489204406738, + "kl": 0.47706497088074684, + "learning_rate": 1.1232727020188726e-07, + "loss": -0.0448, + "num_tokens": 28578834.0, + "reward": 0.8526611328125, + "reward_std": 0.023041460663080215, + "rewards//mean": 0.8526611328125, + "rewards//std": 0.026138851419091225, + "step": 3924 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.785, + "grad_norm": 1.512474775314331, + "kl": 0.29883842915296555, + "learning_rate": 1.1212694091039349e-07, + "loss": 0.012, + "num_tokens": 28586146.0, + "reward": 0.83660888671875, + "reward_std": 0.012748703360557556, + "rewards//mean": 0.83660888671875, + "rewards//std": 0.01863071136176586, + "step": 3925 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7852, + "grad_norm": 1.2753653526306152, + "kl": 0.40217312052845955, + "learning_rate": 1.1192676785412152e-07, + "loss": 0.0161, + "num_tokens": 28593378.0, + "reward": 0.8529052734375, + "reward_std": 0.013087304309010506, + "rewards//mean": 0.8529052734375, + "rewards//std": 0.019552553072571754, + "step": 3926 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7854, + "grad_norm": 1.8730885982513428, + "kl": 0.3600899111479521, + "learning_rate": 1.1172675111370122e-07, + "loss": 0.0144, + "num_tokens": 28600602.0, + "reward": 0.87872314453125, + "reward_std": 0.0229827668517828, + "rewards//mean": 0.87872314453125, + "rewards//std": 0.02812904492020607, + "step": 3927 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7856, + "grad_norm": 1.6371207237243652, + "kl": 0.3201641980558634, + "learning_rate": 1.1152689076969896e-07, + "loss": 0.0128, + "num_tokens": 28607818.0, + "reward": 0.8463134765625, + "reward_std": 0.01285461150109768, + "rewards//mean": 0.8463134765625, + "rewards//std": 0.018935849890112877, + "step": 3928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7858, + "grad_norm": 1.2706996202468872, + "kl": 0.38385503366589546, + "learning_rate": 1.1132718690261867e-07, + "loss": 0.0154, + "num_tokens": 28615106.0, + "reward": 0.8046875, + "reward_std": 0.011089330539107323, + "rewards//mean": 0.8046875, + "rewards//std": 0.019174572080373764, + "step": 3929 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.786, + "grad_norm": 1.5044848918914795, + "kl": 0.33782844990491867, + "learning_rate": 1.11127639592901e-07, + "loss": 0.0135, + "num_tokens": 28622642.0, + "reward": 0.80029296875, + "reward_std": 0.01675265096127987, + "rewards//mean": 0.80029296875, + "rewards//std": 0.021196382120251656, + "step": 3930 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7862, + "grad_norm": 1.7896243333816528, + "kl": 0.40375499427318573, + "learning_rate": 1.1092824892092373e-07, + "loss": 0.0162, + "num_tokens": 28629906.0, + "reward": 0.83984375, + "reward_std": 0.015330873429775238, + "rewards//mean": 0.83984375, + "rewards//std": 0.02273458242416382, + "step": 3931 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7864, + "grad_norm": 1.4734973907470703, + "kl": 0.3399321660399437, + "learning_rate": 1.107290149670011e-07, + "loss": 0.0136, + "num_tokens": 28637106.0, + "reward": 0.82037353515625, + "reward_std": 0.014772121794521809, + "rewards//mean": 0.82037353515625, + "rewards//std": 0.022058606147766113, + "step": 3932 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.7866, + "grad_norm": 1.4374114274978638, + "kl": 0.40189846977591515, + "learning_rate": 1.1052993781138475e-07, + "loss": 0.015, + "num_tokens": 28644341.0, + "reward": 0.84344482421875, + "reward_std": 0.01913130283355713, + "rewards//mean": 0.84344482421875, + "rewards//std": 0.027763962745666504, + "step": 3933 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7868, + "grad_norm": 1.3960051536560059, + "kl": 0.3304452132433653, + "learning_rate": 1.1033101753426282e-07, + "loss": 0.0132, + "num_tokens": 28651749.0, + "reward": 0.86578369140625, + "reward_std": 0.01991131901741028, + "rewards//mean": 0.86578369140625, + "rewards//std": 0.025660976767539978, + "step": 3934 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.787, + "grad_norm": 1.451078176498413, + "kl": 0.4389198515564203, + "learning_rate": 1.1013225421576078e-07, + "loss": 0.0176, + "num_tokens": 28659005.0, + "reward": 0.840087890625, + "reward_std": 0.01606581173837185, + "rewards//mean": 0.840087890625, + "rewards//std": 0.026629116386175156, + "step": 3935 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.7872, + "grad_norm": 1.390854835510254, + "kl": 0.36727774888277054, + "learning_rate": 1.0993364793593979e-07, + "loss": 0.0162, + "num_tokens": 28666304.0, + "reward": 0.7664794921875, + "reward_std": 0.011568665504455566, + "rewards//mean": 0.7664794921875, + "rewards//std": 0.021737592294812202, + "step": 3936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7874, + "grad_norm": 1.7276718616485596, + "kl": 0.3280388005077839, + "learning_rate": 1.0973519877479876e-07, + "loss": 0.0131, + "num_tokens": 28673624.0, + "reward": 0.8480224609375, + "reward_std": 0.019174907356500626, + "rewards//mean": 0.8480224609375, + "rewards//std": 0.032419513911008835, + "step": 3937 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7876, + "grad_norm": 1.8900779485702515, + "kl": 0.3609713576734066, + "learning_rate": 1.09536906812273e-07, + "loss": 0.0144, + "num_tokens": 28680888.0, + "reward": 0.85467529296875, + "reward_std": 0.018681148067116737, + "rewards//mean": 0.85467529296875, + "rewards//std": 0.02492178976535797, + "step": 3938 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7878, + "grad_norm": 1.6882387399673462, + "kl": 0.3558784816414118, + "learning_rate": 1.0933877212823461e-07, + "loss": 0.0142, + "num_tokens": 28688104.0, + "reward": 0.8331298828125, + "reward_std": 0.014666084200143814, + "rewards//mean": 0.8331298828125, + "rewards//std": 0.02367878518998623, + "step": 3939 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.788, + "grad_norm": 1.5064516067504883, + "kl": 0.37648032791912556, + "learning_rate": 1.0914079480249194e-07, + "loss": 0.0161, + "num_tokens": 28695437.0, + "reward": 0.88531494140625, + "reward_std": 0.019291246309876442, + "rewards//mean": 0.88531494140625, + "rewards//std": 0.02809404395520687, + "step": 3940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7882, + "grad_norm": 1.7747290134429932, + "kl": 0.358098566532135, + "learning_rate": 1.0894297491479043e-07, + "loss": 0.0143, + "num_tokens": 28702637.0, + "reward": 0.8592529296875, + "reward_std": 0.01263901125639677, + "rewards//mean": 0.8592529296875, + "rewards//std": 0.024929456412792206, + "step": 3941 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7884, + "grad_norm": 1.525171160697937, + "kl": 0.4246518537402153, + "learning_rate": 1.0874531254481184e-07, + "loss": 0.017, + "num_tokens": 28709909.0, + "reward": 0.83514404296875, + "reward_std": 0.015736792236566544, + "rewards//mean": 0.83514404296875, + "rewards//std": 0.02823861502110958, + "step": 3942 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7886, + "grad_norm": 1.5703661441802979, + "kl": 0.3223246894776821, + "learning_rate": 1.0854780777217465e-07, + "loss": 0.0129, + "num_tokens": 28717213.0, + "reward": 0.89276123046875, + "reward_std": 0.016525311395525932, + "rewards//mean": 0.89276123046875, + "rewards//std": 0.01888091117143631, + "step": 3943 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7888, + "grad_norm": 2.445601463317871, + "kl": 0.26582788676023483, + "learning_rate": 1.083504606764336e-07, + "loss": 0.0106, + "num_tokens": 28724509.0, + "reward": 0.8302001953125, + "reward_std": 0.017063727602362633, + "rewards//mean": 0.8302001953125, + "rewards//std": 0.022977977991104126, + "step": 3944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.6875, + "epoch": 0.789, + "grad_norm": 1.5662479400634766, + "kl": 0.2991022542119026, + "learning_rate": 1.0815327133708013e-07, + "loss": -0.0107, + "num_tokens": 28731737.0, + "reward": 0.84466552734375, + "reward_std": 0.014866153709590435, + "rewards//mean": 0.84466552734375, + "rewards//std": 0.02470276691019535, + "step": 3945 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7892, + "grad_norm": 1.4606257677078247, + "kl": 0.41082843765616417, + "learning_rate": 1.0795623983354213e-07, + "loss": 0.0164, + "num_tokens": 28739049.0, + "reward": 0.88525390625, + "reward_std": 0.022544097155332565, + "rewards//mean": 0.88525390625, + "rewards//std": 0.02470039390027523, + "step": 3946 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7894, + "grad_norm": 1.3977965116500854, + "kl": 0.29287050291895866, + "learning_rate": 1.0775936624518395e-07, + "loss": 0.0117, + "num_tokens": 28746377.0, + "reward": 0.73223876953125, + "reward_std": 0.011085880920290947, + "rewards//mean": 0.73223876953125, + "rewards//std": 0.016424983739852905, + "step": 3947 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7896, + "grad_norm": 4.509192943572998, + "kl": 0.6362180113792419, + "learning_rate": 1.0756265065130604e-07, + "loss": 0.0254, + "num_tokens": 28753705.0, + "reward": 0.85748291015625, + "reward_std": 0.017061473801732063, + "rewards//mean": 0.85748291015625, + "rewards//std": 0.023887790739536285, + "step": 3948 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7898, + "grad_norm": 1.3097034692764282, + "kl": 0.32055095955729485, + "learning_rate": 1.0736609313114548e-07, + "loss": 0.0128, + "num_tokens": 28760897.0, + "reward": 0.85565185546875, + "reward_std": 0.011423088610172272, + "rewards//mean": 0.85565185546875, + "rewards//std": 0.014573194086551666, + "step": 3949 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.79, + "grad_norm": 1.5704087018966675, + "kl": 0.2945827301591635, + "learning_rate": 1.0716969376387563e-07, + "loss": 0.0118, + "num_tokens": 28768257.0, + "reward": 0.863525390625, + "reward_std": 0.012670079246163368, + "rewards//mean": 0.863525390625, + "rewards//std": 0.0177426990121603, + "step": 3950 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.859375, + "epoch": 0.7902, + "grad_norm": 1.651602029800415, + "kl": 0.3765365816652775, + "learning_rate": 1.0697345262860635e-07, + "loss": 0.0132, + "num_tokens": 28775552.0, + "reward": 0.8411865234375, + "reward_std": 0.016277384012937546, + "rewards//mean": 0.8411865234375, + "rewards//std": 0.023900222033262253, + "step": 3951 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7904, + "grad_norm": 1.680754542350769, + "kl": 0.3929339051246643, + "learning_rate": 1.0677736980438318e-07, + "loss": 0.0157, + "num_tokens": 28782872.0, + "reward": 0.8565673828125, + "reward_std": 0.021086938679218292, + "rewards//mean": 0.8565673828125, + "rewards//std": 0.03471231460571289, + "step": 3952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.75, + "epoch": 0.7906, + "grad_norm": 1.5567240715026855, + "kl": 0.3831760808825493, + "learning_rate": 1.0658144537018842e-07, + "loss": -0.0002, + "num_tokens": 28790176.0, + "reward": 0.8138427734375, + "reward_std": 0.010845763608813286, + "rewards//mean": 0.8138427734375, + "rewards//std": 0.01623646728694439, + "step": 3953 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7908, + "grad_norm": 1.9470819234848022, + "kl": 0.4085182324051857, + "learning_rate": 1.0638567940494059e-07, + "loss": 0.0163, + "num_tokens": 28797456.0, + "reward": 0.86419677734375, + "reward_std": 0.012465065345168114, + "rewards//mean": 0.86419677734375, + "rewards//std": 0.018891330808401108, + "step": 3954 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.791, + "grad_norm": 1.502442717552185, + "kl": 0.334028922021389, + "learning_rate": 1.0619007198749386e-07, + "loss": 0.0134, + "num_tokens": 28804712.0, + "reward": 0.875, + "reward_std": 0.014862734824419022, + "rewards//mean": 0.875, + "rewards//std": 0.016948532313108444, + "step": 3955 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7912, + "grad_norm": 1.3622030019760132, + "kl": 0.3719574213027954, + "learning_rate": 1.0599462319663904e-07, + "loss": 0.0149, + "num_tokens": 28811936.0, + "reward": 0.8397216796875, + "reward_std": 0.01642138510942459, + "rewards//mean": 0.8397216796875, + "rewards//std": 0.02270226553082466, + "step": 3956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7914, + "grad_norm": 1.5726033449172974, + "kl": 0.32778062485158443, + "learning_rate": 1.057993331111029e-07, + "loss": 0.0131, + "num_tokens": 28819192.0, + "reward": 0.87103271484375, + "reward_std": 0.015141322277486324, + "rewards//mean": 0.87103271484375, + "rewards//std": 0.021682072430849075, + "step": 3957 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7916, + "grad_norm": 1.634971261024475, + "kl": 0.3724585920572281, + "learning_rate": 1.0560420180954838e-07, + "loss": 0.0149, + "num_tokens": 28826496.0, + "reward": 0.8421630859375, + "reward_std": 0.015521418303251266, + "rewards//mean": 0.8421630859375, + "rewards//std": 0.026435991749167442, + "step": 3958 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7918, + "grad_norm": 1.75856351852417, + "kl": 0.346870593726635, + "learning_rate": 1.0540922937057405e-07, + "loss": 0.0139, + "num_tokens": 28833776.0, + "reward": 0.87530517578125, + "reward_std": 0.012026254087686539, + "rewards//mean": 0.87530517578125, + "rewards//std": 0.022264208644628525, + "step": 3959 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.792, + "grad_norm": 1.7441824674606323, + "kl": 0.36063390225172043, + "learning_rate": 1.0521441587271496e-07, + "loss": 0.0144, + "num_tokens": 28841088.0, + "reward": 0.890380859375, + "reward_std": 0.019638067111372948, + "rewards//mean": 0.890380859375, + "rewards//std": 0.028072549030184746, + "step": 3960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7922, + "grad_norm": 1.5684216022491455, + "kl": 0.3428411688655615, + "learning_rate": 1.0501976139444191e-07, + "loss": 0.0137, + "num_tokens": 28848392.0, + "reward": 0.815673828125, + "reward_std": 0.01443458441644907, + "rewards//mean": 0.815673828125, + "rewards//std": 0.02261575125157833, + "step": 3961 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7924, + "grad_norm": 1.6560554504394531, + "kl": 0.38571103289723396, + "learning_rate": 1.0482526601416186e-07, + "loss": 0.0154, + "num_tokens": 28855720.0, + "reward": 0.89251708984375, + "reward_std": 0.018141109496355057, + "rewards//mean": 0.89251708984375, + "rewards//std": 0.021887755021452904, + "step": 3962 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.7926, + "grad_norm": 1.4662222862243652, + "kl": 0.3482392467558384, + "learning_rate": 1.0463092981021732e-07, + "loss": 0.0159, + "num_tokens": 28863147.0, + "reward": 0.85858154296875, + "reward_std": 0.018766067922115326, + "rewards//mean": 0.85858154296875, + "rewards//std": 0.02575342357158661, + "step": 3963 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7928, + "grad_norm": 1.5191650390625, + "kl": 0.34770362824201584, + "learning_rate": 1.0443675286088694e-07, + "loss": 0.0139, + "num_tokens": 28870507.0, + "reward": 0.874755859375, + "reward_std": 0.014377930201590061, + "rewards//mean": 0.874755859375, + "rewards//std": 0.025881905108690262, + "step": 3964 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.793, + "grad_norm": 1.469210147857666, + "kl": 0.36229852959513664, + "learning_rate": 1.042427352443852e-07, + "loss": 0.0145, + "num_tokens": 28877755.0, + "reward": 0.7838134765625, + "reward_std": 0.011514300480484962, + "rewards//mean": 0.7838134765625, + "rewards//std": 0.01491470169275999, + "step": 3965 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7932, + "grad_norm": 1.4160213470458984, + "kl": 0.3863395266234875, + "learning_rate": 1.040488770388625e-07, + "loss": 0.0155, + "num_tokens": 28885107.0, + "reward": 0.804931640625, + "reward_std": 0.012352668680250645, + "rewards//mean": 0.804931640625, + "rewards//std": 0.015684951096773148, + "step": 3966 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7934, + "grad_norm": 1.6100516319274902, + "kl": 0.3295171894133091, + "learning_rate": 1.038551783224047e-07, + "loss": 0.0132, + "num_tokens": 28892539.0, + "reward": 0.8173828125, + "reward_std": 0.01427968218922615, + "rewards//mean": 0.8173828125, + "rewards//std": 0.03323320299386978, + "step": 3967 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.6875, + "epoch": 0.7936, + "grad_norm": 1.479176640510559, + "kl": 0.3777103088796139, + "learning_rate": 1.0366163917303367e-07, + "loss": -0.0051, + "num_tokens": 28899775.0, + "reward": 0.84173583984375, + "reward_std": 0.019178949296474457, + "rewards//mean": 0.84173583984375, + "rewards//std": 0.023219391703605652, + "step": 3968 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7938, + "grad_norm": 1.358857274055481, + "kl": 0.2888956777751446, + "learning_rate": 1.034682596687071e-07, + "loss": 0.0116, + "num_tokens": 28907159.0, + "reward": 0.8634033203125, + "reward_std": 0.013294190168380737, + "rewards//mean": 0.8634033203125, + "rewards//std": 0.02001471072435379, + "step": 3969 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.625, + "epoch": 0.794, + "grad_norm": 1.7517415285110474, + "kl": 0.31735413521528244, + "learning_rate": 1.0327503988731795e-07, + "loss": -0.007, + "num_tokens": 28914407.0, + "reward": 0.833984375, + "reward_std": 0.021728768944740295, + "rewards//mean": 0.833984375, + "rewards//std": 0.03041735664010048, + "step": 3970 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7942, + "grad_norm": 1.616244912147522, + "kl": 0.3165026940405369, + "learning_rate": 1.0308197990669537e-07, + "loss": 0.0127, + "num_tokens": 28921655.0, + "reward": 0.87835693359375, + "reward_std": 0.01843341439962387, + "rewards//mean": 0.87835693359375, + "rewards//std": 0.03304952755570412, + "step": 3971 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.859375, + "epoch": 0.7944, + "grad_norm": 1.648860216140747, + "kl": 0.3466194085776806, + "learning_rate": 1.0288907980460377e-07, + "loss": 0.0148, + "num_tokens": 28928982.0, + "reward": 0.8271484375, + "reward_std": 0.015025174245238304, + "rewards//mean": 0.8271484375, + "rewards//std": 0.020763462409377098, + "step": 3972 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7946, + "grad_norm": 1.4071489572525024, + "kl": 0.3395378515124321, + "learning_rate": 1.0269633965874347e-07, + "loss": 0.0136, + "num_tokens": 28936166.0, + "reward": 0.7950439453125, + "reward_std": 0.009402807801961899, + "rewards//mean": 0.7950439453125, + "rewards//std": 0.015216141007840633, + "step": 3973 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7948, + "grad_norm": 1.3881512880325317, + "kl": 0.3398167546838522, + "learning_rate": 1.025037595467499e-07, + "loss": 0.0136, + "num_tokens": 28943518.0, + "reward": 0.79327392578125, + "reward_std": 0.010093091055750847, + "rewards//mean": 0.79327392578125, + "rewards//std": 0.016518723219633102, + "step": 3974 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.795, + "grad_norm": 1.5933054685592651, + "kl": 0.4079361967742443, + "learning_rate": 1.0231133954619447e-07, + "loss": 0.0163, + "num_tokens": 28950814.0, + "reward": 0.84814453125, + "reward_std": 0.014897291548550129, + "rewards//mean": 0.84814453125, + "rewards//std": 0.018394500017166138, + "step": 3975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7952, + "grad_norm": 1.461019515991211, + "kl": 0.32532784156501293, + "learning_rate": 1.021190797345839e-07, + "loss": 0.013, + "num_tokens": 28958030.0, + "reward": 0.87274169921875, + "reward_std": 0.020450744777917862, + "rewards//mean": 0.87274169921875, + "rewards//std": 0.030264124274253845, + "step": 3976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7954, + "grad_norm": 1.4060077667236328, + "kl": 0.332545617595315, + "learning_rate": 1.0192698018936058e-07, + "loss": 0.0133, + "num_tokens": 28965278.0, + "reward": 0.86181640625, + "reward_std": 0.01848115399479866, + "rewards//mean": 0.86181640625, + "rewards//std": 0.026369770988821983, + "step": 3977 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7956, + "grad_norm": 1.3589857816696167, + "kl": 0.35631439089775085, + "learning_rate": 1.0173504098790186e-07, + "loss": 0.0143, + "num_tokens": 28972550.0, + "reward": 0.84283447265625, + "reward_std": 0.016522511839866638, + "rewards//mean": 0.84283447265625, + "rewards//std": 0.029771430417895317, + "step": 3978 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7958, + "grad_norm": 1.6091893911361694, + "kl": 0.4103919994086027, + "learning_rate": 1.0154326220752107e-07, + "loss": 0.0164, + "num_tokens": 28979910.0, + "reward": 0.87896728515625, + "reward_std": 0.016728181391954422, + "rewards//mean": 0.87896728515625, + "rewards//std": 0.02766072191298008, + "step": 3979 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.796, + "grad_norm": 1.4611760377883911, + "kl": 0.333572119474411, + "learning_rate": 1.0135164392546658e-07, + "loss": 0.0133, + "num_tokens": 28987118.0, + "reward": 0.8216552734375, + "reward_std": 0.01155116781592369, + "rewards//mean": 0.8216552734375, + "rewards//std": 0.016310881823301315, + "step": 3980 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7962, + "grad_norm": 1.3270684480667114, + "kl": 0.33087583258748055, + "learning_rate": 1.0116018621892236e-07, + "loss": 0.0132, + "num_tokens": 28994334.0, + "reward": 0.868408203125, + "reward_std": 0.01079337578266859, + "rewards//mean": 0.868408203125, + "rewards//std": 0.016394658014178276, + "step": 3981 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7964, + "grad_norm": 1.7798718214035034, + "kl": 0.389729093760252, + "learning_rate": 1.0096888916500734e-07, + "loss": 0.0156, + "num_tokens": 29001726.0, + "reward": 0.8167724609375, + "reward_std": 0.015615573152899742, + "rewards//mean": 0.8167724609375, + "rewards//std": 0.022859087213873863, + "step": 3982 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.7966, + "grad_norm": 2.2034730911254883, + "kl": 0.3675316460430622, + "learning_rate": 1.00777752840776e-07, + "loss": 0.015, + "num_tokens": 29008940.0, + "reward": 0.86224365234375, + "reward_std": 0.02385636791586876, + "rewards//mean": 0.86224365234375, + "rewards//std": 0.02665063925087452, + "step": 3983 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7968, + "grad_norm": 1.4645555019378662, + "kl": 0.30839747190475464, + "learning_rate": 1.0058677732321824e-07, + "loss": 0.0123, + "num_tokens": 29016300.0, + "reward": 0.84912109375, + "reward_std": 0.014120332896709442, + "rewards//mean": 0.84912109375, + "rewards//std": 0.03343302011489868, + "step": 3984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.797, + "grad_norm": 1.8944969177246094, + "kl": 0.3945834878832102, + "learning_rate": 1.0039596268925865e-07, + "loss": 0.0158, + "num_tokens": 29023700.0, + "reward": 0.86822509765625, + "reward_std": 0.014082114212214947, + "rewards//mean": 0.86822509765625, + "rewards//std": 0.02074294723570347, + "step": 3985 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.7972, + "grad_norm": 1.2777612209320068, + "kl": 0.3686863034963608, + "learning_rate": 1.0020530901575752e-07, + "loss": 0.0191, + "num_tokens": 29030957.0, + "reward": 0.85955810546875, + "reward_std": 0.012851662933826447, + "rewards//mean": 0.85955810546875, + "rewards//std": 0.01681121252477169, + "step": 3986 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7974, + "grad_norm": 1.247703194618225, + "kl": 0.3522316589951515, + "learning_rate": 1.0001481637951009e-07, + "loss": 0.0141, + "num_tokens": 29038245.0, + "reward": 0.8443603515625, + "reward_std": 0.01501266285777092, + "rewards//mean": 0.8443603515625, + "rewards//std": 0.02369667775928974, + "step": 3987 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7976, + "grad_norm": 1.6273187398910522, + "kl": 0.409336194396019, + "learning_rate": 9.982448485724692e-08, + "loss": 0.0164, + "num_tokens": 29045581.0, + "reward": 0.8631591796875, + "reward_std": 0.011813142336905003, + "rewards//mean": 0.8631591796875, + "rewards//std": 0.018505671992897987, + "step": 3988 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7978, + "grad_norm": 1.617476224899292, + "kl": 0.2893729005008936, + "learning_rate": 9.963431452563331e-08, + "loss": 0.0116, + "num_tokens": 29052877.0, + "reward": 0.84698486328125, + "reward_std": 0.020152349025011063, + "rewards//mean": 0.84698486328125, + "rewards//std": 0.02291026897728443, + "step": 3989 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.798, + "grad_norm": 2.055994987487793, + "kl": 0.38745853677392006, + "learning_rate": 9.944430546126987e-08, + "loss": 0.0155, + "num_tokens": 29060125.0, + "reward": 0.8546142578125, + "reward_std": 0.01798183098435402, + "rewards//mean": 0.8546142578125, + "rewards//std": 0.01934395357966423, + "step": 3990 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7982, + "grad_norm": 1.6020395755767822, + "kl": 0.309645127505064, + "learning_rate": 9.92544577406923e-08, + "loss": 0.0124, + "num_tokens": 29067405.0, + "reward": 0.807861328125, + "reward_std": 0.012451669201254845, + "rewards//mean": 0.807861328125, + "rewards//std": 0.01992868259549141, + "step": 3991 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7984, + "grad_norm": 2.1577258110046387, + "kl": 0.36871641129255295, + "learning_rate": 9.90647714403714e-08, + "loss": 0.0147, + "num_tokens": 29074637.0, + "reward": 0.86810302734375, + "reward_std": 0.018937332555651665, + "rewards//mean": 0.86810302734375, + "rewards//std": 0.024207567796111107, + "step": 3992 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7986, + "grad_norm": 1.408563256263733, + "kl": 0.35744758881628513, + "learning_rate": 9.887524663671243e-08, + "loss": 0.0143, + "num_tokens": 29081901.0, + "reward": 0.77630615234375, + "reward_std": 0.014103198423981667, + "rewards//mean": 0.77630615234375, + "rewards//std": 0.01971321739256382, + "step": 3993 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7988, + "grad_norm": 1.5851213932037354, + "kl": 0.29951605200767517, + "learning_rate": 9.868588340605621e-08, + "loss": 0.012, + "num_tokens": 29089213.0, + "reward": 0.85174560546875, + "reward_std": 0.01841546595096588, + "rewards//mean": 0.85174560546875, + "rewards//std": 0.024144329130649567, + "step": 3994 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.799, + "grad_norm": 1.3623952865600586, + "kl": 0.31909630075097084, + "learning_rate": 9.849668182467807e-08, + "loss": 0.0128, + "num_tokens": 29096493.0, + "reward": 0.8271484375, + "reward_std": 0.014874465763568878, + "rewards//mean": 0.8271484375, + "rewards//std": 0.02221732959151268, + "step": 3995 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7992, + "grad_norm": 1.7546945810317993, + "kl": 0.3172786645591259, + "learning_rate": 9.830764196878871e-08, + "loss": 0.0127, + "num_tokens": 29103893.0, + "reward": 0.83587646484375, + "reward_std": 0.014482852071523666, + "rewards//mean": 0.83587646484375, + "rewards//std": 0.024154985323548317, + "step": 3996 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.7994, + "grad_norm": 1.4492746591567993, + "kl": 0.3415291775017977, + "learning_rate": 9.811876391453294e-08, + "loss": 0.0129, + "num_tokens": 29111168.0, + "reward": 0.86865234375, + "reward_std": 0.017546415328979492, + "rewards//mean": 0.86865234375, + "rewards//std": 0.02359713427722454, + "step": 3997 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.7996, + "grad_norm": 1.5276652574539185, + "kl": 0.33049282245337963, + "learning_rate": 9.793004773799102e-08, + "loss": 0.0132, + "num_tokens": 29118424.0, + "reward": 0.85638427734375, + "reward_std": 0.013910835608839989, + "rewards//mean": 0.85638427734375, + "rewards//std": 0.019459716975688934, + "step": 3998 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.7998, + "grad_norm": 1.3987878561019897, + "kl": 0.3054155595600605, + "learning_rate": 9.774149351517774e-08, + "loss": 0.0121, + "num_tokens": 29125714.0, + "reward": 0.85595703125, + "reward_std": 0.016958769410848618, + "rewards//mean": 0.85595703125, + "rewards//std": 0.027031954377889633, + "step": 3999 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8, + "grad_norm": 2.135067939758301, + "kl": 0.3523110318928957, + "learning_rate": 9.755310132204297e-08, + "loss": 0.0141, + "num_tokens": 29132994.0, + "reward": 0.8212890625, + "reward_std": 0.013080352917313576, + "rewards//mean": 0.8212890625, + "rewards//std": 0.016022957861423492, + "step": 4000 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8002, + "grad_norm": 1.4587428569793701, + "kl": 0.3243343885987997, + "learning_rate": 9.736487123447068e-08, + "loss": 0.013, + "num_tokens": 29140314.0, + "reward": 0.8519287109375, + "reward_std": 0.013727152720093727, + "rewards//mean": 0.8519287109375, + "rewards//std": 0.02393566630780697, + "step": 4001 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8004, + "grad_norm": 1.5289596319198608, + "kl": 0.31710324063897133, + "learning_rate": 9.717680332828015e-08, + "loss": 0.0127, + "num_tokens": 29147578.0, + "reward": 0.805908203125, + "reward_std": 0.010683877393603325, + "rewards//mean": 0.805908203125, + "rewards//std": 0.013671875931322575, + "step": 4002 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8006, + "grad_norm": 1.7354755401611328, + "kl": 0.4241904951632023, + "learning_rate": 9.698889767922514e-08, + "loss": 0.017, + "num_tokens": 29154802.0, + "reward": 0.8017578125, + "reward_std": 0.019585680216550827, + "rewards//mean": 0.8017578125, + "rewards//std": 0.027374735102057457, + "step": 4003 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8008, + "grad_norm": 1.762416124343872, + "kl": 0.31606124341487885, + "learning_rate": 9.680115436299385e-08, + "loss": 0.0126, + "num_tokens": 29162138.0, + "reward": 0.82281494140625, + "reward_std": 0.018070759251713753, + "rewards//mean": 0.82281494140625, + "rewards//std": 0.021936113014817238, + "step": 4004 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.801, + "grad_norm": 1.9562832117080688, + "kl": 0.33096628822386265, + "learning_rate": 9.661357345520937e-08, + "loss": 0.0132, + "num_tokens": 29169490.0, + "reward": 0.83642578125, + "reward_std": 0.01788272336125374, + "rewards//mean": 0.83642578125, + "rewards//std": 0.025272304192185402, + "step": 4005 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8012, + "grad_norm": 1.332939624786377, + "kl": 0.34954906068742275, + "learning_rate": 9.642615503142926e-08, + "loss": 0.014, + "num_tokens": 29176738.0, + "reward": 0.87200927734375, + "reward_std": 0.01470387913286686, + "rewards//mean": 0.87200927734375, + "rewards//std": 0.023362377658486366, + "step": 4006 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8014, + "grad_norm": 1.4373011589050293, + "kl": 0.32684989646077156, + "learning_rate": 9.623889916714578e-08, + "loss": 0.0131, + "num_tokens": 29184026.0, + "reward": 0.8546142578125, + "reward_std": 0.012430232018232346, + "rewards//mean": 0.8546142578125, + "rewards//std": 0.018900642171502113, + "step": 4007 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8016, + "grad_norm": 1.512559175491333, + "kl": 0.33992309495806694, + "learning_rate": 9.605180593778527e-08, + "loss": 0.0136, + "num_tokens": 29191330.0, + "reward": 0.81097412109375, + "reward_std": 0.015329939313232899, + "rewards//mean": 0.81097412109375, + "rewards//std": 0.02760923095047474, + "step": 4008 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8018, + "grad_norm": 1.6342332363128662, + "kl": 0.328130716457963, + "learning_rate": 9.586487541870907e-08, + "loss": 0.0131, + "num_tokens": 29198682.0, + "reward": 0.86859130859375, + "reward_std": 0.015528233721852303, + "rewards//mean": 0.86859130859375, + "rewards//std": 0.023513510823249817, + "step": 4009 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.802, + "grad_norm": 1.5103495121002197, + "kl": 0.29404704086482525, + "learning_rate": 9.567810768521267e-08, + "loss": 0.0118, + "num_tokens": 29205994.0, + "reward": 0.8480224609375, + "reward_std": 0.024138744920492172, + "rewards//mean": 0.8480224609375, + "rewards//std": 0.03307041898369789, + "step": 4010 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8022, + "grad_norm": 1.615199327468872, + "kl": 0.37356414645910263, + "learning_rate": 9.549150281252632e-08, + "loss": 0.0149, + "num_tokens": 29213298.0, + "reward": 0.87677001953125, + "reward_std": 0.016417548060417175, + "rewards//mean": 0.87677001953125, + "rewards//std": 0.02917305752635002, + "step": 4011 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.875, + "epoch": 0.8024, + "grad_norm": 1.4725537300109863, + "kl": 0.37834105640649796, + "learning_rate": 9.530506087581408e-08, + "loss": 0.0172, + "num_tokens": 29220538.0, + "reward": 0.86639404296875, + "reward_std": 0.01716620847582817, + "rewards//mean": 0.86639404296875, + "rewards//std": 0.023726923391222954, + "step": 4012 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8026, + "grad_norm": 1.414056420326233, + "kl": 0.3745747096836567, + "learning_rate": 9.511878195017498e-08, + "loss": 0.015, + "num_tokens": 29227746.0, + "reward": 0.69647216796875, + "reward_std": 0.011955620720982552, + "rewards//mean": 0.69647216796875, + "rewards//std": 0.022635070607066154, + "step": 4013 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8028, + "grad_norm": 1.4026436805725098, + "kl": 0.3168282303959131, + "learning_rate": 9.493266611064205e-08, + "loss": 0.0127, + "num_tokens": 29235130.0, + "reward": 0.8568115234375, + "reward_std": 0.015675274655222893, + "rewards//mean": 0.8568115234375, + "rewards//std": 0.019290665164589882, + "step": 4014 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.803, + "grad_norm": 1.5331435203552246, + "kl": 0.3006092309951782, + "learning_rate": 9.474671343218293e-08, + "loss": 0.0113, + "num_tokens": 29242430.0, + "reward": 0.81170654296875, + "reward_std": 0.01004455704241991, + "rewards//mean": 0.81170654296875, + "rewards//std": 0.018204964697360992, + "step": 4015 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8032, + "grad_norm": 1.8348156213760376, + "kl": 0.36586709320545197, + "learning_rate": 9.4560923989699e-08, + "loss": 0.0146, + "num_tokens": 29249766.0, + "reward": 0.82672119140625, + "reward_std": 0.012936081737279892, + "rewards//mean": 0.82672119140625, + "rewards//std": 0.0233065877109766, + "step": 4016 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8034, + "grad_norm": 1.462799072265625, + "kl": 0.4462288208305836, + "learning_rate": 9.437529785802644e-08, + "loss": 0.0178, + "num_tokens": 29256894.0, + "reward": 0.85052490234375, + "reward_std": 0.016644584015011787, + "rewards//mean": 0.85052490234375, + "rewards//std": 0.022088779136538506, + "step": 4017 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8036, + "grad_norm": 1.4367918968200684, + "kl": 0.28982184641063213, + "learning_rate": 9.41898351119355e-08, + "loss": 0.0116, + "num_tokens": 29264126.0, + "reward": 0.840576171875, + "reward_std": 0.013386214151978493, + "rewards//mean": 0.840576171875, + "rewards//std": 0.017244266346096992, + "step": 4018 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8038, + "grad_norm": 2.109316110610962, + "kl": 0.32935092598199844, + "learning_rate": 9.400453582613033e-08, + "loss": 0.0132, + "num_tokens": 29271518.0, + "reward": 0.85784912109375, + "reward_std": 0.014271185733377934, + "rewards//mean": 0.85784912109375, + "rewards//std": 0.020382221788167953, + "step": 4019 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.804, + "grad_norm": 1.6226083040237427, + "kl": 0.3399031050503254, + "learning_rate": 9.381940007524974e-08, + "loss": 0.0136, + "num_tokens": 29278694.0, + "reward": 0.84796142578125, + "reward_std": 0.016645364463329315, + "rewards//mean": 0.84796142578125, + "rewards//std": 0.020603088662028313, + "step": 4020 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8042, + "grad_norm": 1.9422663450241089, + "kl": 0.3925927300006151, + "learning_rate": 9.363442793386606e-08, + "loss": 0.0157, + "num_tokens": 29286054.0, + "reward": 0.82965087890625, + "reward_std": 0.012540956027805805, + "rewards//mean": 0.82965087890625, + "rewards//std": 0.016107644885778427, + "step": 4021 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8044, + "grad_norm": 1.5588078498840332, + "kl": 0.31506332382559776, + "learning_rate": 9.344961947648622e-08, + "loss": 0.0126, + "num_tokens": 29293334.0, + "reward": 0.85174560546875, + "reward_std": 0.0134055744856596, + "rewards//mean": 0.85174560546875, + "rewards//std": 0.027554897591471672, + "step": 4022 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.8046, + "grad_norm": 1.374786376953125, + "kl": 0.36646593734622, + "learning_rate": 9.326497477755113e-08, + "loss": 0.0161, + "num_tokens": 29300648.0, + "reward": 0.86981201171875, + "reward_std": 0.015178349800407887, + "rewards//mean": 0.86981201171875, + "rewards//std": 0.019187888130545616, + "step": 4023 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8048, + "grad_norm": 1.4833754301071167, + "kl": 0.35235616378486156, + "learning_rate": 9.308049391143547e-08, + "loss": 0.0141, + "num_tokens": 29308120.0, + "reward": 0.86102294921875, + "reward_std": 0.015533100813627243, + "rewards//mean": 0.86102294921875, + "rewards//std": 0.01981814019382, + "step": 4024 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.805, + "grad_norm": 1.7267820835113525, + "kl": 0.3149067461490631, + "learning_rate": 9.289617695244817e-08, + "loss": 0.0126, + "num_tokens": 29315368.0, + "reward": 0.83477783203125, + "reward_std": 0.014528170228004456, + "rewards//mean": 0.83477783203125, + "rewards//std": 0.01640654169023037, + "step": 4025 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8052, + "grad_norm": 1.5061668157577515, + "kl": 0.38484643027186394, + "learning_rate": 9.271202397483213e-08, + "loss": 0.0154, + "num_tokens": 29322704.0, + "reward": 0.8380126953125, + "reward_std": 0.013345549814403057, + "rewards//mean": 0.8380126953125, + "rewards//std": 0.02212960459291935, + "step": 4026 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8054, + "grad_norm": 1.8263343572616577, + "kl": 0.4530286453664303, + "learning_rate": 9.25280350527643e-08, + "loss": 0.0181, + "num_tokens": 29330040.0, + "reward": 0.86016845703125, + "reward_std": 0.017963070422410965, + "rewards//mean": 0.86016845703125, + "rewards//std": 0.023346174508333206, + "step": 4027 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8056, + "grad_norm": 1.5556817054748535, + "kl": 0.3215796519070864, + "learning_rate": 9.234421026035505e-08, + "loss": 0.0129, + "num_tokens": 29337296.0, + "reward": 0.8966064453125, + "reward_std": 0.018503602594137192, + "rewards//mean": 0.8966064453125, + "rewards//std": 0.023683898150920868, + "step": 4028 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8058, + "grad_norm": 1.5326390266418457, + "kl": 0.39177293330430984, + "learning_rate": 9.216054967164916e-08, + "loss": 0.0157, + "num_tokens": 29344568.0, + "reward": 0.86688232421875, + "reward_std": 0.022303424775600433, + "rewards//mean": 0.86688232421875, + "rewards//std": 0.028546778485178947, + "step": 4029 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.806, + "grad_norm": 1.6529861688613892, + "kl": 0.3960293307900429, + "learning_rate": 9.197705336062516e-08, + "loss": 0.0158, + "num_tokens": 29351944.0, + "reward": 0.86474609375, + "reward_std": 0.016423877328634262, + "rewards//mean": 0.86474609375, + "rewards//std": 0.02145758457481861, + "step": 4030 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8062, + "grad_norm": 1.9677897691726685, + "kl": 0.3480537496507168, + "learning_rate": 9.179372140119524e-08, + "loss": 0.0139, + "num_tokens": 29359264.0, + "reward": 0.81671142578125, + "reward_std": 0.010880688205361366, + "rewards//mean": 0.81671142578125, + "rewards//std": 0.01383791770786047, + "step": 4031 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.859375, + "epoch": 0.8064, + "grad_norm": 1.6738923788070679, + "kl": 0.3848317824304104, + "learning_rate": 9.161055386720545e-08, + "loss": 0.0157, + "num_tokens": 29366543.0, + "reward": 0.85809326171875, + "reward_std": 0.014519911259412766, + "rewards//mean": 0.85809326171875, + "rewards//std": 0.025535015389323235, + "step": 4032 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8066, + "grad_norm": 1.5317248106002808, + "kl": 0.36294709146022797, + "learning_rate": 9.142755083243575e-08, + "loss": 0.0145, + "num_tokens": 29373943.0, + "reward": 0.86328125, + "reward_std": 0.0155613599345088, + "rewards//mean": 0.86328125, + "rewards//std": 0.020255664363503456, + "step": 4033 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8068, + "grad_norm": 1.7571994066238403, + "kl": 0.3547169528901577, + "learning_rate": 9.124471237059989e-08, + "loss": 0.0142, + "num_tokens": 29381167.0, + "reward": 0.86029052734375, + "reward_std": 0.014634073711931705, + "rewards//mean": 0.86029052734375, + "rewards//std": 0.026224639266729355, + "step": 4034 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.807, + "grad_norm": 1.5464346408843994, + "kl": 0.3061304669827223, + "learning_rate": 9.106203855534478e-08, + "loss": 0.0122, + "num_tokens": 29388391.0, + "reward": 0.8167724609375, + "reward_std": 0.01092648133635521, + "rewards//mean": 0.8167724609375, + "rewards//std": 0.022148748859763145, + "step": 4035 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8072, + "grad_norm": 1.825700283050537, + "kl": 0.4843115769326687, + "learning_rate": 9.087952946025174e-08, + "loss": 0.0194, + "num_tokens": 29395639.0, + "reward": 0.83660888671875, + "reward_std": 0.01717470958828926, + "rewards//mean": 0.83660888671875, + "rewards//std": 0.021036429330706596, + "step": 4036 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8074, + "grad_norm": 2.249666929244995, + "kl": 0.45881129428744316, + "learning_rate": 9.069718515883524e-08, + "loss": 0.0184, + "num_tokens": 29402959.0, + "reward": 0.8758544921875, + "reward_std": 0.01726211979985237, + "rewards//mean": 0.8758544921875, + "rewards//std": 0.02444380708038807, + "step": 4037 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8076, + "grad_norm": 1.5668352842330933, + "kl": 0.43363138660788536, + "learning_rate": 9.051500572454373e-08, + "loss": 0.0173, + "num_tokens": 29410295.0, + "reward": 0.84698486328125, + "reward_std": 0.016613787040114403, + "rewards//mean": 0.84698486328125, + "rewards//std": 0.024837825447320938, + "step": 4038 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8078, + "grad_norm": 1.7633342742919922, + "kl": 0.3147842325270176, + "learning_rate": 9.033299123075883e-08, + "loss": 0.0126, + "num_tokens": 29417543.0, + "reward": 0.8773193359375, + "reward_std": 0.015306171961128712, + "rewards//mean": 0.8773193359375, + "rewards//std": 0.017520764842629433, + "step": 4039 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.808, + "grad_norm": 1.5156644582748413, + "kl": 0.37166091054677963, + "learning_rate": 9.015114175079613e-08, + "loss": 0.0149, + "num_tokens": 29424831.0, + "reward": 0.82757568359375, + "reward_std": 0.017196418717503548, + "rewards//mean": 0.82757568359375, + "rewards//std": 0.02352895587682724, + "step": 4040 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8082, + "grad_norm": 1.5108674764633179, + "kl": 0.3399793077260256, + "learning_rate": 8.996945735790446e-08, + "loss": 0.0136, + "num_tokens": 29432119.0, + "reward": 0.8756103515625, + "reward_std": 0.018779566511511803, + "rewards//mean": 0.8756103515625, + "rewards//std": 0.03075956553220749, + "step": 4041 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8084, + "grad_norm": 1.4036959409713745, + "kl": 0.32833194732666016, + "learning_rate": 8.978793812526647e-08, + "loss": 0.0131, + "num_tokens": 29439319.0, + "reward": 0.8724365234375, + "reward_std": 0.012182984501123428, + "rewards//mean": 0.8724365234375, + "rewards//std": 0.02241506241261959, + "step": 4042 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8086, + "grad_norm": 1.7700645923614502, + "kl": 0.3605250045657158, + "learning_rate": 8.960658412599781e-08, + "loss": 0.0144, + "num_tokens": 29446647.0, + "reward": 0.80596923828125, + "reward_std": 0.013495380990207195, + "rewards//mean": 0.80596923828125, + "rewards//std": 0.016558995470404625, + "step": 4043 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8088, + "grad_norm": 1.6948775053024292, + "kl": 0.3622147664427757, + "learning_rate": 8.942539543314798e-08, + "loss": 0.0145, + "num_tokens": 29453983.0, + "reward": 0.820556640625, + "reward_std": 0.01657123863697052, + "rewards//mean": 0.820556640625, + "rewards//std": 0.022508401423692703, + "step": 4044 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.809, + "grad_norm": 1.420899748802185, + "kl": 0.3230245281010866, + "learning_rate": 8.924437211969981e-08, + "loss": 0.0129, + "num_tokens": 29461295.0, + "reward": 0.82843017578125, + "reward_std": 0.013720327988266945, + "rewards//mean": 0.82843017578125, + "rewards//std": 0.023450979962944984, + "step": 4045 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8092, + "grad_norm": 1.5133715867996216, + "kl": 0.3886445723474026, + "learning_rate": 8.90635142585695e-08, + "loss": 0.0155, + "num_tokens": 29468543.0, + "reward": 0.845947265625, + "reward_std": 0.017737004905939102, + "rewards//mean": 0.845947265625, + "rewards//std": 0.02722279727458954, + "step": 4046 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8094, + "grad_norm": 1.856576919555664, + "kl": 0.3784852959215641, + "learning_rate": 8.888282192260643e-08, + "loss": 0.0151, + "num_tokens": 29475863.0, + "reward": 0.8651123046875, + "reward_std": 0.016636978834867477, + "rewards//mean": 0.8651123046875, + "rewards//std": 0.01932516321539879, + "step": 4047 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8096, + "grad_norm": 1.5376306772232056, + "kl": 0.3248175475746393, + "learning_rate": 8.870229518459349e-08, + "loss": 0.013, + "num_tokens": 29483255.0, + "reward": 0.8682861328125, + "reward_std": 0.01662476919591427, + "rewards//mean": 0.8682861328125, + "rewards//std": 0.021000871434807777, + "step": 4048 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8098, + "grad_norm": 1.3211756944656372, + "kl": 0.3073658440262079, + "learning_rate": 8.8521934117247e-08, + "loss": 0.0123, + "num_tokens": 29490575.0, + "reward": 0.8480224609375, + "reward_std": 0.013291878625750542, + "rewards//mean": 0.8480224609375, + "rewards//std": 0.021673431620001793, + "step": 4049 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.81, + "grad_norm": 1.4569898843765259, + "kl": 0.3187287300825119, + "learning_rate": 8.834173879321615e-08, + "loss": 0.0127, + "num_tokens": 29497999.0, + "reward": 0.83148193359375, + "reward_std": 0.016650080680847168, + "rewards//mean": 0.83148193359375, + "rewards//std": 0.030527574941515923, + "step": 4050 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8102, + "grad_norm": 1.6321340799331665, + "kl": 0.3339620288461447, + "learning_rate": 8.816170928508365e-08, + "loss": 0.0134, + "num_tokens": 29505287.0, + "reward": 0.86181640625, + "reward_std": 0.015079575590789318, + "rewards//mean": 0.86181640625, + "rewards//std": 0.01999692991375923, + "step": 4051 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8104, + "grad_norm": 1.4220914840698242, + "kl": 0.3243557885289192, + "learning_rate": 8.798184566536538e-08, + "loss": 0.013, + "num_tokens": 29512591.0, + "reward": 0.873291015625, + "reward_std": 0.01945408433675766, + "rewards//mean": 0.873291015625, + "rewards//std": 0.03218073025345802, + "step": 4052 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8106, + "grad_norm": 1.7433621883392334, + "kl": 0.38570239767432213, + "learning_rate": 8.780214800651059e-08, + "loss": 0.0154, + "num_tokens": 29519911.0, + "reward": 0.82763671875, + "reward_std": 0.015539292246103287, + "rewards//mean": 0.82763671875, + "rewards//std": 0.02312016673386097, + "step": 4053 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8108, + "grad_norm": 1.4249589443206787, + "kl": 0.35746049508452415, + "learning_rate": 8.762261638090112e-08, + "loss": 0.0143, + "num_tokens": 29527247.0, + "reward": 0.88134765625, + "reward_std": 0.01964564621448517, + "rewards//mean": 0.88134765625, + "rewards//std": 0.030973635613918304, + "step": 4054 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.811, + "grad_norm": 1.5641109943389893, + "kl": 0.37428586930036545, + "learning_rate": 8.744325086085247e-08, + "loss": 0.015, + "num_tokens": 29534551.0, + "reward": 0.85675048828125, + "reward_std": 0.01298134308308363, + "rewards//mean": 0.85675048828125, + "rewards//std": 0.01921311765909195, + "step": 4055 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8112, + "grad_norm": 1.4449235200881958, + "kl": 0.3734564073383808, + "learning_rate": 8.726405151861299e-08, + "loss": 0.0149, + "num_tokens": 29541927.0, + "reward": 0.885009765625, + "reward_std": 0.017217546701431274, + "rewards//mean": 0.885009765625, + "rewards//std": 0.02055094949901104, + "step": 4056 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8114, + "grad_norm": 1.5001360177993774, + "kl": 0.2946987710893154, + "learning_rate": 8.70850184263644e-08, + "loss": 0.0118, + "num_tokens": 29549183.0, + "reward": 0.84796142578125, + "reward_std": 0.010441591963171959, + "rewards//mean": 0.84796142578125, + "rewards//std": 0.017898043617606163, + "step": 4057 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8116, + "grad_norm": 1.5520904064178467, + "kl": 0.3271983079612255, + "learning_rate": 8.690615165622083e-08, + "loss": 0.0131, + "num_tokens": 29556447.0, + "reward": 0.843017578125, + "reward_std": 0.016872640699148178, + "rewards//mean": 0.843017578125, + "rewards//std": 0.025428753346204758, + "step": 4058 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.828125, + "epoch": 0.8118, + "grad_norm": 1.7050447463989258, + "kl": 0.3372139297425747, + "learning_rate": 8.672745128022996e-08, + "loss": 0.0158, + "num_tokens": 29563796.0, + "reward": 0.84173583984375, + "reward_std": 0.020819325000047684, + "rewards//mean": 0.84173583984375, + "rewards//std": 0.025505950674414635, + "step": 4059 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.812, + "grad_norm": 1.5358121395111084, + "kl": 0.3630470968782902, + "learning_rate": 8.654891737037235e-08, + "loss": 0.0145, + "num_tokens": 29571052.0, + "reward": 0.84356689453125, + "reward_std": 0.02034541592001915, + "rewards//mean": 0.84356689453125, + "rewards//std": 0.02834348753094673, + "step": 4060 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8122, + "grad_norm": 1.480872631072998, + "kl": 0.41383666545152664, + "learning_rate": 8.637054999856147e-08, + "loss": 0.0166, + "num_tokens": 29578292.0, + "reward": 0.86834716796875, + "reward_std": 0.010935262776911259, + "rewards//mean": 0.86834716796875, + "rewards//std": 0.01480813231319189, + "step": 4061 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.8124, + "grad_norm": 1.4158928394317627, + "kl": 0.3611260037869215, + "learning_rate": 8.619234923664349e-08, + "loss": 0.0144, + "num_tokens": 29585675.0, + "reward": 0.84857177734375, + "reward_std": 0.02056610956788063, + "rewards//mean": 0.84857177734375, + "rewards//std": 0.02844850718975067, + "step": 4062 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8126, + "grad_norm": 1.8008395433425903, + "kl": 0.35143566131591797, + "learning_rate": 8.601431515639768e-08, + "loss": 0.0141, + "num_tokens": 29592875.0, + "reward": 0.864013671875, + "reward_std": 0.01441624853760004, + "rewards//mean": 0.864013671875, + "rewards//std": 0.021143468096852303, + "step": 4063 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8128, + "grad_norm": 1.593862771987915, + "kl": 0.37727850303053856, + "learning_rate": 8.583644782953642e-08, + "loss": 0.0151, + "num_tokens": 29600091.0, + "reward": 0.83477783203125, + "reward_std": 0.02365533635020256, + "rewards//mean": 0.83477783203125, + "rewards//std": 0.028979387134313583, + "step": 4064 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.813, + "grad_norm": 1.6110950708389282, + "kl": 0.3739582300186157, + "learning_rate": 8.565874732770428e-08, + "loss": 0.015, + "num_tokens": 29607531.0, + "reward": 0.847412109375, + "reward_std": 0.015856323763728142, + "rewards//mean": 0.847412109375, + "rewards//std": 0.023239022120833397, + "step": 4065 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8132, + "grad_norm": 1.4501993656158447, + "kl": 0.3683447651565075, + "learning_rate": 8.548121372247919e-08, + "loss": 0.0147, + "num_tokens": 29614923.0, + "reward": 0.8231201171875, + "reward_std": 0.01301217544823885, + "rewards//mean": 0.8231201171875, + "rewards//std": 0.01622527465224266, + "step": 4066 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8134, + "grad_norm": 1.7003854513168335, + "kl": 0.3315230067819357, + "learning_rate": 8.530384708537159e-08, + "loss": 0.0133, + "num_tokens": 29622147.0, + "reward": 0.8631591796875, + "reward_std": 0.0194874070584774, + "rewards//mean": 0.8631591796875, + "rewards//std": 0.029023557901382446, + "step": 4067 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.65625, + "epoch": 0.8136, + "grad_norm": 1.7231545448303223, + "kl": 0.3417908847332001, + "learning_rate": 8.512664748782494e-08, + "loss": -0.0008, + "num_tokens": 29629413.0, + "reward": 0.8896484375, + "reward_std": 0.021458551287651062, + "rewards//mean": 0.8896484375, + "rewards//std": 0.03230936452746391, + "step": 4068 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.8138, + "grad_norm": 1.3951308727264404, + "kl": 0.34267251938581467, + "learning_rate": 8.4949615001215e-08, + "loss": 0.0121, + "num_tokens": 29636695.0, + "reward": 0.855712890625, + "reward_std": 0.015356099233031273, + "rewards//mean": 0.855712890625, + "rewards//std": 0.018647868186235428, + "step": 4069 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.814, + "grad_norm": 1.708523154258728, + "kl": 0.3382199965417385, + "learning_rate": 8.477274969685045e-08, + "loss": 0.0135, + "num_tokens": 29644015.0, + "reward": 0.83685302734375, + "reward_std": 0.01064220629632473, + "rewards//mean": 0.83685302734375, + "rewards//std": 0.014360782690346241, + "step": 4070 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.765625, + "epoch": 0.8142, + "grad_norm": 1.8484249114990234, + "kl": 0.31642797589302063, + "learning_rate": 8.459605164597267e-08, + "loss": 0.0036, + "num_tokens": 29651304.0, + "reward": 0.87353515625, + "reward_std": 0.014482909813523293, + "rewards//mean": 0.87353515625, + "rewards//std": 0.020546529442071915, + "step": 4071 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8144, + "grad_norm": 1.3553528785705566, + "kl": 0.2938405480235815, + "learning_rate": 8.441952091975573e-08, + "loss": 0.0118, + "num_tokens": 29658488.0, + "reward": 0.85845947265625, + "reward_std": 0.013590282760560513, + "rewards//mean": 0.85845947265625, + "rewards//std": 0.031998585909605026, + "step": 4072 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8146, + "grad_norm": 1.3125892877578735, + "kl": 0.3915628008544445, + "learning_rate": 8.424315758930595e-08, + "loss": 0.0157, + "num_tokens": 29665824.0, + "reward": 0.83441162109375, + "reward_std": 0.011935977265238762, + "rewards//mean": 0.83441162109375, + "rewards//std": 0.014969788491725922, + "step": 4073 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8148, + "grad_norm": 1.4398919343948364, + "kl": 0.34620477445423603, + "learning_rate": 8.406696172566257e-08, + "loss": 0.0138, + "num_tokens": 29673080.0, + "reward": 0.85150146484375, + "reward_std": 0.014471301808953285, + "rewards//mean": 0.85150146484375, + "rewards//std": 0.02466106228530407, + "step": 4074 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.815, + "grad_norm": 1.6354414224624634, + "kl": 0.34730883315205574, + "learning_rate": 8.389093339979725e-08, + "loss": 0.0139, + "num_tokens": 29680344.0, + "reward": 0.86761474609375, + "reward_std": 0.0167723186314106, + "rewards//mean": 0.86761474609375, + "rewards//std": 0.03159008175134659, + "step": 4075 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8152, + "grad_norm": 1.2687550783157349, + "kl": 0.37139759212732315, + "learning_rate": 8.371507268261435e-08, + "loss": 0.0149, + "num_tokens": 29687616.0, + "reward": 0.85919189453125, + "reward_std": 0.012017352506518364, + "rewards//mean": 0.85919189453125, + "rewards//std": 0.020320484414696693, + "step": 4076 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8154, + "grad_norm": 1.488584280014038, + "kl": 0.3846381939947605, + "learning_rate": 8.353937964495028e-08, + "loss": 0.0154, + "num_tokens": 29694912.0, + "reward": 0.82415771484375, + "reward_std": 0.01577061600983143, + "rewards//mean": 0.82415771484375, + "rewards//std": 0.019294099882245064, + "step": 4077 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8156, + "grad_norm": 1.5773885250091553, + "kl": 0.38428251072764397, + "learning_rate": 8.336385435757426e-08, + "loss": 0.0154, + "num_tokens": 29702240.0, + "reward": 0.8299560546875, + "reward_std": 0.013626720756292343, + "rewards//mean": 0.8299560546875, + "rewards//std": 0.019758950918912888, + "step": 4078 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8158, + "grad_norm": 1.5501823425292969, + "kl": 0.31198790296912193, + "learning_rate": 8.318849689118801e-08, + "loss": 0.0125, + "num_tokens": 29709448.0, + "reward": 0.83551025390625, + "reward_std": 0.010721169412136078, + "rewards//mean": 0.83551025390625, + "rewards//std": 0.018124129623174667, + "step": 4079 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.816, + "grad_norm": 1.739965796470642, + "kl": 0.3023327849805355, + "learning_rate": 8.301330731642519e-08, + "loss": 0.012, + "num_tokens": 29716790.0, + "reward": 0.82379150390625, + "reward_std": 0.011935070157051086, + "rewards//mean": 0.82379150390625, + "rewards//std": 0.022043501958251, + "step": 4080 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8162, + "grad_norm": 1.5276968479156494, + "kl": 0.3786236122250557, + "learning_rate": 8.283828570385237e-08, + "loss": 0.0151, + "num_tokens": 29724030.0, + "reward": 0.83746337890625, + "reward_std": 0.012060046195983887, + "rewards//mean": 0.83746337890625, + "rewards//std": 0.01490492932498455, + "step": 4081 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.859375, + "epoch": 0.8164, + "grad_norm": 1.4301297664642334, + "kl": 0.3200769256800413, + "learning_rate": 8.26634321239682e-08, + "loss": 0.0109, + "num_tokens": 29731309.0, + "reward": 0.88043212890625, + "reward_std": 0.012733598239719868, + "rewards//mean": 0.88043212890625, + "rewards//std": 0.019815849140286446, + "step": 4082 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8166, + "grad_norm": 1.5842399597167969, + "kl": 0.35028044506907463, + "learning_rate": 8.248874664720374e-08, + "loss": 0.014, + "num_tokens": 29738533.0, + "reward": 0.866943359375, + "reward_std": 0.013190014287829399, + "rewards//mean": 0.866943359375, + "rewards//std": 0.018187599256634712, + "step": 4083 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8168, + "grad_norm": 1.3711832761764526, + "kl": 0.3304759208112955, + "learning_rate": 8.231422934392213e-08, + "loss": 0.0132, + "num_tokens": 29745813.0, + "reward": 0.7572021484375, + "reward_std": 0.011458509601652622, + "rewards//mean": 0.7572021484375, + "rewards//std": 0.02558395080268383, + "step": 4084 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.817, + "grad_norm": 1.8062098026275635, + "kl": 0.42557596787810326, + "learning_rate": 8.213988028441893e-08, + "loss": 0.017, + "num_tokens": 29753085.0, + "reward": 0.83740234375, + "reward_std": 0.01895267516374588, + "rewards//mean": 0.83740234375, + "rewards//std": 0.02820703573524952, + "step": 4085 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8172, + "grad_norm": 1.7063661813735962, + "kl": 0.4340951330959797, + "learning_rate": 8.196569953892201e-08, + "loss": 0.0174, + "num_tokens": 29760389.0, + "reward": 0.83367919921875, + "reward_std": 0.010717886500060558, + "rewards//mean": 0.83367919921875, + "rewards//std": 0.014311154372990131, + "step": 4086 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8174, + "grad_norm": 2.082613468170166, + "kl": 0.39800751954317093, + "learning_rate": 8.179168717759144e-08, + "loss": 0.0159, + "num_tokens": 29767613.0, + "reward": 0.81634521484375, + "reward_std": 0.015658479183912277, + "rewards//mean": 0.81634521484375, + "rewards//std": 0.021648533642292023, + "step": 4087 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8176, + "grad_norm": 1.8582857847213745, + "kl": 0.33859986811876297, + "learning_rate": 8.161784327051919e-08, + "loss": 0.0135, + "num_tokens": 29774885.0, + "reward": 0.84716796875, + "reward_std": 0.01733144000172615, + "rewards//mean": 0.84716796875, + "rewards//std": 0.028352633118629456, + "step": 4088 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8178, + "grad_norm": 1.6356595754623413, + "kl": 0.356569591909647, + "learning_rate": 8.144416788772957e-08, + "loss": 0.0143, + "num_tokens": 29782165.0, + "reward": 0.84808349609375, + "reward_std": 0.019413206726312637, + "rewards//mean": 0.84808349609375, + "rewards//std": 0.024301188066601753, + "step": 4089 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.818, + "grad_norm": 1.8514004945755005, + "kl": 0.36924328841269016, + "learning_rate": 8.127066109917908e-08, + "loss": 0.0148, + "num_tokens": 29789445.0, + "reward": 0.83837890625, + "reward_std": 0.012767795473337173, + "rewards//mean": 0.83837890625, + "rewards//std": 0.01692708395421505, + "step": 4090 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8182, + "grad_norm": 1.4977058172225952, + "kl": 0.3728118911385536, + "learning_rate": 8.109732297475635e-08, + "loss": 0.0149, + "num_tokens": 29796741.0, + "reward": 0.83428955078125, + "reward_std": 0.012760505080223083, + "rewards//mean": 0.83428955078125, + "rewards//std": 0.01886647380888462, + "step": 4091 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8184, + "grad_norm": 2.0038208961486816, + "kl": 0.4062570780515671, + "learning_rate": 8.092415358428173e-08, + "loss": 0.0163, + "num_tokens": 29804005.0, + "reward": 0.86676025390625, + "reward_std": 0.01874103583395481, + "rewards//mean": 0.86676025390625, + "rewards//std": 0.029356688261032104, + "step": 4092 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8186, + "grad_norm": 1.303363561630249, + "kl": 0.31607090681791306, + "learning_rate": 8.075115299750796e-08, + "loss": 0.0126, + "num_tokens": 29811301.0, + "reward": 0.84368896484375, + "reward_std": 0.014583262614905834, + "rewards//mean": 0.84368896484375, + "rewards//std": 0.025943754240870476, + "step": 4093 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8188, + "grad_norm": 1.6058518886566162, + "kl": 0.4020090326666832, + "learning_rate": 8.057832128411967e-08, + "loss": 0.0161, + "num_tokens": 29818485.0, + "reward": 0.8621826171875, + "reward_std": 0.01677882857620716, + "rewards//mean": 0.8621826171875, + "rewards//std": 0.02147979475557804, + "step": 4094 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.819, + "grad_norm": 1.4212559461593628, + "kl": 0.33662329241633415, + "learning_rate": 8.040565851373332e-08, + "loss": 0.0135, + "num_tokens": 29825701.0, + "reward": 0.7811279296875, + "reward_std": 0.019172456115484238, + "rewards//mean": 0.7811279296875, + "rewards//std": 0.03170374408364296, + "step": 4095 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8192, + "grad_norm": 1.47782564163208, + "kl": 0.3800143748521805, + "learning_rate": 8.023316475589754e-08, + "loss": 0.0152, + "num_tokens": 29833037.0, + "reward": 0.84552001953125, + "reward_std": 0.016507180407643318, + "rewards//mean": 0.84552001953125, + "rewards//std": 0.024199439212679863, + "step": 4096 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8194, + "grad_norm": 1.5981013774871826, + "kl": 0.35446709766983986, + "learning_rate": 8.006084008009283e-08, + "loss": 0.0142, + "num_tokens": 29840357.0, + "reward": 0.82208251953125, + "reward_std": 0.014931393787264824, + "rewards//mean": 0.82208251953125, + "rewards//std": 0.018596554175019264, + "step": 4097 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.78125, + "epoch": 0.8196, + "grad_norm": 1.4999812841415405, + "kl": 0.35436281748116016, + "learning_rate": 7.988868455573161e-08, + "loss": 0.008, + "num_tokens": 29847671.0, + "reward": 0.8663330078125, + "reward_std": 0.01571587473154068, + "rewards//mean": 0.8663330078125, + "rewards//std": 0.021524852141737938, + "step": 4098 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8198, + "grad_norm": 1.4586131572723389, + "kl": 0.35615384206175804, + "learning_rate": 7.971669825215787e-08, + "loss": 0.0142, + "num_tokens": 29854943.0, + "reward": 0.79901123046875, + "reward_std": 0.012835171073675156, + "rewards//mean": 0.79901123046875, + "rewards//std": 0.0186355859041214, + "step": 4099 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.82, + "grad_norm": 1.5493109226226807, + "kl": 0.3375002555549145, + "learning_rate": 7.954488123864783e-08, + "loss": 0.0135, + "num_tokens": 29862175.0, + "reward": 0.8758544921875, + "reward_std": 0.02242177166044712, + "rewards//mean": 0.8758544921875, + "rewards//std": 0.036315012723207474, + "step": 4100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8202, + "grad_norm": 1.8541486263275146, + "kl": 0.3447923883795738, + "learning_rate": 7.937323358440934e-08, + "loss": 0.0138, + "num_tokens": 29869511.0, + "reward": 0.87255859375, + "reward_std": 0.017785675823688507, + "rewards//mean": 0.87255859375, + "rewards//std": 0.018655983731150627, + "step": 4101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8204, + "grad_norm": 1.5164684057235718, + "kl": 0.3833223059773445, + "learning_rate": 7.92017553585822e-08, + "loss": 0.0153, + "num_tokens": 29876911.0, + "reward": 0.838623046875, + "reward_std": 0.0172768235206604, + "rewards//mean": 0.838623046875, + "rewards//std": 0.021017082035541534, + "step": 4102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8206, + "grad_norm": 1.3322234153747559, + "kl": 0.29799244552850723, + "learning_rate": 7.903044663023755e-08, + "loss": 0.0119, + "num_tokens": 29884191.0, + "reward": 0.88330078125, + "reward_std": 0.01576179265975952, + "rewards//mean": 0.88330078125, + "rewards//std": 0.031730711460113525, + "step": 4103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.6875, + "epoch": 0.8208, + "grad_norm": 1.684157371520996, + "kl": 0.4051645416766405, + "learning_rate": 7.885930746837865e-08, + "loss": -0.0081, + "num_tokens": 29891403.0, + "reward": 0.86920166015625, + "reward_std": 0.014816135168075562, + "rewards//mean": 0.86920166015625, + "rewards//std": 0.025209857150912285, + "step": 4104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.821, + "grad_norm": 1.4825447797775269, + "kl": 0.33482519537210464, + "learning_rate": 7.868833794194046e-08, + "loss": 0.0134, + "num_tokens": 29898723.0, + "reward": 0.87152099609375, + "reward_std": 0.013751237653195858, + "rewards//mean": 0.87152099609375, + "rewards//std": 0.02345936931669712, + "step": 4105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8212, + "grad_norm": 1.4513994455337524, + "kl": 0.389457393437624, + "learning_rate": 7.851753811978923e-08, + "loss": 0.0156, + "num_tokens": 29905995.0, + "reward": 0.80682373046875, + "reward_std": 0.015258923172950745, + "rewards//mean": 0.80682373046875, + "rewards//std": 0.02294592186808586, + "step": 4106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8214, + "grad_norm": 1.6346688270568848, + "kl": 0.2920389287173748, + "learning_rate": 7.834690807072342e-08, + "loss": 0.0117, + "num_tokens": 29913259.0, + "reward": 0.84716796875, + "reward_std": 0.013358289375901222, + "rewards//mean": 0.84716796875, + "rewards//std": 0.02312016673386097, + "step": 4107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8216, + "grad_norm": 2.010507345199585, + "kl": 0.40758688747882843, + "learning_rate": 7.817644786347245e-08, + "loss": 0.0163, + "num_tokens": 29920475.0, + "reward": 0.88262939453125, + "reward_std": 0.02124773897230625, + "rewards//mean": 0.88262939453125, + "rewards//std": 0.02603636309504509, + "step": 4108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8218, + "grad_norm": 1.8026753664016724, + "kl": 0.3326286692172289, + "learning_rate": 7.800615756669782e-08, + "loss": 0.0133, + "num_tokens": 29927803.0, + "reward": 0.8499755859375, + "reward_std": 0.009349241852760315, + "rewards//mean": 0.8499755859375, + "rewards//std": 0.013739800080657005, + "step": 4109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.822, + "grad_norm": 1.4375810623168945, + "kl": 0.2742121070623398, + "learning_rate": 7.783603724899257e-08, + "loss": 0.0094, + "num_tokens": 29935161.0, + "reward": 0.84320068359375, + "reward_std": 0.013526566326618195, + "rewards//mean": 0.84320068359375, + "rewards//std": 0.021529333665966988, + "step": 4110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.8222, + "grad_norm": 1.5723296403884888, + "kl": 0.4499308168888092, + "learning_rate": 7.766608697888094e-08, + "loss": 0.0208, + "num_tokens": 29942461.0, + "reward": 0.85711669921875, + "reward_std": 0.012958523817360401, + "rewards//mean": 0.85711669921875, + "rewards//std": 0.017494285479187965, + "step": 4111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8224, + "grad_norm": 1.6669658422470093, + "kl": 0.3587644509971142, + "learning_rate": 7.749630682481895e-08, + "loss": 0.0144, + "num_tokens": 29949701.0, + "reward": 0.85516357421875, + "reward_std": 0.01264805905520916, + "rewards//mean": 0.85516357421875, + "rewards//std": 0.022641757503151894, + "step": 4112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8226, + "grad_norm": 1.8298367261886597, + "kl": 0.36348821967840195, + "learning_rate": 7.732669685519405e-08, + "loss": 0.0145, + "num_tokens": 29956901.0, + "reward": 0.88726806640625, + "reward_std": 0.01320055965334177, + "rewards//mean": 0.88726806640625, + "rewards//std": 0.018491249531507492, + "step": 4113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8228, + "grad_norm": 1.5921134948730469, + "kl": 0.42984685115516186, + "learning_rate": 7.715725713832527e-08, + "loss": 0.0172, + "num_tokens": 29964221.0, + "reward": 0.82366943359375, + "reward_std": 0.012229678221046925, + "rewards//mean": 0.82366943359375, + "rewards//std": 0.020691798999905586, + "step": 4114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.823, + "grad_norm": 1.3327782154083252, + "kl": 0.3775964640080929, + "learning_rate": 7.698798774246257e-08, + "loss": 0.0151, + "num_tokens": 29971573.0, + "reward": 0.82940673828125, + "reward_std": 0.017017263919115067, + "rewards//mean": 0.82940673828125, + "rewards//std": 0.025363707914948463, + "step": 4115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8232, + "grad_norm": 1.655260443687439, + "kl": 0.3462710753083229, + "learning_rate": 7.681888873578785e-08, + "loss": 0.0139, + "num_tokens": 29978893.0, + "reward": 0.85052490234375, + "reward_std": 0.013916927389800549, + "rewards//mean": 0.85052490234375, + "rewards//std": 0.022533860057592392, + "step": 4116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8234, + "grad_norm": 1.7401143312454224, + "kl": 0.32749659195542336, + "learning_rate": 7.664996018641413e-08, + "loss": 0.0131, + "num_tokens": 29986093.0, + "reward": 0.86529541015625, + "reward_std": 0.01853601261973381, + "rewards//mean": 0.86529541015625, + "rewards//std": 0.02404947020113468, + "step": 4117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8236, + "grad_norm": 1.5902061462402344, + "kl": 0.34645141661167145, + "learning_rate": 7.648120216238596e-08, + "loss": 0.0139, + "num_tokens": 29993429.0, + "reward": 0.822509765625, + "reward_std": 0.01508716493844986, + "rewards//mean": 0.822509765625, + "rewards//std": 0.026683634147047997, + "step": 4118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.8238, + "grad_norm": 1.382941722869873, + "kl": 0.29912910237908363, + "learning_rate": 7.631261473167877e-08, + "loss": 0.0125, + "num_tokens": 30000676.0, + "reward": 0.86297607421875, + "reward_std": 0.01936020329594612, + "rewards//mean": 0.86297607421875, + "rewards//std": 0.02757137268781662, + "step": 4119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.824, + "grad_norm": 1.4612627029418945, + "kl": 0.346102949231863, + "learning_rate": 7.614419796219973e-08, + "loss": 0.0138, + "num_tokens": 30007924.0, + "reward": 0.879638671875, + "reward_std": 0.01815979927778244, + "rewards//mean": 0.879638671875, + "rewards//std": 0.025627993047237396, + "step": 4120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8242, + "grad_norm": 2.020421028137207, + "kl": 0.4099403955042362, + "learning_rate": 7.597595192178702e-08, + "loss": 0.0164, + "num_tokens": 30015156.0, + "reward": 0.8529052734375, + "reward_std": 0.016660112887620926, + "rewards//mean": 0.8529052734375, + "rewards//std": 0.030514493584632874, + "step": 4121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8244, + "grad_norm": 1.4302366971969604, + "kl": 0.32427364215254784, + "learning_rate": 7.580787667821032e-08, + "loss": 0.013, + "num_tokens": 30022476.0, + "reward": 0.86419677734375, + "reward_std": 0.015132268890738487, + "rewards//mean": 0.86419677734375, + "rewards//std": 0.033318668603897095, + "step": 4122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8246, + "grad_norm": 1.3811843395233154, + "kl": 0.3423639126121998, + "learning_rate": 7.563997229917002e-08, + "loss": 0.0137, + "num_tokens": 30029740.0, + "reward": 0.8883056640625, + "reward_std": 0.014657854102551937, + "rewards//mean": 0.8883056640625, + "rewards//std": 0.019107744097709656, + "step": 4123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8248, + "grad_norm": 1.5523297786712646, + "kl": 0.354934424161911, + "learning_rate": 7.547223885229814e-08, + "loss": 0.0142, + "num_tokens": 30037012.0, + "reward": 0.886962890625, + "reward_std": 0.016636880114674568, + "rewards//mean": 0.886962890625, + "rewards//std": 0.020959382876753807, + "step": 4124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.825, + "grad_norm": 1.706650972366333, + "kl": 0.3536223918199539, + "learning_rate": 7.53046764051578e-08, + "loss": 0.0141, + "num_tokens": 30044252.0, + "reward": 0.81689453125, + "reward_std": 0.01404641754925251, + "rewards//mean": 0.81689453125, + "rewards//std": 0.01729510724544525, + "step": 4125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.859375, + "epoch": 0.8252, + "grad_norm": 1.5872907638549805, + "kl": 0.3939905297011137, + "learning_rate": 7.513728502524286e-08, + "loss": 0.012, + "num_tokens": 30051499.0, + "reward": 0.8387451171875, + "reward_std": 0.014675314538180828, + "rewards//mean": 0.8387451171875, + "rewards//std": 0.02163708209991455, + "step": 4126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8254, + "grad_norm": 1.4254542589187622, + "kl": 0.39181446097791195, + "learning_rate": 7.497006477997874e-08, + "loss": 0.0157, + "num_tokens": 30058763.0, + "reward": 0.84686279296875, + "reward_std": 0.01810998097062111, + "rewards//mean": 0.84686279296875, + "rewards//std": 0.02393021062016487, + "step": 4127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8256, + "grad_norm": 1.47821843624115, + "kl": 0.37221933528780937, + "learning_rate": 7.48030157367217e-08, + "loss": 0.0149, + "num_tokens": 30066083.0, + "reward": 0.8372802734375, + "reward_std": 0.015584672801196575, + "rewards//mean": 0.8372802734375, + "rewards//std": 0.01994500681757927, + "step": 4128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8258, + "grad_norm": 1.4267338514328003, + "kl": 0.31398200802505016, + "learning_rate": 7.46361379627592e-08, + "loss": 0.0126, + "num_tokens": 30073379.0, + "reward": 0.86712646484375, + "reward_std": 0.023829560726881027, + "rewards//mean": 0.86712646484375, + "rewards//std": 0.034635644406080246, + "step": 4129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.826, + "grad_norm": 1.5147639513015747, + "kl": 0.3327608145773411, + "learning_rate": 7.446943152530932e-08, + "loss": 0.0133, + "num_tokens": 30080651.0, + "reward": 0.810791015625, + "reward_std": 0.012397469021379948, + "rewards//mean": 0.810791015625, + "rewards//std": 0.01507069543004036, + "step": 4130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8262, + "grad_norm": 1.6987361907958984, + "kl": 0.38902031257748604, + "learning_rate": 7.430289649152155e-08, + "loss": 0.0156, + "num_tokens": 30087979.0, + "reward": 0.84283447265625, + "reward_std": 0.01901824399828911, + "rewards//mean": 0.84283447265625, + "rewards//std": 0.022236313670873642, + "step": 4131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8264, + "grad_norm": 1.8657431602478027, + "kl": 0.36869295313954353, + "learning_rate": 7.413653292847616e-08, + "loss": 0.0147, + "num_tokens": 30095235.0, + "reward": 0.81494140625, + "reward_std": 0.01587899588048458, + "rewards//mean": 0.81494140625, + "rewards//std": 0.024384593591094017, + "step": 4132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8266, + "grad_norm": 1.5861928462982178, + "kl": 0.32039105147123337, + "learning_rate": 7.397034090318454e-08, + "loss": 0.0128, + "num_tokens": 30102523.0, + "reward": 0.86773681640625, + "reward_std": 0.016474295407533646, + "rewards//mean": 0.86773681640625, + "rewards//std": 0.025806859135627747, + "step": 4133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8268, + "grad_norm": 1.6661603450775146, + "kl": 0.3371716272085905, + "learning_rate": 7.38043204825885e-08, + "loss": 0.0135, + "num_tokens": 30109795.0, + "reward": 0.836669921875, + "reward_std": 0.016841208562254906, + "rewards//mean": 0.836669921875, + "rewards//std": 0.02246531844139099, + "step": 4134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.827, + "grad_norm": 2.1376659870147705, + "kl": 0.3253227435052395, + "learning_rate": 7.363847173356119e-08, + "loss": 0.013, + "num_tokens": 30116947.0, + "reward": 0.83929443359375, + "reward_std": 0.015198031440377235, + "rewards//mean": 0.83929443359375, + "rewards//std": 0.029395850375294685, + "step": 4135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.8272, + "grad_norm": 1.6437451839447021, + "kl": 0.4100830554962158, + "learning_rate": 7.347279472290646e-08, + "loss": 0.0168, + "num_tokens": 30124258.0, + "reward": 0.84820556640625, + "reward_std": 0.013441551476716995, + "rewards//mean": 0.84820556640625, + "rewards//std": 0.016126427799463272, + "step": 4136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8274, + "grad_norm": 1.7211542129516602, + "kl": 0.36370253935456276, + "learning_rate": 7.330728951735915e-08, + "loss": 0.0145, + "num_tokens": 30131586.0, + "reward": 0.87158203125, + "reward_std": 0.015908069908618927, + "rewards//mean": 0.87158203125, + "rewards//std": 0.028709176927804947, + "step": 4137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8276, + "grad_norm": 1.507262110710144, + "kl": 0.34347201883792877, + "learning_rate": 7.314195618358448e-08, + "loss": 0.0137, + "num_tokens": 30138938.0, + "reward": 0.85015869140625, + "reward_std": 0.016421623528003693, + "rewards//mean": 0.85015869140625, + "rewards//std": 0.024962451308965683, + "step": 4138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8278, + "grad_norm": 1.6567319631576538, + "kl": 0.3418978378176689, + "learning_rate": 7.297679478817881e-08, + "loss": 0.0137, + "num_tokens": 30146186.0, + "reward": 0.85162353515625, + "reward_std": 0.01781689003109932, + "rewards//mean": 0.85162353515625, + "rewards//std": 0.024473747238516808, + "step": 4139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.828, + "grad_norm": 1.6429100036621094, + "kl": 0.36844154819846153, + "learning_rate": 7.281180539766923e-08, + "loss": 0.0147, + "num_tokens": 30153458.0, + "reward": 0.83392333984375, + "reward_std": 0.016645651310682297, + "rewards//mean": 0.83392333984375, + "rewards//std": 0.021098941564559937, + "step": 4140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8282, + "grad_norm": 1.8658661842346191, + "kl": 0.368048470467329, + "learning_rate": 7.264698807851327e-08, + "loss": 0.0147, + "num_tokens": 30160714.0, + "reward": 0.8546142578125, + "reward_std": 0.015715952962636948, + "rewards//mean": 0.8546142578125, + "rewards//std": 0.021138813346624374, + "step": 4141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8284, + "grad_norm": 1.690598487854004, + "kl": 0.363025376573205, + "learning_rate": 7.248234289709942e-08, + "loss": 0.0145, + "num_tokens": 30167970.0, + "reward": 0.8726806640625, + "reward_std": 0.015768669545650482, + "rewards//mean": 0.8726806640625, + "rewards//std": 0.021198881790041924, + "step": 4142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8286, + "grad_norm": 1.3703508377075195, + "kl": 0.34328310191631317, + "learning_rate": 7.23178699197467e-08, + "loss": 0.0137, + "num_tokens": 30175258.0, + "reward": 0.858642578125, + "reward_std": 0.018447861075401306, + "rewards//mean": 0.858642578125, + "rewards//std": 0.026801373809576035, + "step": 4143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8288, + "grad_norm": 1.3817570209503174, + "kl": 0.353482685983181, + "learning_rate": 7.215356921270494e-08, + "loss": 0.0141, + "num_tokens": 30182546.0, + "reward": 0.7794189453125, + "reward_std": 0.009102189913392067, + "rewards//mean": 0.7794189453125, + "rewards//std": 0.01554681546986103, + "step": 4144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.829, + "grad_norm": 1.5956313610076904, + "kl": 0.33730224519968033, + "learning_rate": 7.198944084215419e-08, + "loss": 0.0135, + "num_tokens": 30189842.0, + "reward": 0.850830078125, + "reward_std": 0.01352802012115717, + "rewards//mean": 0.850830078125, + "rewards//std": 0.024947969242930412, + "step": 4145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8292, + "grad_norm": 1.5322051048278809, + "kl": 0.3750957138836384, + "learning_rate": 7.182548487420553e-08, + "loss": 0.015, + "num_tokens": 30197098.0, + "reward": 0.83367919921875, + "reward_std": 0.018974825739860535, + "rewards//mean": 0.83367919921875, + "rewards//std": 0.021963009610772133, + "step": 4146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8294, + "grad_norm": 1.4648103713989258, + "kl": 0.3336730785667896, + "learning_rate": 7.166170137490035e-08, + "loss": 0.0133, + "num_tokens": 30204338.0, + "reward": 0.8831787109375, + "reward_std": 0.01254812628030777, + "rewards//mean": 0.8831787109375, + "rewards//std": 0.018371030688285828, + "step": 4147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8296, + "grad_norm": 1.867192268371582, + "kl": 0.38375599682331085, + "learning_rate": 7.149809041021072e-08, + "loss": 0.0154, + "num_tokens": 30211626.0, + "reward": 0.86865234375, + "reward_std": 0.018890218809247017, + "rewards//mean": 0.86865234375, + "rewards//std": 0.02568109892308712, + "step": 4148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.78125, + "epoch": 0.8298, + "grad_norm": 1.5756747722625732, + "kl": 0.3887878768146038, + "learning_rate": 7.133465204603895e-08, + "loss": 0.0184, + "num_tokens": 30218860.0, + "reward": 0.8760986328125, + "reward_std": 0.019448939710855484, + "rewards//mean": 0.8760986328125, + "rewards//std": 0.027103258296847343, + "step": 4149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.83, + "grad_norm": 1.4178553819656372, + "kl": 0.3373007755726576, + "learning_rate": 7.117138634821807e-08, + "loss": 0.0135, + "num_tokens": 30226204.0, + "reward": 0.8563232421875, + "reward_std": 0.013052058406174183, + "rewards//mean": 0.8563232421875, + "rewards//std": 0.018642591312527657, + "step": 4150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8302, + "grad_norm": 1.3825595378875732, + "kl": 0.3136715032160282, + "learning_rate": 7.100829338251146e-08, + "loss": 0.0125, + "num_tokens": 30233572.0, + "reward": 0.82220458984375, + "reward_std": 0.01488727517426014, + "rewards//mean": 0.82220458984375, + "rewards//std": 0.02182125858962536, + "step": 4151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8304, + "grad_norm": 2.0596957206726074, + "kl": 0.4419318661093712, + "learning_rate": 7.08453732146131e-08, + "loss": 0.0177, + "num_tokens": 30240852.0, + "reward": 0.84417724609375, + "reward_std": 0.014524003490805626, + "rewards//mean": 0.84417724609375, + "rewards//std": 0.02840057760477066, + "step": 4152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.8306, + "grad_norm": 1.5526533126831055, + "kl": 0.3522126153111458, + "learning_rate": 7.068262591014695e-08, + "loss": 0.0121, + "num_tokens": 30248192.0, + "reward": 0.80078125, + "reward_std": 0.012444879859685898, + "rewards//mean": 0.80078125, + "rewards//std": 0.018242448568344116, + "step": 4153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8308, + "grad_norm": 1.5228701829910278, + "kl": 0.39497118815779686, + "learning_rate": 7.052005153466778e-08, + "loss": 0.0158, + "num_tokens": 30255584.0, + "reward": 0.86572265625, + "reward_std": 0.018349846825003624, + "rewards//mean": 0.86572265625, + "rewards//std": 0.025157036259770393, + "step": 4154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.831, + "grad_norm": 1.7526928186416626, + "kl": 0.35203467309474945, + "learning_rate": 7.035765015366046e-08, + "loss": 0.0141, + "num_tokens": 30262936.0, + "reward": 0.8199462890625, + "reward_std": 0.017536986619234085, + "rewards//mean": 0.8199462890625, + "rewards//std": 0.02284848876297474, + "step": 4155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.84375, + "epoch": 0.8312, + "grad_norm": 1.8977348804473877, + "kl": 0.3264260068535805, + "learning_rate": 7.019542183254046e-08, + "loss": 0.014, + "num_tokens": 30270294.0, + "reward": 0.8507080078125, + "reward_std": 0.017528468742966652, + "rewards//mean": 0.8507080078125, + "rewards//std": 0.03050854057073593, + "step": 4156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.8125, + "epoch": 0.8314, + "grad_norm": 1.571997046470642, + "kl": 0.34120296128094196, + "learning_rate": 7.003336663665293e-08, + "loss": 0.0055, + "num_tokens": 30277586.0, + "reward": 0.862060546875, + "reward_std": 0.020738881081342697, + "rewards//mean": 0.862060546875, + "rewards//std": 0.02652888000011444, + "step": 4157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8316, + "grad_norm": 1.5971670150756836, + "kl": 0.2944768685847521, + "learning_rate": 6.987148463127396e-08, + "loss": 0.0118, + "num_tokens": 30284850.0, + "reward": 0.8255615234375, + "reward_std": 0.01217835582792759, + "rewards//mean": 0.8255615234375, + "rewards//std": 0.0184171162545681, + "step": 4158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8318, + "grad_norm": 1.6331725120544434, + "kl": 0.2932616798207164, + "learning_rate": 6.970977588160965e-08, + "loss": 0.0117, + "num_tokens": 30292194.0, + "reward": 0.80145263671875, + "reward_std": 0.01188056543469429, + "rewards//mean": 0.80145263671875, + "rewards//std": 0.027966050431132317, + "step": 4159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.832, + "grad_norm": 1.7616389989852905, + "kl": 0.35427936539053917, + "learning_rate": 6.954824045279605e-08, + "loss": 0.0142, + "num_tokens": 30299522.0, + "reward": 0.806884765625, + "reward_std": 0.012054547667503357, + "rewards//mean": 0.806884765625, + "rewards//std": 0.014480582438409328, + "step": 4160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8322, + "grad_norm": 1.6606204509735107, + "kl": 0.34222229942679405, + "learning_rate": 6.938687840989971e-08, + "loss": 0.0137, + "num_tokens": 30306730.0, + "reward": 0.8492431640625, + "reward_std": 0.01749461516737938, + "rewards//mean": 0.8492431640625, + "rewards//std": 0.030038511380553246, + "step": 4161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.8324, + "grad_norm": 1.7304515838623047, + "kl": 0.34861917421221733, + "learning_rate": 6.922568981791726e-08, + "loss": 0.0134, + "num_tokens": 30314119.0, + "reward": 0.87750244140625, + "reward_std": 0.02197084203362465, + "rewards//mean": 0.87750244140625, + "rewards//std": 0.029292678460478783, + "step": 4162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8326, + "grad_norm": 1.6722725629806519, + "kl": 0.3248895965516567, + "learning_rate": 6.906467474177558e-08, + "loss": 0.013, + "num_tokens": 30321439.0, + "reward": 0.84979248046875, + "reward_std": 0.01853764057159424, + "rewards//mean": 0.84979248046875, + "rewards//std": 0.025026045739650726, + "step": 4163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8328, + "grad_norm": 1.4653687477111816, + "kl": 0.30781616643071175, + "learning_rate": 6.89038332463312e-08, + "loss": 0.0123, + "num_tokens": 30328743.0, + "reward": 0.87109375, + "reward_std": 0.015249529853463173, + "rewards//mean": 0.87109375, + "rewards//std": 0.02462673932313919, + "step": 4164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.833, + "grad_norm": 1.3936896324157715, + "kl": 0.3568453937768936, + "learning_rate": 6.874316539637126e-08, + "loss": 0.0143, + "num_tokens": 30335911.0, + "reward": 0.85308837890625, + "reward_std": 0.011545232497155666, + "rewards//mean": 0.85308837890625, + "rewards//std": 0.020310796797275543, + "step": 4165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.796875, + "epoch": 0.8332, + "grad_norm": 1.4864909648895264, + "kl": 0.3724188394844532, + "learning_rate": 6.858267125661272e-08, + "loss": 0.0026, + "num_tokens": 30343186.0, + "reward": 0.8638916015625, + "reward_std": 0.015064583159983158, + "rewards//mean": 0.8638916015625, + "rewards//std": 0.020660776644945145, + "step": 4166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8334, + "grad_norm": 1.4240492582321167, + "kl": 0.3248819187283516, + "learning_rate": 6.842235089170273e-08, + "loss": 0.013, + "num_tokens": 30350450.0, + "reward": 0.88323974609375, + "reward_std": 0.016130397096276283, + "rewards//mean": 0.88323974609375, + "rewards//std": 0.025958919897675514, + "step": 4167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.8336, + "grad_norm": 1.9529290199279785, + "kl": 0.36861285008490086, + "learning_rate": 6.82622043662181e-08, + "loss": 0.0155, + "num_tokens": 30357649.0, + "reward": 0.82568359375, + "reward_std": 0.019015291705727577, + "rewards//mean": 0.82568359375, + "rewards//std": 0.029194390401244164, + "step": 4168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8338, + "grad_norm": 1.8382891416549683, + "kl": 0.41858918592333794, + "learning_rate": 6.810223174466589e-08, + "loss": 0.0167, + "num_tokens": 30364929.0, + "reward": 0.8526611328125, + "reward_std": 0.015231667086482048, + "rewards//mean": 0.8526611328125, + "rewards//std": 0.030165238305926323, + "step": 4169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.834, + "grad_norm": 2.0938758850097656, + "kl": 0.5069857314229012, + "learning_rate": 6.794243309148306e-08, + "loss": 0.0203, + "num_tokens": 30372217.0, + "reward": 0.837646484375, + "reward_std": 0.02031940035521984, + "rewards//mean": 0.837646484375, + "rewards//std": 0.022966446354985237, + "step": 4170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.78125, + "epoch": 0.8342, + "grad_norm": 1.4974007606506348, + "kl": 0.3430071920156479, + "learning_rate": 6.778280847103667e-08, + "loss": 0.0168, + "num_tokens": 30379483.0, + "reward": 0.82257080078125, + "reward_std": 0.013651066459715366, + "rewards//mean": 0.82257080078125, + "rewards//std": 0.01760469377040863, + "step": 4171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8344, + "grad_norm": 1.708443284034729, + "kl": 0.2769802287220955, + "learning_rate": 6.76233579476232e-08, + "loss": 0.0111, + "num_tokens": 30386763.0, + "reward": 0.8829345703125, + "reward_std": 0.0198160819709301, + "rewards//mean": 0.8829345703125, + "rewards//std": 0.03495913743972778, + "step": 4172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8346, + "grad_norm": 1.2899459600448608, + "kl": 0.3379499651491642, + "learning_rate": 6.746408158546945e-08, + "loss": 0.0135, + "num_tokens": 30394067.0, + "reward": 0.878173828125, + "reward_std": 0.01553349755704403, + "rewards//mean": 0.878173828125, + "rewards//std": 0.02394736371934414, + "step": 4173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8348, + "grad_norm": 2.043485164642334, + "kl": 0.44319694116711617, + "learning_rate": 6.730497944873203e-08, + "loss": 0.0177, + "num_tokens": 30401427.0, + "reward": 0.862548828125, + "reward_std": 0.016385518014431, + "rewards//mean": 0.862548828125, + "rewards//std": 0.021908579394221306, + "step": 4174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.835, + "grad_norm": 1.6940995454788208, + "kl": 0.3788539990782738, + "learning_rate": 6.7146051601497e-08, + "loss": 0.0152, + "num_tokens": 30408779.0, + "reward": 0.87841796875, + "reward_std": 0.015360425226390362, + "rewards//mean": 0.87841796875, + "rewards//std": 0.02203672006726265, + "step": 4175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.828125, + "epoch": 0.8352, + "grad_norm": 1.6368697881698608, + "kl": 0.38528451323509216, + "learning_rate": 6.698729810778064e-08, + "loss": 0.0103, + "num_tokens": 30416152.0, + "reward": 0.84814453125, + "reward_std": 0.011729227378964424, + "rewards//mean": 0.84814453125, + "rewards//std": 0.015694599598646164, + "step": 4176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8354, + "grad_norm": 1.777003526687622, + "kl": 0.3411503341048956, + "learning_rate": 6.682871903152886e-08, + "loss": 0.0136, + "num_tokens": 30423416.0, + "reward": 0.88134765625, + "reward_std": 0.0153652373701334, + "rewards//mean": 0.88134765625, + "rewards//std": 0.026734642684459686, + "step": 4177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.8356, + "grad_norm": 1.5812376737594604, + "kl": 0.2790288105607033, + "learning_rate": 6.667031443661731e-08, + "loss": 0.0109, + "num_tokens": 30430710.0, + "reward": 0.8575439453125, + "reward_std": 0.0177525095641613, + "rewards//mean": 0.8575439453125, + "rewards//std": 0.02409953810274601, + "step": 4178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8358, + "grad_norm": 1.8070846796035767, + "kl": 0.2969173528254032, + "learning_rate": 6.651208438685119e-08, + "loss": 0.0119, + "num_tokens": 30438038.0, + "reward": 0.86529541015625, + "reward_std": 0.02065226435661316, + "rewards//mean": 0.86529541015625, + "rewards//std": 0.027813533321022987, + "step": 4179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.836, + "grad_norm": 1.4530234336853027, + "kl": 0.3521423414349556, + "learning_rate": 6.635402894596565e-08, + "loss": 0.0133, + "num_tokens": 30445281.0, + "reward": 0.79986572265625, + "reward_std": 0.014211688190698624, + "rewards//mean": 0.79986572265625, + "rewards//std": 0.019670167937874794, + "step": 4180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8362, + "grad_norm": 1.7848131656646729, + "kl": 0.39825922437012196, + "learning_rate": 6.619614817762536e-08, + "loss": 0.0159, + "num_tokens": 30452505.0, + "reward": 0.86993408203125, + "reward_std": 0.01547317299991846, + "rewards//mean": 0.86993408203125, + "rewards//std": 0.01900557056069374, + "step": 4181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8364, + "grad_norm": 2.6904664039611816, + "kl": 0.4621147848665714, + "learning_rate": 6.603844214542486e-08, + "loss": 0.0185, + "num_tokens": 30459753.0, + "reward": 0.85748291015625, + "reward_std": 0.018254414200782776, + "rewards//mean": 0.85748291015625, + "rewards//std": 0.023354601114988327, + "step": 4182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8366, + "grad_norm": 1.5700494050979614, + "kl": 0.36780533008277416, + "learning_rate": 6.588091091288784e-08, + "loss": 0.0147, + "num_tokens": 30467073.0, + "reward": 0.832763671875, + "reward_std": 0.01916869729757309, + "rewards//mean": 0.832763671875, + "rewards//std": 0.024821441620588303, + "step": 4183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8368, + "grad_norm": 1.4889140129089355, + "kl": 0.3919342942535877, + "learning_rate": 6.572355454346801e-08, + "loss": 0.0157, + "num_tokens": 30474249.0, + "reward": 0.82891845703125, + "reward_std": 0.017988702282309532, + "rewards//mean": 0.82891845703125, + "rewards//std": 0.02580040507018566, + "step": 4184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.837, + "grad_norm": 1.6079210042953491, + "kl": 0.32415612414479256, + "learning_rate": 6.55663731005484e-08, + "loss": 0.013, + "num_tokens": 30481489.0, + "reward": 0.818115234375, + "reward_std": 0.014325767755508423, + "rewards//mean": 0.818115234375, + "rewards//std": 0.015592026524245739, + "step": 4185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8372, + "grad_norm": 1.6629170179367065, + "kl": 0.3660959079861641, + "learning_rate": 6.540936664744196e-08, + "loss": 0.0146, + "num_tokens": 30488721.0, + "reward": 0.85418701171875, + "reward_std": 0.01999533548951149, + "rewards//mean": 0.85418701171875, + "rewards//std": 0.025967666879296303, + "step": 4186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8374, + "grad_norm": 1.4989869594573975, + "kl": 0.35443374887108803, + "learning_rate": 6.52525352473905e-08, + "loss": 0.0142, + "num_tokens": 30496073.0, + "reward": 0.8607177734375, + "reward_std": 0.022352121770381927, + "rewards//mean": 0.8607177734375, + "rewards//std": 0.02879946306347847, + "step": 4187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8376, + "grad_norm": 1.37493896484375, + "kl": 0.3285062275826931, + "learning_rate": 6.509587896356583e-08, + "loss": 0.0131, + "num_tokens": 30503265.0, + "reward": 0.8646240234375, + "reward_std": 0.018227731809020042, + "rewards//mean": 0.8646240234375, + "rewards//std": 0.028418494388461113, + "step": 4188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8378, + "grad_norm": 1.4591484069824219, + "kl": 0.3627197667956352, + "learning_rate": 6.493939785906927e-08, + "loss": 0.0145, + "num_tokens": 30510481.0, + "reward": 0.84759521484375, + "reward_std": 0.017830006778240204, + "rewards//mean": 0.84759521484375, + "rewards//std": 0.023831959813833237, + "step": 4189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.838, + "grad_norm": 1.5061126947402954, + "kl": 0.3023313507437706, + "learning_rate": 6.478309199693105e-08, + "loss": 0.0121, + "num_tokens": 30517761.0, + "reward": 0.80621337890625, + "reward_std": 0.015547076240181923, + "rewards//mean": 0.80621337890625, + "rewards//std": 0.029656296595931053, + "step": 4190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8382, + "grad_norm": 1.7681204080581665, + "kl": 0.33582404255867004, + "learning_rate": 6.462696144011148e-08, + "loss": 0.0134, + "num_tokens": 30525025.0, + "reward": 0.8277587890625, + "reward_std": 0.012799203395843506, + "rewards//mean": 0.8277587890625, + "rewards//std": 0.024297216907143593, + "step": 4191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8384, + "grad_norm": 1.7224639654159546, + "kl": 0.2939874418079853, + "learning_rate": 6.447100625149965e-08, + "loss": 0.0118, + "num_tokens": 30532305.0, + "reward": 0.80035400390625, + "reward_std": 0.02430972084403038, + "rewards//mean": 0.80035400390625, + "rewards//std": 0.02693876437842846, + "step": 4192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8386, + "grad_norm": 1.7367408275604248, + "kl": 0.3167004734277725, + "learning_rate": 6.431522649391447e-08, + "loss": 0.0127, + "num_tokens": 30539561.0, + "reward": 0.8621826171875, + "reward_std": 0.014010723680257797, + "rewards//mean": 0.8621826171875, + "rewards//std": 0.0170265045017004, + "step": 4193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8388, + "grad_norm": 1.4172163009643555, + "kl": 0.37505969032645226, + "learning_rate": 6.415962223010401e-08, + "loss": 0.015, + "num_tokens": 30546865.0, + "reward": 0.88165283203125, + "reward_std": 0.01907765492796898, + "rewards//mean": 0.88165283203125, + "rewards//std": 0.03065277449786663, + "step": 4194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.839, + "grad_norm": 1.9580590724945068, + "kl": 0.4316723421216011, + "learning_rate": 6.40041935227455e-08, + "loss": 0.0175, + "num_tokens": 30554124.0, + "reward": 0.84173583984375, + "reward_std": 0.018283255398273468, + "rewards//mean": 0.84173583984375, + "rewards//std": 0.025296177715063095, + "step": 4195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8392, + "grad_norm": 1.3618199825286865, + "kl": 0.3283780701458454, + "learning_rate": 6.384894043444566e-08, + "loss": 0.0131, + "num_tokens": 30561412.0, + "reward": 0.86297607421875, + "reward_std": 0.012483914382755756, + "rewards//mean": 0.86297607421875, + "rewards//std": 0.015173676423728466, + "step": 4196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8394, + "grad_norm": 2.0767548084259033, + "kl": 0.49945345520973206, + "learning_rate": 6.36938630277405e-08, + "loss": 0.02, + "num_tokens": 30568740.0, + "reward": 0.8533935546875, + "reward_std": 0.019180841743946075, + "rewards//mean": 0.8533935546875, + "rewards//std": 0.020704690366983414, + "step": 4197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8396, + "grad_norm": 1.5610337257385254, + "kl": 0.356452152132988, + "learning_rate": 6.353896136509524e-08, + "loss": 0.0143, + "num_tokens": 30575964.0, + "reward": 0.85052490234375, + "reward_std": 0.017636314034461975, + "rewards//mean": 0.85052490234375, + "rewards//std": 0.021645737811923027, + "step": 4198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8398, + "grad_norm": 1.7192341089248657, + "kl": 0.31639330089092255, + "learning_rate": 6.338423550890404e-08, + "loss": 0.0127, + "num_tokens": 30583196.0, + "reward": 0.8323974609375, + "reward_std": 0.010326013900339603, + "rewards//mean": 0.8323974609375, + "rewards//std": 0.020335843786597252, + "step": 4199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.84, + "grad_norm": 1.5714980363845825, + "kl": 0.3420712612569332, + "learning_rate": 6.322968552149055e-08, + "loss": 0.0137, + "num_tokens": 30590652.0, + "reward": 0.88201904296875, + "reward_std": 0.015533480793237686, + "rewards//mean": 0.88201904296875, + "rewards//std": 0.024490440264344215, + "step": 4200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8402, + "grad_norm": 1.4752475023269653, + "kl": 0.3044797535985708, + "learning_rate": 6.307531146510753e-08, + "loss": 0.0122, + "num_tokens": 30597932.0, + "reward": 0.85064697265625, + "reward_std": 0.012345178052783012, + "rewards//mean": 0.85064697265625, + "rewards//std": 0.01712965779006481, + "step": 4201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8404, + "grad_norm": 1.459265947341919, + "kl": 0.3690438810735941, + "learning_rate": 6.29211134019369e-08, + "loss": 0.0148, + "num_tokens": 30605140.0, + "reward": 0.8082275390625, + "reward_std": 0.01584613509476185, + "rewards//mean": 0.8082275390625, + "rewards//std": 0.025692589581012726, + "step": 4202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8406, + "grad_norm": 1.9187153577804565, + "kl": 0.3553988803178072, + "learning_rate": 6.276709139408937e-08, + "loss": 0.0142, + "num_tokens": 30612516.0, + "reward": 0.85015869140625, + "reward_std": 0.015810515731573105, + "rewards//mean": 0.85015869140625, + "rewards//std": 0.01810072734951973, + "step": 4203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8408, + "grad_norm": 1.4425702095031738, + "kl": 0.39313754439353943, + "learning_rate": 6.26132455036052e-08, + "loss": 0.0157, + "num_tokens": 30619764.0, + "reward": 0.8441162109375, + "reward_std": 0.01753091998398304, + "rewards//mean": 0.8441162109375, + "rewards//std": 0.023310242220759392, + "step": 4204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.841, + "grad_norm": 1.699601173400879, + "kl": 0.37467612512409687, + "learning_rate": 6.245957579245348e-08, + "loss": 0.015, + "num_tokens": 30627036.0, + "reward": 0.79669189453125, + "reward_std": 0.014877783134579659, + "rewards//mean": 0.79669189453125, + "rewards//std": 0.022717846557497978, + "step": 4205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8412, + "grad_norm": 1.6022018194198608, + "kl": 0.36483835615217686, + "learning_rate": 6.230608232253226e-08, + "loss": 0.0146, + "num_tokens": 30634348.0, + "reward": 0.88665771484375, + "reward_std": 0.016205789521336555, + "rewards//mean": 0.88665771484375, + "rewards//std": 0.03110622428357601, + "step": 4206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.796875, + "epoch": 0.8414, + "grad_norm": 1.5429767370224, + "kl": 0.32073400914669037, + "learning_rate": 6.215276515566869e-08, + "loss": 0.0043, + "num_tokens": 30641583.0, + "reward": 0.874267578125, + "reward_std": 0.021581748500466347, + "rewards//mean": 0.874267578125, + "rewards//std": 0.024684453383088112, + "step": 4207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8416, + "grad_norm": 1.51478111743927, + "kl": 0.39857068471610546, + "learning_rate": 6.1999624353619e-08, + "loss": 0.0159, + "num_tokens": 30648847.0, + "reward": 0.8409423828125, + "reward_std": 0.013487032614648342, + "rewards//mean": 0.8409423828125, + "rewards//std": 0.01622527465224266, + "step": 4208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8418, + "grad_norm": 1.792966604232788, + "kl": 0.35144662111997604, + "learning_rate": 6.184665997806831e-08, + "loss": 0.0141, + "num_tokens": 30656143.0, + "reward": 0.833740234375, + "reward_std": 0.013487039133906364, + "rewards//mean": 0.833740234375, + "rewards//std": 0.016096480190753937, + "step": 4209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.842, + "grad_norm": 1.535818099975586, + "kl": 0.3856894411146641, + "learning_rate": 6.169387209063048e-08, + "loss": 0.0154, + "num_tokens": 30663431.0, + "reward": 0.87542724609375, + "reward_std": 0.016512826085090637, + "rewards//mean": 0.87542724609375, + "rewards//std": 0.023736489936709404, + "step": 4210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8422, + "grad_norm": 1.7069778442382812, + "kl": 0.4124010670930147, + "learning_rate": 6.154126075284855e-08, + "loss": 0.0165, + "num_tokens": 30670719.0, + "reward": 0.88311767578125, + "reward_std": 0.017471684142947197, + "rewards//mean": 0.88311767578125, + "rewards//std": 0.02055601216852665, + "step": 4211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8424, + "grad_norm": 1.775516152381897, + "kl": 0.35660505294799805, + "learning_rate": 6.138882602619439e-08, + "loss": 0.0143, + "num_tokens": 30678039.0, + "reward": 0.840576171875, + "reward_std": 0.013436557725071907, + "rewards//mean": 0.840576171875, + "rewards//std": 0.021040117368102074, + "step": 4212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8426, + "grad_norm": 1.5806456804275513, + "kl": 0.3283389285206795, + "learning_rate": 6.123656797206872e-08, + "loss": 0.0131, + "num_tokens": 30685295.0, + "reward": 0.833251953125, + "reward_std": 0.01695084571838379, + "rewards//mean": 0.833251953125, + "rewards//std": 0.02811565436422825, + "step": 4213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8428, + "grad_norm": 1.3744453191757202, + "kl": 0.3064657524228096, + "learning_rate": 6.108448665180089e-08, + "loss": 0.0123, + "num_tokens": 30692663.0, + "reward": 0.85809326171875, + "reward_std": 0.01313976664096117, + "rewards//mean": 0.85809326171875, + "rewards//std": 0.01872796006500721, + "step": 4214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.843, + "grad_norm": 1.7298420667648315, + "kl": 0.3472846783697605, + "learning_rate": 6.093258212664937e-08, + "loss": 0.0139, + "num_tokens": 30699951.0, + "reward": 0.8519287109375, + "reward_std": 0.019602475687861443, + "rewards//mean": 0.8519287109375, + "rewards//std": 0.031610019505023956, + "step": 4215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8432, + "grad_norm": 1.5256788730621338, + "kl": 0.3150588907301426, + "learning_rate": 6.078085445780129e-08, + "loss": 0.0126, + "num_tokens": 30707167.0, + "reward": 0.86865234375, + "reward_std": 0.015942919999361038, + "rewards//mean": 0.86865234375, + "rewards//std": 0.02320382185280323, + "step": 4216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8434, + "grad_norm": 1.860878825187683, + "kl": 0.3728674277663231, + "learning_rate": 6.06293037063726e-08, + "loss": 0.0149, + "num_tokens": 30714399.0, + "reward": 0.854248046875, + "reward_std": 0.0186779648065567, + "rewards//mean": 0.854248046875, + "rewards//std": 0.026012586429715157, + "step": 4217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8436, + "grad_norm": 1.3818442821502686, + "kl": 0.3635284751653671, + "learning_rate": 6.047792993340766e-08, + "loss": 0.0145, + "num_tokens": 30721679.0, + "reward": 0.86627197265625, + "reward_std": 0.016150740906596184, + "rewards//mean": 0.86627197265625, + "rewards//std": 0.01985400728881359, + "step": 4218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.875, + "epoch": 0.8438, + "grad_norm": 1.6471796035766602, + "kl": 0.39348940551280975, + "learning_rate": 6.032673319988007e-08, + "loss": 0.0198, + "num_tokens": 30728895.0, + "reward": 0.81304931640625, + "reward_std": 0.018365202471613884, + "rewards//mean": 0.81304931640625, + "rewards//std": 0.02969098463654518, + "step": 4219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.844, + "grad_norm": 1.5512596368789673, + "kl": 0.34560961462557316, + "learning_rate": 6.017571356669182e-08, + "loss": 0.0138, + "num_tokens": 30736255.0, + "reward": 0.8590087890625, + "reward_std": 0.011834013275802135, + "rewards//mean": 0.8590087890625, + "rewards//std": 0.02674568071961403, + "step": 4220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.8442, + "grad_norm": 1.336822271347046, + "kl": 0.33576761931180954, + "learning_rate": 6.002487109467347e-08, + "loss": 0.0079, + "num_tokens": 30743489.0, + "reward": 0.9027099609375, + "reward_std": 0.015463953837752342, + "rewards//mean": 0.9027099609375, + "rewards//std": 0.020513717085123062, + "step": 4221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8444, + "grad_norm": 1.6376539468765259, + "kl": 0.36225810274481773, + "learning_rate": 5.987420584458441e-08, + "loss": 0.0145, + "num_tokens": 30750929.0, + "reward": 0.8240966796875, + "reward_std": 0.014547135680913925, + "rewards//mean": 0.8240966796875, + "rewards//std": 0.018662068992853165, + "step": 4222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8446, + "grad_norm": 4.216272354125977, + "kl": 0.7501542400568724, + "learning_rate": 5.972371787711261e-08, + "loss": 0.03, + "num_tokens": 30758257.0, + "reward": 0.86077880859375, + "reward_std": 0.01387504581362009, + "rewards//mean": 0.86077880859375, + "rewards//std": 0.017612431198358536, + "step": 4223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8448, + "grad_norm": 1.4341508150100708, + "kl": 0.3678742554038763, + "learning_rate": 5.957340725287474e-08, + "loss": 0.0147, + "num_tokens": 30765585.0, + "reward": 0.88128662109375, + "reward_std": 0.013175055384635925, + "rewards//mean": 0.88128662109375, + "rewards//std": 0.019791388884186745, + "step": 4224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.845, + "grad_norm": 1.437766671180725, + "kl": 0.3289389479905367, + "learning_rate": 5.942327403241559e-08, + "loss": 0.0132, + "num_tokens": 30772889.0, + "reward": 0.83941650390625, + "reward_std": 0.019028345122933388, + "rewards//mean": 0.83941650390625, + "rewards//std": 0.029141388833522797, + "step": 4225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8452, + "grad_norm": 1.5393935441970825, + "kl": 0.3198437727987766, + "learning_rate": 5.927331827620902e-08, + "loss": 0.0128, + "num_tokens": 30780169.0, + "reward": 0.81256103515625, + "reward_std": 0.0174168199300766, + "rewards//mean": 0.81256103515625, + "rewards//std": 0.02548220008611679, + "step": 4226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8454, + "grad_norm": 1.5062516927719116, + "kl": 0.35923222079873085, + "learning_rate": 5.9123540044657085e-08, + "loss": 0.0144, + "num_tokens": 30787561.0, + "reward": 0.82257080078125, + "reward_std": 0.014424780383706093, + "rewards//mean": 0.82257080078125, + "rewards//std": 0.02036067098379135, + "step": 4227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8456, + "grad_norm": 1.6760934591293335, + "kl": 0.35801392421126366, + "learning_rate": 5.897393939809064e-08, + "loss": 0.0143, + "num_tokens": 30794833.0, + "reward": 0.85870361328125, + "reward_std": 0.01828211173415184, + "rewards//mean": 0.85870361328125, + "rewards//std": 0.03267873451113701, + "step": 4228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8458, + "grad_norm": 1.4969907999038696, + "kl": 0.377358291298151, + "learning_rate": 5.882451639676855e-08, + "loss": 0.0151, + "num_tokens": 30802089.0, + "reward": 0.8197021484375, + "reward_std": 0.016350986436009407, + "rewards//mean": 0.8197021484375, + "rewards//std": 0.018948636949062347, + "step": 4229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.846, + "grad_norm": 1.4851903915405273, + "kl": 0.371263787150383, + "learning_rate": 5.867527110087855e-08, + "loss": 0.0149, + "num_tokens": 30809473.0, + "reward": 0.810791015625, + "reward_std": 0.016513079404830933, + "rewards//mean": 0.810791015625, + "rewards//std": 0.028475167229771614, + "step": 4230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8462, + "grad_norm": 1.4228731393814087, + "kl": 0.3614963497966528, + "learning_rate": 5.8526203570536504e-08, + "loss": 0.0145, + "num_tokens": 30816697.0, + "reward": 0.8741455078125, + "reward_std": 0.0124689731746912, + "rewards//mean": 0.8741455078125, + "rewards//std": 0.018102087080478668, + "step": 4231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8464, + "grad_norm": 1.497279405593872, + "kl": 0.3391733579337597, + "learning_rate": 5.837731386578698e-08, + "loss": 0.0136, + "num_tokens": 30824049.0, + "reward": 0.83758544921875, + "reward_std": 0.024331092834472656, + "rewards//mean": 0.83758544921875, + "rewards//std": 0.03068535029888153, + "step": 4232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8466, + "grad_norm": 1.7280094623565674, + "kl": 0.3685675449669361, + "learning_rate": 5.822860204660251e-08, + "loss": 0.0147, + "num_tokens": 30831265.0, + "reward": 0.80010986328125, + "reward_std": 0.01622788794338703, + "rewards//mean": 0.80010986328125, + "rewards//std": 0.021185576915740967, + "step": 4233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8468, + "grad_norm": 1.3955436944961548, + "kl": 0.31931326910853386, + "learning_rate": 5.808006817288436e-08, + "loss": 0.0128, + "num_tokens": 30838569.0, + "reward": 0.895751953125, + "reward_std": 0.016062894836068153, + "rewards//mean": 0.895751953125, + "rewards//std": 0.02555227465927601, + "step": 4234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.847, + "grad_norm": 1.7473810911178589, + "kl": 0.334266722202301, + "learning_rate": 5.7931712304461976e-08, + "loss": 0.0134, + "num_tokens": 30845905.0, + "reward": 0.7586669921875, + "reward_std": 0.013537571765482426, + "rewards//mean": 0.7586669921875, + "rewards//std": 0.028198180720210075, + "step": 4235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.8472, + "grad_norm": 1.535911202430725, + "kl": 0.4721258617937565, + "learning_rate": 5.778353450109286e-08, + "loss": 0.0193, + "num_tokens": 30853307.0, + "reward": 0.83001708984375, + "reward_std": 0.01271449401974678, + "rewards//mean": 0.83001708984375, + "rewards//std": 0.025353560224175453, + "step": 4236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8474, + "grad_norm": 1.739741563796997, + "kl": 0.4936726465821266, + "learning_rate": 5.763553482246319e-08, + "loss": 0.0197, + "num_tokens": 30860515.0, + "reward": 0.8758544921875, + "reward_std": 0.01915540173649788, + "rewards//mean": 0.8758544921875, + "rewards//std": 0.023266039788722992, + "step": 4237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.609375, + "epoch": 0.8476, + "grad_norm": 1.599661111831665, + "kl": 0.3256378620862961, + "learning_rate": 5.7487713328187246e-08, + "loss": -0.015, + "num_tokens": 30867882.0, + "reward": 0.84600830078125, + "reward_std": 0.01870877668261528, + "rewards//mean": 0.84600830078125, + "rewards//std": 0.022529156878590584, + "step": 4238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8478, + "grad_norm": 4.199921131134033, + "kl": 0.5584206134080887, + "learning_rate": 5.73400700778075e-08, + "loss": 0.0223, + "num_tokens": 30875218.0, + "reward": 0.84368896484375, + "reward_std": 0.017905058339238167, + "rewards//mean": 0.84368896484375, + "rewards//std": 0.02752026543021202, + "step": 4239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.848, + "grad_norm": 1.5143473148345947, + "kl": 0.2909140884876251, + "learning_rate": 5.719260513079449e-08, + "loss": 0.0116, + "num_tokens": 30882482.0, + "reward": 0.865234375, + "reward_std": 0.013429409824311733, + "rewards//mean": 0.865234375, + "rewards//std": 0.018584443256258965, + "step": 4240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8482, + "grad_norm": 1.4943419694900513, + "kl": 0.3747611753642559, + "learning_rate": 5.70453185465472e-08, + "loss": 0.015, + "num_tokens": 30889890.0, + "reward": 0.7982177734375, + "reward_std": 0.011206010356545448, + "rewards//mean": 0.7982177734375, + "rewards//std": 0.01750001683831215, + "step": 4241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8484, + "grad_norm": 1.4851951599121094, + "kl": 0.335858603939414, + "learning_rate": 5.689821038439263e-08, + "loss": 0.0134, + "num_tokens": 30897042.0, + "reward": 0.75018310546875, + "reward_std": 0.01846522092819214, + "rewards//mean": 0.75018310546875, + "rewards//std": 0.02859869785606861, + "step": 4242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8486, + "grad_norm": 1.4918593168258667, + "kl": 0.3503509797155857, + "learning_rate": 5.675128070358598e-08, + "loss": 0.014, + "num_tokens": 30904378.0, + "reward": 0.8072509765625, + "reward_std": 0.014288939535617828, + "rewards//mean": 0.8072509765625, + "rewards//std": 0.022447455674409866, + "step": 4243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8488, + "grad_norm": 1.6127610206604004, + "kl": 0.3381691053509712, + "learning_rate": 5.660452956331041e-08, + "loss": 0.0135, + "num_tokens": 30911642.0, + "reward": 0.80389404296875, + "reward_std": 0.01227780431509018, + "rewards//mean": 0.80389404296875, + "rewards//std": 0.017121702432632446, + "step": 4244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.849, + "grad_norm": 1.397808313369751, + "kl": 0.3589520640671253, + "learning_rate": 5.6457957022677307e-08, + "loss": 0.0144, + "num_tokens": 30918842.0, + "reward": 0.852783203125, + "reward_std": 0.015099626034498215, + "rewards//mean": 0.852783203125, + "rewards//std": 0.017003819346427917, + "step": 4245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8492, + "grad_norm": 1.53096342086792, + "kl": 0.3228977546095848, + "learning_rate": 5.6311563140726045e-08, + "loss": 0.0129, + "num_tokens": 30926106.0, + "reward": 0.82977294921875, + "reward_std": 0.01380319893360138, + "rewards//mean": 0.82977294921875, + "rewards//std": 0.02271118201315403, + "step": 4246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8494, + "grad_norm": 1.476199746131897, + "kl": 0.340336661785841, + "learning_rate": 5.616534797642419e-08, + "loss": 0.0136, + "num_tokens": 30933378.0, + "reward": 0.83099365234375, + "reward_std": 0.015326269902288914, + "rewards//mean": 0.83099365234375, + "rewards//std": 0.018325138837099075, + "step": 4247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8496, + "grad_norm": 1.9347679615020752, + "kl": 0.33257778361439705, + "learning_rate": 5.601931158866702e-08, + "loss": 0.0133, + "num_tokens": 30940650.0, + "reward": 0.85772705078125, + "reward_std": 0.015171999111771584, + "rewards//mean": 0.85772705078125, + "rewards//std": 0.022753801196813583, + "step": 4248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8498, + "grad_norm": 1.8129996061325073, + "kl": 0.4067627191543579, + "learning_rate": 5.5873454036278023e-08, + "loss": 0.0163, + "num_tokens": 30947834.0, + "reward": 0.809814453125, + "reward_std": 0.015569254755973816, + "rewards//mean": 0.809814453125, + "rewards//std": 0.02233557030558586, + "step": 4249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.85, + "grad_norm": 1.5309576988220215, + "kl": 0.3840240240097046, + "learning_rate": 5.5727775378008714e-08, + "loss": 0.0154, + "num_tokens": 30955146.0, + "reward": 0.85302734375, + "reward_std": 0.020503655076026917, + "rewards//mean": 0.85302734375, + "rewards//std": 0.02822420373558998, + "step": 4250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8502, + "grad_norm": 1.4817177057266235, + "kl": 0.3282468542456627, + "learning_rate": 5.5582275672538316e-08, + "loss": 0.0131, + "num_tokens": 30962410.0, + "reward": 0.826904296875, + "reward_std": 0.017185721546411514, + "rewards//mean": 0.826904296875, + "rewards//std": 0.02617964707314968, + "step": 4251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8504, + "grad_norm": 1.4195289611816406, + "kl": 0.2941933088004589, + "learning_rate": 5.543695497847406e-08, + "loss": 0.0118, + "num_tokens": 30969690.0, + "reward": 0.80621337890625, + "reward_std": 0.009995119646191597, + "rewards//mean": 0.80621337890625, + "rewards//std": 0.025473881512880325, + "step": 4252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8506, + "grad_norm": 1.6700268983840942, + "kl": 0.384139958769083, + "learning_rate": 5.529181335435124e-08, + "loss": 0.0154, + "num_tokens": 30976890.0, + "reward": 0.83660888671875, + "reward_std": 0.012913815677165985, + "rewards//mean": 0.83660888671875, + "rewards//std": 0.0216715969145298, + "step": 4253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8508, + "grad_norm": 1.6265356540679932, + "kl": 0.34718688018620014, + "learning_rate": 5.5146850858632854e-08, + "loss": 0.0139, + "num_tokens": 30984194.0, + "reward": 0.8404541015625, + "reward_std": 0.018445227295160294, + "rewards//mean": 0.8404541015625, + "rewards//std": 0.022339297458529472, + "step": 4254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.851, + "grad_norm": 1.541718602180481, + "kl": 0.2964393924921751, + "learning_rate": 5.500206754970965e-08, + "loss": 0.0119, + "num_tokens": 30991450.0, + "reward": 0.80084228515625, + "reward_std": 0.012238239869475365, + "rewards//mean": 0.80084228515625, + "rewards//std": 0.01939426362514496, + "step": 4255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.75, + "epoch": 0.8512, + "grad_norm": 1.5295214653015137, + "kl": 0.32345325872302055, + "learning_rate": 5.485746348590048e-08, + "loss": 0.0099, + "num_tokens": 30998674.0, + "reward": 0.799072265625, + "reward_std": 0.013293981552124023, + "rewards//mean": 0.799072265625, + "rewards//std": 0.0177426990121603, + "step": 4256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.78125, + "epoch": 0.8514, + "grad_norm": 1.6038769483566284, + "kl": 0.376309797167778, + "learning_rate": 5.4713038725451744e-08, + "loss": 0.0113, + "num_tokens": 31005892.0, + "reward": 0.84716796875, + "reward_std": 0.016014551743865013, + "rewards//mean": 0.84716796875, + "rewards//std": 0.0267799012362957, + "step": 4257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.859375, + "epoch": 0.8516, + "grad_norm": 1.5502102375030518, + "kl": 0.3773011229932308, + "learning_rate": 5.456879332653785e-08, + "loss": 0.0157, + "num_tokens": 31013123.0, + "reward": 0.8302001953125, + "reward_std": 0.01692294143140316, + "rewards//mean": 0.8302001953125, + "rewards//std": 0.028185293078422546, + "step": 4258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8518, + "grad_norm": 1.6494120359420776, + "kl": 0.38958122953772545, + "learning_rate": 5.4424727347260614e-08, + "loss": 0.0156, + "num_tokens": 31020371.0, + "reward": 0.84429931640625, + "reward_std": 0.01452619582414627, + "rewards//mean": 0.84429931640625, + "rewards//std": 0.023228516802191734, + "step": 4259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.84375, + "epoch": 0.852, + "grad_norm": 1.6155531406402588, + "kl": 0.3344910889863968, + "learning_rate": 5.428084084564999e-08, + "loss": 0.0168, + "num_tokens": 31027617.0, + "reward": 0.87432861328125, + "reward_std": 0.02024097368121147, + "rewards//mean": 0.87432861328125, + "rewards//std": 0.028995053842663765, + "step": 4260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8522, + "grad_norm": 2.0520122051239014, + "kl": 0.42967952229082584, + "learning_rate": 5.4137133879663287e-08, + "loss": 0.0172, + "num_tokens": 31035009.0, + "reward": 0.79656982421875, + "reward_std": 0.01785283163189888, + "rewards//mean": 0.79656982421875, + "rewards//std": 0.021791407838463783, + "step": 4261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.8524, + "grad_norm": 1.7794435024261475, + "kl": 0.4123074747622013, + "learning_rate": 5.399360650718593e-08, + "loss": 0.0165, + "num_tokens": 31042197.0, + "reward": 0.86456298828125, + "reward_std": 0.01679547131061554, + "rewards//mean": 0.86456298828125, + "rewards//std": 0.02082234062254429, + "step": 4262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8526, + "grad_norm": 1.800512433052063, + "kl": 0.347666934132576, + "learning_rate": 5.385025878603039e-08, + "loss": 0.0139, + "num_tokens": 31049517.0, + "reward": 0.8739013671875, + "reward_std": 0.018539678305387497, + "rewards//mean": 0.8739013671875, + "rewards//std": 0.022200630977749825, + "step": 4263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8528, + "grad_norm": 1.5363327264785767, + "kl": 0.31126645766198635, + "learning_rate": 5.37070907739372e-08, + "loss": 0.0125, + "num_tokens": 31056781.0, + "reward": 0.84979248046875, + "reward_std": 0.015417426824569702, + "rewards//mean": 0.84979248046875, + "rewards//std": 0.019990021362900734, + "step": 4264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.853, + "grad_norm": 1.7754722833633423, + "kl": 0.4415670819580555, + "learning_rate": 5.3564102528574574e-08, + "loss": 0.0177, + "num_tokens": 31064029.0, + "reward": 0.85211181640625, + "reward_std": 0.015117152594029903, + "rewards//mean": 0.85211181640625, + "rewards//std": 0.019708609208464622, + "step": 4265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8532, + "grad_norm": 1.4821736812591553, + "kl": 0.3456269558519125, + "learning_rate": 5.342129410753809e-08, + "loss": 0.0138, + "num_tokens": 31071325.0, + "reward": 0.86077880859375, + "reward_std": 0.01723666489124298, + "rewards//mean": 0.86077880859375, + "rewards//std": 0.025357142090797424, + "step": 4266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8534, + "grad_norm": 1.6352554559707642, + "kl": 0.3849274292588234, + "learning_rate": 5.327866556835087e-08, + "loss": 0.0154, + "num_tokens": 31078549.0, + "reward": 0.864013671875, + "reward_std": 0.021458754315972328, + "rewards//mean": 0.864013671875, + "rewards//std": 0.025984639301896095, + "step": 4267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8536, + "grad_norm": 1.3669281005859375, + "kl": 0.3350668791681528, + "learning_rate": 5.313621696846371e-08, + "loss": 0.0134, + "num_tokens": 31085789.0, + "reward": 0.87921142578125, + "reward_std": 0.01732531748712063, + "rewards//mean": 0.87921142578125, + "rewards//std": 0.028614573180675507, + "step": 4268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8538, + "grad_norm": 1.6182578802108765, + "kl": 0.31415930762887, + "learning_rate": 5.299394836525506e-08, + "loss": 0.0126, + "num_tokens": 31093037.0, + "reward": 0.870361328125, + "reward_std": 0.02380114048719406, + "rewards//mean": 0.870361328125, + "rewards//std": 0.04419349133968353, + "step": 4269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.854, + "grad_norm": 1.6550828218460083, + "kl": 0.3287584502249956, + "learning_rate": 5.285185981603041e-08, + "loss": 0.0132, + "num_tokens": 31100477.0, + "reward": 0.8486328125, + "reward_std": 0.01386962365359068, + "rewards//mean": 0.8486328125, + "rewards//std": 0.02079842798411846, + "step": 4270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.59375, + "epoch": 0.8542, + "grad_norm": 1.5166598558425903, + "kl": 0.3294871523976326, + "learning_rate": 5.270995137802314e-08, + "loss": -0.0211, + "num_tokens": 31107795.0, + "reward": 0.84698486328125, + "reward_std": 0.01574988290667534, + "rewards//mean": 0.84698486328125, + "rewards//std": 0.020286187529563904, + "step": 4271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8544, + "grad_norm": 1.9090089797973633, + "kl": 0.37578022852540016, + "learning_rate": 5.256822310839404e-08, + "loss": 0.015, + "num_tokens": 31115067.0, + "reward": 0.87567138671875, + "reward_std": 0.015199854969978333, + "rewards//mean": 0.87567138671875, + "rewards//std": 0.019741609692573547, + "step": 4272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8546, + "grad_norm": 1.6779329776763916, + "kl": 0.4351811520755291, + "learning_rate": 5.2426675064231206e-08, + "loss": 0.0174, + "num_tokens": 31122323.0, + "reward": 0.82220458984375, + "reward_std": 0.015192839317023754, + "rewards//mean": 0.82220458984375, + "rewards//std": 0.015359099954366684, + "step": 4273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.8125, + "epoch": 0.8548, + "grad_norm": 1.6820476055145264, + "kl": 0.37596421875059605, + "learning_rate": 5.228530730255004e-08, + "loss": 0.0227, + "num_tokens": 31129527.0, + "reward": 0.8331298828125, + "reward_std": 0.01792309805750847, + "rewards//mean": 0.8331298828125, + "rewards//std": 0.021804343909025192, + "step": 4274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.855, + "grad_norm": 1.5738712549209595, + "kl": 0.35564643144607544, + "learning_rate": 5.2144119880293544e-08, + "loss": 0.0142, + "num_tokens": 31136847.0, + "reward": 0.8409423828125, + "reward_std": 0.011381283402442932, + "rewards//mean": 0.8409423828125, + "rewards//std": 0.0190918929874897, + "step": 4275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8552, + "grad_norm": 1.3998780250549316, + "kl": 0.42089030519127846, + "learning_rate": 5.200311285433212e-08, + "loss": 0.0168, + "num_tokens": 31144143.0, + "reward": 0.866455078125, + "reward_std": 0.011947710067033768, + "rewards//mean": 0.866455078125, + "rewards//std": 0.01708906888961792, + "step": 4276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8554, + "grad_norm": 1.6143746376037598, + "kl": 0.35695548355579376, + "learning_rate": 5.186228628146316e-08, + "loss": 0.0143, + "num_tokens": 31151439.0, + "reward": 0.854736328125, + "reward_std": 0.01709626615047455, + "rewards//mean": 0.854736328125, + "rewards//std": 0.022183235734701157, + "step": 4277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8556, + "grad_norm": 1.425134539604187, + "kl": 0.32531359791755676, + "learning_rate": 5.172164021841174e-08, + "loss": 0.013, + "num_tokens": 31158687.0, + "reward": 0.78997802734375, + "reward_std": 0.009341209195554256, + "rewards//mean": 0.78997802734375, + "rewards//std": 0.0175210889428854, + "step": 4278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.8558, + "grad_norm": 1.9598307609558105, + "kl": 0.38326945155858994, + "learning_rate": 5.158117472182999e-08, + "loss": 0.0156, + "num_tokens": 31166021.0, + "reward": 0.81512451171875, + "reward_std": 0.020300373435020447, + "rewards//mean": 0.81512451171875, + "rewards//std": 0.02549704536795616, + "step": 4279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.856, + "grad_norm": 1.5565712451934814, + "kl": 0.33779706060886383, + "learning_rate": 5.144088984829742e-08, + "loss": 0.0135, + "num_tokens": 31173357.0, + "reward": 0.8585205078125, + "reward_std": 0.012846752069890499, + "rewards//mean": 0.8585205078125, + "rewards//std": 0.0190028827637434, + "step": 4280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.859375, + "epoch": 0.8562, + "grad_norm": 1.4262027740478516, + "kl": 0.3161669336259365, + "learning_rate": 5.1300785654320886e-08, + "loss": 0.0182, + "num_tokens": 31180676.0, + "reward": 0.85546875, + "reward_std": 0.01609819382429123, + "rewards//mean": 0.85546875, + "rewards//std": 0.02399914152920246, + "step": 4281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.875, + "epoch": 0.8564, + "grad_norm": 1.8352020978927612, + "kl": 0.37403643503785133, + "learning_rate": 5.1160862196334144e-08, + "loss": 0.0088, + "num_tokens": 31187876.0, + "reward": 0.86285400390625, + "reward_std": 0.01637028343975544, + "rewards//mean": 0.86285400390625, + "rewards//std": 0.02267182245850563, + "step": 4282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8566, + "grad_norm": 1.3691680431365967, + "kl": 0.30535553582012653, + "learning_rate": 5.1021119530698434e-08, + "loss": 0.0122, + "num_tokens": 31195116.0, + "reward": 0.83551025390625, + "reward_std": 0.01590568572282791, + "rewards//mean": 0.83551025390625, + "rewards//std": 0.019821958616375923, + "step": 4283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.78125, + "epoch": 0.8568, + "grad_norm": 1.7967513799667358, + "kl": 0.39002611860632896, + "learning_rate": 5.088155771370206e-08, + "loss": 0.0112, + "num_tokens": 31202422.0, + "reward": 0.837158203125, + "reward_std": 0.012829482555389404, + "rewards//mean": 0.837158203125, + "rewards//std": 0.018200911581516266, + "step": 4284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.857, + "grad_norm": 1.5569275617599487, + "kl": 0.36706992611289024, + "learning_rate": 5.074217680156062e-08, + "loss": 0.0097, + "num_tokens": 31209679.0, + "reward": 0.828369140625, + "reward_std": 0.011700037866830826, + "rewards//mean": 0.828369140625, + "rewards//std": 0.014597195200622082, + "step": 4285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8572, + "grad_norm": 1.649536371231079, + "kl": 0.3309887982904911, + "learning_rate": 5.060297685041659e-08, + "loss": 0.0132, + "num_tokens": 31216943.0, + "reward": 0.80096435546875, + "reward_std": 0.013680942356586456, + "rewards//mean": 0.80096435546875, + "rewards//std": 0.023182202130556107, + "step": 4286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8574, + "grad_norm": 1.5085866451263428, + "kl": 0.2959523983299732, + "learning_rate": 5.0463957916339675e-08, + "loss": 0.0118, + "num_tokens": 31224247.0, + "reward": 0.831787109375, + "reward_std": 0.01846975088119507, + "rewards//mean": 0.831787109375, + "rewards//std": 0.02747960016131401, + "step": 4287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.8125, + "epoch": 0.8576, + "grad_norm": 1.5198442935943604, + "kl": 0.3550422526896, + "learning_rate": 5.0325120055326797e-08, + "loss": 0.0219, + "num_tokens": 31231499.0, + "reward": 0.8349609375, + "reward_std": 0.015116861090064049, + "rewards//mean": 0.8349609375, + "rewards//std": 0.018242448568344116, + "step": 4288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8578, + "grad_norm": 1.7027066946029663, + "kl": 0.39274836145341396, + "learning_rate": 5.01864633233019e-08, + "loss": 0.0157, + "num_tokens": 31238811.0, + "reward": 0.87457275390625, + "reward_std": 0.02066972106695175, + "rewards//mean": 0.87457275390625, + "rewards//std": 0.030418287962675095, + "step": 4289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.858, + "grad_norm": 1.6548658609390259, + "kl": 0.3616466484963894, + "learning_rate": 5.004798777611563e-08, + "loss": 0.0145, + "num_tokens": 31246051.0, + "reward": 0.88287353515625, + "reward_std": 0.017114635556936264, + "rewards//mean": 0.88287353515625, + "rewards//std": 0.02181570790708065, + "step": 4290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8582, + "grad_norm": 1.538705587387085, + "kl": 0.3266626540571451, + "learning_rate": 4.9909693469546097e-08, + "loss": 0.0131, + "num_tokens": 31253307.0, + "reward": 0.8404541015625, + "reward_std": 0.012178991921246052, + "rewards//mean": 0.8404541015625, + "rewards//std": 0.016370171681046486, + "step": 4291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.8584, + "grad_norm": 1.6120730638504028, + "kl": 0.34754152223467827, + "learning_rate": 4.9771580459298245e-08, + "loss": 0.0143, + "num_tokens": 31260630.0, + "reward": 0.8702392578125, + "reward_std": 0.01990106701850891, + "rewards//mean": 0.8702392578125, + "rewards//std": 0.025002216920256615, + "step": 4292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8586, + "grad_norm": 1.4386358261108398, + "kl": 0.3566423486918211, + "learning_rate": 4.963364880100401e-08, + "loss": 0.0143, + "num_tokens": 31268014.0, + "reward": 0.8529052734375, + "reward_std": 0.01603141613304615, + "rewards//mean": 0.8529052734375, + "rewards//std": 0.024510597810149193, + "step": 4293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8588, + "grad_norm": 1.3883957862854004, + "kl": 0.303422536700964, + "learning_rate": 4.949589855022207e-08, + "loss": 0.0121, + "num_tokens": 31275262.0, + "reward": 0.87640380859375, + "reward_std": 0.019576329737901688, + "rewards//mean": 0.87640380859375, + "rewards//std": 0.032446760684251785, + "step": 4294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.515625, + "epoch": 0.859, + "grad_norm": 1.790102243423462, + "kl": 0.3373217452317476, + "learning_rate": 4.935832976243831e-08, + "loss": 0.0062, + "num_tokens": 31282535.0, + "reward": 0.7481689453125, + "reward_std": 0.01309402845799923, + "rewards//mean": 0.7481689453125, + "rewards//std": 0.016688868403434753, + "step": 4295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8592, + "grad_norm": 1.856990933418274, + "kl": 0.3649642113596201, + "learning_rate": 4.922094249306558e-08, + "loss": 0.0146, + "num_tokens": 31289815.0, + "reward": 0.8516845703125, + "reward_std": 0.019844459369778633, + "rewards//mean": 0.8516845703125, + "rewards//std": 0.025151921436190605, + "step": 4296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8594, + "grad_norm": 1.6882449388504028, + "kl": 0.40094659850001335, + "learning_rate": 4.908373679744315e-08, + "loss": 0.016, + "num_tokens": 31297127.0, + "reward": 0.84930419921875, + "reward_std": 0.012871384620666504, + "rewards//mean": 0.84930419921875, + "rewards//std": 0.020467452704906464, + "step": 4297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8596, + "grad_norm": 1.9884932041168213, + "kl": 0.31045131012797356, + "learning_rate": 4.894671273083767e-08, + "loss": 0.0124, + "num_tokens": 31304423.0, + "reward": 0.77398681640625, + "reward_std": 0.01156381331384182, + "rewards//mean": 0.77398681640625, + "rewards//std": 0.022226782515645027, + "step": 4298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.8598, + "grad_norm": 1.3008369207382202, + "kl": 0.35963777638971806, + "learning_rate": 4.8809870348442306e-08, + "loss": 0.0134, + "num_tokens": 31311725.0, + "reward": 0.8892822265625, + "reward_std": 0.013062011450529099, + "rewards//mean": 0.8892822265625, + "rewards//std": 0.01918995939195156, + "step": 4299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.86, + "grad_norm": 1.4316363334655762, + "kl": 0.35475622676312923, + "learning_rate": 4.867320970537736e-08, + "loss": 0.0142, + "num_tokens": 31318965.0, + "reward": 0.8134765625, + "reward_std": 0.013587046414613724, + "rewards//mean": 0.8134765625, + "rewards//std": 0.02287265844643116, + "step": 4300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8602, + "grad_norm": 1.7026633024215698, + "kl": 0.4280840363353491, + "learning_rate": 4.853673085668947e-08, + "loss": 0.0171, + "num_tokens": 31326229.0, + "reward": 0.80169677734375, + "reward_std": 0.015974031761288643, + "rewards//mean": 0.80169677734375, + "rewards//std": 0.019719360396265984, + "step": 4301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8604, + "grad_norm": 1.8313426971435547, + "kl": 0.37967269495129585, + "learning_rate": 4.8400433857352375e-08, + "loss": 0.0152, + "num_tokens": 31333549.0, + "reward": 0.82427978515625, + "reward_std": 0.014316444285213947, + "rewards//mean": 0.82427978515625, + "rewards//std": 0.023638075217604637, + "step": 4302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8606, + "grad_norm": 1.52655029296875, + "kl": 0.3305170461535454, + "learning_rate": 4.82643187622665e-08, + "loss": 0.0132, + "num_tokens": 31340901.0, + "reward": 0.87481689453125, + "reward_std": 0.025627393275499344, + "rewards//mean": 0.87481689453125, + "rewards//std": 0.0317683182656765, + "step": 4303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8608, + "grad_norm": 1.5007883310317993, + "kl": 0.4010661393404007, + "learning_rate": 4.812838562625915e-08, + "loss": 0.016, + "num_tokens": 31348173.0, + "reward": 0.84588623046875, + "reward_std": 0.01727578416466713, + "rewards//mean": 0.84588623046875, + "rewards//std": 0.024398788809776306, + "step": 4304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.861, + "grad_norm": 1.587082028388977, + "kl": 0.41398731246590614, + "learning_rate": 4.799263450408386e-08, + "loss": 0.0166, + "num_tokens": 31355493.0, + "reward": 0.86553955078125, + "reward_std": 0.016258377581834793, + "rewards//mean": 0.86553955078125, + "rewards//std": 0.019058067351579666, + "step": 4305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.859375, + "epoch": 0.8612, + "grad_norm": 1.5323902368545532, + "kl": 0.38004960119724274, + "learning_rate": 4.78570654504214e-08, + "loss": 0.0071, + "num_tokens": 31362684.0, + "reward": 0.8096923828125, + "reward_std": 0.01308352779597044, + "rewards//mean": 0.8096923828125, + "rewards//std": 0.023067397996783257, + "step": 4306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8614, + "grad_norm": 1.6398415565490723, + "kl": 0.3729514256119728, + "learning_rate": 4.7721678519878904e-08, + "loss": 0.0149, + "num_tokens": 31369996.0, + "reward": 0.8349609375, + "reward_std": 0.015554562211036682, + "rewards//mean": 0.8349609375, + "rewards//std": 0.020434236153960228, + "step": 4307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.8616, + "grad_norm": 2.313824415206909, + "kl": 0.35661060363054276, + "learning_rate": 4.758647376699032e-08, + "loss": 0.0131, + "num_tokens": 31377310.0, + "reward": 0.86572265625, + "reward_std": 0.016161344945430756, + "rewards//mean": 0.86572265625, + "rewards//std": 0.02501220442354679, + "step": 4308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8618, + "grad_norm": 1.5274909734725952, + "kl": 0.3894195146858692, + "learning_rate": 4.7451451246215855e-08, + "loss": 0.0156, + "num_tokens": 31384494.0, + "reward": 0.82080078125, + "reward_std": 0.01541917584836483, + "rewards//mean": 0.82080078125, + "rewards//std": 0.021264830604195595, + "step": 4309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.862, + "grad_norm": 1.6492449045181274, + "kl": 0.31915193796157837, + "learning_rate": 4.731661101194273e-08, + "loss": 0.0128, + "num_tokens": 31391734.0, + "reward": 0.8641357421875, + "reward_std": 0.015817657113075256, + "rewards//mean": 0.8641357421875, + "rewards//std": 0.02441406436264515, + "step": 4310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8622, + "grad_norm": 1.4329378604888916, + "kl": 0.3558424413204193, + "learning_rate": 4.718195311848455e-08, + "loss": 0.0142, + "num_tokens": 31399174.0, + "reward": 0.8536376953125, + "reward_std": 0.018128424882888794, + "rewards//mean": 0.8536376953125, + "rewards//std": 0.024329591542482376, + "step": 4311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8624, + "grad_norm": 1.3776302337646484, + "kl": 0.34046757593750954, + "learning_rate": 4.70474776200816e-08, + "loss": 0.0136, + "num_tokens": 31406486.0, + "reward": 0.81475830078125, + "reward_std": 0.017596881836652756, + "rewards//mean": 0.81475830078125, + "rewards//std": 0.026881949976086617, + "step": 4312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.8626, + "grad_norm": 1.4672794342041016, + "kl": 0.3665420822799206, + "learning_rate": 4.6913184570900435e-08, + "loss": 0.0075, + "num_tokens": 31413672.0, + "reward": 0.86273193359375, + "reward_std": 0.02056157775223255, + "rewards//mean": 0.86273193359375, + "rewards//std": 0.03365229442715645, + "step": 4313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8628, + "grad_norm": 2.1598100662231445, + "kl": 0.39104755595326424, + "learning_rate": 4.677907402503428e-08, + "loss": 0.0156, + "num_tokens": 31420952.0, + "reward": 0.85089111328125, + "reward_std": 0.01275339350104332, + "rewards//mean": 0.85089111328125, + "rewards//std": 0.03084869682788849, + "step": 4314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.863, + "grad_norm": 1.6364399194717407, + "kl": 0.34472657553851604, + "learning_rate": 4.664514603650305e-08, + "loss": 0.0138, + "num_tokens": 31428248.0, + "reward": 0.8663330078125, + "reward_std": 0.016403352841734886, + "rewards//mean": 0.8663330078125, + "rewards//std": 0.019801806658506393, + "step": 4315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.5625, + "epoch": 0.8632, + "grad_norm": 1.8325942754745483, + "kl": 0.32486645318567753, + "learning_rate": 4.6511400659252685e-08, + "loss": -0.0158, + "num_tokens": 31435628.0, + "reward": 0.8504638671875, + "reward_std": 0.016022779047489166, + "rewards//mean": 0.8504638671875, + "rewards//std": 0.02371966280043125, + "step": 4316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8634, + "grad_norm": 1.447759747505188, + "kl": 0.3700493052601814, + "learning_rate": 4.6377837947155886e-08, + "loss": 0.0148, + "num_tokens": 31442940.0, + "reward": 0.8316650390625, + "reward_std": 0.019863657653331757, + "rewards//mean": 0.8316650390625, + "rewards//std": 0.02603905275464058, + "step": 4317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8636, + "grad_norm": 1.7643446922302246, + "kl": 0.39722635969519615, + "learning_rate": 4.624445795401172e-08, + "loss": 0.0159, + "num_tokens": 31450212.0, + "reward": 0.85552978515625, + "reward_std": 0.021936144679784775, + "rewards//mean": 0.85552978515625, + "rewards//std": 0.026752684265375137, + "step": 4318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8638, + "grad_norm": 1.6874953508377075, + "kl": 0.33584577962756157, + "learning_rate": 4.611126073354571e-08, + "loss": 0.0134, + "num_tokens": 31457476.0, + "reward": 0.84814453125, + "reward_std": 0.013482915237545967, + "rewards//mean": 0.84814453125, + "rewards//std": 0.02808656543493271, + "step": 4319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.864, + "grad_norm": 1.4465926885604858, + "kl": 0.35477934032678604, + "learning_rate": 4.597824633940955e-08, + "loss": 0.0142, + "num_tokens": 31464788.0, + "reward": 0.83966064453125, + "reward_std": 0.015494177117943764, + "rewards//mean": 0.83966064453125, + "rewards//std": 0.019959706813097, + "step": 4320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8642, + "grad_norm": 1.9176064729690552, + "kl": 0.31718420051038265, + "learning_rate": 4.5845414825181394e-08, + "loss": 0.0127, + "num_tokens": 31472108.0, + "reward": 0.86907958984375, + "reward_std": 0.0152147077023983, + "rewards//mean": 0.86907958984375, + "rewards//std": 0.01945038139820099, + "step": 4321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8644, + "grad_norm": 1.4377477169036865, + "kl": 0.34695965237915516, + "learning_rate": 4.5712766244365874e-08, + "loss": 0.0139, + "num_tokens": 31479428.0, + "reward": 0.8370361328125, + "reward_std": 0.012695148587226868, + "rewards//mean": 0.8370361328125, + "rewards//std": 0.024423982948064804, + "step": 4322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.8646, + "grad_norm": 1.9421366453170776, + "kl": 0.33142516389489174, + "learning_rate": 4.558030065039386e-08, + "loss": 0.0144, + "num_tokens": 31486712.0, + "reward": 0.82611083984375, + "reward_std": 0.019100558012723923, + "rewards//mean": 0.82611083984375, + "rewards//std": 0.022691844031214714, + "step": 4323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.828125, + "epoch": 0.8648, + "grad_norm": 1.5254710912704468, + "kl": 0.3253202252089977, + "learning_rate": 4.5448018096622355e-08, + "loss": 0.0072, + "num_tokens": 31493917.0, + "reward": 0.83721923828125, + "reward_std": 0.011628461070358753, + "rewards//mean": 0.83721923828125, + "rewards//std": 0.01855907216668129, + "step": 4324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.865, + "grad_norm": 1.6423691511154175, + "kl": 0.452313918620348, + "learning_rate": 4.531591863633477e-08, + "loss": 0.0181, + "num_tokens": 31501125.0, + "reward": 0.83978271484375, + "reward_std": 0.013749266043305397, + "rewards//mean": 0.83978271484375, + "rewards//std": 0.016240544617176056, + "step": 4325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8652, + "grad_norm": 1.5513684749603271, + "kl": 0.31380167603492737, + "learning_rate": 4.518400232274078e-08, + "loss": 0.0126, + "num_tokens": 31508485.0, + "reward": 0.86236572265625, + "reward_std": 0.01641712337732315, + "rewards//mean": 0.86236572265625, + "rewards//std": 0.024594679474830627, + "step": 4326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8654, + "grad_norm": 1.650884747505188, + "kl": 0.38525577262043953, + "learning_rate": 4.505226920897637e-08, + "loss": 0.0154, + "num_tokens": 31515685.0, + "reward": 0.795654296875, + "reward_std": 0.01724625751376152, + "rewards//mean": 0.795654296875, + "rewards//std": 0.02534288540482521, + "step": 4327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8656, + "grad_norm": 1.5133346319198608, + "kl": 0.35317208990454674, + "learning_rate": 4.492071934810343e-08, + "loss": 0.0141, + "num_tokens": 31523053.0, + "reward": 0.84783935546875, + "reward_std": 0.013614693656563759, + "rewards//mean": 0.84783935546875, + "rewards//std": 0.019430914893746376, + "step": 4328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8658, + "grad_norm": 1.4757647514343262, + "kl": 0.3538968823850155, + "learning_rate": 4.4789352793110305e-08, + "loss": 0.0142, + "num_tokens": 31530245.0, + "reward": 0.81610107421875, + "reward_std": 0.011585007421672344, + "rewards//mean": 0.81610107421875, + "rewards//std": 0.012914460152387619, + "step": 4329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.866, + "grad_norm": 1.6317075490951538, + "kl": 0.386222954839468, + "learning_rate": 4.465816959691149e-08, + "loss": 0.0154, + "num_tokens": 31537493.0, + "reward": 0.79644775390625, + "reward_std": 0.013004375621676445, + "rewards//mean": 0.79644775390625, + "rewards//std": 0.01908743381500244, + "step": 4330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8662, + "grad_norm": 1.385238766670227, + "kl": 0.39118445105850697, + "learning_rate": 4.452716981234744e-08, + "loss": 0.0156, + "num_tokens": 31544741.0, + "reward": 0.865478515625, + "reward_std": 0.0188966765999794, + "rewards//mean": 0.865478515625, + "rewards//std": 0.02747960016131401, + "step": 4331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.8664, + "grad_norm": 1.8337881565093994, + "kl": 0.33867961168289185, + "learning_rate": 4.439635349218496e-08, + "loss": 0.0122, + "num_tokens": 31552046.0, + "reward": 0.8494873046875, + "reward_std": 0.014910105615854263, + "rewards//mean": 0.8494873046875, + "rewards//std": 0.021564198657870293, + "step": 4332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8666, + "grad_norm": 1.3257395029067993, + "kl": 0.3165564127266407, + "learning_rate": 4.426572068911677e-08, + "loss": 0.0127, + "num_tokens": 31559286.0, + "reward": 0.84130859375, + "reward_std": 0.012637436389923096, + "rewards//mean": 0.84130859375, + "rewards//std": 0.01945670321583748, + "step": 4333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8668, + "grad_norm": 1.5579930543899536, + "kl": 0.3351917080581188, + "learning_rate": 4.41352714557619e-08, + "loss": 0.0134, + "num_tokens": 31566574.0, + "reward": 0.7891845703125, + "reward_std": 0.011185454204678535, + "rewards//mean": 0.7891845703125, + "rewards//std": 0.018518757075071335, + "step": 4334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.867, + "grad_norm": 1.4502034187316895, + "kl": 0.40190870501101017, + "learning_rate": 4.400500584466504e-08, + "loss": 0.0161, + "num_tokens": 31573830.0, + "reward": 0.8399658203125, + "reward_std": 0.015223093330860138, + "rewards//mean": 0.8399658203125, + "rewards//std": 0.01701939105987549, + "step": 4335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8672, + "grad_norm": 1.688428282737732, + "kl": 0.41891827061772346, + "learning_rate": 4.387492390829734e-08, + "loss": 0.0168, + "num_tokens": 31581086.0, + "reward": 0.75982666015625, + "reward_std": 0.013309262692928314, + "rewards//mean": 0.75982666015625, + "rewards//std": 0.016631057485938072, + "step": 4336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.8674, + "grad_norm": 1.5599359273910522, + "kl": 0.3449965640902519, + "learning_rate": 4.374502569905569e-08, + "loss": 0.0149, + "num_tokens": 31588404.0, + "reward": 0.8145751953125, + "reward_std": 0.013749840669333935, + "rewards//mean": 0.8145751953125, + "rewards//std": 0.01873977668583393, + "step": 4337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8676, + "grad_norm": 1.5709589719772339, + "kl": 0.3674721159040928, + "learning_rate": 4.3615311269263264e-08, + "loss": 0.0147, + "num_tokens": 31595772.0, + "reward": 0.8511962890625, + "reward_std": 0.018322251737117767, + "rewards//mean": 0.8511962890625, + "rewards//std": 0.029990093782544136, + "step": 4338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8678, + "grad_norm": 2.0163190364837646, + "kl": 0.41342082992196083, + "learning_rate": 4.3485780671168816e-08, + "loss": 0.0165, + "num_tokens": 31602948.0, + "reward": 0.840576171875, + "reward_std": 0.023362714797258377, + "rewards//mean": 0.840576171875, + "rewards//std": 0.032887961715459824, + "step": 4339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.868, + "grad_norm": 1.558245062828064, + "kl": 0.34168388321995735, + "learning_rate": 4.335643395694727e-08, + "loss": 0.0137, + "num_tokens": 31610196.0, + "reward": 0.86181640625, + "reward_std": 0.01455664075911045, + "rewards//mean": 0.86181640625, + "rewards//std": 0.019605513662099838, + "step": 4340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.8125, + "epoch": 0.8682, + "grad_norm": 1.6723392009735107, + "kl": 0.41322190314531326, + "learning_rate": 4.322727117869951e-08, + "loss": 0.0082, + "num_tokens": 31617496.0, + "reward": 0.8287353515625, + "reward_std": 0.018064696341753006, + "rewards//mean": 0.8287353515625, + "rewards//std": 0.030179286375641823, + "step": 4341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8684, + "grad_norm": 1.38756263256073, + "kl": 0.3709533363580704, + "learning_rate": 4.309829238845242e-08, + "loss": 0.0148, + "num_tokens": 31624712.0, + "reward": 0.845458984375, + "reward_std": 0.01776747591793537, + "rewards//mean": 0.845458984375, + "rewards//std": 0.026891591027379036, + "step": 4342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8686, + "grad_norm": 1.5433884859085083, + "kl": 0.3305229749530554, + "learning_rate": 4.296949763815838e-08, + "loss": 0.0132, + "num_tokens": 31632016.0, + "reward": 0.84405517578125, + "reward_std": 0.012692307122051716, + "rewards//mean": 0.84405517578125, + "rewards//std": 0.01602756232023239, + "step": 4343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8688, + "grad_norm": 1.337644100189209, + "kl": 0.2740225028246641, + "learning_rate": 4.2840886979696064e-08, + "loss": 0.011, + "num_tokens": 31639240.0, + "reward": 0.79559326171875, + "reward_std": 0.013282544910907745, + "rewards//mean": 0.79559326171875, + "rewards//std": 0.023261727765202522, + "step": 4344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.869, + "grad_norm": 1.5546361207962036, + "kl": 0.35059504210948944, + "learning_rate": 4.2712460464869926e-08, + "loss": 0.014, + "num_tokens": 31646504.0, + "reward": 0.90460205078125, + "reward_std": 0.01876232586801052, + "rewards//mean": 0.90460205078125, + "rewards//std": 0.025418557226657867, + "step": 4345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8692, + "grad_norm": 1.716528296470642, + "kl": 0.3393856957554817, + "learning_rate": 4.258421814540991e-08, + "loss": 0.0136, + "num_tokens": 31653784.0, + "reward": 0.82806396484375, + "reward_std": 0.016474425792694092, + "rewards//mean": 0.82806396484375, + "rewards//std": 0.030229590833187103, + "step": 4346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8694, + "grad_norm": 1.58834707736969, + "kl": 0.30020437203347683, + "learning_rate": 4.245616007297209e-08, + "loss": 0.012, + "num_tokens": 31661096.0, + "reward": 0.8511962890625, + "reward_std": 0.017060955986380577, + "rewards//mean": 0.8511962890625, + "rewards//std": 0.022667566314339638, + "step": 4347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8696, + "grad_norm": 1.4673447608947754, + "kl": 0.3455250635743141, + "learning_rate": 4.232828629913831e-08, + "loss": 0.0138, + "num_tokens": 31668424.0, + "reward": 0.86419677734375, + "reward_std": 0.017808355391025543, + "rewards//mean": 0.86419677734375, + "rewards//std": 0.02222541905939579, + "step": 4348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8698, + "grad_norm": 1.7092657089233398, + "kl": 0.3410078175365925, + "learning_rate": 4.220059687541616e-08, + "loss": 0.0136, + "num_tokens": 31675608.0, + "reward": 0.85467529296875, + "reward_std": 0.011381398886442184, + "rewards//mean": 0.85467529296875, + "rewards//std": 0.015219498425722122, + "step": 4349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.87, + "grad_norm": 1.6388574838638306, + "kl": 0.33446183428168297, + "learning_rate": 4.207309185323876e-08, + "loss": 0.0134, + "num_tokens": 31682832.0, + "reward": 0.8427734375, + "reward_std": 0.017076876014471054, + "rewards//mean": 0.8427734375, + "rewards//std": 0.019648704677820206, + "step": 4350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8702, + "grad_norm": 1.7502591609954834, + "kl": 0.4233454689383507, + "learning_rate": 4.19457712839652e-08, + "loss": 0.0169, + "num_tokens": 31690072.0, + "reward": 0.8482666015625, + "reward_std": 0.02173193171620369, + "rewards//mean": 0.8482666015625, + "rewards//std": 0.030704397708177567, + "step": 4351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.8704, + "grad_norm": 1.9159340858459473, + "kl": 0.4850265048444271, + "learning_rate": 4.181863521888018e-08, + "loss": 0.0128, + "num_tokens": 31697281.0, + "reward": 0.83392333984375, + "reward_std": 0.017593543976545334, + "rewards//mean": 0.83392333984375, + "rewards//std": 0.01825561746954918, + "step": 4352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8706, + "grad_norm": 1.5526963472366333, + "kl": 0.36149363219738007, + "learning_rate": 4.169168370919418e-08, + "loss": 0.0145, + "num_tokens": 31704545.0, + "reward": 0.853759765625, + "reward_std": 0.02310981974005699, + "rewards//mean": 0.853759765625, + "rewards//std": 0.02738247625529766, + "step": 4353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8708, + "grad_norm": 1.3744797706604004, + "kl": 0.3312084823846817, + "learning_rate": 4.156491680604307e-08, + "loss": 0.0132, + "num_tokens": 31711833.0, + "reward": 0.85577392578125, + "reward_std": 0.014316041022539139, + "rewards//mean": 0.85577392578125, + "rewards//std": 0.019897419959306717, + "step": 4354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.871, + "grad_norm": 1.4205459356307983, + "kl": 0.4273208677768707, + "learning_rate": 4.1438334560488673e-08, + "loss": 0.0171, + "num_tokens": 31719073.0, + "reward": 0.81927490234375, + "reward_std": 0.014196357689797878, + "rewards//mean": 0.81927490234375, + "rewards//std": 0.02121627889573574, + "step": 4355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8712, + "grad_norm": 1.7366597652435303, + "kl": 0.3769746646285057, + "learning_rate": 4.131193702351826e-08, + "loss": 0.0151, + "num_tokens": 31726385.0, + "reward": 0.8446044921875, + "reward_std": 0.022515375167131424, + "rewards//mean": 0.8446044921875, + "rewards//std": 0.02820676751434803, + "step": 4356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8714, + "grad_norm": 1.5871922969818115, + "kl": 0.3650573566555977, + "learning_rate": 4.118572424604489e-08, + "loss": 0.0146, + "num_tokens": 31733705.0, + "reward": 0.8621826171875, + "reward_std": 0.022056495770812035, + "rewards//mean": 0.8621826171875, + "rewards//std": 0.030141141265630722, + "step": 4357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8716, + "grad_norm": 1.4620230197906494, + "kl": 0.30615602619946003, + "learning_rate": 4.105969627890682e-08, + "loss": 0.0122, + "num_tokens": 31740921.0, + "reward": 0.840576171875, + "reward_std": 0.013250255957245827, + "rewards//mean": 0.840576171875, + "rewards//std": 0.022259533405303955, + "step": 4358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8718, + "grad_norm": 1.9812614917755127, + "kl": 0.3708654195070267, + "learning_rate": 4.0933853172868185e-08, + "loss": 0.0148, + "num_tokens": 31748425.0, + "reward": 0.8245849609375, + "reward_std": 0.02088150382041931, + "rewards//mean": 0.8245849609375, + "rewards//std": 0.024456189945340157, + "step": 4359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.872, + "grad_norm": 1.4006843566894531, + "kl": 0.3527573198080063, + "learning_rate": 4.08081949786187e-08, + "loss": 0.0141, + "num_tokens": 31755897.0, + "reward": 0.85540771484375, + "reward_std": 0.014590341597795486, + "rewards//mean": 0.85540771484375, + "rewards//std": 0.017669066786766052, + "step": 4360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8722, + "grad_norm": 1.5837286710739136, + "kl": 0.4390038512647152, + "learning_rate": 4.068272174677334e-08, + "loss": 0.0176, + "num_tokens": 31763209.0, + "reward": 0.8531494140625, + "reward_std": 0.016324587166309357, + "rewards//mean": 0.8531494140625, + "rewards//std": 0.020946016535162926, + "step": 4361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8724, + "grad_norm": 1.4215898513793945, + "kl": 0.38274479657411575, + "learning_rate": 4.0557433527872666e-08, + "loss": 0.0153, + "num_tokens": 31770513.0, + "reward": 0.8837890625, + "reward_std": 0.010937494225800037, + "rewards//mean": 0.8837890625, + "rewards//std": 0.027603816241025925, + "step": 4362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8726, + "grad_norm": 1.207544207572937, + "kl": 0.27096839994192123, + "learning_rate": 4.043233037238281e-08, + "loss": 0.0108, + "num_tokens": 31777857.0, + "reward": 0.875732421875, + "reward_std": 0.017700091004371643, + "rewards//mean": 0.875732421875, + "rewards//std": 0.034413836896419525, + "step": 4363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8728, + "grad_norm": 1.5660760402679443, + "kl": 0.3184626176953316, + "learning_rate": 4.0307412330695345e-08, + "loss": 0.0127, + "num_tokens": 31785169.0, + "reward": 0.8624267578125, + "reward_std": 0.015787314623594284, + "rewards//mean": 0.8624267578125, + "rewards//std": 0.017266638576984406, + "step": 4364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.873, + "grad_norm": 1.7658641338348389, + "kl": 0.3751383349299431, + "learning_rate": 4.018267945312731e-08, + "loss": 0.015, + "num_tokens": 31792625.0, + "reward": 0.88037109375, + "reward_std": 0.014494216069579124, + "rewards//mean": 0.88037109375, + "rewards//std": 0.018823999911546707, + "step": 4365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8732, + "grad_norm": 1.4723654985427856, + "kl": 0.3596403896808624, + "learning_rate": 4.00581317899209e-08, + "loss": 0.0144, + "num_tokens": 31799953.0, + "reward": 0.87890625, + "reward_std": 0.02106696367263794, + "rewards//mean": 0.87890625, + "rewards//std": 0.029421651735901833, + "step": 4366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8734, + "grad_norm": 1.6253868341445923, + "kl": 0.3065449260175228, + "learning_rate": 3.993376939124399e-08, + "loss": 0.0123, + "num_tokens": 31807169.0, + "reward": 0.78466796875, + "reward_std": 0.015563479624688625, + "rewards//mean": 0.78466796875, + "rewards//std": 0.021849077194929123, + "step": 4367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8736, + "grad_norm": 1.3052564859390259, + "kl": 0.36036044359207153, + "learning_rate": 3.980959230718972e-08, + "loss": 0.0144, + "num_tokens": 31814489.0, + "reward": 0.87811279296875, + "reward_std": 0.01561515312641859, + "rewards//mean": 0.87811279296875, + "rewards//std": 0.023777907714247704, + "step": 4368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8738, + "grad_norm": 1.4159760475158691, + "kl": 0.3253336437046528, + "learning_rate": 3.9685600587776815e-08, + "loss": 0.013, + "num_tokens": 31821897.0, + "reward": 0.85595703125, + "reward_std": 0.012988989241421223, + "rewards//mean": 0.85595703125, + "rewards//std": 0.022080639377236366, + "step": 4369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.874, + "grad_norm": 1.92479407787323, + "kl": 0.38342833891510963, + "learning_rate": 3.9561794282948756e-08, + "loss": 0.0138, + "num_tokens": 31829188.0, + "reward": 0.8597412109375, + "reward_std": 0.015016221441328526, + "rewards//mean": 0.8597412109375, + "rewards//std": 0.019694490358233452, + "step": 4370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8742, + "grad_norm": 1.5148792266845703, + "kl": 0.358619486913085, + "learning_rate": 3.9438173442575e-08, + "loss": 0.0143, + "num_tokens": 31836460.0, + "reward": 0.8577880859375, + "reward_std": 0.016543248668313026, + "rewards//mean": 0.8577880859375, + "rewards//std": 0.02316431887447834, + "step": 4371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8744, + "grad_norm": 1.577394962310791, + "kl": 0.3763664122670889, + "learning_rate": 3.9314738116449806e-08, + "loss": 0.0151, + "num_tokens": 31843764.0, + "reward": 0.87451171875, + "reward_std": 0.019105711951851845, + "rewards//mean": 0.87451171875, + "rewards//std": 0.02524353563785553, + "step": 4372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8746, + "grad_norm": 1.6471099853515625, + "kl": 0.3701088707894087, + "learning_rate": 3.919148835429314e-08, + "loss": 0.0148, + "num_tokens": 31851028.0, + "reward": 0.87396240234375, + "reward_std": 0.015467820689082146, + "rewards//mean": 0.87396240234375, + "rewards//std": 0.020766286179423332, + "step": 4373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8748, + "grad_norm": 1.5749318599700928, + "kl": 0.32524202577769756, + "learning_rate": 3.9068424205749794e-08, + "loss": 0.013, + "num_tokens": 31858268.0, + "reward": 0.8712158203125, + "reward_std": 0.018588120117783546, + "rewards//mean": 0.8712158203125, + "rewards//std": 0.021986860781908035, + "step": 4374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.875, + "grad_norm": 1.6993379592895508, + "kl": 0.37194090336561203, + "learning_rate": 3.8945545720389995e-08, + "loss": 0.0149, + "num_tokens": 31865524.0, + "reward": 0.81268310546875, + "reward_std": 0.013444069772958755, + "rewards//mean": 0.81268310546875, + "rewards//std": 0.02076847478747368, + "step": 4375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.84375, + "epoch": 0.8752, + "grad_norm": 1.4846996068954468, + "kl": 0.3343331068754196, + "learning_rate": 3.882285294770937e-08, + "loss": 0.0202, + "num_tokens": 31872834.0, + "reward": 0.83123779296875, + "reward_std": 0.015915218740701675, + "rewards//mean": 0.83123779296875, + "rewards//std": 0.020672038197517395, + "step": 4376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.796875, + "epoch": 0.8754, + "grad_norm": 1.6570696830749512, + "kl": 0.35910502448678017, + "learning_rate": 3.8700345937128344e-08, + "loss": 0.0237, + "num_tokens": 31880165.0, + "reward": 0.8670654296875, + "reward_std": 0.01938692107796669, + "rewards//mean": 0.8670654296875, + "rewards//std": 0.02249864861369133, + "step": 4377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8756, + "grad_norm": 1.451913595199585, + "kl": 0.3881080895662308, + "learning_rate": 3.857802473799282e-08, + "loss": 0.0155, + "num_tokens": 31887485.0, + "reward": 0.8670654296875, + "reward_std": 0.014795191586017609, + "rewards//mean": 0.8670654296875, + "rewards//std": 0.026802221313118935, + "step": 4378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8758, + "grad_norm": 1.5494000911712646, + "kl": 0.40036890283226967, + "learning_rate": 3.845588939957373e-08, + "loss": 0.016, + "num_tokens": 31894877.0, + "reward": 0.827392578125, + "reward_std": 0.011914758011698723, + "rewards//mean": 0.827392578125, + "rewards//std": 0.01698956824839115, + "step": 4379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.859375, + "epoch": 0.876, + "grad_norm": 1.752338171005249, + "kl": 0.33363544195890427, + "learning_rate": 3.8333939971067265e-08, + "loss": 0.0105, + "num_tokens": 31902180.0, + "reward": 0.86737060546875, + "reward_std": 0.012686425819993019, + "rewards//mean": 0.86737060546875, + "rewards//std": 0.023841485381126404, + "step": 4380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8762, + "grad_norm": 1.4871864318847656, + "kl": 0.3151228465139866, + "learning_rate": 3.821217650159453e-08, + "loss": 0.0126, + "num_tokens": 31909452.0, + "reward": 0.87744140625, + "reward_std": 0.01871931552886963, + "rewards//mean": 0.87744140625, + "rewards//std": 0.032170381397008896, + "step": 4381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8764, + "grad_norm": 1.2929878234863281, + "kl": 0.3298631124198437, + "learning_rate": 3.8090599040201846e-08, + "loss": 0.0132, + "num_tokens": 31916668.0, + "reward": 0.87445068359375, + "reward_std": 0.01558179035782814, + "rewards//mean": 0.87445068359375, + "rewards//std": 0.02181987091898918, + "step": 4382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8766, + "grad_norm": 1.6105625629425049, + "kl": 0.37234168499708176, + "learning_rate": 3.796920763586059e-08, + "loss": 0.0149, + "num_tokens": 31923964.0, + "reward": 0.83831787109375, + "reward_std": 0.021022185683250427, + "rewards//mean": 0.83831787109375, + "rewards//std": 0.027113519608974457, + "step": 4383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8768, + "grad_norm": 1.675588846206665, + "kl": 0.413856141269207, + "learning_rate": 3.784800233746738e-08, + "loss": 0.0166, + "num_tokens": 31931228.0, + "reward": 0.85400390625, + "reward_std": 0.019194453954696655, + "rewards//mean": 0.85400390625, + "rewards//std": 0.02358686737716198, + "step": 4384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.877, + "grad_norm": 1.4969428777694702, + "kl": 0.32888463139533997, + "learning_rate": 3.7726983193843485e-08, + "loss": 0.0132, + "num_tokens": 31938644.0, + "reward": 0.81640625, + "reward_std": 0.0169911477714777, + "rewards//mean": 0.81640625, + "rewards//std": 0.02530582621693611, + "step": 4385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8772, + "grad_norm": 1.5794289112091064, + "kl": 0.3445751406252384, + "learning_rate": 3.7606150253735424e-08, + "loss": 0.0138, + "num_tokens": 31945932.0, + "reward": 0.861328125, + "reward_std": 0.02093343809247017, + "rewards//mean": 0.861328125, + "rewards//std": 0.03164280578494072, + "step": 4386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8774, + "grad_norm": 1.751267910003662, + "kl": 0.35636717826128006, + "learning_rate": 3.748550356581481e-08, + "loss": 0.0143, + "num_tokens": 31953188.0, + "reward": 0.8759765625, + "reward_std": 0.02019195631146431, + "rewards//mean": 0.8759765625, + "rewards//std": 0.027969907969236374, + "step": 4387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8776, + "grad_norm": 1.611501932144165, + "kl": 0.3413221277296543, + "learning_rate": 3.7365043178678114e-08, + "loss": 0.0137, + "num_tokens": 31960508.0, + "reward": 0.8497314453125, + "reward_std": 0.014824304729700089, + "rewards//mean": 0.8497314453125, + "rewards//std": 0.018255306407809258, + "step": 4388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8778, + "grad_norm": 1.5389647483825684, + "kl": 0.4013797231018543, + "learning_rate": 3.724476914084657e-08, + "loss": 0.0161, + "num_tokens": 31967764.0, + "reward": 0.8359375, + "reward_std": 0.01817270740866661, + "rewards//mean": 0.8359375, + "rewards//std": 0.02377607673406601, + "step": 4389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.878, + "grad_norm": 1.9975334405899048, + "kl": 0.39948026090860367, + "learning_rate": 3.7124681500766696e-08, + "loss": 0.016, + "num_tokens": 31975076.0, + "reward": 0.8433837890625, + "reward_std": 0.013185643590986729, + "rewards//mean": 0.8433837890625, + "rewards//std": 0.02020440623164177, + "step": 4390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8782, + "grad_norm": 1.7739497423171997, + "kl": 0.34156813472509384, + "learning_rate": 3.700478030680987e-08, + "loss": 0.0137, + "num_tokens": 31982420.0, + "reward": 0.86309814453125, + "reward_std": 0.014670537784695625, + "rewards//mean": 0.86309814453125, + "rewards//std": 0.021703705191612244, + "step": 4391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8784, + "grad_norm": 1.5503315925598145, + "kl": 0.3905604910105467, + "learning_rate": 3.688506560727206e-08, + "loss": 0.0156, + "num_tokens": 31989852.0, + "reward": 0.88067626953125, + "reward_std": 0.01753605715930462, + "rewards//mean": 0.88067626953125, + "rewards//std": 0.025457831099629402, + "step": 4392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8786, + "grad_norm": 1.9559335708618164, + "kl": 0.434989795088768, + "learning_rate": 3.6765537450374473e-08, + "loss": 0.0174, + "num_tokens": 31997156.0, + "reward": 0.81982421875, + "reward_std": 0.014114649966359138, + "rewards//mean": 0.81982421875, + "rewards//std": 0.017026949673891068, + "step": 4393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8788, + "grad_norm": 1.4853277206420898, + "kl": 0.37657763436436653, + "learning_rate": 3.6646195884262985e-08, + "loss": 0.0151, + "num_tokens": 32004532.0, + "reward": 0.851318359375, + "reward_std": 0.014094673097133636, + "rewards//mean": 0.851318359375, + "rewards//std": 0.018412595614790916, + "step": 4394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.879, + "grad_norm": 1.6832538843154907, + "kl": 0.3766142912209034, + "learning_rate": 3.652704095700848e-08, + "loss": 0.0151, + "num_tokens": 32011788.0, + "reward": 0.79937744140625, + "reward_std": 0.015566828660666943, + "rewards//mean": 0.79937744140625, + "rewards//std": 0.027773229405283928, + "step": 4395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8792, + "grad_norm": 1.5601757764816284, + "kl": 0.3502756245434284, + "learning_rate": 3.6408072716606345e-08, + "loss": 0.014, + "num_tokens": 32019004.0, + "reward": 0.84210205078125, + "reward_std": 0.021091943606734276, + "rewards//mean": 0.84210205078125, + "rewards//std": 0.029698632657527924, + "step": 4396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8794, + "grad_norm": 1.401410698890686, + "kl": 0.29431716725230217, + "learning_rate": 3.6289291210977066e-08, + "loss": 0.0118, + "num_tokens": 32026140.0, + "reward": 0.85028076171875, + "reward_std": 0.017908649519085884, + "rewards//mean": 0.85028076171875, + "rewards//std": 0.02417064644396305, + "step": 4397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8796, + "grad_norm": 2.357546806335449, + "kl": 0.467760793864727, + "learning_rate": 3.617069648796589e-08, + "loss": 0.0187, + "num_tokens": 32033404.0, + "reward": 0.8575439453125, + "reward_std": 0.01531791128218174, + "rewards//mean": 0.8575439453125, + "rewards//std": 0.024871094152331352, + "step": 4398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.859375, + "epoch": 0.8798, + "grad_norm": 2.132086992263794, + "kl": 0.4035695306956768, + "learning_rate": 3.605228859534271e-08, + "loss": 0.0189, + "num_tokens": 32040731.0, + "reward": 0.86895751953125, + "reward_std": 0.01605762168765068, + "rewards//mean": 0.86895751953125, + "rewards//std": 0.028322117403149605, + "step": 4399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.88, + "grad_norm": 1.72798490524292, + "kl": 0.37902181781828403, + "learning_rate": 3.5934067580802195e-08, + "loss": 0.0152, + "num_tokens": 32047987.0, + "reward": 0.811767578125, + "reward_std": 0.016867097467184067, + "rewards//mean": 0.811767578125, + "rewards//std": 0.0221941526979208, + "step": 4400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8802, + "grad_norm": 1.8839863538742065, + "kl": 0.3522583954036236, + "learning_rate": 3.581603349196371e-08, + "loss": 0.0141, + "num_tokens": 32055483.0, + "reward": 0.8045654296875, + "reward_std": 0.01767471432685852, + "rewards//mean": 0.8045654296875, + "rewards//std": 0.022999048233032227, + "step": 4401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8804, + "grad_norm": 2.0020010471343994, + "kl": 0.41889311745762825, + "learning_rate": 3.569818637637145e-08, + "loss": 0.0168, + "num_tokens": 32062763.0, + "reward": 0.84375, + "reward_std": 0.012096276506781578, + "rewards//mean": 0.84375, + "rewards//std": 0.017839696258306503, + "step": 4402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8806, + "grad_norm": 1.7458064556121826, + "kl": 0.37332963943481445, + "learning_rate": 3.5580526281494215e-08, + "loss": 0.0149, + "num_tokens": 32070003.0, + "reward": 0.85137939453125, + "reward_std": 0.021029453724622726, + "rewards//mean": 0.85137939453125, + "rewards//std": 0.027660174295306206, + "step": 4403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8808, + "grad_norm": 1.7608835697174072, + "kl": 0.37125079333782196, + "learning_rate": 3.546305325472543e-08, + "loss": 0.0149, + "num_tokens": 32077355.0, + "reward": 0.80560302734375, + "reward_std": 0.010669346898794174, + "rewards//mean": 0.80560302734375, + "rewards//std": 0.011042891070246696, + "step": 4404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.881, + "grad_norm": 1.77800452709198, + "kl": 0.35210656374692917, + "learning_rate": 3.534576734338324e-08, + "loss": 0.0141, + "num_tokens": 32084627.0, + "reward": 0.86279296875, + "reward_std": 0.01624513417482376, + "rewards//mean": 0.86279296875, + "rewards//std": 0.023046717047691345, + "step": 4405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8812, + "grad_norm": 1.5840259790420532, + "kl": 0.4180618990212679, + "learning_rate": 3.5228668594710465e-08, + "loss": 0.0167, + "num_tokens": 32091955.0, + "reward": 0.80810546875, + "reward_std": 0.010110899806022644, + "rewards//mean": 0.80810546875, + "rewards//std": 0.012004615738987923, + "step": 4406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8814, + "grad_norm": 1.508863091468811, + "kl": 0.3241861816495657, + "learning_rate": 3.5111757055874326e-08, + "loss": 0.013, + "num_tokens": 32099219.0, + "reward": 0.794677734375, + "reward_std": 0.011078650131821632, + "rewards//mean": 0.794677734375, + "rewards//std": 0.014193391427397728, + "step": 4407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8816, + "grad_norm": 1.442732572555542, + "kl": 0.30993764102458954, + "learning_rate": 3.499503277396687e-08, + "loss": 0.0124, + "num_tokens": 32106499.0, + "reward": 0.87054443359375, + "reward_std": 0.012329284101724625, + "rewards//mean": 0.87054443359375, + "rewards//std": 0.019198141992092133, + "step": 4408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.8818, + "grad_norm": 1.628668189048767, + "kl": 0.46213459596037865, + "learning_rate": 3.487849579600455e-08, + "loss": 0.0191, + "num_tokens": 32113754.0, + "reward": 0.85430908203125, + "reward_std": 0.010931744240224361, + "rewards//mean": 0.85430908203125, + "rewards//std": 0.016785981133580208, + "step": 4409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.882, + "grad_norm": 1.366586685180664, + "kl": 0.3455500788986683, + "learning_rate": 3.476214616892864e-08, + "loss": 0.0138, + "num_tokens": 32121026.0, + "reward": 0.80621337890625, + "reward_std": 0.015023964457213879, + "rewards//mean": 0.80621337890625, + "rewards//std": 0.02544534206390381, + "step": 4410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8822, + "grad_norm": 1.498936653137207, + "kl": 0.3626771569252014, + "learning_rate": 3.464598393960449e-08, + "loss": 0.0145, + "num_tokens": 32128346.0, + "reward": 0.86309814453125, + "reward_std": 0.02250540629029274, + "rewards//mean": 0.86309814453125, + "rewards//std": 0.031477268785238266, + "step": 4411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.8824, + "grad_norm": 1.4399572610855103, + "kl": 0.29080308601260185, + "learning_rate": 3.45300091548224e-08, + "loss": 0.0176, + "num_tokens": 32135580.0, + "reward": 0.87017822265625, + "reward_std": 0.0155226681381464, + "rewards//mean": 0.87017822265625, + "rewards//std": 0.02375306561589241, + "step": 4412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8826, + "grad_norm": 1.519689679145813, + "kl": 0.40468044206500053, + "learning_rate": 3.441422186129689e-08, + "loss": 0.0162, + "num_tokens": 32142900.0, + "reward": 0.85784912109375, + "reward_std": 0.015589980408549309, + "rewards//mean": 0.85784912109375, + "rewards//std": 0.01908108778297901, + "step": 4413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8828, + "grad_norm": 1.9416172504425049, + "kl": 0.280993165448308, + "learning_rate": 3.429862210566731e-08, + "loss": 0.0112, + "num_tokens": 32150148.0, + "reward": 0.82794189453125, + "reward_std": 0.009820308536291122, + "rewards//mean": 0.82794189453125, + "rewards//std": 0.01747523806989193, + "step": 4414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.883, + "grad_norm": 1.502389907836914, + "kl": 0.3674459867179394, + "learning_rate": 3.4183209934496914e-08, + "loss": 0.0147, + "num_tokens": 32157404.0, + "reward": 0.849609375, + "reward_std": 0.018505766987800598, + "rewards//mean": 0.849609375, + "rewards//std": 0.027410103008151054, + "step": 4415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8832, + "grad_norm": 1.7214075326919556, + "kl": 0.33178932033479214, + "learning_rate": 3.4067985394273855e-08, + "loss": 0.0133, + "num_tokens": 32164900.0, + "reward": 0.8363037109375, + "reward_std": 0.016827652230858803, + "rewards//mean": 0.8363037109375, + "rewards//std": 0.02705182507634163, + "step": 4416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8834, + "grad_norm": 1.7327353954315186, + "kl": 0.4884509705007076, + "learning_rate": 3.395294853141056e-08, + "loss": 0.0195, + "num_tokens": 32172124.0, + "reward": 0.8221435546875, + "reward_std": 0.01908591017127037, + "rewards//mean": 0.8221435546875, + "rewards//std": 0.02491001784801483, + "step": 4417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8836, + "grad_norm": 1.4532880783081055, + "kl": 0.31318217888474464, + "learning_rate": 3.3838099392243915e-08, + "loss": 0.0125, + "num_tokens": 32179332.0, + "reward": 0.85699462890625, + "reward_std": 0.018031029030680656, + "rewards//mean": 0.85699462890625, + "rewards//std": 0.026979193091392517, + "step": 4418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8838, + "grad_norm": 1.6246733665466309, + "kl": 0.34744178503751755, + "learning_rate": 3.3723438023035065e-08, + "loss": 0.0139, + "num_tokens": 32186556.0, + "reward": 0.87591552734375, + "reward_std": 0.02194386161863804, + "rewards//mean": 0.87591552734375, + "rewards//std": 0.02619691751897335, + "step": 4419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.78125, + "epoch": 0.884, + "grad_norm": 1.309566855430603, + "kl": 0.4033825732767582, + "learning_rate": 3.360896446996958e-08, + "loss": 0.0084, + "num_tokens": 32193806.0, + "reward": 0.84442138671875, + "reward_std": 0.012104297056794167, + "rewards//mean": 0.84442138671875, + "rewards//std": 0.017587488517165184, + "step": 4420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8842, + "grad_norm": 2.0345911979675293, + "kl": 0.4713810980319977, + "learning_rate": 3.349467877915746e-08, + "loss": 0.0189, + "num_tokens": 32201054.0, + "reward": 0.81658935546875, + "reward_std": 0.017843633890151978, + "rewards//mean": 0.81658935546875, + "rewards//std": 0.02125263586640358, + "step": 4421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8844, + "grad_norm": 1.5323821306228638, + "kl": 0.3606094643473625, + "learning_rate": 3.338058099663299e-08, + "loss": 0.0144, + "num_tokens": 32208334.0, + "reward": 0.8609619140625, + "reward_std": 0.015292910858988762, + "rewards//mean": 0.8609619140625, + "rewards//std": 0.02407691441476345, + "step": 4422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.84375, + "epoch": 0.8846, + "grad_norm": 1.3606394529342651, + "kl": 0.337992150336504, + "learning_rate": 3.3266671168354634e-08, + "loss": 0.0095, + "num_tokens": 32215588.0, + "reward": 0.87744140625, + "reward_std": 0.014404719695448875, + "rewards//mean": 0.87744140625, + "rewards//std": 0.024523256346583366, + "step": 4423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8848, + "grad_norm": 1.6605690717697144, + "kl": 0.3492238111793995, + "learning_rate": 3.31529493402053e-08, + "loss": 0.014, + "num_tokens": 32222796.0, + "reward": 0.865966796875, + "reward_std": 0.019085507839918137, + "rewards//mean": 0.865966796875, + "rewards//std": 0.033355962485075, + "step": 4424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.885, + "grad_norm": 1.6532717943191528, + "kl": 0.3705015294253826, + "learning_rate": 3.3039415557992224e-08, + "loss": 0.0148, + "num_tokens": 32230052.0, + "reward": 0.78424072265625, + "reward_std": 0.01223844289779663, + "rewards//mean": 0.78424072265625, + "rewards//std": 0.02293338254094124, + "step": 4425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8852, + "grad_norm": 1.4872429370880127, + "kl": 0.30795399472117424, + "learning_rate": 3.292606986744667e-08, + "loss": 0.0123, + "num_tokens": 32237324.0, + "reward": 0.84259033203125, + "reward_std": 0.017266640439629555, + "rewards//mean": 0.84259033203125, + "rewards//std": 0.025149136781692505, + "step": 4426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8854, + "grad_norm": 1.4635041952133179, + "kl": 0.37962037324905396, + "learning_rate": 3.2812912314224285e-08, + "loss": 0.0152, + "num_tokens": 32244508.0, + "reward": 0.81756591796875, + "reward_std": 0.016941070556640625, + "rewards//mean": 0.81756591796875, + "rewards//std": 0.029501227661967278, + "step": 4427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.828125, + "epoch": 0.8856, + "grad_norm": 1.4916117191314697, + "kl": 0.32618922367691994, + "learning_rate": 3.269994294390493e-08, + "loss": 0.0125, + "num_tokens": 32251817.0, + "reward": 0.8292236328125, + "reward_std": 0.012120665051043034, + "rewards//mean": 0.8292236328125, + "rewards//std": 0.018862159922719002, + "step": 4428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8858, + "grad_norm": 1.411091685295105, + "kl": 0.35962180607020855, + "learning_rate": 3.258716180199278e-08, + "loss": 0.0144, + "num_tokens": 32259105.0, + "reward": 0.84320068359375, + "reward_std": 0.010310797020792961, + "rewards//mean": 0.84320068359375, + "rewards//std": 0.016989456489682198, + "step": 4429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.886, + "grad_norm": 1.388874888420105, + "kl": 0.3493846580386162, + "learning_rate": 3.247456893391592e-08, + "loss": 0.014, + "num_tokens": 32266393.0, + "reward": 0.8480224609375, + "reward_std": 0.012448560446500778, + "rewards//mean": 0.8480224609375, + "rewards//std": 0.014139428734779358, + "step": 4430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8862, + "grad_norm": 1.4588563442230225, + "kl": 0.3259659204632044, + "learning_rate": 3.23621643850267e-08, + "loss": 0.013, + "num_tokens": 32273785.0, + "reward": 0.83294677734375, + "reward_std": 0.014750637114048004, + "rewards//mean": 0.83294677734375, + "rewards//std": 0.016318652778863907, + "step": 4431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8864, + "grad_norm": 1.3843640089035034, + "kl": 0.3157553318887949, + "learning_rate": 3.224994820060184e-08, + "loss": 0.0126, + "num_tokens": 32281017.0, + "reward": 0.8922119140625, + "reward_std": 0.01606578379869461, + "rewards//mean": 0.8922119140625, + "rewards//std": 0.024819916114211082, + "step": 4432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8866, + "grad_norm": 1.310791254043579, + "kl": 0.3262499123811722, + "learning_rate": 3.2137920425841904e-08, + "loss": 0.013, + "num_tokens": 32288353.0, + "reward": 0.8553466796875, + "reward_std": 0.012090086936950684, + "rewards//mean": 0.8553466796875, + "rewards//std": 0.01907285489141941, + "step": 4433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8868, + "grad_norm": 1.7222439050674438, + "kl": 0.36421194300055504, + "learning_rate": 3.202608110587163e-08, + "loss": 0.0146, + "num_tokens": 32295697.0, + "reward": 0.85968017578125, + "reward_std": 0.011979794129729271, + "rewards//mean": 0.85968017578125, + "rewards//std": 0.020295139402151108, + "step": 4434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.887, + "grad_norm": 1.6157634258270264, + "kl": 0.392425823956728, + "learning_rate": 3.191443028573992e-08, + "loss": 0.0157, + "num_tokens": 32303033.0, + "reward": 0.859619140625, + "reward_std": 0.02137252315878868, + "rewards//mean": 0.859619140625, + "rewards//std": 0.024576283991336823, + "step": 4435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8872, + "grad_norm": 1.8093223571777344, + "kl": 0.37034439481794834, + "learning_rate": 3.18029680104197e-08, + "loss": 0.0148, + "num_tokens": 32310273.0, + "reward": 0.85552978515625, + "reward_std": 0.014529233798384666, + "rewards//mean": 0.85552978515625, + "rewards//std": 0.01630008965730667, + "step": 4436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.859375, + "epoch": 0.8874, + "grad_norm": 1.2986258268356323, + "kl": 0.3132203072309494, + "learning_rate": 3.1691694324808063e-08, + "loss": 0.0074, + "num_tokens": 32317552.0, + "reward": 0.8375244140625, + "reward_std": 0.01660093292593956, + "rewards//mean": 0.8375244140625, + "rewards//std": 0.024247324094176292, + "step": 4437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8876, + "grad_norm": 1.5250868797302246, + "kl": 0.3895719088613987, + "learning_rate": 3.158060927372586e-08, + "loss": 0.0156, + "num_tokens": 32324840.0, + "reward": 0.82293701171875, + "reward_std": 0.01171296089887619, + "rewards//mean": 0.82293701171875, + "rewards//std": 0.016008662059903145, + "step": 4438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8878, + "grad_norm": 1.4100642204284668, + "kl": 0.3054699655622244, + "learning_rate": 3.1469712901918244e-08, + "loss": 0.0122, + "num_tokens": 32332112.0, + "reward": 0.858154296875, + "reward_std": 0.016490796580910683, + "rewards//mean": 0.858154296875, + "rewards//std": 0.021663999184966087, + "step": 4439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.888, + "grad_norm": 1.4429901838302612, + "kl": 0.3383251465857029, + "learning_rate": 3.135900525405427e-08, + "loss": 0.0135, + "num_tokens": 32339472.0, + "reward": 0.83258056640625, + "reward_std": 0.014049920253455639, + "rewards//mean": 0.83258056640625, + "rewards//std": 0.01749514974653721, + "step": 4440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8882, + "grad_norm": 1.8765872716903687, + "kl": 0.33504759334027767, + "learning_rate": 3.124848637472688e-08, + "loss": 0.0134, + "num_tokens": 32346728.0, + "reward": 0.817626953125, + "reward_std": 0.012948301620781422, + "rewards//mean": 0.817626953125, + "rewards//std": 0.016875134781003, + "step": 4441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.75, + "epoch": 0.8884, + "grad_norm": 1.273861289024353, + "kl": 0.30360346660017967, + "learning_rate": 3.1138156308453176e-08, + "loss": -0.0016, + "num_tokens": 32353984.0, + "reward": 0.84527587890625, + "reward_std": 0.015239219181239605, + "rewards//mean": 0.84527587890625, + "rewards//std": 0.020933276042342186, + "step": 4442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8886, + "grad_norm": 1.6496193408966064, + "kl": 0.31934549286961555, + "learning_rate": 3.1028015099673953e-08, + "loss": 0.0128, + "num_tokens": 32361216.0, + "reward": 0.7728271484375, + "reward_std": 0.015214920043945312, + "rewards//mean": 0.7728271484375, + "rewards//std": 0.02948101796209812, + "step": 4443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8888, + "grad_norm": 1.5884482860565186, + "kl": 0.4688630439341068, + "learning_rate": 3.091806279275433e-08, + "loss": 0.0188, + "num_tokens": 32368528.0, + "reward": 0.754638671875, + "reward_std": 0.014316722750663757, + "rewards//mean": 0.754638671875, + "rewards//std": 0.026647301390767097, + "step": 4444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.889, + "grad_norm": 1.587321400642395, + "kl": 0.3843678869307041, + "learning_rate": 3.0808299431982766e-08, + "loss": 0.0154, + "num_tokens": 32375728.0, + "reward": 0.8709716796875, + "reward_std": 0.01677863486111164, + "rewards//mean": 0.8709716796875, + "rewards//std": 0.02282993122935295, + "step": 4445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8892, + "grad_norm": 1.8169115781784058, + "kl": 0.49708353728055954, + "learning_rate": 3.069872506157212e-08, + "loss": 0.0199, + "num_tokens": 32383056.0, + "reward": 0.86285400390625, + "reward_std": 0.014853041619062424, + "rewards//mean": 0.86285400390625, + "rewards//std": 0.01884479820728302, + "step": 4446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8894, + "grad_norm": 1.5774621963500977, + "kl": 0.36171188578009605, + "learning_rate": 3.058933972565897e-08, + "loss": 0.0145, + "num_tokens": 32390320.0, + "reward": 0.8031005859375, + "reward_std": 0.0186658576130867, + "rewards//mean": 0.8031005859375, + "rewards//std": 0.028010739013552666, + "step": 4447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8896, + "grad_norm": 1.3695982694625854, + "kl": 0.3198714330792427, + "learning_rate": 3.0480143468303574e-08, + "loss": 0.0128, + "num_tokens": 32397680.0, + "reward": 0.83441162109375, + "reward_std": 0.013837151229381561, + "rewards//mean": 0.83441162109375, + "rewards//std": 0.020131129771471024, + "step": 4448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8898, + "grad_norm": 1.7570843696594238, + "kl": 0.36380529031157494, + "learning_rate": 3.037113633349031e-08, + "loss": 0.0146, + "num_tokens": 32404928.0, + "reward": 0.86578369140625, + "reward_std": 0.01671033538877964, + "rewards//mean": 0.86578369140625, + "rewards//std": 0.023285795003175735, + "step": 4449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.89, + "grad_norm": 1.676579475402832, + "kl": 0.3459554649889469, + "learning_rate": 3.026231836512705e-08, + "loss": 0.0138, + "num_tokens": 32412312.0, + "reward": 0.83099365234375, + "reward_std": 0.01627012901008129, + "rewards//mean": 0.83099365234375, + "rewards//std": 0.025308143347501755, + "step": 4450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8902, + "grad_norm": 1.7147327661514282, + "kl": 0.5098309032619, + "learning_rate": 3.015368960704584e-08, + "loss": 0.0204, + "num_tokens": 32419544.0, + "reward": 0.80889892578125, + "reward_std": 0.01438160240650177, + "rewards//mean": 0.80889892578125, + "rewards//std": 0.02381988801062107, + "step": 4451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8904, + "grad_norm": 1.370969533920288, + "kl": 0.38032881915569305, + "learning_rate": 3.004525010300229e-08, + "loss": 0.0152, + "num_tokens": 32426912.0, + "reward": 0.869384765625, + "reward_std": 0.016517607495188713, + "rewards//mean": 0.869384765625, + "rewards//std": 0.019782302901148796, + "step": 4452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8906, + "grad_norm": 1.5862177610397339, + "kl": 0.3670230507850647, + "learning_rate": 2.993699989667575e-08, + "loss": 0.0147, + "num_tokens": 32434200.0, + "reward": 0.86962890625, + "reward_std": 0.01823972724378109, + "rewards//mean": 0.86962890625, + "rewards//std": 0.022793101146817207, + "step": 4453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8908, + "grad_norm": 1.359232783317566, + "kl": 0.32295599207282066, + "learning_rate": 2.982893903166944e-08, + "loss": 0.0129, + "num_tokens": 32441480.0, + "reward": 0.7821044921875, + "reward_std": 0.009971541352570057, + "rewards//mean": 0.7821044921875, + "rewards//std": 0.02004494145512581, + "step": 4454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.891, + "grad_norm": 1.6181820631027222, + "kl": 0.32340400479733944, + "learning_rate": 2.972106755151027e-08, + "loss": 0.0129, + "num_tokens": 32448824.0, + "reward": 0.87164306640625, + "reward_std": 0.01608187146484852, + "rewards//mean": 0.87164306640625, + "rewards//std": 0.025965917855501175, + "step": 4455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8912, + "grad_norm": 1.6797953844070435, + "kl": 0.37060083262622356, + "learning_rate": 2.9613385499648925e-08, + "loss": 0.0148, + "num_tokens": 32456040.0, + "reward": 0.87542724609375, + "reward_std": 0.018633823841810226, + "rewards//mean": 0.87542724609375, + "rewards//std": 0.02369564026594162, + "step": 4456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8914, + "grad_norm": 1.452024221420288, + "kl": 0.3686896935105324, + "learning_rate": 2.950589291945954e-08, + "loss": 0.0147, + "num_tokens": 32463408.0, + "reward": 0.8734130859375, + "reward_std": 0.019787058234214783, + "rewards//mean": 0.8734130859375, + "rewards//std": 0.031076829880475998, + "step": 4457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8916, + "grad_norm": 1.6069614887237549, + "kl": 0.30727800726890564, + "learning_rate": 2.9398589854240263e-08, + "loss": 0.0123, + "num_tokens": 32470848.0, + "reward": 0.83056640625, + "reward_std": 0.014695718884468079, + "rewards//mean": 0.83056640625, + "rewards//std": 0.016955677419900894, + "step": 4458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8918, + "grad_norm": 1.426316738128662, + "kl": 0.33929793536663055, + "learning_rate": 2.9291476347212685e-08, + "loss": 0.0136, + "num_tokens": 32478120.0, + "reward": 0.822509765625, + "reward_std": 0.015592760406434536, + "rewards//mean": 0.822509765625, + "rewards//std": 0.021120544523000717, + "step": 4459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.892, + "grad_norm": 1.5058932304382324, + "kl": 0.438001561909914, + "learning_rate": 2.9184552441522236e-08, + "loss": 0.0175, + "num_tokens": 32485376.0, + "reward": 0.87335205078125, + "reward_std": 0.013975276611745358, + "rewards//mean": 0.87335205078125, + "rewards//std": 0.024168141186237335, + "step": 4460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8922, + "grad_norm": 2.1488022804260254, + "kl": 0.4643558729439974, + "learning_rate": 2.907781818023769e-08, + "loss": 0.0186, + "num_tokens": 32492728.0, + "reward": 0.838623046875, + "reward_std": 0.016914114356040955, + "rewards//mean": 0.838623046875, + "rewards//std": 0.026810409501194954, + "step": 4461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8924, + "grad_norm": 1.6770350933074951, + "kl": 0.3884905371814966, + "learning_rate": 2.8971273606351655e-08, + "loss": 0.0155, + "num_tokens": 32500040.0, + "reward": 0.7945556640625, + "reward_std": 0.013534259982407093, + "rewards//mean": 0.7945556640625, + "rewards//std": 0.021718084812164307, + "step": 4462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8926, + "grad_norm": 2.123645782470703, + "kl": 0.42405153438448906, + "learning_rate": 2.8864918762780298e-08, + "loss": 0.017, + "num_tokens": 32507344.0, + "reward": 0.83349609375, + "reward_std": 0.02016177773475647, + "rewards//mean": 0.83349609375, + "rewards//std": 0.02860775962471962, + "step": 4463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8928, + "grad_norm": 1.5367326736450195, + "kl": 0.36079894192516804, + "learning_rate": 2.8758753692363358e-08, + "loss": 0.0144, + "num_tokens": 32514616.0, + "reward": 0.86431884765625, + "reward_std": 0.014619659632444382, + "rewards//mean": 0.86431884765625, + "rewards//std": 0.01689474657177925, + "step": 4464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.893, + "grad_norm": 1.4845688343048096, + "kl": 0.40177371725440025, + "learning_rate": 2.865277843786401e-08, + "loss": 0.0161, + "num_tokens": 32521848.0, + "reward": 0.85888671875, + "reward_std": 0.01761963590979576, + "rewards//mean": 0.85888671875, + "rewards//std": 0.02649804763495922, + "step": 4465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8932, + "grad_norm": 1.6397887468338013, + "kl": 0.38952793180942535, + "learning_rate": 2.854699304196917e-08, + "loss": 0.0156, + "num_tokens": 32529112.0, + "reward": 0.8675537109375, + "reward_std": 0.017097553238272667, + "rewards//mean": 0.8675537109375, + "rewards//std": 0.02438676543533802, + "step": 4466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8934, + "grad_norm": 1.3146075010299683, + "kl": 0.38657253235578537, + "learning_rate": 2.8441397547289137e-08, + "loss": 0.0155, + "num_tokens": 32536328.0, + "reward": 0.78839111328125, + "reward_std": 0.01726074516773224, + "rewards//mean": 0.78839111328125, + "rewards//std": 0.02196025289595127, + "step": 4467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8936, + "grad_norm": 1.527091145515442, + "kl": 0.3304063640534878, + "learning_rate": 2.833599199635783e-08, + "loss": 0.0132, + "num_tokens": 32543600.0, + "reward": 0.826904296875, + "reward_std": 0.014080574735999107, + "rewards//mean": 0.826904296875, + "rewards//std": 0.02046828344464302, + "step": 4468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8938, + "grad_norm": 1.5547235012054443, + "kl": 0.3579777367413044, + "learning_rate": 2.823077643163252e-08, + "loss": 0.0143, + "num_tokens": 32550840.0, + "reward": 0.8697509765625, + "reward_std": 0.015280396677553654, + "rewards//mean": 0.8697509765625, + "rewards//std": 0.024805273860692978, + "step": 4469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.894, + "grad_norm": 1.7757577896118164, + "kl": 0.35699666664004326, + "learning_rate": 2.8125750895494015e-08, + "loss": 0.0143, + "num_tokens": 32558096.0, + "reward": 0.8502197265625, + "reward_std": 0.014702596701681614, + "rewards//mean": 0.8502197265625, + "rewards//std": 0.019762014970183372, + "step": 4470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8942, + "grad_norm": 1.6924188137054443, + "kl": 0.36695364117622375, + "learning_rate": 2.8020915430246706e-08, + "loss": 0.0147, + "num_tokens": 32565520.0, + "reward": 0.8095703125, + "reward_std": 0.012304997071623802, + "rewards//mean": 0.8095703125, + "rewards//std": 0.02428007684648037, + "step": 4471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8944, + "grad_norm": 1.8585814237594604, + "kl": 0.41152974404394627, + "learning_rate": 2.7916270078118087e-08, + "loss": 0.0165, + "num_tokens": 32572808.0, + "reward": 0.85723876953125, + "reward_std": 0.01537230797111988, + "rewards//mean": 0.85723876953125, + "rewards//std": 0.02366303652524948, + "step": 4472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8946, + "grad_norm": 1.7264291048049927, + "kl": 0.35942405462265015, + "learning_rate": 2.78118148812595e-08, + "loss": 0.0144, + "num_tokens": 32580048.0, + "reward": 0.864013671875, + "reward_std": 0.019826307892799377, + "rewards//mean": 0.864013671875, + "rewards//std": 0.023145031183958054, + "step": 4473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8948, + "grad_norm": 1.532377004623413, + "kl": 0.39626041054725647, + "learning_rate": 2.7707549881745397e-08, + "loss": 0.0159, + "num_tokens": 32587360.0, + "reward": 0.8720703125, + "reward_std": 0.01947236992418766, + "rewards//mean": 0.8720703125, + "rewards//std": 0.029355719685554504, + "step": 4474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.895, + "grad_norm": 1.4868652820587158, + "kl": 0.3672832138836384, + "learning_rate": 2.7603475121573737e-08, + "loss": 0.0147, + "num_tokens": 32594712.0, + "reward": 0.82122802734375, + "reward_std": 0.013827813789248466, + "rewards//mean": 0.82122802734375, + "rewards//std": 0.019987748935818672, + "step": 4475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.875, + "epoch": 0.8952, + "grad_norm": 1.709123134613037, + "kl": 0.39046735689044, + "learning_rate": 2.749959064266577e-08, + "loss": 0.0094, + "num_tokens": 32601968.0, + "reward": 0.84454345703125, + "reward_std": 0.02122376672923565, + "rewards//mean": 0.84454345703125, + "rewards//std": 0.023273438215255737, + "step": 4476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8954, + "grad_norm": 1.4249610900878906, + "kl": 0.2781844027340412, + "learning_rate": 2.739589648686619e-08, + "loss": 0.0111, + "num_tokens": 32609216.0, + "reward": 0.85882568359375, + "reward_std": 0.01326545886695385, + "rewards//mean": 0.85882568359375, + "rewards//std": 0.01739797182381153, + "step": 4477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8956, + "grad_norm": 1.4311909675598145, + "kl": 0.37098195403814316, + "learning_rate": 2.729239269594302e-08, + "loss": 0.0148, + "num_tokens": 32616552.0, + "reward": 0.85736083984375, + "reward_std": 0.01248359214514494, + "rewards//mean": 0.85736083984375, + "rewards//std": 0.020879695191979408, + "step": 4478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8958, + "grad_norm": 1.4833438396453857, + "kl": 0.3357194494456053, + "learning_rate": 2.7189079311587593e-08, + "loss": 0.0134, + "num_tokens": 32623896.0, + "reward": 0.80169677734375, + "reward_std": 0.011829864233732224, + "rewards//mean": 0.80169677734375, + "rewards//std": 0.019497022032737732, + "step": 4479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.896, + "grad_norm": 1.4626429080963135, + "kl": 0.3222733326256275, + "learning_rate": 2.7085956375414387e-08, + "loss": 0.0129, + "num_tokens": 32631176.0, + "reward": 0.8739013671875, + "reward_std": 0.018052183091640472, + "rewards//mean": 0.8739013671875, + "rewards//std": 0.021502334624528885, + "step": 4480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8962, + "grad_norm": 1.4633063077926636, + "kl": 0.37089305371046066, + "learning_rate": 2.69830239289614e-08, + "loss": 0.0148, + "num_tokens": 32638416.0, + "reward": 0.83111572265625, + "reward_std": 0.011965997517108917, + "rewards//mean": 0.83111572265625, + "rewards//std": 0.016873229295015335, + "step": 4481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8964, + "grad_norm": 2.014796257019043, + "kl": 0.4790822025388479, + "learning_rate": 2.6880282013689803e-08, + "loss": 0.0192, + "num_tokens": 32645672.0, + "reward": 0.85211181640625, + "reward_std": 0.012764000333845615, + "rewards//mean": 0.85211181640625, + "rewards//std": 0.017131425440311432, + "step": 4482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8966, + "grad_norm": 1.6659165620803833, + "kl": 0.35121648013591766, + "learning_rate": 2.67777306709841e-08, + "loss": 0.014, + "num_tokens": 32652928.0, + "reward": 0.851806640625, + "reward_std": 0.01987200602889061, + "rewards//mean": 0.851806640625, + "rewards//std": 0.02262645959854126, + "step": 4483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8968, + "grad_norm": 1.3849561214447021, + "kl": 0.35634196922183037, + "learning_rate": 2.667536994215186e-08, + "loss": 0.0143, + "num_tokens": 32660216.0, + "reward": 0.83233642578125, + "reward_std": 0.01604793220758438, + "rewards//mean": 0.83233642578125, + "rewards//std": 0.02564150094985962, + "step": 4484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.897, + "grad_norm": 2.080922842025757, + "kl": 0.30679450929164886, + "learning_rate": 2.6573199868423934e-08, + "loss": 0.0123, + "num_tokens": 32667448.0, + "reward": 0.8607177734375, + "reward_std": 0.011388814076781273, + "rewards//mean": 0.8607177734375, + "rewards//std": 0.01707976497709751, + "step": 4485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8972, + "grad_norm": 1.6807440519332886, + "kl": 0.33188471570611, + "learning_rate": 2.6471220490954626e-08, + "loss": 0.0133, + "num_tokens": 32674800.0, + "reward": 0.84716796875, + "reward_std": 0.01388736441731453, + "rewards//mean": 0.84716796875, + "rewards//std": 0.017628002911806107, + "step": 4486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8974, + "grad_norm": 1.6390451192855835, + "kl": 0.3420492634177208, + "learning_rate": 2.6369431850820966e-08, + "loss": 0.0137, + "num_tokens": 32682024.0, + "reward": 0.8157958984375, + "reward_std": 0.01354975812137127, + "rewards//mean": 0.8157958984375, + "rewards//std": 0.01994500681757927, + "step": 4487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8976, + "grad_norm": 1.7935255765914917, + "kl": 0.3920958638191223, + "learning_rate": 2.6267833989023546e-08, + "loss": 0.0157, + "num_tokens": 32689392.0, + "reward": 0.83953857421875, + "reward_std": 0.021847596392035484, + "rewards//mean": 0.83953857421875, + "rewards//std": 0.03341711312532425, + "step": 4488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8978, + "grad_norm": 1.6170144081115723, + "kl": 0.41324716433882713, + "learning_rate": 2.616642694648591e-08, + "loss": 0.0165, + "num_tokens": 32696616.0, + "reward": 0.82904052734375, + "reward_std": 0.02043680101633072, + "rewards//mean": 0.82904052734375, + "rewards//std": 0.02955915406346321, + "step": 4489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.898, + "grad_norm": 1.4323885440826416, + "kl": 0.3466543275862932, + "learning_rate": 2.6065210764054936e-08, + "loss": 0.0139, + "num_tokens": 32703872.0, + "reward": 0.84423828125, + "reward_std": 0.011618506163358688, + "rewards//mean": 0.84423828125, + "rewards//std": 0.01596996560692787, + "step": 4490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8982, + "grad_norm": 1.6140698194503784, + "kl": 0.3647391200065613, + "learning_rate": 2.596418548250029e-08, + "loss": 0.0146, + "num_tokens": 32711184.0, + "reward": 0.8492431640625, + "reward_std": 0.02024863287806511, + "rewards//mean": 0.8492431640625, + "rewards//std": 0.02849084511399269, + "step": 4491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8984, + "grad_norm": 1.4863587617874146, + "kl": 0.3433086145669222, + "learning_rate": 2.5863351142515035e-08, + "loss": 0.0137, + "num_tokens": 32718512.0, + "reward": 0.853759765625, + "reward_std": 0.015460994094610214, + "rewards//mean": 0.853759765625, + "rewards//std": 0.02274390123784542, + "step": 4492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8986, + "grad_norm": 1.6466515064239502, + "kl": 0.29330265522003174, + "learning_rate": 2.5762707784715287e-08, + "loss": 0.0117, + "num_tokens": 32725840.0, + "reward": 0.84149169921875, + "reward_std": 0.01594429835677147, + "rewards//mean": 0.84149169921875, + "rewards//std": 0.023331904783844948, + "step": 4493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8988, + "grad_norm": 1.5130976438522339, + "kl": 0.3584822863340378, + "learning_rate": 2.5662255449640125e-08, + "loss": 0.0143, + "num_tokens": 32733104.0, + "reward": 0.841552734375, + "reward_std": 0.01875952072441578, + "rewards//mean": 0.841552734375, + "rewards//std": 0.02617964707314968, + "step": 4494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.899, + "grad_norm": 1.7252254486083984, + "kl": 0.29944756254553795, + "learning_rate": 2.5561994177751732e-08, + "loss": 0.012, + "num_tokens": 32740360.0, + "reward": 0.8389892578125, + "reward_std": 0.011317943222820759, + "rewards//mean": 0.8389892578125, + "rewards//std": 0.017069127410650253, + "step": 4495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8992, + "grad_norm": 1.2977688312530518, + "kl": 0.2761100046336651, + "learning_rate": 2.5461924009435364e-08, + "loss": 0.011, + "num_tokens": 32747712.0, + "reward": 0.8624267578125, + "reward_std": 0.017425062134861946, + "rewards//mean": 0.8624267578125, + "rewards//std": 0.026231348514556885, + "step": 4496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8994, + "grad_norm": 1.3790017366409302, + "kl": 0.305893637239933, + "learning_rate": 2.536204498499922e-08, + "loss": 0.0122, + "num_tokens": 32755024.0, + "reward": 0.8497314453125, + "reward_std": 0.011500324122607708, + "rewards//mean": 0.8497314453125, + "rewards//std": 0.015421680174767971, + "step": 4497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.8996, + "grad_norm": 1.484510898590088, + "kl": 0.32821579463779926, + "learning_rate": 2.5262357144674673e-08, + "loss": 0.0162, + "num_tokens": 32762362.0, + "reward": 0.8477783203125, + "reward_std": 0.015296096913516521, + "rewards//mean": 0.8477783203125, + "rewards//std": 0.024596910923719406, + "step": 4498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.8998, + "grad_norm": 1.5096423625946045, + "kl": 0.36688076704740524, + "learning_rate": 2.5162860528615826e-08, + "loss": 0.0147, + "num_tokens": 32769666.0, + "reward": 0.80810546875, + "reward_std": 0.008945461362600327, + "rewards//mean": 0.80810546875, + "rewards//std": 0.017572958022356033, + "step": 4499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9, + "grad_norm": 1.5877500772476196, + "kl": 0.32299715653061867, + "learning_rate": 2.506355517689995e-08, + "loss": 0.0129, + "num_tokens": 32776914.0, + "reward": 0.89111328125, + "reward_std": 0.01559511385858059, + "rewards//mean": 0.89111328125, + "rewards//std": 0.019580790773034096, + "step": 4500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.828125, + "epoch": 0.9002, + "grad_norm": 1.9948774576187134, + "kl": 0.3662331886589527, + "learning_rate": 2.4964441129527335e-08, + "loss": 0.0103, + "num_tokens": 32784215.0, + "reward": 0.84881591796875, + "reward_std": 0.017433658242225647, + "rewards//mean": 0.84881591796875, + "rewards//std": 0.021862149238586426, + "step": 4501 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9004, + "grad_norm": 1.4551469087600708, + "kl": 0.31662929616868496, + "learning_rate": 2.4865518426420984e-08, + "loss": 0.0127, + "num_tokens": 32791503.0, + "reward": 0.8372802734375, + "reward_std": 0.016640465706586838, + "rewards//mean": 0.8372802734375, + "rewards//std": 0.020719308406114578, + "step": 4502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9006, + "grad_norm": 1.784532070159912, + "kl": 0.37760264426469803, + "learning_rate": 2.4766787107426966e-08, + "loss": 0.0151, + "num_tokens": 32798847.0, + "reward": 0.84832763671875, + "reward_std": 0.01783980056643486, + "rewards//mean": 0.84832763671875, + "rewards//std": 0.025444746017456055, + "step": 4503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.859375, + "epoch": 0.9008, + "grad_norm": 1.5003454685211182, + "kl": 0.3038846254348755, + "learning_rate": 2.4668247212314253e-08, + "loss": 0.0066, + "num_tokens": 32806078.0, + "reward": 0.85693359375, + "reward_std": 0.012660158798098564, + "rewards//mean": 0.85693359375, + "rewards//std": 0.019142966717481613, + "step": 4504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.901, + "grad_norm": 1.4837403297424316, + "kl": 0.32518533430993557, + "learning_rate": 2.4569898780774812e-08, + "loss": 0.013, + "num_tokens": 32813382.0, + "reward": 0.86083984375, + "reward_std": 0.016472090035676956, + "rewards//mean": 0.86083984375, + "rewards//std": 0.022298941388726234, + "step": 4505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.859375, + "epoch": 0.9012, + "grad_norm": 1.5979838371276855, + "kl": 0.4858356639742851, + "learning_rate": 2.4471741852423233e-08, + "loss": 0.0112, + "num_tokens": 32820741.0, + "reward": 0.8095703125, + "reward_std": 0.01391022652387619, + "rewards//mean": 0.8095703125, + "rewards//std": 0.017839696258306503, + "step": 4506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9014, + "grad_norm": 1.253968358039856, + "kl": 0.33172405138611794, + "learning_rate": 2.437377646679717e-08, + "loss": 0.0133, + "num_tokens": 32828021.0, + "reward": 0.8414306640625, + "reward_std": 0.015980685129761696, + "rewards//mean": 0.8414306640625, + "rewards//std": 0.022795425727963448, + "step": 4507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9016, + "grad_norm": 1.4650753736495972, + "kl": 0.38719886541366577, + "learning_rate": 2.4276002663357e-08, + "loss": 0.0155, + "num_tokens": 32835333.0, + "reward": 0.79595947265625, + "reward_std": 0.012778854928910732, + "rewards//mean": 0.79595947265625, + "rewards//std": 0.020946288481354713, + "step": 4508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9018, + "grad_norm": 1.7424938678741455, + "kl": 0.34398496709764004, + "learning_rate": 2.4178420481486215e-08, + "loss": 0.0138, + "num_tokens": 32842565.0, + "reward": 0.8350830078125, + "reward_std": 0.01265160646289587, + "rewards//mean": 0.8350830078125, + "rewards//std": 0.023462999612092972, + "step": 4509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.902, + "grad_norm": 1.3495311737060547, + "kl": 0.4236586019396782, + "learning_rate": 2.4081029960490663e-08, + "loss": 0.0169, + "num_tokens": 32849781.0, + "reward": 0.88958740234375, + "reward_std": 0.015290739946067333, + "rewards//mean": 0.88958740234375, + "rewards//std": 0.02054348960518837, + "step": 4510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.9022, + "grad_norm": 1.605559229850769, + "kl": 0.34961652010679245, + "learning_rate": 2.3983831139599286e-08, + "loss": 0.014, + "num_tokens": 32857088.0, + "reward": 0.84783935546875, + "reward_std": 0.014366204850375652, + "rewards//mean": 0.84783935546875, + "rewards//std": 0.017096929252147675, + "step": 4511 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.75, + "epoch": 0.9024, + "grad_norm": 1.6359548568725586, + "kl": 0.36244893446564674, + "learning_rate": 2.388682405796383e-08, + "loss": 0.0074, + "num_tokens": 32864368.0, + "reward": 0.8450927734375, + "reward_std": 0.01221681758761406, + "rewards//mean": 0.8450927734375, + "rewards//std": 0.019847620278596878, + "step": 4512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9026, + "grad_norm": 1.7240041494369507, + "kl": 0.3717503249645233, + "learning_rate": 2.379000875465881e-08, + "loss": 0.0149, + "num_tokens": 32871688.0, + "reward": 0.85235595703125, + "reward_std": 0.014558492228388786, + "rewards//mean": 0.85235595703125, + "rewards//std": 0.027570275589823723, + "step": 4513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9028, + "grad_norm": 1.6703267097473145, + "kl": 0.3465522304177284, + "learning_rate": 2.36933852686812e-08, + "loss": 0.0139, + "num_tokens": 32878976.0, + "reward": 0.84124755859375, + "reward_std": 0.01837472803890705, + "rewards//mean": 0.84124755859375, + "rewards//std": 0.021703006699681282, + "step": 4514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.903, + "grad_norm": 4.254644870758057, + "kl": 0.6923658847808838, + "learning_rate": 2.359695363895109e-08, + "loss": 0.0277, + "num_tokens": 32886272.0, + "reward": 0.86810302734375, + "reward_std": 0.01704494282603264, + "rewards//mean": 0.86810302734375, + "rewards//std": 0.020216671749949455, + "step": 4515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.859375, + "epoch": 0.9032, + "grad_norm": 1.4336652755737305, + "kl": 0.31744270771741867, + "learning_rate": 2.350071390431102e-08, + "loss": 0.0153, + "num_tokens": 32893527.0, + "reward": 0.88885498046875, + "reward_std": 0.015518415719270706, + "rewards//mean": 0.88885498046875, + "rewards//std": 0.021269723773002625, + "step": 4516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.9034, + "grad_norm": 1.4750488996505737, + "kl": 0.2838269993662834, + "learning_rate": 2.3404666103526537e-08, + "loss": 0.0117, + "num_tokens": 32900906.0, + "reward": 0.84857177734375, + "reward_std": 0.020793620496988297, + "rewards//mean": 0.84857177734375, + "rewards//std": 0.02854200452566147, + "step": 4517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.734375, + "epoch": 0.9036, + "grad_norm": 1.5815972089767456, + "kl": 0.45490653067827225, + "learning_rate": 2.3308810275285416e-08, + "loss": 0.0029, + "num_tokens": 32908137.0, + "reward": 0.84881591796875, + "reward_std": 0.01996319554746151, + "rewards//mean": 0.84881591796875, + "rewards//std": 0.025052646175026894, + "step": 4518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9038, + "grad_norm": 1.4869297742843628, + "kl": 0.36391124315559864, + "learning_rate": 2.3213146458198553e-08, + "loss": 0.0146, + "num_tokens": 32915409.0, + "reward": 0.8487548828125, + "reward_std": 0.013554678298532963, + "rewards//mean": 0.8487548828125, + "rewards//std": 0.02018941566348076, + "step": 4519 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.904, + "grad_norm": 2.777827739715576, + "kl": 0.4561477266252041, + "learning_rate": 2.311767469079934e-08, + "loss": 0.0182, + "num_tokens": 32922769.0, + "reward": 0.854248046875, + "reward_std": 0.0131695456802845, + "rewards//mean": 0.854248046875, + "rewards//std": 0.017591899260878563, + "step": 4520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9042, + "grad_norm": 1.7089899778366089, + "kl": 0.3517397791147232, + "learning_rate": 2.3022395011543682e-08, + "loss": 0.0141, + "num_tokens": 32929953.0, + "reward": 0.803466796875, + "reward_std": 0.018367884680628777, + "rewards//mean": 0.803466796875, + "rewards//std": 0.021551910787820816, + "step": 4521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9044, + "grad_norm": 1.4580795764923096, + "kl": 0.3690270483493805, + "learning_rate": 2.2927307458810218e-08, + "loss": 0.0148, + "num_tokens": 32937345.0, + "reward": 0.83770751953125, + "reward_std": 0.013674251735210419, + "rewards//mean": 0.83770751953125, + "rewards//std": 0.02227100543677807, + "step": 4522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9046, + "grad_norm": 1.4114179611206055, + "kl": 0.3433406502008438, + "learning_rate": 2.283241207090031e-08, + "loss": 0.0137, + "num_tokens": 32944625.0, + "reward": 0.85870361328125, + "reward_std": 0.01400452945381403, + "rewards//mean": 0.85870361328125, + "rewards//std": 0.01904217340052128, + "step": 4523 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9048, + "grad_norm": 1.5013134479522705, + "kl": 0.3480869308114052, + "learning_rate": 2.273770888603782e-08, + "loss": 0.0139, + "num_tokens": 32951929.0, + "reward": 0.87420654296875, + "reward_std": 0.020277278497815132, + "rewards//mean": 0.87420654296875, + "rewards//std": 0.024251926690340042, + "step": 4524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.905, + "grad_norm": 1.5016202926635742, + "kl": 0.2852180153131485, + "learning_rate": 2.264319794236902e-08, + "loss": 0.0116, + "num_tokens": 32959179.0, + "reward": 0.83843994140625, + "reward_std": 0.013162139803171158, + "rewards//mean": 0.83843994140625, + "rewards//std": 0.019922509789466858, + "step": 4525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9052, + "grad_norm": 1.7164223194122314, + "kl": 0.5064308159053326, + "learning_rate": 2.2548879277963063e-08, + "loss": 0.0203, + "num_tokens": 32966403.0, + "reward": 0.87042236328125, + "reward_std": 0.015761129558086395, + "rewards//mean": 0.87042236328125, + "rewards//std": 0.018355676904320717, + "step": 4526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9054, + "grad_norm": 1.5532580614089966, + "kl": 0.5003973357379436, + "learning_rate": 2.2454752930811393e-08, + "loss": 0.02, + "num_tokens": 32973595.0, + "reward": 0.8448486328125, + "reward_std": 0.01613466814160347, + "rewards//mean": 0.8448486328125, + "rewards//std": 0.019765079021453857, + "step": 4527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9056, + "grad_norm": 1.5018435716629028, + "kl": 0.3610619381070137, + "learning_rate": 2.2360818938828187e-08, + "loss": 0.0144, + "num_tokens": 32980859.0, + "reward": 0.8382568359375, + "reward_std": 0.012993678450584412, + "rewards//mean": 0.8382568359375, + "rewards//std": 0.023732423782348633, + "step": 4528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9058, + "grad_norm": 1.6301559209823608, + "kl": 0.3432910144329071, + "learning_rate": 2.226707733984995e-08, + "loss": 0.0137, + "num_tokens": 32988211.0, + "reward": 0.850341796875, + "reward_std": 0.014470256865024567, + "rewards//mean": 0.850341796875, + "rewards//std": 0.01701805740594864, + "step": 4529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.906, + "grad_norm": 1.5379111766815186, + "kl": 0.3234483767300844, + "learning_rate": 2.2173528171635814e-08, + "loss": 0.0129, + "num_tokens": 32995459.0, + "reward": 0.8106689453125, + "reward_std": 0.012701722793281078, + "rewards//mean": 0.8106689453125, + "rewards//std": 0.021258778870105743, + "step": 4530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9062, + "grad_norm": 1.6622810363769531, + "kl": 0.3268700521439314, + "learning_rate": 2.208017147186736e-08, + "loss": 0.0131, + "num_tokens": 33002771.0, + "reward": 0.87103271484375, + "reward_std": 0.014483044855296612, + "rewards//mean": 0.87103271484375, + "rewards//std": 0.02081579715013504, + "step": 4531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.9064, + "grad_norm": 1.583402395248413, + "kl": 0.369486540555954, + "learning_rate": 2.198700727814884e-08, + "loss": 0.0142, + "num_tokens": 33009998.0, + "reward": 0.84893798828125, + "reward_std": 0.016334760934114456, + "rewards//mean": 0.84893798828125, + "rewards//std": 0.025620238855481148, + "step": 4532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9066, + "grad_norm": 1.552356481552124, + "kl": 0.3337889052927494, + "learning_rate": 2.1894035628006515e-08, + "loss": 0.0134, + "num_tokens": 33017318.0, + "reward": 0.78424072265625, + "reward_std": 0.01142684556543827, + "rewards//mean": 0.78424072265625, + "rewards//std": 0.014677731320261955, + "step": 4533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9068, + "grad_norm": 1.4767251014709473, + "kl": 0.3716042637825012, + "learning_rate": 2.180125655888948e-08, + "loss": 0.0149, + "num_tokens": 33024558.0, + "reward": 0.8458251953125, + "reward_std": 0.016410503536462784, + "rewards//mean": 0.8458251953125, + "rewards//std": 0.021854272112250328, + "step": 4534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.907, + "grad_norm": 1.4505834579467773, + "kl": 0.4246042910963297, + "learning_rate": 2.170867010816907e-08, + "loss": 0.017, + "num_tokens": 33031742.0, + "reward": 0.80206298828125, + "reward_std": 0.016411446034908295, + "rewards//mean": 0.80206298828125, + "rewards//std": 0.024050099775195122, + "step": 4535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9072, + "grad_norm": 1.365291714668274, + "kl": 0.36390843242406845, + "learning_rate": 2.1616276313139225e-08, + "loss": 0.0146, + "num_tokens": 33039038.0, + "reward": 0.80535888671875, + "reward_std": 0.01191666815429926, + "rewards//mean": 0.80535888671875, + "rewards//std": 0.02000969834625721, + "step": 4536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9074, + "grad_norm": 1.4279018640518188, + "kl": 0.3326105084270239, + "learning_rate": 2.1524075211016013e-08, + "loss": 0.0133, + "num_tokens": 33046398.0, + "reward": 0.85809326171875, + "reward_std": 0.018369358032941818, + "rewards//mean": 0.85809326171875, + "rewards//std": 0.02274581603705883, + "step": 4537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9076, + "grad_norm": 1.5274666547775269, + "kl": 0.36576418578624725, + "learning_rate": 2.1432066838938056e-08, + "loss": 0.0146, + "num_tokens": 33053638.0, + "reward": 0.8062744140625, + "reward_std": 0.012244398705661297, + "rewards//mean": 0.8062744140625, + "rewards//std": 0.022581923753023148, + "step": 4538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9078, + "grad_norm": 2.408151388168335, + "kl": 0.476013520732522, + "learning_rate": 2.1340251233966377e-08, + "loss": 0.019, + "num_tokens": 33060830.0, + "reward": 0.86981201171875, + "reward_std": 0.014559960924088955, + "rewards//mean": 0.86981201171875, + "rewards//std": 0.025287199765443802, + "step": 4539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.796875, + "epoch": 0.908, + "grad_norm": 1.53707754611969, + "kl": 0.33742501214146614, + "learning_rate": 2.1248628433084337e-08, + "loss": 0.0088, + "num_tokens": 33068113.0, + "reward": 0.88531494140625, + "reward_std": 0.019272083416581154, + "rewards//mean": 0.88531494140625, + "rewards//std": 0.02553797885775566, + "step": 4540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9082, + "grad_norm": 1.6993283033370972, + "kl": 0.32757556438446045, + "learning_rate": 2.1157198473197413e-08, + "loss": 0.0131, + "num_tokens": 33075553.0, + "reward": 0.81646728515625, + "reward_std": 0.011613219045102596, + "rewards//mean": 0.81646728515625, + "rewards//std": 0.016754386946558952, + "step": 4541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9084, + "grad_norm": 1.90936279296875, + "kl": 0.3403279520571232, + "learning_rate": 2.10659613911337e-08, + "loss": 0.0136, + "num_tokens": 33082833.0, + "reward": 0.8038330078125, + "reward_std": 0.01780633255839348, + "rewards//mean": 0.8038330078125, + "rewards//std": 0.022856438532471657, + "step": 4542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9086, + "grad_norm": 1.3995543718338013, + "kl": 0.3553305082023144, + "learning_rate": 2.0974917223643417e-08, + "loss": 0.0142, + "num_tokens": 33090065.0, + "reward": 0.82098388671875, + "reward_std": 0.014105871319770813, + "rewards//mean": 0.82098388671875, + "rewards//std": 0.0230787992477417, + "step": 4543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.703125, + "epoch": 0.9088, + "grad_norm": 1.8175773620605469, + "kl": 0.42927582934498787, + "learning_rate": 2.0884066007399337e-08, + "loss": 0.0022, + "num_tokens": 33097334.0, + "reward": 0.85858154296875, + "reward_std": 0.020147085189819336, + "rewards//mean": 0.85858154296875, + "rewards//std": 0.02371671237051487, + "step": 4544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.909, + "grad_norm": 1.5954092741012573, + "kl": 0.3226478863507509, + "learning_rate": 2.079340777899602e-08, + "loss": 0.0129, + "num_tokens": 33104622.0, + "reward": 0.86431884765625, + "reward_std": 0.013793978840112686, + "rewards//mean": 0.86431884765625, + "rewards//std": 0.01645904965698719, + "step": 4545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9092, + "grad_norm": 1.7037746906280518, + "kl": 0.3776575066149235, + "learning_rate": 2.070294257495081e-08, + "loss": 0.0151, + "num_tokens": 33111870.0, + "reward": 0.87603759765625, + "reward_std": 0.022713765501976013, + "rewards//mean": 0.87603759765625, + "rewards//std": 0.026532230898737907, + "step": 4546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.59375, + "epoch": 0.9094, + "grad_norm": 1.9099295139312744, + "kl": 0.3551754467189312, + "learning_rate": 2.0612670431703062e-08, + "loss": -0.0175, + "num_tokens": 33119140.0, + "reward": 0.8748779296875, + "reward_std": 0.015727322548627853, + "rewards//mean": 0.8748779296875, + "rewards//std": 0.026245195418596268, + "step": 4547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.9096, + "grad_norm": 1.7044501304626465, + "kl": 0.37031604163348675, + "learning_rate": 2.052259138561424e-08, + "loss": 0.0146, + "num_tokens": 33126403.0, + "reward": 0.76824951171875, + "reward_std": 0.014178345911204815, + "rewards//mean": 0.76824951171875, + "rewards//std": 0.02202082984149456, + "step": 4548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.65625, + "epoch": 0.9098, + "grad_norm": 1.8954366445541382, + "kl": 0.33687857538461685, + "learning_rate": 2.0432705472968325e-08, + "loss": -0.0137, + "num_tokens": 33133733.0, + "reward": 0.8499755859375, + "reward_std": 0.017492497339844704, + "rewards//mean": 0.8499755859375, + "rewards//std": 0.02615969255566597, + "step": 4549 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.91, + "grad_norm": 1.5480455160140991, + "kl": 0.34940280951559544, + "learning_rate": 2.0343012729971243e-08, + "loss": 0.014, + "num_tokens": 33141013.0, + "reward": 0.84869384765625, + "reward_std": 0.017550308257341385, + "rewards//mean": 0.84869384765625, + "rewards//std": 0.023263679817318916, + "step": 4550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9102, + "grad_norm": 1.4821313619613647, + "kl": 0.36272356659173965, + "learning_rate": 2.025351319275137e-08, + "loss": 0.0145, + "num_tokens": 33148285.0, + "reward": 0.86138916015625, + "reward_std": 0.016581695526838303, + "rewards//mean": 0.86138916015625, + "rewards//std": 0.024419255554676056, + "step": 4551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9104, + "grad_norm": 1.5356181859970093, + "kl": 0.35149649903178215, + "learning_rate": 2.0164206897358927e-08, + "loss": 0.0141, + "num_tokens": 33155565.0, + "reward": 0.8587646484375, + "reward_std": 0.01269182562828064, + "rewards//mean": 0.8587646484375, + "rewards//std": 0.01776103302836418, + "step": 4552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.578125, + "epoch": 0.9106, + "grad_norm": 1.6154439449310303, + "kl": 0.3281309213489294, + "learning_rate": 2.007509387976658e-08, + "loss": -0.0173, + "num_tokens": 33162898.0, + "reward": 0.85345458984375, + "reward_std": 0.015360802412033081, + "rewards//mean": 0.85345458984375, + "rewards//std": 0.02374541573226452, + "step": 4553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9108, + "grad_norm": 1.7102718353271484, + "kl": 0.3223665989935398, + "learning_rate": 1.9986174175869008e-08, + "loss": 0.0129, + "num_tokens": 33170186.0, + "reward": 0.85906982421875, + "reward_std": 0.013588209636509418, + "rewards//mean": 0.85906982421875, + "rewards//std": 0.013465343043208122, + "step": 4554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.828125, + "epoch": 0.911, + "grad_norm": 1.4130429029464722, + "kl": 0.3592616692185402, + "learning_rate": 1.9897447821483115e-08, + "loss": 0.0076, + "num_tokens": 33177407.0, + "reward": 0.8656005859375, + "reward_std": 0.01411796547472477, + "rewards//mean": 0.8656005859375, + "rewards//std": 0.02355829067528248, + "step": 4555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9112, + "grad_norm": 1.4583559036254883, + "kl": 0.33746632747352123, + "learning_rate": 1.9808914852347812e-08, + "loss": 0.0135, + "num_tokens": 33184679.0, + "reward": 0.80596923828125, + "reward_std": 0.014706170186400414, + "rewards//mean": 0.80596923828125, + "rewards//std": 0.01999153383076191, + "step": 4556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9114, + "grad_norm": 1.6453462839126587, + "kl": 0.38000495359301567, + "learning_rate": 1.972057530412413e-08, + "loss": 0.0152, + "num_tokens": 33192087.0, + "reward": 0.87237548828125, + "reward_std": 0.012961952947080135, + "rewards//mean": 0.87237548828125, + "rewards//std": 0.019010350108146667, + "step": 4557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9116, + "grad_norm": 2.36561918258667, + "kl": 0.48010801896452904, + "learning_rate": 1.963242921239533e-08, + "loss": 0.0192, + "num_tokens": 33199415.0, + "reward": 0.8807373046875, + "reward_std": 0.01628914102911949, + "rewards//mean": 0.8807373046875, + "rewards//std": 0.024919738993048668, + "step": 4558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9118, + "grad_norm": 1.3408225774765015, + "kl": 0.33757750503718853, + "learning_rate": 1.9544476612666672e-08, + "loss": 0.0135, + "num_tokens": 33206743.0, + "reward": 0.8321533203125, + "reward_std": 0.01470895018428564, + "rewards//mean": 0.8321533203125, + "rewards//std": 0.0241998303681612, + "step": 4559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.912, + "grad_norm": 1.6005979776382446, + "kl": 0.41016847640275955, + "learning_rate": 1.9456717540365264e-08, + "loss": 0.0164, + "num_tokens": 33214103.0, + "reward": 0.767333984375, + "reward_std": 0.01862824521958828, + "rewards//mean": 0.767333984375, + "rewards//std": 0.02683749608695507, + "step": 4560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9122, + "grad_norm": 1.839993953704834, + "kl": 0.3701367750763893, + "learning_rate": 1.936915203084055e-08, + "loss": 0.0148, + "num_tokens": 33221511.0, + "reward": 0.83636474609375, + "reward_std": 0.013489417731761932, + "rewards//mean": 0.83636474609375, + "rewards//std": 0.017219560220837593, + "step": 4561 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9124, + "grad_norm": 1.2368366718292236, + "kl": 0.33463942632079124, + "learning_rate": 1.928178011936399e-08, + "loss": 0.0134, + "num_tokens": 33228735.0, + "reward": 0.8551025390625, + "reward_std": 0.011095106601715088, + "rewards//mean": 0.8551025390625, + "rewards//std": 0.017984632402658463, + "step": 4562 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9126, + "grad_norm": 1.6291916370391846, + "kl": 0.34026291593909264, + "learning_rate": 1.9194601841128922e-08, + "loss": 0.0136, + "num_tokens": 33235967.0, + "reward": 0.87060546875, + "reward_std": 0.013285147957503796, + "rewards//mean": 0.87060546875, + "rewards//std": 0.025614995509386063, + "step": 4563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9128, + "grad_norm": 1.455024242401123, + "kl": 0.3373980727046728, + "learning_rate": 1.9107617231250707e-08, + "loss": 0.0135, + "num_tokens": 33243271.0, + "reward": 0.8984375, + "reward_std": 0.018022645264863968, + "rewards//mean": 0.8984375, + "rewards//std": 0.02244509570300579, + "step": 4564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.913, + "grad_norm": 1.4973222017288208, + "kl": 0.34061742573976517, + "learning_rate": 1.9020826324766702e-08, + "loss": 0.0155, + "num_tokens": 33250584.0, + "reward": 0.84844970703125, + "reward_std": 0.01784159243106842, + "rewards//mean": 0.84844970703125, + "rewards//std": 0.026549911126494408, + "step": 4565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9132, + "grad_norm": 1.3206374645233154, + "kl": 0.28921122290194035, + "learning_rate": 1.893422915663645e-08, + "loss": 0.0116, + "num_tokens": 33257800.0, + "reward": 0.814208984375, + "reward_std": 0.013477440923452377, + "rewards//mean": 0.814208984375, + "rewards//std": 0.017481409013271332, + "step": 4566 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9134, + "grad_norm": 1.716090440750122, + "kl": 0.31614818051457405, + "learning_rate": 1.88478257617411e-08, + "loss": 0.0126, + "num_tokens": 33265152.0, + "reward": 0.83673095703125, + "reward_std": 0.017183100804686546, + "rewards//mean": 0.83673095703125, + "rewards//std": 0.018625836819410324, + "step": 4567 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.59375, + "epoch": 0.9136, + "grad_norm": 1.4333384037017822, + "kl": 0.3626956082880497, + "learning_rate": 1.8761616174883977e-08, + "loss": 0.0051, + "num_tokens": 33272462.0, + "reward": 0.83575439453125, + "reward_std": 0.013897364027798176, + "rewards//mean": 0.83575439453125, + "rewards//std": 0.020403748378157616, + "step": 4568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9138, + "grad_norm": 1.6345735788345337, + "kl": 0.44061511382460594, + "learning_rate": 1.8675600430790306e-08, + "loss": 0.0176, + "num_tokens": 33279622.0, + "reward": 0.88165283203125, + "reward_std": 0.018394194543361664, + "rewards//mean": 0.88165283203125, + "rewards//std": 0.026889830827713013, + "step": 4569 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.914, + "grad_norm": 1.6505047082901, + "kl": 0.377115823328495, + "learning_rate": 1.8589778564107262e-08, + "loss": 0.0151, + "num_tokens": 33286886.0, + "reward": 0.81524658203125, + "reward_std": 0.014645840972661972, + "rewards//mean": 0.81524658203125, + "rewards//std": 0.02034728415310383, + "step": 4570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9142, + "grad_norm": 1.5877903699874878, + "kl": 0.40958403050899506, + "learning_rate": 1.8504150609403856e-08, + "loss": 0.0164, + "num_tokens": 33294222.0, + "reward": 0.83367919921875, + "reward_std": 0.016292477026581764, + "rewards//mean": 0.83367919921875, + "rewards//std": 0.02341480366885662, + "step": 4571 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9144, + "grad_norm": 1.4832303524017334, + "kl": 0.29857940413057804, + "learning_rate": 1.8418716601170947e-08, + "loss": 0.0119, + "num_tokens": 33301502.0, + "reward": 0.81927490234375, + "reward_std": 0.013630274683237076, + "rewards//mean": 0.81927490234375, + "rewards//std": 0.021193435415625572, + "step": 4572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9146, + "grad_norm": 1.5987635850906372, + "kl": 0.36742229014635086, + "learning_rate": 1.8333476573821394e-08, + "loss": 0.0147, + "num_tokens": 33308822.0, + "reward": 0.82293701171875, + "reward_std": 0.011042647995054722, + "rewards//mean": 0.82293701171875, + "rewards//std": 0.014466854743659496, + "step": 4573 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9148, + "grad_norm": 1.7073404788970947, + "kl": 0.49276550114154816, + "learning_rate": 1.8248430561689954e-08, + "loss": 0.0197, + "num_tokens": 33316070.0, + "reward": 0.8692626953125, + "reward_std": 0.018821457400918007, + "rewards//mean": 0.8692626953125, + "rewards//std": 0.026910440996289253, + "step": 4574 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.59375, + "epoch": 0.915, + "grad_norm": 1.4092655181884766, + "kl": 0.34774870052933693, + "learning_rate": 1.8163578599033004e-08, + "loss": -0.0196, + "num_tokens": 33323452.0, + "reward": 0.86370849609375, + "reward_std": 0.01813596487045288, + "rewards//mean": 0.86370849609375, + "rewards//std": 0.027892882004380226, + "step": 4575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9152, + "grad_norm": 1.5556390285491943, + "kl": 0.32791332341730595, + "learning_rate": 1.807892072002898e-08, + "loss": 0.0131, + "num_tokens": 33330820.0, + "reward": 0.86053466796875, + "reward_std": 0.014032919891178608, + "rewards//mean": 0.86053466796875, + "rewards//std": 0.022290028631687164, + "step": 4576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9154, + "grad_norm": 1.4349385499954224, + "kl": 0.40412434563040733, + "learning_rate": 1.799445695877805e-08, + "loss": 0.0162, + "num_tokens": 33338140.0, + "reward": 0.84716796875, + "reward_std": 0.010296618565917015, + "rewards//mean": 0.84716796875, + "rewards//std": 0.013858823105692863, + "step": 4577 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9156, + "grad_norm": 1.8287577629089355, + "kl": 0.3625735305249691, + "learning_rate": 1.7910187349302275e-08, + "loss": 0.0145, + "num_tokens": 33345460.0, + "reward": 0.84417724609375, + "reward_std": 0.017469875514507294, + "rewards//mean": 0.84417724609375, + "rewards//std": 0.023407690227031708, + "step": 4578 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9158, + "grad_norm": 1.9869333505630493, + "kl": 0.34451817721128464, + "learning_rate": 1.782611192554534e-08, + "loss": 0.0138, + "num_tokens": 33352788.0, + "reward": 0.86474609375, + "reward_std": 0.015079973265528679, + "rewards//mean": 0.86474609375, + "rewards//std": 0.016492240130901337, + "step": 4579 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.916, + "grad_norm": 1.8295468091964722, + "kl": 0.47221143916249275, + "learning_rate": 1.774223072137282e-08, + "loss": 0.0189, + "num_tokens": 33360068.0, + "reward": 0.8385009765625, + "reward_std": 0.015066659078001976, + "rewards//mean": 0.8385009765625, + "rewards//std": 0.021078573539853096, + "step": 4580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.9162, + "grad_norm": 1.410413384437561, + "kl": 0.28660681284964085, + "learning_rate": 1.7658543770572186e-08, + "loss": 0.0106, + "num_tokens": 33367513.0, + "reward": 0.7967529296875, + "reward_std": 0.013072768226265907, + "rewards//mean": 0.7967529296875, + "rewards//std": 0.02221698872745037, + "step": 4581 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.84375, + "epoch": 0.9164, + "grad_norm": 1.3638978004455566, + "kl": 0.3303449060767889, + "learning_rate": 1.757505110685237e-08, + "loss": 0.0084, + "num_tokens": 33374815.0, + "reward": 0.86053466796875, + "reward_std": 0.011543812230229378, + "rewards//mean": 0.86053466796875, + "rewards//std": 0.01514771580696106, + "step": 4582 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9166, + "grad_norm": 2.2223012447357178, + "kl": 0.4743575472384691, + "learning_rate": 1.7491752763844292e-08, + "loss": 0.019, + "num_tokens": 33382087.0, + "reward": 0.86407470703125, + "reward_std": 0.01917937956750393, + "rewards//mean": 0.86407470703125, + "rewards//std": 0.024716857820749283, + "step": 4583 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9168, + "grad_norm": 1.640597939491272, + "kl": 0.3358616456389427, + "learning_rate": 1.7408648775100455e-08, + "loss": 0.0134, + "num_tokens": 33389327.0, + "reward": 0.80767822265625, + "reward_std": 0.023933619260787964, + "rewards//mean": 0.80767822265625, + "rewards//std": 0.03681362792849541, + "step": 4584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.917, + "grad_norm": 1.350784182548523, + "kl": 0.3531596437096596, + "learning_rate": 1.73257391740953e-08, + "loss": 0.0141, + "num_tokens": 33396751.0, + "reward": 0.86956787109375, + "reward_std": 0.014509846456348896, + "rewards//mean": 0.86956787109375, + "rewards//std": 0.02307945489883423, + "step": 4585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9172, + "grad_norm": 1.6743545532226562, + "kl": 0.38045815750956535, + "learning_rate": 1.724302399422456e-08, + "loss": 0.0152, + "num_tokens": 33404031.0, + "reward": 0.81805419921875, + "reward_std": 0.01970217376947403, + "rewards//mean": 0.81805419921875, + "rewards//std": 0.024918144568800926, + "step": 4586 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9174, + "grad_norm": 1.5268124341964722, + "kl": 0.3031890280544758, + "learning_rate": 1.7160503268806082e-08, + "loss": 0.0121, + "num_tokens": 33411255.0, + "reward": 0.83929443359375, + "reward_std": 0.015268910676240921, + "rewards//mean": 0.83929443359375, + "rewards//std": 0.021619146689772606, + "step": 4587 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9176, + "grad_norm": 1.5377789735794067, + "kl": 0.4053388349711895, + "learning_rate": 1.707817703107911e-08, + "loss": 0.0162, + "num_tokens": 33418519.0, + "reward": 0.8487548828125, + "reward_std": 0.017768707126379013, + "rewards//mean": 0.8487548828125, + "rewards//std": 0.021298617124557495, + "step": 4588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9178, + "grad_norm": 1.571900486946106, + "kl": 0.3303512968122959, + "learning_rate": 1.699604531420473e-08, + "loss": 0.0132, + "num_tokens": 33425903.0, + "reward": 0.86920166015625, + "reward_std": 0.019488748162984848, + "rewards//mean": 0.86920166015625, + "rewards//std": 0.025036325678229332, + "step": 4589 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.918, + "grad_norm": 1.4416414499282837, + "kl": 0.3520708605647087, + "learning_rate": 1.691410815126554e-08, + "loss": 0.0141, + "num_tokens": 33433175.0, + "reward": 0.82269287109375, + "reward_std": 0.014576714485883713, + "rewards//mean": 0.82269287109375, + "rewards//std": 0.03295228257775307, + "step": 4590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9182, + "grad_norm": 1.7973352670669556, + "kl": 0.31708630733191967, + "learning_rate": 1.683236557526574e-08, + "loss": 0.0127, + "num_tokens": 33440487.0, + "reward": 0.821533203125, + "reward_std": 0.013134103268384933, + "rewards//mean": 0.821533203125, + "rewards//std": 0.02662002108991146, + "step": 4591 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.84375, + "epoch": 0.9184, + "grad_norm": 1.4609709978103638, + "kl": 0.3406752645969391, + "learning_rate": 1.675081761913133e-08, + "loss": 0.007, + "num_tokens": 33447733.0, + "reward": 0.85699462890625, + "reward_std": 0.015192190185189247, + "rewards//mean": 0.85699462890625, + "rewards//std": 0.02519664354622364, + "step": 4592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9186, + "grad_norm": 1.694146752357483, + "kl": 0.39293621480464935, + "learning_rate": 1.666946431570987e-08, + "loss": 0.0157, + "num_tokens": 33454965.0, + "reward": 0.88037109375, + "reward_std": 0.01732385903596878, + "rewards//mean": 0.88037109375, + "rewards//std": 0.031207343563437462, + "step": 4593 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.796875, + "epoch": 0.9188, + "grad_norm": 1.742443561553955, + "kl": 0.3574513643980026, + "learning_rate": 1.658830569777031e-08, + "loss": 0.0118, + "num_tokens": 33462256.0, + "reward": 0.85723876953125, + "reward_std": 0.014932326972484589, + "rewards//mean": 0.85723876953125, + "rewards//std": 0.01790650002658367, + "step": 4594 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.919, + "grad_norm": 1.3734056949615479, + "kl": 0.28707289323210716, + "learning_rate": 1.6507341798003394e-08, + "loss": 0.0115, + "num_tokens": 33469464.0, + "reward": 0.85198974609375, + "reward_std": 0.01347331516444683, + "rewards//mean": 0.85198974609375, + "rewards//std": 0.019489256665110588, + "step": 4595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9192, + "grad_norm": 1.558181881904602, + "kl": 0.30588754266500473, + "learning_rate": 1.6426572649021474e-08, + "loss": 0.0122, + "num_tokens": 33476824.0, + "reward": 0.86126708984375, + "reward_std": 0.014095829799771309, + "rewards//mean": 0.86126708984375, + "rewards//std": 0.0180957093834877, + "step": 4596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.78125, + "epoch": 0.9194, + "grad_norm": 1.403488039970398, + "kl": 0.32893023267388344, + "learning_rate": 1.6345998283358143e-08, + "loss": 0.0165, + "num_tokens": 33484098.0, + "reward": 0.87353515625, + "reward_std": 0.015671808272600174, + "rewards//mean": 0.87353515625, + "rewards//std": 0.020908765494823456, + "step": 4597 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9196, + "grad_norm": 1.4121499061584473, + "kl": 0.3456408344209194, + "learning_rate": 1.6265618733468933e-08, + "loss": 0.0138, + "num_tokens": 33491298.0, + "reward": 0.77569580078125, + "reward_std": 0.015675555914640427, + "rewards//mean": 0.77569580078125, + "rewards//std": 0.02539949305355549, + "step": 4598 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9198, + "grad_norm": 1.4072465896606445, + "kl": 0.3406732529401779, + "learning_rate": 1.6185434031730617e-08, + "loss": 0.0136, + "num_tokens": 33498682.0, + "reward": 0.83135986328125, + "reward_std": 0.01256217435002327, + "rewards//mean": 0.83135986328125, + "rewards//std": 0.015161700546741486, + "step": 4599 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.625, + "epoch": 0.92, + "grad_norm": 1.652827501296997, + "kl": 0.4358820542693138, + "learning_rate": 1.6105444210441686e-08, + "loss": -0.0133, + "num_tokens": 33506002.0, + "reward": 0.846923828125, + "reward_std": 0.012428149580955505, + "rewards//mean": 0.846923828125, + "rewards//std": 0.016081426292657852, + "step": 4600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.484375, + "epoch": 0.9202, + "grad_norm": 1.9533042907714844, + "kl": 0.4232810400426388, + "learning_rate": 1.6025649301821875e-08, + "loss": -0.0162, + "num_tokens": 33513345.0, + "reward": 0.8248291015625, + "reward_std": 0.01585240662097931, + "rewards//mean": 0.8248291015625, + "rewards//std": 0.019215185195207596, + "step": 4601 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9204, + "grad_norm": 1.8739873170852661, + "kl": 0.3638824410736561, + "learning_rate": 1.5946049338012635e-08, + "loss": 0.0146, + "num_tokens": 33520561.0, + "reward": 0.85205078125, + "reward_std": 0.01785585843026638, + "rewards//mean": 0.85205078125, + "rewards//std": 0.02708566002547741, + "step": 4602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.9206, + "grad_norm": 1.4545316696166992, + "kl": 0.3984760381281376, + "learning_rate": 1.5866644351076874e-08, + "loss": 0.0192, + "num_tokens": 33527827.0, + "reward": 0.87066650390625, + "reward_std": 0.01747879944741726, + "rewards//mean": 0.87066650390625, + "rewards//std": 0.021042905747890472, + "step": 4603 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9208, + "grad_norm": 1.6888371706008911, + "kl": 0.36684535816311836, + "learning_rate": 1.5787434372998953e-08, + "loss": 0.0147, + "num_tokens": 33535139.0, + "reward": 0.80181884765625, + "reward_std": 0.016411006450653076, + "rewards//mean": 0.80181884765625, + "rewards//std": 0.023242847993969917, + "step": 4604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.734375, + "epoch": 0.921, + "grad_norm": 5.529165267944336, + "kl": 0.6817955877631903, + "learning_rate": 1.570841943568446e-08, + "loss": 0.0112, + "num_tokens": 33542378.0, + "reward": 0.77105712890625, + "reward_std": 0.016466820612549782, + "rewards//mean": 0.77105712890625, + "rewards//std": 0.024622974917292595, + "step": 4605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9212, + "grad_norm": 1.5066628456115723, + "kl": 0.3138388395309448, + "learning_rate": 1.5629599570960716e-08, + "loss": 0.0126, + "num_tokens": 33549642.0, + "reward": 0.8485107421875, + "reward_std": 0.01757938042283058, + "rewards//mean": 0.8485107421875, + "rewards//std": 0.02580311894416809, + "step": 4606 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9214, + "grad_norm": 1.699554681777954, + "kl": 0.3205305617302656, + "learning_rate": 1.555097481057632e-08, + "loss": 0.0128, + "num_tokens": 33557026.0, + "reward": 0.84466552734375, + "reward_std": 0.012945768423378468, + "rewards//mean": 0.84466552734375, + "rewards//std": 0.029129957780241966, + "step": 4607 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9216, + "grad_norm": 1.6375833749771118, + "kl": 0.34347896464169025, + "learning_rate": 1.547254518620139e-08, + "loss": 0.0137, + "num_tokens": 33564274.0, + "reward": 0.8177490234375, + "reward_std": 0.013596137054264545, + "rewards//mean": 0.8177490234375, + "rewards//std": 0.015456976369023323, + "step": 4608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9218, + "grad_norm": 1.6670026779174805, + "kl": 0.3843751475214958, + "learning_rate": 1.539431072942726e-08, + "loss": 0.0154, + "num_tokens": 33571546.0, + "reward": 0.8675537109375, + "reward_std": 0.016160568222403526, + "rewards//mean": 0.8675537109375, + "rewards//std": 0.02437683194875717, + "step": 4609 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.922, + "grad_norm": 1.5215321779251099, + "kl": 0.3502209819853306, + "learning_rate": 1.531627147176684e-08, + "loss": 0.014, + "num_tokens": 33578826.0, + "reward": 0.78887939453125, + "reward_std": 0.013354992493987083, + "rewards//mean": 0.78887939453125, + "rewards//std": 0.017054377123713493, + "step": 4610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9222, + "grad_norm": 1.5046569108963013, + "kl": 0.3279954958707094, + "learning_rate": 1.5238427444654367e-08, + "loss": 0.0131, + "num_tokens": 33586090.0, + "reward": 0.88226318359375, + "reward_std": 0.02023148350417614, + "rewards//mean": 0.88226318359375, + "rewards//std": 0.028788095340132713, + "step": 4611 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.75, + "epoch": 0.9224, + "grad_norm": 1.7810089588165283, + "kl": 0.33720095828175545, + "learning_rate": 1.5160778679445263e-08, + "loss": 0.003, + "num_tokens": 33593466.0, + "reward": 0.85064697265625, + "reward_std": 0.012497048825025558, + "rewards//mean": 0.85064697265625, + "rewards//std": 0.01794450171291828, + "step": 4612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9226, + "grad_norm": 1.4079134464263916, + "kl": 0.3369548488408327, + "learning_rate": 1.5083325207416565e-08, + "loss": 0.0135, + "num_tokens": 33600754.0, + "reward": 0.8419189453125, + "reward_std": 0.018291622400283813, + "rewards//mean": 0.8419189453125, + "rewards//std": 0.018570998683571815, + "step": 4613 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.703125, + "epoch": 0.9228, + "grad_norm": 1.5666649341583252, + "kl": 0.3403737507760525, + "learning_rate": 1.500606705976648e-08, + "loss": -0.008, + "num_tokens": 33608007.0, + "reward": 0.8331298828125, + "reward_std": 0.016042746603488922, + "rewards//mean": 0.8331298828125, + "rewards//std": 0.02136673964560032, + "step": 4614 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.923, + "grad_norm": 1.602552056312561, + "kl": 0.38950016535818577, + "learning_rate": 1.492900426761462e-08, + "loss": 0.0156, + "num_tokens": 33615327.0, + "reward": 0.82177734375, + "reward_std": 0.014361120760440826, + "rewards//mean": 0.82177734375, + "rewards//std": 0.019494013860821724, + "step": 4615 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.796875, + "epoch": 0.9232, + "grad_norm": 1.401540756225586, + "kl": 0.3511277958750725, + "learning_rate": 1.4852136862001763e-08, + "loss": 0.01, + "num_tokens": 33622530.0, + "reward": 0.8253173828125, + "reward_std": 0.014680089429020882, + "rewards//mean": 0.8253173828125, + "rewards//std": 0.018288444727659225, + "step": 4616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9234, + "grad_norm": 1.5277020931243896, + "kl": 0.3343350123614073, + "learning_rate": 1.4775464873890253e-08, + "loss": 0.0134, + "num_tokens": 33629778.0, + "reward": 0.81591796875, + "reward_std": 0.012161925435066223, + "rewards//mean": 0.81591796875, + "rewards//std": 0.020237719640135765, + "step": 4617 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.9236, + "grad_norm": 1.7681467533111572, + "kl": 0.3288610577583313, + "learning_rate": 1.4698988334163388e-08, + "loss": 0.0169, + "num_tokens": 33637147.0, + "reward": 0.80419921875, + "reward_std": 0.01321513019502163, + "rewards//mean": 0.80419921875, + "rewards//std": 0.017239000648260117, + "step": 4618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9238, + "grad_norm": 1.8800532817840576, + "kl": 0.4223090894520283, + "learning_rate": 1.4622707273625967e-08, + "loss": 0.0169, + "num_tokens": 33644547.0, + "reward": 0.87823486328125, + "reward_std": 0.01599426567554474, + "rewards//mean": 0.87823486328125, + "rewards//std": 0.025624966248869896, + "step": 4619 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.924, + "grad_norm": 1.4205137491226196, + "kl": 0.31413524225354195, + "learning_rate": 1.4546621723004083e-08, + "loss": 0.0126, + "num_tokens": 33651851.0, + "reward": 0.84814453125, + "reward_std": 0.012257950380444527, + "rewards//mean": 0.84814453125, + "rewards//std": 0.021592609584331512, + "step": 4620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9242, + "grad_norm": 1.886273980140686, + "kl": 0.4030495882034302, + "learning_rate": 1.4470731712944883e-08, + "loss": 0.0161, + "num_tokens": 33658987.0, + "reward": 0.8846435546875, + "reward_std": 0.022089164704084396, + "rewards//mean": 0.8846435546875, + "rewards//std": 0.028116464614868164, + "step": 4621 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.9244, + "grad_norm": 1.5876567363739014, + "kl": 0.3064971435815096, + "learning_rate": 1.4395037274016863e-08, + "loss": 0.0138, + "num_tokens": 33666328.0, + "reward": 0.876220703125, + "reward_std": 0.017311884090304375, + "rewards//mean": 0.876220703125, + "rewards//std": 0.02461567334830761, + "step": 4622 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9246, + "grad_norm": 1.6576391458511353, + "kl": 0.3171660155057907, + "learning_rate": 1.4319538436709743e-08, + "loss": 0.0127, + "num_tokens": 33673568.0, + "reward": 0.79571533203125, + "reward_std": 0.01702665537595749, + "rewards//mean": 0.79571533203125, + "rewards//std": 0.030904587358236313, + "step": 4623 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9248, + "grad_norm": 1.5752099752426147, + "kl": 0.3477589525282383, + "learning_rate": 1.4244235231434531e-08, + "loss": 0.0139, + "num_tokens": 33680912.0, + "reward": 0.873779296875, + "reward_std": 0.019731123000383377, + "rewards//mean": 0.873779296875, + "rewards//std": 0.02591930888593197, + "step": 4624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.925, + "grad_norm": 1.5605982542037964, + "kl": 0.40671668387949467, + "learning_rate": 1.4169127688523185e-08, + "loss": 0.0163, + "num_tokens": 33688184.0, + "reward": 0.79730224609375, + "reward_std": 0.01554492861032486, + "rewards//mean": 0.79730224609375, + "rewards//std": 0.01950167864561081, + "step": 4625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9252, + "grad_norm": 1.7340707778930664, + "kl": 0.3243843000382185, + "learning_rate": 1.4094215838229172e-08, + "loss": 0.013, + "num_tokens": 33695464.0, + "reward": 0.867431640625, + "reward_std": 0.013585293665528297, + "rewards//mean": 0.867431640625, + "rewards//std": 0.0206450168043375, + "step": 4626 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9254, + "grad_norm": 1.6338239908218384, + "kl": 0.3364904969930649, + "learning_rate": 1.4019499710726911e-08, + "loss": 0.0135, + "num_tokens": 33702736.0, + "reward": 0.8280029296875, + "reward_std": 0.020608825609087944, + "rewards//mean": 0.8280029296875, + "rewards//std": 0.02563832886517048, + "step": 4627 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9256, + "grad_norm": 1.662854552268982, + "kl": 0.36761413887143135, + "learning_rate": 1.394497933611205e-08, + "loss": 0.0147, + "num_tokens": 33709976.0, + "reward": 0.8433837890625, + "reward_std": 0.017200134694576263, + "rewards//mean": 0.8433837890625, + "rewards//std": 0.022699598222970963, + "step": 4628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9258, + "grad_norm": 1.2743134498596191, + "kl": 0.310068279504776, + "learning_rate": 1.3870654744401356e-08, + "loss": 0.0124, + "num_tokens": 33717192.0, + "reward": 0.86053466796875, + "reward_std": 0.013666962273418903, + "rewards//mean": 0.86053466796875, + "rewards//std": 0.01588144712150097, + "step": 4629 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.926, + "grad_norm": 1.379326343536377, + "kl": 0.2839972097426653, + "learning_rate": 1.3796525965532767e-08, + "loss": 0.0114, + "num_tokens": 33724448.0, + "reward": 0.88092041015625, + "reward_std": 0.01801542192697525, + "rewards//mean": 0.88092041015625, + "rewards//std": 0.02624541148543358, + "step": 4630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9262, + "grad_norm": 1.5303194522857666, + "kl": 0.36898949928581715, + "learning_rate": 1.3722593029365459e-08, + "loss": 0.0148, + "num_tokens": 33731656.0, + "reward": 0.8499755859375, + "reward_std": 0.017976049333810806, + "rewards//mean": 0.8499755859375, + "rewards//std": 0.019384603947401047, + "step": 4631 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9264, + "grad_norm": 1.7568378448486328, + "kl": 0.40444274991750717, + "learning_rate": 1.3648855965679496e-08, + "loss": 0.0162, + "num_tokens": 33738888.0, + "reward": 0.87548828125, + "reward_std": 0.018784508109092712, + "rewards//mean": 0.87548828125, + "rewards//std": 0.02711247280240059, + "step": 4632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9266, + "grad_norm": 1.6396690607070923, + "kl": 0.36089838668704033, + "learning_rate": 1.3575314804176174e-08, + "loss": 0.0144, + "num_tokens": 33746120.0, + "reward": 0.8466796875, + "reward_std": 0.020809084177017212, + "rewards//mean": 0.8466796875, + "rewards//std": 0.027027472853660583, + "step": 4633 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.84375, + "epoch": 0.9268, + "grad_norm": 1.6190179586410522, + "kl": 0.37009620293974876, + "learning_rate": 1.3501969574477856e-08, + "loss": 0.0158, + "num_tokens": 33753390.0, + "reward": 0.8597412109375, + "reward_std": 0.022162698209285736, + "rewards//mean": 0.8597412109375, + "rewards//std": 0.03050258569419384, + "step": 4634 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.927, + "grad_norm": 1.4321017265319824, + "kl": 0.34745950996875763, + "learning_rate": 1.3428820306128075e-08, + "loss": 0.0116, + "num_tokens": 33760664.0, + "reward": 0.807861328125, + "reward_std": 0.010911739431321621, + "rewards//mean": 0.807861328125, + "rewards//std": 0.02157437428832054, + "step": 4635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.734375, + "epoch": 0.9272, + "grad_norm": 1.5425961017608643, + "kl": 0.36517898738384247, + "learning_rate": 1.3355867028591206e-08, + "loss": -0.0039, + "num_tokens": 33767911.0, + "reward": 0.8555908203125, + "reward_std": 0.014448745176196098, + "rewards//mean": 0.8555908203125, + "rewards//std": 0.018932653591036797, + "step": 4636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9274, + "grad_norm": 1.3180999755859375, + "kl": 0.3609759993851185, + "learning_rate": 1.3283109771252965e-08, + "loss": 0.0144, + "num_tokens": 33775279.0, + "reward": 0.864013671875, + "reward_std": 0.014265619218349457, + "rewards//mean": 0.864013671875, + "rewards//std": 0.018425745889544487, + "step": 4637 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9276, + "grad_norm": 1.4095191955566406, + "kl": 0.29140516743063927, + "learning_rate": 1.3210548563419855e-08, + "loss": 0.0117, + "num_tokens": 33782511.0, + "reward": 0.8548583984375, + "reward_std": 0.014002842828631401, + "rewards//mean": 0.8548583984375, + "rewards//std": 0.017651600763201714, + "step": 4638 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9278, + "grad_norm": 1.6290132999420166, + "kl": 0.32163828797638416, + "learning_rate": 1.3138183434319661e-08, + "loss": 0.0129, + "num_tokens": 33789791.0, + "reward": 0.83416748046875, + "reward_std": 0.016960669308900833, + "rewards//mean": 0.83416748046875, + "rewards//std": 0.027026841416954994, + "step": 4639 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.928, + "grad_norm": 2.0735812187194824, + "kl": 0.32829706743359566, + "learning_rate": 1.3066014413100846e-08, + "loss": 0.0131, + "num_tokens": 33797055.0, + "reward": 0.86871337890625, + "reward_std": 0.01435087714344263, + "rewards//mean": 0.86871337890625, + "rewards//std": 0.017366621643304825, + "step": 4640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9282, + "grad_norm": 1.6044586896896362, + "kl": 0.35022495687007904, + "learning_rate": 1.2994041528833267e-08, + "loss": 0.014, + "num_tokens": 33804255.0, + "reward": 0.81231689453125, + "reward_std": 0.018247392028570175, + "rewards//mean": 0.81231689453125, + "rewards//std": 0.021535661071538925, + "step": 4641 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9284, + "grad_norm": 1.419110655784607, + "kl": 0.3567805103957653, + "learning_rate": 1.292226481050751e-08, + "loss": 0.0143, + "num_tokens": 33811575.0, + "reward": 0.84197998046875, + "reward_std": 0.01056685857474804, + "rewards//mean": 0.84197998046875, + "rewards//std": 0.01557733491063118, + "step": 4642 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9286, + "grad_norm": 4.619170665740967, + "kl": 0.6737064272165298, + "learning_rate": 1.285068428703523e-08, + "loss": 0.0269, + "num_tokens": 33818807.0, + "reward": 0.86553955078125, + "reward_std": 0.018104489892721176, + "rewards//mean": 0.86553955078125, + "rewards//std": 0.02660117670893669, + "step": 4643 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9288, + "grad_norm": 1.6501569747924805, + "kl": 0.2983449064195156, + "learning_rate": 1.2779299987249025e-08, + "loss": 0.0119, + "num_tokens": 33826055.0, + "reward": 0.8868408203125, + "reward_std": 0.01628311723470688, + "rewards//mean": 0.8868408203125, + "rewards//std": 0.02789159305393696, + "step": 4644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.929, + "grad_norm": 1.537192702293396, + "kl": 0.3037367947399616, + "learning_rate": 1.2708111939902565e-08, + "loss": 0.0121, + "num_tokens": 33833303.0, + "reward": 0.82647705078125, + "reward_std": 0.01441875472664833, + "rewards//mean": 0.82647705078125, + "rewards//std": 0.019976384937763214, + "step": 4645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9292, + "grad_norm": 1.7259591817855835, + "kl": 0.31959059834480286, + "learning_rate": 1.2637120173670358e-08, + "loss": 0.0128, + "num_tokens": 33840607.0, + "reward": 0.87603759765625, + "reward_std": 0.018616005778312683, + "rewards//mean": 0.87603759765625, + "rewards//std": 0.035958267748355865, + "step": 4646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9294, + "grad_norm": 2.2733888626098633, + "kl": 0.32221030816435814, + "learning_rate": 1.2566324717147802e-08, + "loss": 0.0129, + "num_tokens": 33848055.0, + "reward": 0.8778076171875, + "reward_std": 0.019676607102155685, + "rewards//mean": 0.8778076171875, + "rewards//std": 0.02563360333442688, + "step": 4647 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9296, + "grad_norm": 1.5456295013427734, + "kl": 0.3624178934842348, + "learning_rate": 1.249572559885137e-08, + "loss": 0.0145, + "num_tokens": 33855423.0, + "reward": 0.852294921875, + "reward_std": 0.017561784014105797, + "rewards//mean": 0.852294921875, + "rewards//std": 0.026891591027379036, + "step": 4648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9298, + "grad_norm": 1.5479904413223267, + "kl": 0.3341165855526924, + "learning_rate": 1.2425322847218367e-08, + "loss": 0.0134, + "num_tokens": 33862807.0, + "reward": 0.832275390625, + "reward_std": 0.014236100018024445, + "rewards//mean": 0.832275390625, + "rewards//std": 0.018147604539990425, + "step": 4649 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.93, + "grad_norm": 1.3249244689941406, + "kl": 0.41383425518870354, + "learning_rate": 1.2355116490607109e-08, + "loss": 0.0166, + "num_tokens": 33870207.0, + "reward": 0.8477783203125, + "reward_std": 0.016079094260931015, + "rewards//mean": 0.8477783203125, + "rewards//std": 0.02890649251639843, + "step": 4650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9302, + "grad_norm": 1.6747719049453735, + "kl": 0.34105646796524525, + "learning_rate": 1.2285106557296476e-08, + "loss": 0.0136, + "num_tokens": 33877487.0, + "reward": 0.865966796875, + "reward_std": 0.014571290463209152, + "rewards//mean": 0.865966796875, + "rewards//std": 0.01973326876759529, + "step": 4651 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9304, + "grad_norm": 1.7099974155426025, + "kl": 0.3707960397005081, + "learning_rate": 1.2215293075486687e-08, + "loss": 0.0148, + "num_tokens": 33884967.0, + "reward": 0.8514404296875, + "reward_std": 0.01899847202003002, + "rewards//mean": 0.8514404296875, + "rewards//std": 0.026013460010290146, + "step": 4652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9306, + "grad_norm": 1.4071778059005737, + "kl": 0.3432564176619053, + "learning_rate": 1.2145676073298472e-08, + "loss": 0.0137, + "num_tokens": 33892207.0, + "reward": 0.8555908203125, + "reward_std": 0.018992770463228226, + "rewards//mean": 0.8555908203125, + "rewards//std": 0.022691594436764717, + "step": 4653 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9308, + "grad_norm": 1.361726999282837, + "kl": 0.2967512421309948, + "learning_rate": 1.207625557877362e-08, + "loss": 0.0119, + "num_tokens": 33899519.0, + "reward": 0.8681640625, + "reward_std": 0.011541148647665977, + "rewards//mean": 0.8681640625, + "rewards//std": 0.0156868826597929, + "step": 4654 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.931, + "grad_norm": 1.7046258449554443, + "kl": 0.4417959488928318, + "learning_rate": 1.2007031619874652e-08, + "loss": 0.0177, + "num_tokens": 33906775.0, + "reward": 0.8035888671875, + "reward_std": 0.013113148510456085, + "rewards//mean": 0.8035888671875, + "rewards//std": 0.0190918929874897, + "step": 4655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9312, + "grad_norm": 1.685572624206543, + "kl": 0.3626176193356514, + "learning_rate": 1.1938004224484988e-08, + "loss": 0.0145, + "num_tokens": 33913999.0, + "reward": 0.862060546875, + "reward_std": 0.01783212274312973, + "rewards//mean": 0.862060546875, + "rewards//std": 0.02276518940925598, + "step": 4656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9314, + "grad_norm": 1.7095988988876343, + "kl": 0.32518097199499607, + "learning_rate": 1.1869173420408884e-08, + "loss": 0.013, + "num_tokens": 33921271.0, + "reward": 0.79473876953125, + "reward_std": 0.015575402416288853, + "rewards//mean": 0.79473876953125, + "rewards//std": 0.02633695863187313, + "step": 4657 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.828125, + "epoch": 0.9316, + "grad_norm": 1.976167917251587, + "kl": 0.40971802547574043, + "learning_rate": 1.1800539235371331e-08, + "loss": 0.0092, + "num_tokens": 33928604.0, + "reward": 0.86248779296875, + "reward_std": 0.016747474670410156, + "rewards//mean": 0.86248779296875, + "rewards//std": 0.0296282097697258, + "step": 4658 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9318, + "grad_norm": 1.9084550142288208, + "kl": 0.37687700241804123, + "learning_rate": 1.173210169701816e-08, + "loss": 0.0151, + "num_tokens": 33936012.0, + "reward": 0.82037353515625, + "reward_std": 0.012139962986111641, + "rewards//mean": 0.82037353515625, + "rewards//std": 0.01810240000486374, + "step": 4659 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.932, + "grad_norm": 1.6546666622161865, + "kl": 0.3332790117710829, + "learning_rate": 1.166386083291604e-08, + "loss": 0.0133, + "num_tokens": 33943340.0, + "reward": 0.86541748046875, + "reward_std": 0.019707251340150833, + "rewards//mean": 0.86541748046875, + "rewards//std": 0.032725948840379715, + "step": 4660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9322, + "grad_norm": 2.058159112930298, + "kl": 0.46955742686986923, + "learning_rate": 1.1595816670552428e-08, + "loss": 0.0188, + "num_tokens": 33950668.0, + "reward": 0.84521484375, + "reward_std": 0.019051920622587204, + "rewards//mean": 0.84521484375, + "rewards//std": 0.026397312059998512, + "step": 4661 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9324, + "grad_norm": 1.7703251838684082, + "kl": 0.35799674317240715, + "learning_rate": 1.1527969237335455e-08, + "loss": 0.0143, + "num_tokens": 33958036.0, + "reward": 0.77508544921875, + "reward_std": 0.015151070430874825, + "rewards//mean": 0.77508544921875, + "rewards//std": 0.03231539577245712, + "step": 4662 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9326, + "grad_norm": 1.880821704864502, + "kl": 0.4844995327293873, + "learning_rate": 1.1460318560593985e-08, + "loss": 0.0194, + "num_tokens": 33965316.0, + "reward": 0.8414306640625, + "reward_std": 0.01560099795460701, + "rewards//mean": 0.8414306640625, + "rewards//std": 0.025512849912047386, + "step": 4663 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9328, + "grad_norm": 1.4754722118377686, + "kl": 0.3888169154524803, + "learning_rate": 1.1392864667577828e-08, + "loss": 0.0156, + "num_tokens": 33972492.0, + "reward": 0.87005615234375, + "reward_std": 0.020654262974858284, + "rewards//mean": 0.87005615234375, + "rewards//std": 0.030924662947654724, + "step": 4664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.933, + "grad_norm": 1.9464852809906006, + "kl": 0.3622848279774189, + "learning_rate": 1.1325607585457365e-08, + "loss": 0.0145, + "num_tokens": 33979804.0, + "reward": 0.818603515625, + "reward_std": 0.017939098179340363, + "rewards//mean": 0.818603515625, + "rewards//std": 0.021952755749225616, + "step": 4665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9332, + "grad_norm": 1.4293502569198608, + "kl": 0.33374173380434513, + "learning_rate": 1.1258547341323698e-08, + "loss": 0.0133, + "num_tokens": 33987140.0, + "reward": 0.8447265625, + "reward_std": 0.014863544143736362, + "rewards//mean": 0.8447265625, + "rewards//std": 0.022755878046154976, + "step": 4666 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9334, + "grad_norm": 1.7260041236877441, + "kl": 0.3467239961028099, + "learning_rate": 1.1191683962188724e-08, + "loss": 0.0139, + "num_tokens": 33994428.0, + "reward": 0.865478515625, + "reward_std": 0.01247878186404705, + "rewards//mean": 0.865478515625, + "rewards//std": 0.014210445806384087, + "step": 4667 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9336, + "grad_norm": 1.5770965814590454, + "kl": 0.3047757614403963, + "learning_rate": 1.1125017474984954e-08, + "loss": 0.0122, + "num_tokens": 34001772.0, + "reward": 0.8316650390625, + "reward_std": 0.016867462545633316, + "rewards//mean": 0.8316650390625, + "rewards//std": 0.03223971277475357, + "step": 4668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9338, + "grad_norm": 1.4913557767868042, + "kl": 0.34347541630268097, + "learning_rate": 1.1058547906565741e-08, + "loss": 0.0137, + "num_tokens": 34009092.0, + "reward": 0.85601806640625, + "reward_std": 0.013784472830593586, + "rewards//mean": 0.85601806640625, + "rewards//std": 0.024228820577263832, + "step": 4669 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.934, + "grad_norm": 1.602810263633728, + "kl": 0.29277483746409416, + "learning_rate": 1.0992275283704944e-08, + "loss": 0.0117, + "num_tokens": 34016444.0, + "reward": 0.8519287109375, + "reward_std": 0.016733959317207336, + "rewards//mean": 0.8519287109375, + "rewards//std": 0.02632582001388073, + "step": 4670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.9342, + "grad_norm": 1.8868054151535034, + "kl": 0.3969796672463417, + "learning_rate": 1.0926199633097154e-08, + "loss": 0.0166, + "num_tokens": 34023715.0, + "reward": 0.84478759765625, + "reward_std": 0.012507634237408638, + "rewards//mean": 0.84478759765625, + "rewards//std": 0.014817329123616219, + "step": 4671 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9344, + "grad_norm": 1.400521159172058, + "kl": 0.35249847173690796, + "learning_rate": 1.0860320981357696e-08, + "loss": 0.0141, + "num_tokens": 34030995.0, + "reward": 0.828857421875, + "reward_std": 0.014721591956913471, + "rewards//mean": 0.828857421875, + "rewards//std": 0.022117629647254944, + "step": 4672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.65625, + "epoch": 0.9346, + "grad_norm": 2.9261763095855713, + "kl": 0.5304894112050533, + "learning_rate": 1.0794639355022505e-08, + "loss": -0.0063, + "num_tokens": 34038285.0, + "reward": 0.81243896484375, + "reward_std": 0.015336016193032265, + "rewards//mean": 0.81243896484375, + "rewards//std": 0.01709781400859356, + "step": 4673 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9348, + "grad_norm": 1.696968674659729, + "kl": 0.32913639582693577, + "learning_rate": 1.0729154780547977e-08, + "loss": 0.0132, + "num_tokens": 34045533.0, + "reward": 0.87457275390625, + "reward_std": 0.020305117592215538, + "rewards//mean": 0.87457275390625, + "rewards//std": 0.03040236048400402, + "step": 4674 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.935, + "grad_norm": 1.7245067358016968, + "kl": 0.34522201120853424, + "learning_rate": 1.0663867284311457e-08, + "loss": 0.0138, + "num_tokens": 34052773.0, + "reward": 0.82470703125, + "reward_std": 0.012849615886807442, + "rewards//mean": 0.82470703125, + "rewards//std": 0.017968211323022842, + "step": 4675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9352, + "grad_norm": 1.6007561683654785, + "kl": 0.395024336874485, + "learning_rate": 1.0598776892610684e-08, + "loss": 0.0158, + "num_tokens": 34060069.0, + "reward": 0.85040283203125, + "reward_std": 0.017693953588604927, + "rewards//mean": 0.85040283203125, + "rewards//std": 0.023527026176452637, + "step": 4676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9354, + "grad_norm": 1.5932676792144775, + "kl": 0.3580753020942211, + "learning_rate": 1.0533883631663964e-08, + "loss": 0.0143, + "num_tokens": 34067541.0, + "reward": 0.83111572265625, + "reward_std": 0.012572882696986198, + "rewards//mean": 0.83111572265625, + "rewards//std": 0.021426433697342873, + "step": 4677 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9356, + "grad_norm": 1.4196768999099731, + "kl": 0.3704220224171877, + "learning_rate": 1.0469187527610446e-08, + "loss": 0.0148, + "num_tokens": 34074877.0, + "reward": 0.8555908203125, + "reward_std": 0.009638081304728985, + "rewards//mean": 0.8555908203125, + "rewards//std": 0.010781572200357914, + "step": 4678 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9358, + "grad_norm": 1.8200583457946777, + "kl": 0.486427903175354, + "learning_rate": 1.0404688606509615e-08, + "loss": 0.0195, + "num_tokens": 34082253.0, + "reward": 0.8382568359375, + "reward_std": 0.01647966541349888, + "rewards//mean": 0.8382568359375, + "rewards//std": 0.0241070743650198, + "step": 4679 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.936, + "grad_norm": 1.9612797498703003, + "kl": 0.482926644384861, + "learning_rate": 1.0340386894341747e-08, + "loss": 0.0193, + "num_tokens": 34089493.0, + "reward": 0.8204345703125, + "reward_std": 0.015358454547822475, + "rewards//mean": 0.8204345703125, + "rewards//std": 0.028159502893686295, + "step": 4680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9362, + "grad_norm": 1.3373068571090698, + "kl": 0.35141442343592644, + "learning_rate": 1.0276282417007399e-08, + "loss": 0.0141, + "num_tokens": 34096805.0, + "reward": 0.81805419921875, + "reward_std": 0.014636575244367123, + "rewards//mean": 0.81805419921875, + "rewards//std": 0.021483583375811577, + "step": 4681 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.9364, + "grad_norm": 1.2757453918457031, + "kl": 0.33718907088041306, + "learning_rate": 1.0212375200327972e-08, + "loss": 0.0083, + "num_tokens": 34104078.0, + "reward": 0.819091796875, + "reward_std": 0.012040581554174423, + "rewards//mean": 0.819091796875, + "rewards//std": 0.01323988102376461, + "step": 4682 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9366, + "grad_norm": 1.5718172788619995, + "kl": 0.41901638731360435, + "learning_rate": 1.0148665270045209e-08, + "loss": 0.0168, + "num_tokens": 34111398.0, + "reward": 0.82965087890625, + "reward_std": 0.014855468645691872, + "rewards//mean": 0.82965087890625, + "rewards//std": 0.025061707943677902, + "step": 4683 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9368, + "grad_norm": 1.4591695070266724, + "kl": 0.3157491460442543, + "learning_rate": 1.0085152651821527e-08, + "loss": 0.0126, + "num_tokens": 34118718.0, + "reward": 0.84417724609375, + "reward_std": 0.014837794005870819, + "rewards//mean": 0.84417724609375, + "rewards//std": 0.0184814240783453, + "step": 4684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.937, + "grad_norm": 1.5390300750732422, + "kl": 0.3169529438018799, + "learning_rate": 1.0021837371239739e-08, + "loss": 0.0127, + "num_tokens": 34126038.0, + "reward": 0.82635498046875, + "reward_std": 0.008911725133657455, + "rewards//mean": 0.82635498046875, + "rewards//std": 0.0194246806204319, + "step": 4685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9372, + "grad_norm": 1.9167721271514893, + "kl": 0.3937947452068329, + "learning_rate": 9.958719453803276e-09, + "loss": 0.0158, + "num_tokens": 34133286.0, + "reward": 0.8289794921875, + "reward_std": 0.020447466522455215, + "rewards//mean": 0.8289794921875, + "rewards//std": 0.028522705659270287, + "step": 4686 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9374, + "grad_norm": 1.738019347190857, + "kl": 0.30956181697547436, + "learning_rate": 9.895798924936028e-09, + "loss": 0.0124, + "num_tokens": 34140566.0, + "reward": 0.86529541015625, + "reward_std": 0.014169572852551937, + "rewards//mean": 0.86529541015625, + "rewards//std": 0.020327188074588776, + "step": 4687 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9376, + "grad_norm": 1.5056109428405762, + "kl": 0.32084956392645836, + "learning_rate": 9.833075809982383e-09, + "loss": 0.0128, + "num_tokens": 34147878.0, + "reward": 0.85479736328125, + "reward_std": 0.014305813238024712, + "rewards//mean": 0.85479736328125, + "rewards//std": 0.021816402673721313, + "step": 4688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.671875, + "epoch": 0.9378, + "grad_norm": 1.6349847316741943, + "kl": 0.3430243097245693, + "learning_rate": 9.770550134207134e-09, + "loss": -0.0135, + "num_tokens": 34155233.0, + "reward": 0.8096923828125, + "reward_std": 0.01618008315563202, + "rewards//mean": 0.8096923828125, + "rewards//std": 0.03289969637989998, + "step": 4689 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.938, + "grad_norm": 1.4947112798690796, + "kl": 0.3312462102621794, + "learning_rate": 9.70822192279569e-09, + "loss": 0.0132, + "num_tokens": 34162481.0, + "reward": 0.819091796875, + "reward_std": 0.011458508670330048, + "rewards//mean": 0.819091796875, + "rewards//std": 0.0152146490290761, + "step": 4690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9382, + "grad_norm": 1.41661536693573, + "kl": 0.4659327492117882, + "learning_rate": 9.646091200853801e-09, + "loss": 0.0186, + "num_tokens": 34169737.0, + "reward": 0.8173828125, + "reward_std": 0.01073689665645361, + "rewards//mean": 0.8173828125, + "rewards//std": 0.01550049614161253, + "step": 4691 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9384, + "grad_norm": 1.5933407545089722, + "kl": 0.38159171119332314, + "learning_rate": 9.584157993407782e-09, + "loss": 0.0153, + "num_tokens": 34177001.0, + "reward": 0.85357666015625, + "reward_std": 0.015319755300879478, + "rewards//mean": 0.85357666015625, + "rewards//std": 0.01814916916191578, + "step": 4692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9386, + "grad_norm": 1.4227607250213623, + "kl": 0.34601306542754173, + "learning_rate": 9.522422325404233e-09, + "loss": 0.0138, + "num_tokens": 34184257.0, + "reward": 0.8724365234375, + "reward_std": 0.017260145395994186, + "rewards//mean": 0.8724365234375, + "rewards//std": 0.031205160543322563, + "step": 4693 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9388, + "grad_norm": 1.5174833536148071, + "kl": 0.3042658194899559, + "learning_rate": 9.460884221710264e-09, + "loss": 0.0122, + "num_tokens": 34191481.0, + "reward": 0.84478759765625, + "reward_std": 0.012060677632689476, + "rewards//mean": 0.84478759765625, + "rewards//std": 0.02711016871035099, + "step": 4694 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.939, + "grad_norm": 1.6122314929962158, + "kl": 0.28196057491004467, + "learning_rate": 9.3995437071136e-09, + "loss": 0.0113, + "num_tokens": 34198745.0, + "reward": 0.81793212890625, + "reward_std": 0.014866150915622711, + "rewards//mean": 0.81793212890625, + "rewards//std": 0.02170579694211483, + "step": 4695 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.9392, + "grad_norm": 1.908390760421753, + "kl": 0.3341179136186838, + "learning_rate": 9.338400806321977e-09, + "loss": 0.0126, + "num_tokens": 34206080.0, + "reward": 0.837158203125, + "reward_std": 0.01518004946410656, + "rewards//mean": 0.837158203125, + "rewards//std": 0.021143468096852303, + "step": 4696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9394, + "grad_norm": 1.4262806177139282, + "kl": 0.37257977575063705, + "learning_rate": 9.277455543963808e-09, + "loss": 0.0149, + "num_tokens": 34213328.0, + "reward": 0.864013671875, + "reward_std": 0.023346373811364174, + "rewards//mean": 0.864013671875, + "rewards//std": 0.031155884265899658, + "step": 4697 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9396, + "grad_norm": 1.5666983127593994, + "kl": 0.3093798868358135, + "learning_rate": 9.216707944587897e-09, + "loss": 0.0124, + "num_tokens": 34220720.0, + "reward": 0.748046875, + "reward_std": 0.011676718480885029, + "rewards//mean": 0.748046875, + "rewards//std": 0.019375620409846306, + "step": 4698 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9398, + "grad_norm": 1.5146872997283936, + "kl": 0.2930257711559534, + "learning_rate": 9.156158032663397e-09, + "loss": 0.0117, + "num_tokens": 34228144.0, + "reward": 0.87945556640625, + "reward_std": 0.013446742668747902, + "rewards//mean": 0.87945556640625, + "rewards//std": 0.027151457965373993, + "step": 4699 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.94, + "grad_norm": 1.339426875114441, + "kl": 0.33625999093055725, + "learning_rate": 9.095805832579683e-09, + "loss": 0.0135, + "num_tokens": 34235536.0, + "reward": 0.80352783203125, + "reward_std": 0.0139263104647398, + "rewards//mean": 0.80352783203125, + "rewards//std": 0.017767317593097687, + "step": 4700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9402, + "grad_norm": 1.952547311782837, + "kl": 0.37341562286019325, + "learning_rate": 9.035651368646646e-09, + "loss": 0.0149, + "num_tokens": 34242888.0, + "reward": 0.83563232421875, + "reward_std": 0.016614213585853577, + "rewards//mean": 0.83563232421875, + "rewards//std": 0.024501565843820572, + "step": 4701 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9404, + "grad_norm": 1.5794960260391235, + "kl": 0.3555312156677246, + "learning_rate": 8.975694665094512e-09, + "loss": 0.0142, + "num_tokens": 34250176.0, + "reward": 0.85125732421875, + "reward_std": 0.014155249111354351, + "rewards//mean": 0.85125732421875, + "rewards//std": 0.022598927840590477, + "step": 4702 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.9406, + "grad_norm": 1.6960153579711914, + "kl": 0.34780067577958107, + "learning_rate": 8.915935746073966e-09, + "loss": 0.0123, + "num_tokens": 34257506.0, + "reward": 0.8248291015625, + "reward_std": 0.012515103444457054, + "rewards//mean": 0.8248291015625, + "rewards//std": 0.020578552037477493, + "step": 4703 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9408, + "grad_norm": 1.5334835052490234, + "kl": 0.3448377139866352, + "learning_rate": 8.856374635655695e-09, + "loss": 0.0138, + "num_tokens": 34264730.0, + "reward": 0.8125, + "reward_std": 0.014583462849259377, + "rewards//mean": 0.8125, + "rewards//std": 0.025448985397815704, + "step": 4704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.941, + "grad_norm": 1.5368980169296265, + "kl": 0.4655034802854061, + "learning_rate": 8.797011357830952e-09, + "loss": 0.0186, + "num_tokens": 34271970.0, + "reward": 0.87750244140625, + "reward_std": 0.014913380146026611, + "rewards//mean": 0.87750244140625, + "rewards//std": 0.022653119638562202, + "step": 4705 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.9412, + "grad_norm": 1.4565691947937012, + "kl": 0.4364870637655258, + "learning_rate": 8.737845936511334e-09, + "loss": 0.0119, + "num_tokens": 34279221.0, + "reward": 0.86004638671875, + "reward_std": 0.014443136751651764, + "rewards//mean": 0.86004638671875, + "rewards//std": 0.02513829991221428, + "step": 4706 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9414, + "grad_norm": 1.3570581674575806, + "kl": 0.29647129960358143, + "learning_rate": 8.678878395528666e-09, + "loss": 0.0119, + "num_tokens": 34286429.0, + "reward": 0.85870361328125, + "reward_std": 0.021685095503926277, + "rewards//mean": 0.85870361328125, + "rewards//std": 0.028642069548368454, + "step": 4707 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9416, + "grad_norm": 1.4600880146026611, + "kl": 0.3749280031770468, + "learning_rate": 8.620108758634948e-09, + "loss": 0.015, + "num_tokens": 34293717.0, + "reward": 0.82171630859375, + "reward_std": 0.014866610988974571, + "rewards//mean": 0.82171630859375, + "rewards//std": 0.019342681393027306, + "step": 4708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9418, + "grad_norm": 1.6179548501968384, + "kl": 0.3939981833100319, + "learning_rate": 8.561537049502687e-09, + "loss": 0.0158, + "num_tokens": 34301021.0, + "reward": 0.860107421875, + "reward_std": 0.014425528235733509, + "rewards//mean": 0.860107421875, + "rewards//std": 0.022139519453048706, + "step": 4709 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.942, + "grad_norm": 1.7052768468856812, + "kl": 0.35144367441534996, + "learning_rate": 8.503163291724514e-09, + "loss": 0.0141, + "num_tokens": 34308285.0, + "reward": 0.86407470703125, + "reward_std": 0.011555705219507217, + "rewards//mean": 0.86407470703125, + "rewards//std": 0.0154563644900918, + "step": 4710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9422, + "grad_norm": 1.5587245225906372, + "kl": 0.37431276962161064, + "learning_rate": 8.44498750881345e-09, + "loss": 0.015, + "num_tokens": 34315517.0, + "reward": 0.7816162109375, + "reward_std": 0.010704953223466873, + "rewards//mean": 0.7816162109375, + "rewards//std": 0.015323207713663578, + "step": 4711 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9424, + "grad_norm": 1.3332446813583374, + "kl": 0.3186302538961172, + "learning_rate": 8.387009724202531e-09, + "loss": 0.0127, + "num_tokens": 34322789.0, + "reward": 0.87811279296875, + "reward_std": 0.01678089052438736, + "rewards//mean": 0.87811279296875, + "rewards//std": 0.023242197930812836, + "step": 4712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9426, + "grad_norm": 1.5532786846160889, + "kl": 0.3359874412417412, + "learning_rate": 8.329229961245354e-09, + "loss": 0.0134, + "num_tokens": 34330013.0, + "reward": 0.86663818359375, + "reward_std": 0.014510509558022022, + "rewards//mean": 0.86663818359375, + "rewards//std": 0.03178689256310463, + "step": 4713 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9428, + "grad_norm": 3.2346277236938477, + "kl": 0.5762649849057198, + "learning_rate": 8.27164824321558e-09, + "loss": 0.0231, + "num_tokens": 34337389.0, + "reward": 0.85345458984375, + "reward_std": 0.021126478910446167, + "rewards//mean": 0.85345458984375, + "rewards//std": 0.02821984700858593, + "step": 4714 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.943, + "grad_norm": 1.7565299272537231, + "kl": 0.4073692783713341, + "learning_rate": 8.214264593307096e-09, + "loss": 0.0163, + "num_tokens": 34344685.0, + "reward": 0.76812744140625, + "reward_std": 0.012290867045521736, + "rewards//mean": 0.76812744140625, + "rewards//std": 0.015769533812999725, + "step": 4715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9432, + "grad_norm": 1.4303007125854492, + "kl": 0.3228004761040211, + "learning_rate": 8.157079034633973e-09, + "loss": 0.0129, + "num_tokens": 34352013.0, + "reward": 0.8624267578125, + "reward_std": 0.016198312863707542, + "rewards//mean": 0.8624267578125, + "rewards//std": 0.0224824957549572, + "step": 4716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9434, + "grad_norm": 1.8022536039352417, + "kl": 0.3902216609567404, + "learning_rate": 8.100091590230617e-09, + "loss": 0.0156, + "num_tokens": 34359269.0, + "reward": 0.865966796875, + "reward_std": 0.016153736039996147, + "rewards//mean": 0.865966796875, + "rewards//std": 0.024288803339004517, + "step": 4717 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9436, + "grad_norm": 1.679481863975525, + "kl": 0.39477379620075226, + "learning_rate": 8.043302283051501e-09, + "loss": 0.0158, + "num_tokens": 34366581.0, + "reward": 0.761962890625, + "reward_std": 0.010515645146369934, + "rewards//mean": 0.761962890625, + "rewards//std": 0.018160944804549217, + "step": 4718 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9438, + "grad_norm": 1.6326032876968384, + "kl": 0.3461551070213318, + "learning_rate": 7.98671113597149e-09, + "loss": 0.0138, + "num_tokens": 34373869.0, + "reward": 0.81396484375, + "reward_std": 0.01390068605542183, + "rewards//mean": 0.81396484375, + "rewards//std": 0.01567916013300419, + "step": 4719 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.944, + "grad_norm": 1.2495405673980713, + "kl": 0.3490316327661276, + "learning_rate": 7.930318171785355e-09, + "loss": 0.014, + "num_tokens": 34381093.0, + "reward": 0.82586669921875, + "reward_std": 0.01188234519213438, + "rewards//mean": 0.82586669921875, + "rewards//std": 0.019272902980446815, + "step": 4720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9442, + "grad_norm": 1.339251160621643, + "kl": 0.32650359347462654, + "learning_rate": 7.874123413208145e-09, + "loss": 0.0131, + "num_tokens": 34388317.0, + "reward": 0.840087890625, + "reward_std": 0.014489980414509773, + "rewards//mean": 0.840087890625, + "rewards//std": 0.029272064566612244, + "step": 4721 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9444, + "grad_norm": 1.568811058998108, + "kl": 0.3907836936414242, + "learning_rate": 7.818126882875254e-09, + "loss": 0.0156, + "num_tokens": 34395605.0, + "reward": 0.84600830078125, + "reward_std": 0.017509199678897858, + "rewards//mean": 0.84600830078125, + "rewards//std": 0.02031303383409977, + "step": 4722 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.734375, + "epoch": 0.9446, + "grad_norm": 1.4673041105270386, + "kl": 0.3126402795314789, + "learning_rate": 7.762328603341973e-09, + "loss": -0.0079, + "num_tokens": 34402860.0, + "reward": 0.81927490234375, + "reward_std": 0.011257503181695938, + "rewards//mean": 0.81927490234375, + "rewards//std": 0.021055851131677628, + "step": 4723 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9448, + "grad_norm": 1.7855360507965088, + "kl": 0.4457605481147766, + "learning_rate": 7.706728597083878e-09, + "loss": 0.0178, + "num_tokens": 34410108.0, + "reward": 0.8541259765625, + "reward_std": 0.01742073893547058, + "rewards//mean": 0.8541259765625, + "rewards//std": 0.021734807640314102, + "step": 4724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.945, + "grad_norm": 1.6469132900238037, + "kl": 0.409717807546258, + "learning_rate": 7.651326886496612e-09, + "loss": 0.0164, + "num_tokens": 34417500.0, + "reward": 0.84564208984375, + "reward_std": 0.014993098564445972, + "rewards//mean": 0.84564208984375, + "rewards//std": 0.023857353255152702, + "step": 4725 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.9452, + "grad_norm": 1.5285273790359497, + "kl": 0.27474839985370636, + "learning_rate": 7.59612349389599e-09, + "loss": 0.0115, + "num_tokens": 34424819.0, + "reward": 0.84588623046875, + "reward_std": 0.012445235624909401, + "rewards//mean": 0.84588623046875, + "rewards//std": 0.02478291280567646, + "step": 4726 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.9454, + "grad_norm": 1.6298582553863525, + "kl": 0.45400806330144405, + "learning_rate": 7.541118441517946e-09, + "loss": 0.0148, + "num_tokens": 34432064.0, + "reward": 0.803955078125, + "reward_std": 0.015719035640358925, + "rewards//mean": 0.803955078125, + "rewards//std": 0.018660852685570717, + "step": 4727 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9456, + "grad_norm": 1.7961888313293457, + "kl": 0.4118168130517006, + "learning_rate": 7.486311751518481e-09, + "loss": 0.0165, + "num_tokens": 34439352.0, + "reward": 0.85040283203125, + "reward_std": 0.017572451382875443, + "rewards//mean": 0.85040283203125, + "rewards//std": 0.02932005375623703, + "step": 4728 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.953125, + "epoch": 0.9458, + "grad_norm": 1.5051331520080566, + "kl": 0.47903033159673214, + "learning_rate": 7.431703445973769e-09, + "loss": 0.0167, + "num_tokens": 34446645.0, + "reward": 0.8216552734375, + "reward_std": 0.01252724975347519, + "rewards//mean": 0.8216552734375, + "rewards//std": 0.01915205828845501, + "step": 4729 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.946, + "grad_norm": 1.686604380607605, + "kl": 0.3506283648312092, + "learning_rate": 7.377293546880048e-09, + "loss": 0.014, + "num_tokens": 34453941.0, + "reward": 0.80780029296875, + "reward_std": 0.015126289799809456, + "rewards//mean": 0.80780029296875, + "rewards//std": 0.020951345562934875, + "step": 4730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.78125, + "epoch": 0.9462, + "grad_norm": 1.6884669065475464, + "kl": 0.36281212233006954, + "learning_rate": 7.323082076153508e-09, + "loss": 0.0158, + "num_tokens": 34461255.0, + "reward": 0.86151123046875, + "reward_std": 0.01324511505663395, + "rewards//mean": 0.86151123046875, + "rewards//std": 0.01988067477941513, + "step": 4731 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9464, + "grad_norm": 1.4923486709594727, + "kl": 0.3908032104372978, + "learning_rate": 7.269069055630628e-09, + "loss": 0.0156, + "num_tokens": 34468591.0, + "reward": 0.80206298828125, + "reward_std": 0.012313421815633774, + "rewards//mean": 0.80206298828125, + "rewards//std": 0.017553025856614113, + "step": 4732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9466, + "grad_norm": 1.5295242071151733, + "kl": 0.3190116100013256, + "learning_rate": 7.215254507067781e-09, + "loss": 0.0128, + "num_tokens": 34475871.0, + "reward": 0.873046875, + "reward_std": 0.012086309492588043, + "rewards//mean": 0.873046875, + "rewards//std": 0.016188381239771843, + "step": 4733 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9468, + "grad_norm": 1.3924345970153809, + "kl": 0.3045340161770582, + "learning_rate": 7.1616384521415164e-09, + "loss": 0.0122, + "num_tokens": 34483087.0, + "reward": 0.87371826171875, + "reward_std": 0.016002232208848, + "rewards//mean": 0.87371826171875, + "rewards//std": 0.025411410257220268, + "step": 4734 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.947, + "grad_norm": 1.510418176651001, + "kl": 0.36744603514671326, + "learning_rate": 7.1082209124482815e-09, + "loss": 0.0147, + "num_tokens": 34490343.0, + "reward": 0.8740234375, + "reward_std": 0.017742328345775604, + "rewards//mean": 0.8740234375, + "rewards//std": 0.029659423977136612, + "step": 4735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.9472, + "grad_norm": 1.5820646286010742, + "kl": 0.3908033836632967, + "learning_rate": 7.055001909504754e-09, + "loss": 0.0158, + "num_tokens": 34497646.0, + "reward": 0.8726806640625, + "reward_std": 0.017267785966396332, + "rewards//mean": 0.8726806640625, + "rewards//std": 0.024515537545084953, + "step": 4736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9474, + "grad_norm": 1.48491632938385, + "kl": 0.37096014618873596, + "learning_rate": 7.0019814647475636e-09, + "loss": 0.0148, + "num_tokens": 34504958.0, + "reward": 0.8472900390625, + "reward_std": 0.013281015679240227, + "rewards//mean": 0.8472900390625, + "rewards//std": 0.019926784560084343, + "step": 4737 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9476, + "grad_norm": 1.6187870502471924, + "kl": 0.3716888017952442, + "learning_rate": 6.949159599533239e-09, + "loss": 0.0149, + "num_tokens": 34512214.0, + "reward": 0.866943359375, + "reward_std": 0.014120902866125107, + "rewards//mean": 0.866943359375, + "rewards//std": 0.017453676089644432, + "step": 4738 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9478, + "grad_norm": 1.5136545896530151, + "kl": 0.3543064408004284, + "learning_rate": 6.8965363351384255e-09, + "loss": 0.0142, + "num_tokens": 34519510.0, + "reward": 0.8369140625, + "reward_std": 0.0156100420281291, + "rewards//mean": 0.8369140625, + "rewards//std": 0.017510825768113136, + "step": 4739 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.948, + "grad_norm": 1.3846619129180908, + "kl": 0.32339096814393997, + "learning_rate": 6.844111692759835e-09, + "loss": 0.0129, + "num_tokens": 34526774.0, + "reward": 0.8629150390625, + "reward_std": 0.01374409906566143, + "rewards//mean": 0.8629150390625, + "rewards//std": 0.020180417224764824, + "step": 4740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9482, + "grad_norm": 1.7266995906829834, + "kl": 0.33808812499046326, + "learning_rate": 6.791885693514132e-09, + "loss": 0.0135, + "num_tokens": 34533998.0, + "reward": 0.85595703125, + "reward_std": 0.016363611444830894, + "rewards//mean": 0.85595703125, + "rewards//std": 0.021058816462755203, + "step": 4741 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9484, + "grad_norm": 1.7505500316619873, + "kl": 0.35323931463062763, + "learning_rate": 6.739858358437822e-09, + "loss": 0.0141, + "num_tokens": 34541334.0, + "reward": 0.8167724609375, + "reward_std": 0.010155614465475082, + "rewards//mean": 0.8167724609375, + "rewards//std": 0.017917169257998466, + "step": 4742 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9486, + "grad_norm": 1.51130211353302, + "kl": 0.3158059660345316, + "learning_rate": 6.688029708487586e-09, + "loss": 0.0126, + "num_tokens": 34548654.0, + "reward": 0.88275146484375, + "reward_std": 0.01166527159512043, + "rewards//mean": 0.88275146484375, + "rewards//std": 0.026296118274331093, + "step": 4743 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.9488, + "grad_norm": 1.9021966457366943, + "kl": 0.4198130574077368, + "learning_rate": 6.636399764540002e-09, + "loss": 0.0163, + "num_tokens": 34555925.0, + "reward": 0.84014892578125, + "reward_std": 0.013495172373950481, + "rewards//mean": 0.84014892578125, + "rewards//std": 0.01870531402528286, + "step": 4744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.949, + "grad_norm": 1.560039758682251, + "kl": 0.34768814593553543, + "learning_rate": 6.584968547391656e-09, + "loss": 0.0139, + "num_tokens": 34563341.0, + "reward": 0.88055419921875, + "reward_std": 0.014748362824320793, + "rewards//mean": 0.88055419921875, + "rewards//std": 0.017285367473959923, + "step": 4745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9492, + "grad_norm": 1.5619407892227173, + "kl": 0.3552377037703991, + "learning_rate": 6.533736077758867e-09, + "loss": 0.0142, + "num_tokens": 34570661.0, + "reward": 0.83184814453125, + "reward_std": 0.014576803892850876, + "rewards//mean": 0.83184814453125, + "rewards//std": 0.023276040330529213, + "step": 4746 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9494, + "grad_norm": 1.600672960281372, + "kl": 0.3345373533666134, + "learning_rate": 6.482702376278237e-09, + "loss": 0.0134, + "num_tokens": 34577941.0, + "reward": 0.78350830078125, + "reward_std": 0.017385775223374367, + "rewards//mean": 0.78350830078125, + "rewards//std": 0.021996067836880684, + "step": 4747 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9496, + "grad_norm": 1.7312920093536377, + "kl": 0.33878943137824535, + "learning_rate": 6.431867463506046e-09, + "loss": 0.0136, + "num_tokens": 34585197.0, + "reward": 0.83001708984375, + "reward_std": 0.01128983311355114, + "rewards//mean": 0.83001708984375, + "rewards//std": 0.01851906254887581, + "step": 4748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9498, + "grad_norm": 1.8158444166183472, + "kl": 0.38457716442644596, + "learning_rate": 6.381231359918637e-09, + "loss": 0.0154, + "num_tokens": 34592541.0, + "reward": 0.8580322265625, + "reward_std": 0.012611869722604752, + "rewards//mean": 0.8580322265625, + "rewards//std": 0.02155577391386032, + "step": 4749 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.95, + "grad_norm": 1.4219367504119873, + "kl": 0.3224663808941841, + "learning_rate": 6.330794085912195e-09, + "loss": 0.0129, + "num_tokens": 34599805.0, + "reward": 0.88299560546875, + "reward_std": 0.01473300438374281, + "rewards//mean": 0.88299560546875, + "rewards//std": 0.026479117572307587, + "step": 4750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9502, + "grad_norm": 1.3263863325119019, + "kl": 0.316728126257658, + "learning_rate": 6.280555661802856e-09, + "loss": 0.0127, + "num_tokens": 34607109.0, + "reward": 0.8402099609375, + "reward_std": 0.011978057213127613, + "rewards//mean": 0.8402099609375, + "rewards//std": 0.019926784560084343, + "step": 4751 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9504, + "grad_norm": 1.6687452793121338, + "kl": 0.32372505962848663, + "learning_rate": 6.230516107826656e-09, + "loss": 0.0129, + "num_tokens": 34614397.0, + "reward": 0.8721923828125, + "reward_std": 0.01800365187227726, + "rewards//mean": 0.8721923828125, + "rewards//std": 0.031446777284145355, + "step": 4752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9506, + "grad_norm": 1.8539459705352783, + "kl": 0.33790944889187813, + "learning_rate": 6.180675444139527e-09, + "loss": 0.0135, + "num_tokens": 34621677.0, + "reward": 0.82696533203125, + "reward_std": 0.014899062924087048, + "rewards//mean": 0.82696533203125, + "rewards//std": 0.018118280917406082, + "step": 4753 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9508, + "grad_norm": 1.3952925205230713, + "kl": 0.3292773775756359, + "learning_rate": 6.131033690817244e-09, + "loss": 0.0132, + "num_tokens": 34629021.0, + "reward": 0.8382568359375, + "reward_std": 0.01650029793381691, + "rewards//mean": 0.8382568359375, + "rewards//std": 0.025720855221152306, + "step": 4754 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.951, + "grad_norm": 3.534182548522949, + "kl": 0.5691559184342623, + "learning_rate": 6.081590867855535e-09, + "loss": 0.0228, + "num_tokens": 34636365.0, + "reward": 0.80889892578125, + "reward_std": 0.014736739918589592, + "rewards//mean": 0.80889892578125, + "rewards//std": 0.024708282202482224, + "step": 4755 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9512, + "grad_norm": 1.7524240016937256, + "kl": 0.3950022589415312, + "learning_rate": 6.032346995169968e-09, + "loss": 0.0158, + "num_tokens": 34643669.0, + "reward": 0.7900390625, + "reward_std": 0.01457544881850481, + "rewards//mean": 0.7900390625, + "rewards//std": 0.02309396117925644, + "step": 4756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9514, + "grad_norm": 1.4977092742919922, + "kl": 0.3447002302855253, + "learning_rate": 5.983302092595954e-09, + "loss": 0.0138, + "num_tokens": 34650941.0, + "reward": 0.8233642578125, + "reward_std": 0.012695788405835629, + "rewards//mean": 0.8233642578125, + "rewards//std": 0.02016540989279747, + "step": 4757 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.9516, + "grad_norm": 1.6013820171356201, + "kl": 0.43205204233527184, + "learning_rate": 5.934456179888803e-09, + "loss": 0.0141, + "num_tokens": 34658233.0, + "reward": 0.80108642578125, + "reward_std": 0.013474996201694012, + "rewards//mean": 0.80108642578125, + "rewards//std": 0.01669374294579029, + "step": 4758 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9518, + "grad_norm": 1.421121597290039, + "kl": 0.3717142902314663, + "learning_rate": 5.8858092767236076e-09, + "loss": 0.0149, + "num_tokens": 34665545.0, + "reward": 0.8536376953125, + "reward_std": 0.013682223856449127, + "rewards//mean": 0.8536376953125, + "rewards//std": 0.018440116196870804, + "step": 4759 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.952, + "grad_norm": 1.660895824432373, + "kl": 0.3786383904516697, + "learning_rate": 5.837361402695362e-09, + "loss": 0.0151, + "num_tokens": 34672953.0, + "reward": 0.8853759765625, + "reward_std": 0.01890169270336628, + "rewards//mean": 0.8853759765625, + "rewards//std": 0.028170250356197357, + "step": 4760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9522, + "grad_norm": 1.3667136430740356, + "kl": 0.3840652368962765, + "learning_rate": 5.789112577318789e-09, + "loss": 0.0154, + "num_tokens": 34680233.0, + "reward": 0.8037109375, + "reward_std": 0.011502102948725224, + "rewards//mean": 0.8037109375, + "rewards//std": 0.021372759714722633, + "step": 4761 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9524, + "grad_norm": 1.6021872758865356, + "kl": 0.3483203686773777, + "learning_rate": 5.741062820028619e-09, + "loss": 0.0139, + "num_tokens": 34687481.0, + "reward": 0.86199951171875, + "reward_std": 0.012174916453659534, + "rewards//mean": 0.86199951171875, + "rewards//std": 0.019624710083007812, + "step": 4762 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9526, + "grad_norm": 1.494627833366394, + "kl": 0.4446861892938614, + "learning_rate": 5.693212150179205e-09, + "loss": 0.0178, + "num_tokens": 34694673.0, + "reward": 0.796630859375, + "reward_std": 0.013271160423755646, + "rewards//mean": 0.796630859375, + "rewards//std": 0.019071664661169052, + "step": 4763 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9528, + "grad_norm": 1.4651333093643188, + "kl": 0.3390200287103653, + "learning_rate": 5.6455605870448506e-09, + "loss": 0.0136, + "num_tokens": 34701993.0, + "reward": 0.78094482421875, + "reward_std": 0.012380458414554596, + "rewards//mean": 0.78094482421875, + "rewards//std": 0.02445208840072155, + "step": 4764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.953, + "grad_norm": 1.6541699171066284, + "kl": 0.2873194385319948, + "learning_rate": 5.598108149819536e-09, + "loss": 0.0115, + "num_tokens": 34709321.0, + "reward": 0.8489990234375, + "reward_std": 0.015250557102262974, + "rewards//mean": 0.8489990234375, + "rewards//std": 0.021101541817188263, + "step": 4765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9532, + "grad_norm": 1.5778707265853882, + "kl": 0.36702675372362137, + "learning_rate": 5.550854857617193e-09, + "loss": 0.0147, + "num_tokens": 34716601.0, + "reward": 0.8909912109375, + "reward_std": 0.021183934062719345, + "rewards//mean": 0.8909912109375, + "rewards//std": 0.030842134729027748, + "step": 4766 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9534, + "grad_norm": 1.6136507987976074, + "kl": 0.3599730394780636, + "learning_rate": 5.503800729471375e-09, + "loss": 0.0144, + "num_tokens": 34723865.0, + "reward": 0.861572265625, + "reward_std": 0.014068357646465302, + "rewards//mean": 0.861572265625, + "rewards//std": 0.01845201663672924, + "step": 4767 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9536, + "grad_norm": 2.2721095085144043, + "kl": 0.3242312856018543, + "learning_rate": 5.456945784335421e-09, + "loss": 0.013, + "num_tokens": 34731113.0, + "reward": 0.8590087890625, + "reward_std": 0.016098683699965477, + "rewards//mean": 0.8590087890625, + "rewards//std": 0.028062572702765465, + "step": 4768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9538, + "grad_norm": 1.1745376586914062, + "kl": 0.27957226522266865, + "learning_rate": 5.4102900410826215e-09, + "loss": 0.0112, + "num_tokens": 34738417.0, + "reward": 0.81005859375, + "reward_std": 0.011502699926495552, + "rewards//mean": 0.81005859375, + "rewards//std": 0.017026949673891068, + "step": 4769 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.954, + "grad_norm": 1.4499071836471558, + "kl": 0.33668995648622513, + "learning_rate": 5.3638335185058335e-09, + "loss": 0.0131, + "num_tokens": 34745688.0, + "reward": 0.84625244140625, + "reward_std": 0.0174288097769022, + "rewards//mean": 0.84625244140625, + "rewards//std": 0.0228553619235754, + "step": 4770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9542, + "grad_norm": 1.81228506565094, + "kl": 0.3327188454568386, + "learning_rate": 5.317576235317756e-09, + "loss": 0.0133, + "num_tokens": 34752912.0, + "reward": 0.86376953125, + "reward_std": 0.022460952401161194, + "rewards//mean": 0.86376953125, + "rewards//std": 0.03488653153181076, + "step": 4771 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9544, + "grad_norm": 1.63291597366333, + "kl": 0.37071678787469864, + "learning_rate": 5.271518210150816e-09, + "loss": 0.0148, + "num_tokens": 34760152.0, + "reward": 0.7825927734375, + "reward_std": 0.011638655327260494, + "rewards//mean": 0.7825927734375, + "rewards//std": 0.01705138012766838, + "step": 4772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9546, + "grad_norm": 1.570784568786621, + "kl": 0.34013552218675613, + "learning_rate": 5.2256594615571745e-09, + "loss": 0.0136, + "num_tokens": 34767600.0, + "reward": 0.80303955078125, + "reward_std": 0.017225481569767, + "rewards//mean": 0.80303955078125, + "rewards//std": 0.023112880066037178, + "step": 4773 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9548, + "grad_norm": 1.9612780809402466, + "kl": 0.31901793740689754, + "learning_rate": 5.180000008008723e-09, + "loss": 0.0128, + "num_tokens": 34774904.0, + "reward": 0.8787841796875, + "reward_std": 0.018391672521829605, + "rewards//mean": 0.8787841796875, + "rewards//std": 0.024968286976218224, + "step": 4774 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.955, + "grad_norm": 1.4933252334594727, + "kl": 0.3274306822568178, + "learning_rate": 5.134539867897081e-09, + "loss": 0.0131, + "num_tokens": 34782152.0, + "reward": 0.8831787109375, + "reward_std": 0.015438959002494812, + "rewards//mean": 0.8831787109375, + "rewards//std": 0.019246671348810196, + "step": 4775 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9552, + "grad_norm": 1.5904371738433838, + "kl": 0.3301115371286869, + "learning_rate": 5.0892790595336575e-09, + "loss": 0.0132, + "num_tokens": 34789408.0, + "reward": 0.85577392578125, + "reward_std": 0.015082028694450855, + "rewards//mean": 0.85577392578125, + "rewards//std": 0.018847206607460976, + "step": 4776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9554, + "grad_norm": 1.6464992761611938, + "kl": 0.32859107106924057, + "learning_rate": 5.04421760114937e-09, + "loss": 0.0131, + "num_tokens": 34796640.0, + "reward": 0.79071044921875, + "reward_std": 0.011411339044570923, + "rewards//mean": 0.79071044921875, + "rewards//std": 0.01691713184118271, + "step": 4777 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9556, + "grad_norm": 1.977237582206726, + "kl": 0.3288893327116966, + "learning_rate": 4.999355510895087e-09, + "loss": 0.0132, + "num_tokens": 34803992.0, + "reward": 0.8267822265625, + "reward_std": 0.01572568342089653, + "rewards//mean": 0.8267822265625, + "rewards//std": 0.016793755814433098, + "step": 4778 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.734375, + "epoch": 0.9558, + "grad_norm": 1.6689069271087646, + "kl": 0.37577610462903976, + "learning_rate": 4.954692806841187e-09, + "loss": -0.0011, + "num_tokens": 34811271.0, + "reward": 0.8011474609375, + "reward_std": 0.013162641786038876, + "rewards//mean": 0.8011474609375, + "rewards//std": 0.024878397583961487, + "step": 4779 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.956, + "grad_norm": 1.8979978561401367, + "kl": 0.3544388972222805, + "learning_rate": 4.910229506977837e-09, + "loss": 0.0142, + "num_tokens": 34818623.0, + "reward": 0.83966064453125, + "reward_std": 0.011146994307637215, + "rewards//mean": 0.83966064453125, + "rewards//std": 0.01939348503947258, + "step": 4780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9562, + "grad_norm": 2.1719629764556885, + "kl": 0.39195709861814976, + "learning_rate": 4.865965629214819e-09, + "loss": 0.0157, + "num_tokens": 34825927.0, + "reward": 0.83831787109375, + "reward_std": 0.01101045124232769, + "rewards//mean": 0.83831787109375, + "rewards//std": 0.01561906561255455, + "step": 4781 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9564, + "grad_norm": 1.6535974740982056, + "kl": 0.35681261494755745, + "learning_rate": 4.82190119138165e-09, + "loss": 0.0143, + "num_tokens": 34833191.0, + "reward": 0.86474609375, + "reward_std": 0.019234728068113327, + "rewards//mean": 0.86474609375, + "rewards//std": 0.02109329029917717, + "step": 4782 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9566, + "grad_norm": 1.369400978088379, + "kl": 0.3545556589961052, + "learning_rate": 4.778036211227465e-09, + "loss": 0.0142, + "num_tokens": 34840479.0, + "reward": 0.885986328125, + "reward_std": 0.017470749095082283, + "rewards//mean": 0.885986328125, + "rewards//std": 0.027986139059066772, + "step": 4783 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9568, + "grad_norm": 1.330815315246582, + "kl": 0.3364972844719887, + "learning_rate": 4.734370706421076e-09, + "loss": 0.0135, + "num_tokens": 34847695.0, + "reward": 0.8603515625, + "reward_std": 0.013539858162403107, + "rewards//mean": 0.8603515625, + "rewards//std": 0.017758049070835114, + "step": 4784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.875, + "epoch": 0.957, + "grad_norm": 1.5991706848144531, + "kl": 0.42889444902539253, + "learning_rate": 4.690904694550912e-09, + "loss": 0.0249, + "num_tokens": 34854887.0, + "reward": 0.83941650390625, + "reward_std": 0.012195726856589317, + "rewards//mean": 0.83941650390625, + "rewards//std": 0.02006484940648079, + "step": 4785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9572, + "grad_norm": 1.7357984781265259, + "kl": 0.31372356228530407, + "learning_rate": 4.647638193125137e-09, + "loss": 0.0125, + "num_tokens": 34862207.0, + "reward": 0.8701171875, + "reward_std": 0.013492703437805176, + "rewards//mean": 0.8701171875, + "rewards//std": 0.018907450139522552, + "step": 4786 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9574, + "grad_norm": 2.2090110778808594, + "kl": 0.41609225422143936, + "learning_rate": 4.604571219571473e-09, + "loss": 0.0166, + "num_tokens": 34869487.0, + "reward": 0.8585205078125, + "reward_std": 0.015598912723362446, + "rewards//mean": 0.8585205078125, + "rewards//std": 0.023709449917078018, + "step": 4787 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9576, + "grad_norm": 1.5306243896484375, + "kl": 0.3517100401222706, + "learning_rate": 4.56170379123727e-09, + "loss": 0.0141, + "num_tokens": 34876775.0, + "reward": 0.84814453125, + "reward_std": 0.01694408431649208, + "rewards//mean": 0.84814453125, + "rewards//std": 0.027652032673358917, + "step": 4788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9578, + "grad_norm": 1.4215234518051147, + "kl": 0.356245506554842, + "learning_rate": 4.519035925389491e-09, + "loss": 0.0142, + "num_tokens": 34884015.0, + "reward": 0.8004150390625, + "reward_std": 0.013126879930496216, + "rewards//mean": 0.8004150390625, + "rewards//std": 0.021762648597359657, + "step": 4789 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.958, + "grad_norm": 1.495995283126831, + "kl": 0.3604093287140131, + "learning_rate": 4.476567639214779e-09, + "loss": 0.0144, + "num_tokens": 34891327.0, + "reward": 0.80322265625, + "reward_std": 0.01317135151475668, + "rewards//mean": 0.80322265625, + "rewards//std": 0.02470039390027523, + "step": 4790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9582, + "grad_norm": 1.6540287733078003, + "kl": 0.3137865886092186, + "learning_rate": 4.434298949819448e-09, + "loss": 0.0126, + "num_tokens": 34898615.0, + "reward": 0.862060546875, + "reward_std": 0.01894480176270008, + "rewards//mean": 0.862060546875, + "rewards//std": 0.024118687957525253, + "step": 4791 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9584, + "grad_norm": 1.6673585176467896, + "kl": 0.3370441012084484, + "learning_rate": 4.3922298742291585e-09, + "loss": 0.0135, + "num_tokens": 34905951.0, + "reward": 0.86041259765625, + "reward_std": 0.018529430031776428, + "rewards//mean": 0.86041259765625, + "rewards//std": 0.02939894050359726, + "step": 4792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9586, + "grad_norm": 1.3649471998214722, + "kl": 0.3508693315088749, + "learning_rate": 4.35036042938941e-09, + "loss": 0.014, + "num_tokens": 34913167.0, + "reward": 0.83013916015625, + "reward_std": 0.013233350589871407, + "rewards//mean": 0.83013916015625, + "rewards//std": 0.02338569238781929, + "step": 4793 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9588, + "grad_norm": 1.8300386667251587, + "kl": 0.40834821946918964, + "learning_rate": 4.308690632165213e-09, + "loss": 0.0163, + "num_tokens": 34920495.0, + "reward": 0.8394775390625, + "reward_std": 0.016343556344509125, + "rewards//mean": 0.8394775390625, + "rewards//std": 0.021084317937493324, + "step": 4794 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.959, + "grad_norm": 1.4113802909851074, + "kl": 0.32526713237166405, + "learning_rate": 4.2672204993411955e-09, + "loss": 0.013, + "num_tokens": 34927743.0, + "reward": 0.81976318359375, + "reward_std": 0.012741511687636375, + "rewards//mean": 0.81976318359375, + "rewards//std": 0.016468243673443794, + "step": 4795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9592, + "grad_norm": 2.4517223834991455, + "kl": 0.47669027000665665, + "learning_rate": 4.22595004762144e-09, + "loss": 0.0191, + "num_tokens": 34934983.0, + "reward": 0.816162109375, + "reward_std": 0.010870839469134808, + "rewards//mean": 0.816162109375, + "rewards//std": 0.0152146490290761, + "step": 4796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9594, + "grad_norm": 1.7102011442184448, + "kl": 0.3752759322524071, + "learning_rate": 4.184879293629706e-09, + "loss": 0.015, + "num_tokens": 34942295.0, + "reward": 0.8480224609375, + "reward_std": 0.01684337854385376, + "rewards//mean": 0.8480224609375, + "rewards//std": 0.024036642163991928, + "step": 4797 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9596, + "grad_norm": 1.6712647676467896, + "kl": 0.3384042792022228, + "learning_rate": 4.1440082539093705e-09, + "loss": 0.0135, + "num_tokens": 34949631.0, + "reward": 0.853515625, + "reward_std": 0.01424287911504507, + "rewards//mean": 0.853515625, + "rewards//std": 0.020231734961271286, + "step": 4798 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9598, + "grad_norm": 1.519855260848999, + "kl": 0.38865454867482185, + "learning_rate": 4.103336944923153e-09, + "loss": 0.0155, + "num_tokens": 34956983.0, + "reward": 0.85382080078125, + "reward_std": 0.018429629504680634, + "rewards//mean": 0.85382080078125, + "rewards//std": 0.02471318282186985, + "step": 4799 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.96, + "grad_norm": 1.4850401878356934, + "kl": 0.279587110504508, + "learning_rate": 4.062865383053504e-09, + "loss": 0.0112, + "num_tokens": 34964223.0, + "reward": 0.89141845703125, + "reward_std": 0.01625317893922329, + "rewards//mean": 0.89141845703125, + "rewards//std": 0.020519159734249115, + "step": 4800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.9602, + "grad_norm": 1.7206153869628906, + "kl": 0.39388834685087204, + "learning_rate": 4.022593584602329e-09, + "loss": 0.0189, + "num_tokens": 34971475.0, + "reward": 0.84417724609375, + "reward_std": 0.015337115153670311, + "rewards//mean": 0.84417724609375, + "rewards//std": 0.019870774820446968, + "step": 4801 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9604, + "grad_norm": 1.5450139045715332, + "kl": 0.3588743004947901, + "learning_rate": 3.982521565791264e-09, + "loss": 0.0144, + "num_tokens": 34978819.0, + "reward": 0.843505859375, + "reward_std": 0.013149967417120934, + "rewards//mean": 0.843505859375, + "rewards//std": 0.021675176918506622, + "step": 4802 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.9606, + "grad_norm": 1.6365841627120972, + "kl": 0.34901024773716927, + "learning_rate": 3.9426493427611175e-09, + "loss": 0.0131, + "num_tokens": 34986033.0, + "reward": 0.8309326171875, + "reward_std": 0.014335734769701958, + "rewards//mean": 0.8309326171875, + "rewards//std": 0.019960181787610054, + "step": 4803 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9608, + "grad_norm": 1.9375921487808228, + "kl": 0.4370863139629364, + "learning_rate": 3.902976931572488e-09, + "loss": 0.0175, + "num_tokens": 34993409.0, + "reward": 0.82708740234375, + "reward_std": 0.016343433409929276, + "rewards//mean": 0.82708740234375, + "rewards//std": 0.021330133080482483, + "step": 4804 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.961, + "grad_norm": 1.5911625623703003, + "kl": 0.3179664220660925, + "learning_rate": 3.8635043482054266e-09, + "loss": 0.0127, + "num_tokens": 35000713.0, + "reward": 0.823486328125, + "reward_std": 0.012743920087814331, + "rewards//mean": 0.823486328125, + "rewards//std": 0.021996842697262764, + "step": 4805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9612, + "grad_norm": 1.5126700401306152, + "kl": 0.3535656835883856, + "learning_rate": 3.8242316085594915e-09, + "loss": 0.0141, + "num_tokens": 35007969.0, + "reward": 0.73577880859375, + "reward_std": 0.015123605728149414, + "rewards//mean": 0.73577880859375, + "rewards//std": 0.0258958637714386, + "step": 4806 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9614, + "grad_norm": 1.4406359195709229, + "kl": 0.3616410568356514, + "learning_rate": 3.785158728453752e-09, + "loss": 0.0145, + "num_tokens": 35015249.0, + "reward": 0.81207275390625, + "reward_std": 0.011947871185839176, + "rewards//mean": 0.81207275390625, + "rewards//std": 0.01742926798760891, + "step": 4807 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9616, + "grad_norm": 1.5012662410736084, + "kl": 0.35851363092660904, + "learning_rate": 3.746285723626729e-09, + "loss": 0.0143, + "num_tokens": 35022505.0, + "reward": 0.78656005859375, + "reward_std": 0.015920285135507584, + "rewards//mean": 0.78656005859375, + "rewards//std": 0.01750207133591175, + "step": 4808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9618, + "grad_norm": 1.6132220029830933, + "kl": 0.35909099131822586, + "learning_rate": 3.707612609736399e-09, + "loss": 0.0144, + "num_tokens": 35029745.0, + "reward": 0.855712890625, + "reward_std": 0.012067515403032303, + "rewards//mean": 0.855712890625, + "rewards//std": 0.028939202427864075, + "step": 4809 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.546875, + "epoch": 0.962, + "grad_norm": 1.7421917915344238, + "kl": 0.38004862144589424, + "learning_rate": 3.669139402360466e-09, + "loss": 0.0012, + "num_tokens": 35036996.0, + "reward": 0.85040283203125, + "reward_std": 0.016412882134318352, + "rewards//mean": 0.85040283203125, + "rewards//std": 0.021187005564570427, + "step": 4810 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9622, + "grad_norm": 1.9665318727493286, + "kl": 0.38347260281443596, + "learning_rate": 3.6308661169957565e-09, + "loss": 0.0153, + "num_tokens": 35044276.0, + "reward": 0.81390380859375, + "reward_std": 0.018308144062757492, + "rewards//mean": 0.81390380859375, + "rewards//std": 0.022685173898935318, + "step": 4811 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9624, + "grad_norm": 1.2427629232406616, + "kl": 0.3307101409882307, + "learning_rate": 3.5927927690588278e-09, + "loss": 0.0132, + "num_tokens": 35051564.0, + "reward": 0.8787841796875, + "reward_std": 0.01443537138402462, + "rewards//mean": 0.8787841796875, + "rewards//std": 0.020111288875341415, + "step": 4812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9626, + "grad_norm": 1.4993481636047363, + "kl": 0.3570174388587475, + "learning_rate": 3.554919373885634e-09, + "loss": 0.0143, + "num_tokens": 35058820.0, + "reward": 0.81304931640625, + "reward_std": 0.01347489096224308, + "rewards//mean": 0.81304931640625, + "rewards//std": 0.02004975453019142, + "step": 4813 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9628, + "grad_norm": 1.4033595323562622, + "kl": 0.33580660447478294, + "learning_rate": 3.5172459467315286e-09, + "loss": 0.0134, + "num_tokens": 35066204.0, + "reward": 0.7427978515625, + "reward_std": 0.013774673454463482, + "rewards//mean": 0.7427978515625, + "rewards//std": 0.020625578239560127, + "step": 4814 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.859375, + "epoch": 0.963, + "grad_norm": 1.6315659284591675, + "kl": 0.38000498712062836, + "learning_rate": 3.479772502771372e-09, + "loss": 0.0075, + "num_tokens": 35073467.0, + "reward": 0.82000732421875, + "reward_std": 0.015232747420668602, + "rewards//mean": 0.82000732421875, + "rewards//std": 0.02560368925333023, + "step": 4815 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9632, + "grad_norm": 1.8232678174972534, + "kl": 0.3255794048309326, + "learning_rate": 3.4424990570994796e-09, + "loss": 0.013, + "num_tokens": 35080859.0, + "reward": 0.82989501953125, + "reward_std": 0.018792826682329178, + "rewards//mean": 0.82989501953125, + "rewards//std": 0.019610822200775146, + "step": 4816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9634, + "grad_norm": 1.4625208377838135, + "kl": 0.2996787205338478, + "learning_rate": 3.405425624729619e-09, + "loss": 0.012, + "num_tokens": 35088195.0, + "reward": 0.8502197265625, + "reward_std": 0.020717410370707512, + "rewards//mean": 0.8502197265625, + "rewards//std": 0.03339473903179169, + "step": 4817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9636, + "grad_norm": 1.4139375686645508, + "kl": 0.3126769382506609, + "learning_rate": 3.3685522205949003e-09, + "loss": 0.0125, + "num_tokens": 35095523.0, + "reward": 0.79974365234375, + "reward_std": 0.014857356436550617, + "rewards//mean": 0.79974365234375, + "rewards//std": 0.01996956393122673, + "step": 4818 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9638, + "grad_norm": 1.4831478595733643, + "kl": 0.3957001008093357, + "learning_rate": 3.331878859547943e-09, + "loss": 0.0158, + "num_tokens": 35102835.0, + "reward": 0.8406982421875, + "reward_std": 0.013864737004041672, + "rewards//mean": 0.8406982421875, + "rewards//std": 0.018380915746092796, + "step": 4819 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.964, + "grad_norm": 1.5363819599151611, + "kl": 0.3347139172255993, + "learning_rate": 3.2954055563608197e-09, + "loss": 0.0134, + "num_tokens": 35110059.0, + "reward": 0.7744140625, + "reward_std": 0.014219533652067184, + "rewards//mean": 0.7744140625, + "rewards//std": 0.016277901828289032, + "step": 4820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9642, + "grad_norm": 1.5181599855422974, + "kl": 0.3285191208124161, + "learning_rate": 3.2591323257248894e-09, + "loss": 0.0131, + "num_tokens": 35117331.0, + "reward": 0.8043212890625, + "reward_std": 0.013528214767575264, + "rewards//mean": 0.8043212890625, + "rewards//std": 0.020096229389309883, + "step": 4821 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.59375, + "epoch": 0.9644, + "grad_norm": 1.6595962047576904, + "kl": 0.34726752154529095, + "learning_rate": 3.2230591822510756e-09, + "loss": -0.0005, + "num_tokens": 35124593.0, + "reward": 0.8692626953125, + "reward_std": 0.016333896666765213, + "rewards//mean": 0.8692626953125, + "rewards//std": 0.02408948540687561, + "step": 4822 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9646, + "grad_norm": 1.986414909362793, + "kl": 0.33945267647504807, + "learning_rate": 3.1871861404696444e-09, + "loss": 0.0136, + "num_tokens": 35131825.0, + "reward": 0.88983154296875, + "reward_std": 0.024044491350650787, + "rewards//mean": 0.88983154296875, + "rewards//std": 0.027876053005456924, + "step": 4823 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9648, + "grad_norm": 1.7148033380508423, + "kl": 0.342111699283123, + "learning_rate": 3.1515132148302038e-09, + "loss": 0.0137, + "num_tokens": 35139145.0, + "reward": 0.79022216796875, + "reward_std": 0.017488688230514526, + "rewards//mean": 0.79022216796875, + "rewards//std": 0.03117768093943596, + "step": 4824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.796875, + "epoch": 0.965, + "grad_norm": 1.5482666492462158, + "kl": 0.32368583604693413, + "learning_rate": 3.116040419701815e-09, + "loss": 0.0026, + "num_tokens": 35146460.0, + "reward": 0.87982177734375, + "reward_std": 0.020754970610141754, + "rewards//mean": 0.87982177734375, + "rewards//std": 0.02908002771437168, + "step": 4825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9652, + "grad_norm": 1.4910794496536255, + "kl": 0.36638421565294266, + "learning_rate": 3.0807677693729385e-09, + "loss": 0.0147, + "num_tokens": 35153868.0, + "reward": 0.83648681640625, + "reward_std": 0.012763193808495998, + "rewards//mean": 0.83648681640625, + "rewards//std": 0.0165269672870636, + "step": 4826 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9654, + "grad_norm": 1.604996919631958, + "kl": 0.4308152012526989, + "learning_rate": 3.0456952780513747e-09, + "loss": 0.0172, + "num_tokens": 35161156.0, + "reward": 0.85369873046875, + "reward_std": 0.016836853697896004, + "rewards//mean": 0.85369873046875, + "rewards//std": 0.02044525183737278, + "step": 4827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9656, + "grad_norm": 1.4455245733261108, + "kl": 0.37491138465702534, + "learning_rate": 3.010822959864323e-09, + "loss": 0.015, + "num_tokens": 35168596.0, + "reward": 0.85980224609375, + "reward_std": 0.019456181675195694, + "rewards//mean": 0.85980224609375, + "rewards//std": 0.033190298825502396, + "step": 4828 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9658, + "grad_norm": 1.1953821182250977, + "kl": 0.353719474747777, + "learning_rate": 2.976150828858326e-09, + "loss": 0.0141, + "num_tokens": 35175788.0, + "reward": 0.86297607421875, + "reward_std": 0.012019362300634384, + "rewards//mean": 0.86297607421875, + "rewards//std": 0.023595130071043968, + "step": 4829 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.966, + "grad_norm": 1.7058582305908203, + "kl": 0.3492080830037594, + "learning_rate": 2.941678898999378e-09, + "loss": 0.013, + "num_tokens": 35183107.0, + "reward": 0.8824462890625, + "reward_std": 0.01289602555334568, + "rewards//mean": 0.8824462890625, + "rewards//std": 0.02181822434067726, + "step": 4830 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9662, + "grad_norm": 1.5485382080078125, + "kl": 0.35370236448943615, + "learning_rate": 2.9074071841727054e-09, + "loss": 0.0141, + "num_tokens": 35190347.0, + "reward": 0.84832763671875, + "reward_std": 0.014124991372227669, + "rewards//mean": 0.84832763671875, + "rewards//std": 0.025869546458125114, + "step": 4831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9664, + "grad_norm": 1.5124542713165283, + "kl": 0.3150057289749384, + "learning_rate": 2.873335698182988e-09, + "loss": 0.0126, + "num_tokens": 35197587.0, + "reward": 0.767333984375, + "reward_std": 0.014515913091599941, + "rewards//mean": 0.767333984375, + "rewards//std": 0.021719828248023987, + "step": 4832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9666, + "grad_norm": 1.5244609117507935, + "kl": 0.34798533469438553, + "learning_rate": 2.839464454754137e-09, + "loss": 0.0139, + "num_tokens": 35204875.0, + "reward": 0.84619140625, + "reward_std": 0.016064245253801346, + "rewards//mean": 0.84619140625, + "rewards//std": 0.02116207405924797, + "step": 4833 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9668, + "grad_norm": 1.6013902425765991, + "kl": 0.3519063629209995, + "learning_rate": 2.8057934675296268e-09, + "loss": 0.0141, + "num_tokens": 35212083.0, + "reward": 0.87591552734375, + "reward_std": 0.018797818571329117, + "rewards//mean": 0.87591552734375, + "rewards//std": 0.026518534868955612, + "step": 4834 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.967, + "grad_norm": 1.5831148624420166, + "kl": 0.273804796859622, + "learning_rate": 2.772322750071998e-09, + "loss": 0.011, + "num_tokens": 35219419.0, + "reward": 0.87078857421875, + "reward_std": 0.01880044862627983, + "rewards//mean": 0.87078857421875, + "rewards//std": 0.02658751606941223, + "step": 4835 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9672, + "grad_norm": 1.798590064048767, + "kl": 0.31937896460294724, + "learning_rate": 2.739052315863355e-09, + "loss": 0.0128, + "num_tokens": 35226803.0, + "reward": 0.863525390625, + "reward_std": 0.011680962517857552, + "rewards//mean": 0.863525390625, + "rewards//std": 0.017591899260878563, + "step": 4836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9674, + "grad_norm": 1.4979737997055054, + "kl": 0.38325751200318336, + "learning_rate": 2.705982178304922e-09, + "loss": 0.0153, + "num_tokens": 35234195.0, + "reward": 0.85369873046875, + "reward_std": 0.019051048904657364, + "rewards//mean": 0.85369873046875, + "rewards//std": 0.03458271920681, + "step": 4837 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9676, + "grad_norm": 1.7646653652191162, + "kl": 0.3275521583855152, + "learning_rate": 2.6731123507174324e-09, + "loss": 0.0131, + "num_tokens": 35241491.0, + "reward": 0.88238525390625, + "reward_std": 0.014918937347829342, + "rewards//mean": 0.88238525390625, + "rewards//std": 0.01769132725894451, + "step": 4838 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9678, + "grad_norm": 1.5148659944534302, + "kl": 0.3304231148213148, + "learning_rate": 2.640442846340796e-09, + "loss": 0.0132, + "num_tokens": 35248875.0, + "reward": 0.85284423828125, + "reward_std": 0.014814004302024841, + "rewards//mean": 0.85284423828125, + "rewards//std": 0.02018444798886776, + "step": 4839 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.968, + "grad_norm": 1.6659876108169556, + "kl": 0.32332315668463707, + "learning_rate": 2.6079736783343187e-09, + "loss": 0.0129, + "num_tokens": 35256155.0, + "reward": 0.8739013671875, + "reward_std": 0.015971723943948746, + "rewards//mean": 0.8739013671875, + "rewards//std": 0.025358112528920174, + "step": 4840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9682, + "grad_norm": 1.5159579515457153, + "kl": 0.37280111387372017, + "learning_rate": 2.5757048597765395e-09, + "loss": 0.0149, + "num_tokens": 35263403.0, + "reward": 0.85479736328125, + "reward_std": 0.015227770432829857, + "rewards//mean": 0.85479736328125, + "rewards//std": 0.027005551382899284, + "step": 4841 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9684, + "grad_norm": 1.6285183429718018, + "kl": 0.27141095139086246, + "learning_rate": 2.5436364036653393e-09, + "loss": 0.0109, + "num_tokens": 35270627.0, + "reward": 0.820068359375, + "reward_std": 0.013083484023809433, + "rewards//mean": 0.820068359375, + "rewards//std": 0.015761971473693848, + "step": 4842 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9686, + "grad_norm": 3.269968271255493, + "kl": 0.5199231952428818, + "learning_rate": 2.51176832291794e-09, + "loss": 0.0208, + "num_tokens": 35277915.0, + "reward": 0.8623046875, + "reward_std": 0.012993669137358665, + "rewards//mean": 0.8623046875, + "rewards//std": 0.02005135826766491, + "step": 4843 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9688, + "grad_norm": 1.6606500148773193, + "kl": 0.3982977345585823, + "learning_rate": 2.480100630370796e-09, + "loss": 0.0159, + "num_tokens": 35285227.0, + "reward": 0.843505859375, + "reward_std": 0.012403905391693115, + "rewards//mean": 0.843505859375, + "rewards//std": 0.018000196665525436, + "step": 4844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.969, + "grad_norm": 1.5609487295150757, + "kl": 0.42801303416490555, + "learning_rate": 2.448633338779593e-09, + "loss": 0.0171, + "num_tokens": 35292475.0, + "reward": 0.86370849609375, + "reward_std": 0.019725071266293526, + "rewards//mean": 0.86370849609375, + "rewards//std": 0.025912227109074593, + "step": 4845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.9692, + "grad_norm": 1.5515614748001099, + "kl": 0.43616129644215107, + "learning_rate": 2.417366460819359e-09, + "loss": 0.017, + "num_tokens": 35299778.0, + "reward": 0.8145751953125, + "reward_std": 0.015177986584603786, + "rewards//mean": 0.8145751953125, + "rewards//std": 0.01783927157521248, + "step": 4846 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9694, + "grad_norm": 1.8687266111373901, + "kl": 0.4099589101970196, + "learning_rate": 2.3863000090844076e-09, + "loss": 0.0164, + "num_tokens": 35307138.0, + "reward": 0.8045654296875, + "reward_std": 0.011616850271821022, + "rewards//mean": 0.8045654296875, + "rewards//std": 0.01796104945242405, + "step": 4847 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9696, + "grad_norm": 1.6017037630081177, + "kl": 0.3976080622524023, + "learning_rate": 2.3554339960883407e-09, + "loss": 0.0159, + "num_tokens": 35314450.0, + "reward": 0.818359375, + "reward_std": 0.013033047318458557, + "rewards//mean": 0.818359375, + "rewards//std": 0.02142934687435627, + "step": 4848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9698, + "grad_norm": 2.042386770248413, + "kl": 0.3053513541817665, + "learning_rate": 2.324768434263935e-09, + "loss": 0.0122, + "num_tokens": 35321754.0, + "reward": 0.857421875, + "reward_std": 0.015209203585982323, + "rewards//mean": 0.857421875, + "rewards//std": 0.025334522128105164, + "step": 4849 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.875, + "epoch": 0.97, + "grad_norm": 1.5372663736343384, + "kl": 0.36574243381619453, + "learning_rate": 2.2943033359632546e-09, + "loss": 0.0093, + "num_tokens": 35329058.0, + "reward": 0.87054443359375, + "reward_std": 0.014328816905617714, + "rewards//mean": 0.87054443359375, + "rewards//std": 0.022302929311990738, + "step": 4850 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.328125, + "epoch": 0.9702, + "grad_norm": 1.4898346662521362, + "kl": 0.3831529915332794, + "learning_rate": 2.2640387134577053e-09, + "loss": -0.0301, + "num_tokens": 35336311.0, + "reward": 0.80267333984375, + "reward_std": 0.015568515285849571, + "rewards//mean": 0.80267333984375, + "rewards//std": 0.023489033803343773, + "step": 4851 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9704, + "grad_norm": 1.5791990756988525, + "kl": 0.41659272462129593, + "learning_rate": 2.233974578937814e-09, + "loss": 0.0167, + "num_tokens": 35343503.0, + "reward": 0.81512451171875, + "reward_std": 0.013336120173335075, + "rewards//mean": 0.81512451171875, + "rewards//std": 0.01921311765909195, + "step": 4852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9706, + "grad_norm": 1.607378602027893, + "kl": 0.3678699266165495, + "learning_rate": 2.2041109445134488e-09, + "loss": 0.0147, + "num_tokens": 35350711.0, + "reward": 0.78546142578125, + "reward_std": 0.015425069257616997, + "rewards//mean": 0.78546142578125, + "rewards//std": 0.020378507673740387, + "step": 4853 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9708, + "grad_norm": 1.8571914434432983, + "kl": 0.41165316477417946, + "learning_rate": 2.1744478222136543e-09, + "loss": 0.0165, + "num_tokens": 35357975.0, + "reward": 0.86065673828125, + "reward_std": 0.020128078758716583, + "rewards//mean": 0.86065673828125, + "rewards//std": 0.02504660189151764, + "step": 4854 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.971, + "grad_norm": 1.5091907978057861, + "kl": 0.462164718657732, + "learning_rate": 2.1449852239868173e-09, + "loss": 0.0185, + "num_tokens": 35365287.0, + "reward": 0.83502197265625, + "reward_std": 0.01925741136074066, + "rewards//mean": 0.83502197265625, + "rewards//std": 0.021785156801342964, + "step": 4855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9712, + "grad_norm": 1.4049150943756104, + "kl": 0.36213384196162224, + "learning_rate": 2.115723161700278e-09, + "loss": 0.0145, + "num_tokens": 35372543.0, + "reward": 0.82720947265625, + "reward_std": 0.013309131376445293, + "rewards//mean": 0.82720947265625, + "rewards//std": 0.020443031564354897, + "step": 4856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9714, + "grad_norm": 1.735480785369873, + "kl": 0.37802253663539886, + "learning_rate": 2.086661647140997e-09, + "loss": 0.0151, + "num_tokens": 35379903.0, + "reward": 0.84552001953125, + "reward_std": 0.01746436394751072, + "rewards//mean": 0.84552001953125, + "rewards//std": 0.023734577000141144, + "step": 4857 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9716, + "grad_norm": 1.3320856094360352, + "kl": 0.28780791722238064, + "learning_rate": 2.057800692014833e-09, + "loss": 0.0115, + "num_tokens": 35387159.0, + "reward": 0.84332275390625, + "reward_std": 0.010679153725504875, + "rewards//mean": 0.84332275390625, + "rewards//std": 0.015392573550343513, + "step": 4858 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9718, + "grad_norm": 1.4447962045669556, + "kl": 0.3260059393942356, + "learning_rate": 2.029140307946986e-09, + "loss": 0.013, + "num_tokens": 35394471.0, + "reward": 0.822998046875, + "reward_std": 0.015427544713020325, + "rewards//mean": 0.822998046875, + "rewards//std": 0.019160354509949684, + "step": 4859 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.921875, + "epoch": 0.972, + "grad_norm": 1.605102300643921, + "kl": 0.33628041110932827, + "learning_rate": 2.000680506481889e-09, + "loss": 0.0094, + "num_tokens": 35401706.0, + "reward": 0.82342529296875, + "reward_std": 0.015508405864238739, + "rewards//mean": 0.82342529296875, + "rewards//std": 0.017121702432632446, + "step": 4860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9722, + "grad_norm": 1.5789241790771484, + "kl": 0.32810851372778416, + "learning_rate": 1.9724212990830936e-09, + "loss": 0.0131, + "num_tokens": 35409082.0, + "reward": 0.84747314453125, + "reward_std": 0.014937498606741428, + "rewards//mean": 0.84747314453125, + "rewards//std": 0.02486584521830082, + "step": 4861 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9724, + "grad_norm": 1.6849124431610107, + "kl": 0.39616289362311363, + "learning_rate": 1.9443626971334946e-09, + "loss": 0.0158, + "num_tokens": 35416322.0, + "reward": 0.83544921875, + "reward_std": 0.017523813992738724, + "rewards//mean": 0.83544921875, + "rewards//std": 0.02558661252260208, + "step": 4862 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9726, + "grad_norm": 1.7540698051452637, + "kl": 0.44232622161507607, + "learning_rate": 1.9165047119349966e-09, + "loss": 0.0177, + "num_tokens": 35423498.0, + "reward": 0.818603515625, + "reward_std": 0.012506465427577496, + "rewards//mean": 0.818603515625, + "rewards//std": 0.021234910935163498, + "step": 4863 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9728, + "grad_norm": 1.441293478012085, + "kl": 0.33493490517139435, + "learning_rate": 1.8888473547088445e-09, + "loss": 0.0134, + "num_tokens": 35430986.0, + "reward": 0.8275146484375, + "reward_std": 0.0110073983669281, + "rewards//mean": 0.8275146484375, + "rewards//std": 0.018535098060965538, + "step": 4864 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.75, + "epoch": 0.973, + "grad_norm": 2.277129650115967, + "kl": 0.3752974830567837, + "learning_rate": 1.8613906365954612e-09, + "loss": 0.0006, + "num_tokens": 35438250.0, + "reward": 0.87823486328125, + "reward_std": 0.022599805146455765, + "rewards//mean": 0.87823486328125, + "rewards//std": 0.033541906625032425, + "step": 4865 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9732, + "grad_norm": 1.419501781463623, + "kl": 0.3669318463653326, + "learning_rate": 1.8341345686543331e-09, + "loss": 0.0147, + "num_tokens": 35445434.0, + "reward": 0.8634033203125, + "reward_std": 0.016175398603081703, + "rewards//mean": 0.8634033203125, + "rewards//std": 0.030024398118257523, + "step": 4866 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9734, + "grad_norm": 1.4142467975616455, + "kl": 0.3593587316572666, + "learning_rate": 1.8070791618641778e-09, + "loss": 0.0144, + "num_tokens": 35452642.0, + "reward": 0.854736328125, + "reward_std": 0.01783338189125061, + "rewards//mean": 0.854736328125, + "rewards//std": 0.024348560720682144, + "step": 4867 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.734375, + "epoch": 0.9736, + "grad_norm": 1.413886308670044, + "kl": 0.33903209678828716, + "learning_rate": 1.7802244271230004e-09, + "loss": -0.0057, + "num_tokens": 35459857.0, + "reward": 0.88116455078125, + "reward_std": 0.019553670659661293, + "rewards//mean": 0.88116455078125, + "rewards//std": 0.027759600430727005, + "step": 4868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9738, + "grad_norm": 1.5008416175842285, + "kl": 0.3341696634888649, + "learning_rate": 1.7535703752478147e-09, + "loss": 0.0134, + "num_tokens": 35467161.0, + "reward": 0.85028076171875, + "reward_std": 0.01675577461719513, + "rewards//mean": 0.85028076171875, + "rewards//std": 0.02623675763607025, + "step": 4869 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.974, + "grad_norm": 1.2949012517929077, + "kl": 0.3160394821316004, + "learning_rate": 1.7271170169749216e-09, + "loss": 0.0126, + "num_tokens": 35474449.0, + "reward": 0.83758544921875, + "reward_std": 0.014323633164167404, + "rewards//mean": 0.83758544921875, + "rewards//std": 0.017073892056941986, + "step": 4870 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9742, + "grad_norm": 1.2877492904663086, + "kl": 0.286435192450881, + "learning_rate": 1.7008643629596864e-09, + "loss": 0.0115, + "num_tokens": 35481721.0, + "reward": 0.888427734375, + "reward_std": 0.010766400955617428, + "rewards//mean": 0.888427734375, + "rewards//std": 0.02555227465927601, + "step": 4871 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9744, + "grad_norm": 1.4381568431854248, + "kl": 0.3377153314650059, + "learning_rate": 1.6748124237767058e-09, + "loss": 0.0135, + "num_tokens": 35488985.0, + "reward": 0.82330322265625, + "reward_std": 0.011207811534404755, + "rewards//mean": 0.82330322265625, + "rewards//std": 0.019221782684326172, + "step": 4872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9746, + "grad_norm": 1.5111219882965088, + "kl": 0.33056943118572235, + "learning_rate": 1.6489612099197526e-09, + "loss": 0.0132, + "num_tokens": 35496209.0, + "reward": 0.8353271484375, + "reward_std": 0.014957181178033352, + "rewards//mean": 0.8353271484375, + "rewards//std": 0.019340822473168373, + "step": 4873 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9748, + "grad_norm": 1.715509295463562, + "kl": 0.33028822392225266, + "learning_rate": 1.6233107318015526e-09, + "loss": 0.0132, + "num_tokens": 35503561.0, + "reward": 0.8275146484375, + "reward_std": 0.011869829148054123, + "rewards//mean": 0.8275146484375, + "rewards//std": 0.0182320736348629, + "step": 4874 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.975, + "grad_norm": 1.619838833808899, + "kl": 0.38914645463228226, + "learning_rate": 1.5978609997542302e-09, + "loss": 0.0156, + "num_tokens": 35510801.0, + "reward": 0.8538818359375, + "reward_std": 0.025746382772922516, + "rewards//mean": 0.8538818359375, + "rewards//std": 0.03773615509271622, + "step": 4875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9752, + "grad_norm": 1.8487824201583862, + "kl": 0.3506835885345936, + "learning_rate": 1.5726120240288631e-09, + "loss": 0.014, + "num_tokens": 35518009.0, + "reward": 0.86151123046875, + "reward_std": 0.016414908692240715, + "rewards//mean": 0.86151123046875, + "rewards//std": 0.021439852192997932, + "step": 4876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.9754, + "grad_norm": 1.576078176498413, + "kl": 0.30351653322577477, + "learning_rate": 1.5475638147957603e-09, + "loss": 0.0127, + "num_tokens": 35525307.0, + "reward": 0.8641357421875, + "reward_std": 0.014363760128617287, + "rewards//mean": 0.8641357421875, + "rewards//std": 0.027713004499673843, + "step": 4877 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9756, + "grad_norm": 2.1528677940368652, + "kl": 0.3127799276262522, + "learning_rate": 1.522716382144351e-09, + "loss": 0.0125, + "num_tokens": 35532635.0, + "reward": 0.8614501953125, + "reward_std": 0.01502231601625681, + "rewards//mean": 0.8614501953125, + "rewards//std": 0.016818977892398834, + "step": 4878 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9758, + "grad_norm": 1.6361627578735352, + "kl": 0.32756296172738075, + "learning_rate": 1.498069736083185e-09, + "loss": 0.0131, + "num_tokens": 35539971.0, + "reward": 0.83489990234375, + "reward_std": 0.013261337764561176, + "rewards//mean": 0.83489990234375, + "rewards//std": 0.020917359739542007, + "step": 4879 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.976, + "grad_norm": 1.5438940525054932, + "kl": 0.32328738272190094, + "learning_rate": 1.4736238865398765e-09, + "loss": 0.0129, + "num_tokens": 35547219.0, + "reward": 0.85919189453125, + "reward_std": 0.013360263779759407, + "rewards//mean": 0.85919189453125, + "rewards//std": 0.021837208420038223, + "step": 4880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9762, + "grad_norm": 1.3633015155792236, + "kl": 0.3315174914896488, + "learning_rate": 1.4493788433612708e-09, + "loss": 0.0133, + "num_tokens": 35554443.0, + "reward": 0.850341796875, + "reward_std": 0.016781767830252647, + "rewards//mean": 0.850341796875, + "rewards//std": 0.03279947116971016, + "step": 4881 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9764, + "grad_norm": 1.4831078052520752, + "kl": 0.37404085509479046, + "learning_rate": 1.4253346163132785e-09, + "loss": 0.015, + "num_tokens": 35561739.0, + "reward": 0.87603759765625, + "reward_std": 0.014742021448910236, + "rewards//mean": 0.87603759765625, + "rewards//std": 0.022854037582874298, + "step": 4882 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9766, + "grad_norm": 1.5051507949829102, + "kl": 0.36110315285623074, + "learning_rate": 1.4014912150808745e-09, + "loss": 0.0144, + "num_tokens": 35568947.0, + "reward": 0.84600830078125, + "reward_std": 0.01836758479475975, + "rewards//mean": 0.84600830078125, + "rewards//std": 0.024218197911977768, + "step": 4883 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9768, + "grad_norm": 1.564754605293274, + "kl": 0.3346183020621538, + "learning_rate": 1.377848649268154e-09, + "loss": 0.0134, + "num_tokens": 35576331.0, + "reward": 0.85211181640625, + "reward_std": 0.018391408026218414, + "rewards//mean": 0.85211181640625, + "rewards//std": 0.03366668522357941, + "step": 4884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.977, + "grad_norm": 1.6736866235733032, + "kl": 0.42126012220978737, + "learning_rate": 1.3544069283983327e-09, + "loss": 0.0169, + "num_tokens": 35583611.0, + "reward": 0.831298828125, + "reward_std": 0.017114033922553062, + "rewards//mean": 0.831298828125, + "rewards//std": 0.025352440774440765, + "step": 4885 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9772, + "grad_norm": 1.6092556715011597, + "kl": 0.3821617662906647, + "learning_rate": 1.3311660619138576e-09, + "loss": 0.0153, + "num_tokens": 35590891.0, + "reward": 0.794677734375, + "reward_std": 0.014312348328530788, + "rewards//mean": 0.794677734375, + "rewards//std": 0.019634833559393883, + "step": 4886 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9774, + "grad_norm": 1.4437180757522583, + "kl": 0.33752287551760674, + "learning_rate": 1.308126059176018e-09, + "loss": 0.0135, + "num_tokens": 35598187.0, + "reward": 0.81903076171875, + "reward_std": 0.012896996922791004, + "rewards//mean": 0.81903076171875, + "rewards//std": 0.01957528106868267, + "step": 4887 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.9776, + "grad_norm": 1.815627098083496, + "kl": 0.49160879105329514, + "learning_rate": 1.2852869294653346e-09, + "loss": 0.0219, + "num_tokens": 35605477.0, + "reward": 0.8680419921875, + "reward_std": 0.015412973240017891, + "rewards//mean": 0.8680419921875, + "rewards//std": 0.01926868036389351, + "step": 4888 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9778, + "grad_norm": 1.4393154382705688, + "kl": 0.26227068342268467, + "learning_rate": 1.2626486819814486e-09, + "loss": 0.0105, + "num_tokens": 35612701.0, + "reward": 0.874755859375, + "reward_std": 0.017067044973373413, + "rewards//mean": 0.874755859375, + "rewards//std": 0.019046248868107796, + "step": 4889 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.978, + "grad_norm": 1.488392949104309, + "kl": 0.35248049534857273, + "learning_rate": 1.2402113258430658e-09, + "loss": 0.0141, + "num_tokens": 35620029.0, + "reward": 0.79693603515625, + "reward_std": 0.011855371296405792, + "rewards//mean": 0.79693603515625, + "rewards//std": 0.01972856931388378, + "step": 4890 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9782, + "grad_norm": 1.6444729566574097, + "kl": 0.3413447942584753, + "learning_rate": 1.217974870087901e-09, + "loss": 0.0137, + "num_tokens": 35627373.0, + "reward": 0.8603515625, + "reward_std": 0.016246125102043152, + "rewards//mean": 0.8603515625, + "rewards//std": 0.023198600858449936, + "step": 4891 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9784, + "grad_norm": 1.4134465456008911, + "kl": 0.3032511379569769, + "learning_rate": 1.1959393236727898e-09, + "loss": 0.0121, + "num_tokens": 35634573.0, + "reward": 0.8839111328125, + "reward_std": 0.01216820441186428, + "rewards//mean": 0.8839111328125, + "rewards//std": 0.018075305968523026, + "step": 4892 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.984375, + "epoch": 0.9786, + "grad_norm": 1.3221920728683472, + "kl": 0.31086331233382225, + "learning_rate": 1.1741046954736877e-09, + "loss": 0.0117, + "num_tokens": 35641916.0, + "reward": 0.86334228515625, + "reward_std": 0.016991181299090385, + "rewards//mean": 0.86334228515625, + "rewards//std": 0.027272727340459824, + "step": 4893 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9788, + "grad_norm": 1.49241042137146, + "kl": 0.32009601779282093, + "learning_rate": 1.1524709942855592e-09, + "loss": 0.0128, + "num_tokens": 35649196.0, + "reward": 0.812255859375, + "reward_std": 0.014876000583171844, + "rewards//mean": 0.812255859375, + "rewards//std": 0.030614785850048065, + "step": 4894 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.979, + "grad_norm": 1.6513659954071045, + "kl": 0.4529429990798235, + "learning_rate": 1.131038228822434e-09, + "loss": 0.0181, + "num_tokens": 35656420.0, + "reward": 0.8421630859375, + "reward_std": 0.01708853431046009, + "rewards//mean": 0.8421630859375, + "rewards//std": 0.025796078145503998, + "step": 4895 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9792, + "grad_norm": 1.5651764869689941, + "kl": 0.30712660774588585, + "learning_rate": 1.1098064077174617e-09, + "loss": 0.0123, + "num_tokens": 35663628.0, + "reward": 0.8330078125, + "reward_std": 0.009892605245113373, + "rewards//mean": 0.8330078125, + "rewards//std": 0.01268159318715334, + "step": 4896 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9794, + "grad_norm": 1.768189549446106, + "kl": 0.348291365429759, + "learning_rate": 1.0887755395228016e-09, + "loss": 0.0139, + "num_tokens": 35670956.0, + "reward": 0.8333740234375, + "reward_std": 0.013517772778868675, + "rewards//mean": 0.8333740234375, + "rewards//std": 0.02851209044456482, + "step": 4897 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9796, + "grad_norm": 1.6398584842681885, + "kl": 0.3963230215013027, + "learning_rate": 1.0679456327097324e-09, + "loss": 0.0159, + "num_tokens": 35678260.0, + "reward": 0.8712158203125, + "reward_std": 0.01915648579597473, + "rewards//mean": 0.8712158203125, + "rewards//std": 0.027559634298086166, + "step": 4898 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9798, + "grad_norm": 1.5351966619491577, + "kl": 0.33961725421249866, + "learning_rate": 1.0473166956684322e-09, + "loss": 0.0136, + "num_tokens": 35685596.0, + "reward": 0.865478515625, + "reward_std": 0.01284283958375454, + "rewards//mean": 0.865478515625, + "rewards//std": 0.026501474902033806, + "step": 4899 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.98, + "grad_norm": 1.465986967086792, + "kl": 0.3657778464257717, + "learning_rate": 1.0268887367083645e-09, + "loss": 0.0146, + "num_tokens": 35692932.0, + "reward": 0.78558349609375, + "reward_std": 0.014685399830341339, + "rewards//mean": 0.78558349609375, + "rewards//std": 0.017595233395695686, + "step": 4900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9802, + "grad_norm": 1.4840291738510132, + "kl": 0.3689253181219101, + "learning_rate": 1.0066617640578368e-09, + "loss": 0.0148, + "num_tokens": 35700156.0, + "reward": 0.8707275390625, + "reward_std": 0.01355360634624958, + "rewards//mean": 0.8707275390625, + "rewards//std": 0.021335544064641, + "step": 4901 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.8125, + "epoch": 0.9804, + "grad_norm": 1.1934826374053955, + "kl": 0.30744518898427486, + "learning_rate": 9.866357858642205e-10, + "loss": -0.0002, + "num_tokens": 35707384.0, + "reward": 0.8699951171875, + "reward_std": 0.012915176339447498, + "rewards//mean": 0.8699951171875, + "rewards//std": 0.01735059730708599, + "step": 4902 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9806, + "grad_norm": 1.650517463684082, + "kl": 0.3924860246479511, + "learning_rate": 9.668108101940632e-10, + "loss": 0.0157, + "num_tokens": 35714704.0, + "reward": 0.8214111328125, + "reward_std": 0.01737486943602562, + "rewards//mean": 0.8214111328125, + "rewards//std": 0.02303587831556797, + "step": 4903 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9808, + "grad_norm": 1.441894292831421, + "kl": 0.39231092296540737, + "learning_rate": 9.471868450328101e-10, + "loss": 0.0157, + "num_tokens": 35722040.0, + "reward": 0.85809326171875, + "reward_std": 0.013881836086511612, + "rewards//mean": 0.85809326171875, + "rewards//std": 0.020517682656645775, + "step": 4904 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.981, + "grad_norm": 1.6240513324737549, + "kl": 0.3761884719133377, + "learning_rate": 9.277638982850833e-10, + "loss": 0.015, + "num_tokens": 35729312.0, + "reward": 0.7962646484375, + "reward_std": 0.015251717530190945, + "rewards//mean": 0.7962646484375, + "rewards//std": 0.024510597810149193, + "step": 4905 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.671875, + "epoch": 0.9812, + "grad_norm": 1.6023316383361816, + "kl": 0.37099383771419525, + "learning_rate": 9.085419777743464e-10, + "loss": 0.0155, + "num_tokens": 35736547.0, + "reward": 0.81793212890625, + "reward_std": 0.01635604165494442, + "rewards//mean": 0.81793212890625, + "rewards//std": 0.021716952323913574, + "step": 4906 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9814, + "grad_norm": 1.5741690397262573, + "kl": 0.35278251580893993, + "learning_rate": 8.895210912431838e-10, + "loss": 0.0141, + "num_tokens": 35743787.0, + "reward": 0.86932373046875, + "reward_std": 0.0205998532474041, + "rewards//mean": 0.86932373046875, + "rewards//std": 0.03093004785478115, + "step": 4907 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9816, + "grad_norm": 1.421380877494812, + "kl": 0.3359085749834776, + "learning_rate": 8.707012463532448e-10, + "loss": 0.0134, + "num_tokens": 35751035.0, + "reward": 0.8634033203125, + "reward_std": 0.014388307929039001, + "rewards//mean": 0.8634033203125, + "rewards//std": 0.021121619269251823, + "step": 4908 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9818, + "grad_norm": 1.5139524936676025, + "kl": 0.3426791988313198, + "learning_rate": 8.520824506851876e-10, + "loss": 0.0137, + "num_tokens": 35758323.0, + "reward": 0.8594970703125, + "reward_std": 0.013325007632374763, + "rewards//mean": 0.8594970703125, + "rewards//std": 0.017798494547605515, + "step": 4909 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.982, + "grad_norm": 1.3788906335830688, + "kl": 0.3366003017872572, + "learning_rate": 8.336647117385687e-10, + "loss": 0.0135, + "num_tokens": 35765619.0, + "reward": 0.86065673828125, + "reward_std": 0.013799364678561687, + "rewards//mean": 0.86065673828125, + "rewards//std": 0.018843993544578552, + "step": 4910 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9822, + "grad_norm": 1.8101286888122559, + "kl": 0.4048751648515463, + "learning_rate": 8.154480369321759e-10, + "loss": 0.0162, + "num_tokens": 35773003.0, + "reward": 0.86456298828125, + "reward_std": 0.01959386095404625, + "rewards//mean": 0.86456298828125, + "rewards//std": 0.03611159697175026, + "step": 4911 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9824, + "grad_norm": 1.88957941532135, + "kl": 0.338626217097044, + "learning_rate": 7.974324336035843e-10, + "loss": 0.0135, + "num_tokens": 35780315.0, + "reward": 0.8233642578125, + "reward_std": 0.012682097032666206, + "rewards//mean": 0.8233642578125, + "rewards//std": 0.016462381929159164, + "step": 4912 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9826, + "grad_norm": 1.6707555055618286, + "kl": 0.37593916803598404, + "learning_rate": 7.79617909009489e-10, + "loss": 0.015, + "num_tokens": 35787659.0, + "reward": 0.85772705078125, + "reward_std": 0.016048669815063477, + "rewards//mean": 0.85772705078125, + "rewards//std": 0.02313382923603058, + "step": 4913 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9828, + "grad_norm": 1.6673663854599, + "kl": 0.3526548929512501, + "learning_rate": 7.620044703256501e-10, + "loss": 0.0141, + "num_tokens": 35794867.0, + "reward": 0.8448486328125, + "reward_std": 0.015979493036866188, + "rewards//mean": 0.8448486328125, + "rewards//std": 0.02380121313035488, + "step": 4914 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.71875, + "epoch": 0.983, + "grad_norm": 1.6906325817108154, + "kl": 0.347359674051404, + "learning_rate": 7.445921246466702e-10, + "loss": -0.0077, + "num_tokens": 35802145.0, + "reward": 0.89031982421875, + "reward_std": 0.022195957601070404, + "rewards//mean": 0.89031982421875, + "rewards//std": 0.03229665383696556, + "step": 4915 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9832, + "grad_norm": 1.551815390586853, + "kl": 0.38830458000302315, + "learning_rate": 7.273808789862723e-10, + "loss": 0.0155, + "num_tokens": 35809433.0, + "reward": 0.8685302734375, + "reward_std": 0.019586831331253052, + "rewards//mean": 0.8685302734375, + "rewards//std": 0.02645888738334179, + "step": 4916 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9834, + "grad_norm": 1.4443800449371338, + "kl": 0.38262927532196045, + "learning_rate": 7.103707402771886e-10, + "loss": 0.0153, + "num_tokens": 35816729.0, + "reward": 0.8043212890625, + "reward_std": 0.012423822656273842, + "rewards//mean": 0.8043212890625, + "rewards//std": 0.021673431620001793, + "step": 4917 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9836, + "grad_norm": 1.3209503889083862, + "kl": 0.31966519355773926, + "learning_rate": 6.935617153710494e-10, + "loss": 0.0128, + "num_tokens": 35823961.0, + "reward": 0.87579345703125, + "reward_std": 0.01548282615840435, + "rewards//mean": 0.87579345703125, + "rewards//std": 0.019490031525492668, + "step": 4918 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9838, + "grad_norm": 1.6582190990447998, + "kl": 0.41344910115003586, + "learning_rate": 6.769538110384943e-10, + "loss": 0.0165, + "num_tokens": 35831233.0, + "reward": 0.834228515625, + "reward_std": 0.019052131101489067, + "rewards//mean": 0.834228515625, + "rewards//std": 0.02134866639971733, + "step": 4919 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.984, + "grad_norm": 1.5148040056228638, + "kl": 0.36154856346547604, + "learning_rate": 6.605470339692831e-10, + "loss": 0.0145, + "num_tokens": 35838505.0, + "reward": 0.8443603515625, + "reward_std": 0.013174253515899181, + "rewards//mean": 0.8443603515625, + "rewards//std": 0.01473498810082674, + "step": 4920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9842, + "grad_norm": 1.4602357149124146, + "kl": 0.3280716501176357, + "learning_rate": 6.443413907720186e-10, + "loss": 0.0131, + "num_tokens": 35845761.0, + "reward": 0.86981201171875, + "reward_std": 0.014248033985495567, + "rewards//mean": 0.86981201171875, + "rewards//std": 0.023715436458587646, + "step": 4921 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.84375, + "epoch": 0.9844, + "grad_norm": 1.7693934440612793, + "kl": 0.39357607811689377, + "learning_rate": 6.283368879742567e-10, + "loss": 0.0132, + "num_tokens": 35853063.0, + "reward": 0.8323974609375, + "reward_std": 0.015042723156511784, + "rewards//mean": 0.8323974609375, + "rewards//std": 0.019743623211979866, + "step": 4922 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.96875, + "epoch": 0.9846, + "grad_norm": 1.736224889755249, + "kl": 0.38356440514326096, + "learning_rate": 6.125335320227298e-10, + "loss": 0.0155, + "num_tokens": 35860373.0, + "reward": 0.76397705078125, + "reward_std": 0.015214234590530396, + "rewards//mean": 0.76397705078125, + "rewards//std": 0.021707888692617416, + "step": 4923 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9848, + "grad_norm": 1.3882968425750732, + "kl": 0.3600369356572628, + "learning_rate": 5.969313292830125e-10, + "loss": 0.0144, + "num_tokens": 35867613.0, + "reward": 0.8260498046875, + "reward_std": 0.012699716724455357, + "rewards//mean": 0.8260498046875, + "rewards//std": 0.018108775839209557, + "step": 4924 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.985, + "grad_norm": 1.335463285446167, + "kl": 0.35797204449772835, + "learning_rate": 5.815302860395776e-10, + "loss": 0.0143, + "num_tokens": 35874893.0, + "reward": 0.79046630859375, + "reward_std": 0.009389033541083336, + "rewards//mean": 0.79046630859375, + "rewards//std": 0.012764728628098965, + "step": 4925 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9852, + "grad_norm": 1.524540901184082, + "kl": 0.3533009458333254, + "learning_rate": 5.663304084960186e-10, + "loss": 0.0141, + "num_tokens": 35882157.0, + "reward": 0.78546142578125, + "reward_std": 0.01428314670920372, + "rewards//mean": 0.78546142578125, + "rewards//std": 0.020461535081267357, + "step": 4926 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9854, + "grad_norm": 1.5188511610031128, + "kl": 0.2636149823665619, + "learning_rate": 5.51331702774882e-10, + "loss": 0.0105, + "num_tokens": 35889549.0, + "reward": 0.81927490234375, + "reward_std": 0.01621650904417038, + "rewards//mean": 0.81927490234375, + "rewards//std": 0.02585725486278534, + "step": 4927 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9856, + "grad_norm": 1.528095006942749, + "kl": 0.34828596748411655, + "learning_rate": 5.365341749175578e-10, + "loss": 0.0139, + "num_tokens": 35896813.0, + "reward": 0.8662109375, + "reward_std": 0.017877478152513504, + "rewards//mean": 0.8662109375, + "rewards//std": 0.0308914203196764, + "step": 4928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9858, + "grad_norm": 1.4896503686904907, + "kl": 0.3516658581793308, + "learning_rate": 5.219378308845556e-10, + "loss": 0.0141, + "num_tokens": 35904293.0, + "reward": 0.87158203125, + "reward_std": 0.02152361162006855, + "rewards//mean": 0.87158203125, + "rewards//std": 0.02749391831457615, + "step": 4929 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.986, + "grad_norm": 1.5440232753753662, + "kl": 0.37030784972012043, + "learning_rate": 5.075426765552837e-10, + "loss": 0.0148, + "num_tokens": 35911509.0, + "reward": 0.865966796875, + "reward_std": 0.00979132391512394, + "rewards//mean": 0.865966796875, + "rewards//std": 0.012999890372157097, + "step": 4930 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9862, + "grad_norm": 1.7732912302017212, + "kl": 0.32938553392887115, + "learning_rate": 4.933487177280482e-10, + "loss": 0.0132, + "num_tokens": 35918789.0, + "reward": 0.8077392578125, + "reward_std": 0.01365977805107832, + "rewards//mean": 0.8077392578125, + "rewards//std": 0.01826193928718567, + "step": 4931 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9864, + "grad_norm": 1.501515507698059, + "kl": 0.3885760270059109, + "learning_rate": 4.793559601202757e-10, + "loss": 0.0155, + "num_tokens": 35926037.0, + "reward": 0.83404541015625, + "reward_std": 0.012971030548214912, + "rewards//mean": 0.83404541015625, + "rewards//std": 0.014665350317955017, + "step": 4932 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9866, + "grad_norm": 1.6637002229690552, + "kl": 0.4570925086736679, + "learning_rate": 4.6556440936812437e-10, + "loss": 0.0183, + "num_tokens": 35933333.0, + "reward": 0.83367919921875, + "reward_std": 0.016335122287273407, + "rewards//mean": 0.83367919921875, + "rewards//std": 0.022018080577254295, + "step": 4933 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9868, + "grad_norm": 1.5848854780197144, + "kl": 0.3366064019501209, + "learning_rate": 4.519740710269282e-10, + "loss": 0.0135, + "num_tokens": 35940605.0, + "reward": 0.85418701171875, + "reward_std": 0.01435003150254488, + "rewards//mean": 0.85418701171875, + "rewards//std": 0.021236959844827652, + "step": 4934 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.8125, + "epoch": 0.987, + "grad_norm": 1.5103498697280884, + "kl": 0.3860991094261408, + "learning_rate": 4.3858495057080836e-10, + "loss": 0.0083, + "num_tokens": 35947849.0, + "reward": 0.83258056640625, + "reward_std": 0.014800931327044964, + "rewards//mean": 0.83258056640625, + "rewards//std": 0.02671077847480774, + "step": 4935 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9872, + "grad_norm": 1.4702281951904297, + "kl": 0.3935575783252716, + "learning_rate": 4.2539705339295073e-10, + "loss": 0.0157, + "num_tokens": 35955145.0, + "reward": 0.8231201171875, + "reward_std": 0.017062487080693245, + "rewards//mean": 0.8231201171875, + "rewards//std": 0.022265993058681488, + "step": 4936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9874, + "grad_norm": 1.415228247642517, + "kl": 0.3618629090487957, + "learning_rate": 4.1241038480543945e-10, + "loss": 0.0145, + "num_tokens": 35962457.0, + "reward": 0.8485107421875, + "reward_std": 0.01690743863582611, + "rewards//mean": 0.8485107421875, + "rewards//std": 0.018142182379961014, + "step": 4937 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.9876, + "grad_norm": 1.62523353099823, + "kl": 0.3040515724569559, + "learning_rate": 3.996249500392568e-10, + "loss": 0.016, + "num_tokens": 35969787.0, + "reward": 0.816162109375, + "reward_std": 0.014701067470014095, + "rewards//mean": 0.816162109375, + "rewards//std": 0.019071664661169052, + "step": 4938 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9878, + "grad_norm": 1.4462467432022095, + "kl": 0.3289750572293997, + "learning_rate": 3.870407542443943e-10, + "loss": 0.0132, + "num_tokens": 35977107.0, + "reward": 0.834228515625, + "reward_std": 0.011918775737285614, + "rewards//mean": 0.834228515625, + "rewards//std": 0.02862786129117012, + "step": 4939 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.988, + "grad_norm": 1.6390811204910278, + "kl": 0.3589901328086853, + "learning_rate": 3.746578024897418e-10, + "loss": 0.0144, + "num_tokens": 35984395.0, + "reward": 0.8236083984375, + "reward_std": 0.016193339601159096, + "rewards//mean": 0.8236083984375, + "rewards//std": 0.027902444824576378, + "step": 4940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9882, + "grad_norm": 1.4231598377227783, + "kl": 0.3658863566815853, + "learning_rate": 3.6247609976319817e-10, + "loss": 0.0146, + "num_tokens": 35991675.0, + "reward": 0.83624267578125, + "reward_std": 0.016094576567411423, + "rewards//mean": 0.83624267578125, + "rewards//std": 0.021444087848067284, + "step": 4941 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9884, + "grad_norm": 1.7162764072418213, + "kl": 0.3504890725016594, + "learning_rate": 3.5049565097156063e-10, + "loss": 0.014, + "num_tokens": 35998915.0, + "reward": 0.87225341796875, + "reward_std": 0.01899198815226555, + "rewards//mean": 0.87225341796875, + "rewards//std": 0.020945565775036812, + "step": 4942 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9886, + "grad_norm": 1.4146583080291748, + "kl": 0.3125611748546362, + "learning_rate": 3.387164609405246e-10, + "loss": 0.0125, + "num_tokens": 36006171.0, + "reward": 0.8724365234375, + "reward_std": 0.01256505586206913, + "rewards//mean": 0.8724365234375, + "rewards//std": 0.023253023624420166, + "step": 4943 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9888, + "grad_norm": 1.7504431009292603, + "kl": 0.3458398450165987, + "learning_rate": 3.2713853441468375e-10, + "loss": 0.0138, + "num_tokens": 36013443.0, + "reward": 0.85595703125, + "reward_std": 0.014408886432647705, + "rewards//mean": 0.85595703125, + "rewards//std": 0.020021138712763786, + "step": 4944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.989, + "grad_norm": 1.7222075462341309, + "kl": 0.32947627641260624, + "learning_rate": 3.1576187605775186e-10, + "loss": 0.0132, + "num_tokens": 36020731.0, + "reward": 0.83447265625, + "reward_std": 0.012921841815114021, + "rewards//mean": 0.83447265625, + "rewards//std": 0.017696566879749298, + "step": 4945 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9892, + "grad_norm": 1.4415017366409302, + "kl": 0.3028903156518936, + "learning_rate": 3.0458649045211894e-10, + "loss": 0.0121, + "num_tokens": 36028027.0, + "reward": 0.802490234375, + "reward_std": 0.015005173161625862, + "rewards//mean": 0.802490234375, + "rewards//std": 0.02767282724380493, + "step": 4946 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9894, + "grad_norm": 1.7140281200408936, + "kl": 0.30649276822805405, + "learning_rate": 2.936123820993508e-10, + "loss": 0.0123, + "num_tokens": 36035331.0, + "reward": 0.8690185546875, + "reward_std": 0.01795235462486744, + "rewards//mean": 0.8690185546875, + "rewards//std": 0.024139704182744026, + "step": 4947 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9896, + "grad_norm": 1.7300121784210205, + "kl": 0.31846003606915474, + "learning_rate": 2.828395554196894e-10, + "loss": 0.0127, + "num_tokens": 36042539.0, + "reward": 0.84552001953125, + "reward_std": 0.01832585223019123, + "rewards//mean": 0.84552001953125, + "rewards//std": 0.029991798102855682, + "step": 4948 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9898, + "grad_norm": 1.6224092245101929, + "kl": 0.3698316775262356, + "learning_rate": 2.7226801475255247e-10, + "loss": 0.0148, + "num_tokens": 36049739.0, + "reward": 0.87628173828125, + "reward_std": 0.014833712950348854, + "rewards//mean": 0.87628173828125, + "rewards//std": 0.0198821984231472, + "step": 4949 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.99, + "grad_norm": 1.5110578536987305, + "kl": 0.3520733639597893, + "learning_rate": 2.6189776435608933e-10, + "loss": 0.0141, + "num_tokens": 36056955.0, + "reward": 0.8843994140625, + "reward_std": 0.01624944992363453, + "rewards//mean": 0.8843994140625, + "rewards//std": 0.030700454488396645, + "step": 4950 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9902, + "grad_norm": 1.9053874015808105, + "kl": 0.32217373326420784, + "learning_rate": 2.517288084074587e-10, + "loss": 0.0129, + "num_tokens": 36064203.0, + "reward": 0.8978271484375, + "reward_std": 0.014509275555610657, + "rewards//mean": 0.8978271484375, + "rewards//std": 0.020766014233231544, + "step": 4951 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9904, + "grad_norm": 1.6235235929489136, + "kl": 0.3555965796113014, + "learning_rate": 2.417611510026618e-10, + "loss": 0.0142, + "num_tokens": 36071443.0, + "reward": 0.774658203125, + "reward_std": 0.01312037743628025, + "rewards//mean": 0.774658203125, + "rewards//std": 0.0270532239228487, + "step": 4952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9906, + "grad_norm": 1.6987266540527344, + "kl": 0.3353637680411339, + "learning_rate": 2.3199479615670926e-10, + "loss": 0.0134, + "num_tokens": 36078739.0, + "reward": 0.8018798828125, + "reward_std": 0.010770524851977825, + "rewards//mean": 0.8018798828125, + "rewards//std": 0.019418932497501373, + "step": 4953 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9908, + "grad_norm": 1.4360694885253906, + "kl": 0.3531855158507824, + "learning_rate": 2.2242974780350977e-10, + "loss": 0.0141, + "num_tokens": 36085979.0, + "reward": 0.84796142578125, + "reward_std": 0.016989244148135185, + "rewards//mean": 0.84796142578125, + "rewards//std": 0.02145537920296192, + "step": 4954 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.991, + "grad_norm": 1.4363220930099487, + "kl": 0.40883135981857777, + "learning_rate": 2.130660097958148e-10, + "loss": 0.0164, + "num_tokens": 36093331.0, + "reward": 0.8145751953125, + "reward_std": 0.012151921167969704, + "rewards//mean": 0.8145751953125, + "rewards//std": 0.01431393064558506, + "step": 4955 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9912, + "grad_norm": 1.4969900846481323, + "kl": 0.3198392763733864, + "learning_rate": 2.0390358590538504e-10, + "loss": 0.0128, + "num_tokens": 36100587.0, + "reward": 0.84613037109375, + "reward_std": 0.014582432806491852, + "rewards//mean": 0.84613037109375, + "rewards//std": 0.02571048028767109, + "step": 4956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.9914, + "grad_norm": 1.5264203548431396, + "kl": 0.32920248806476593, + "learning_rate": 1.9494247982282387e-10, + "loss": 0.0153, + "num_tokens": 36107877.0, + "reward": 0.86236572265625, + "reward_std": 0.01645466685295105, + "rewards//mean": 0.86236572265625, + "rewards//std": 0.025758715346455574, + "step": 4957 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9916, + "grad_norm": 1.5021122694015503, + "kl": 0.41761723533272743, + "learning_rate": 1.8618269515763284e-10, + "loss": 0.0167, + "num_tokens": 36115221.0, + "reward": 0.8582763671875, + "reward_std": 0.019081858918070793, + "rewards//mean": 0.8582763671875, + "rewards//std": 0.021412033587694168, + "step": 4958 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.9918, + "grad_norm": 1.548029899597168, + "kl": 0.40022701770067215, + "learning_rate": 1.7762423543832282e-10, + "loss": 0.0134, + "num_tokens": 36122550.0, + "reward": 0.82586669921875, + "reward_std": 0.014030765742063522, + "rewards//mean": 0.82586669921875, + "rewards//std": 0.02140451967716217, + "step": 4959 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.992, + "grad_norm": 1.4229713678359985, + "kl": 0.32292686589062214, + "learning_rate": 1.692671041121918e-10, + "loss": 0.0129, + "num_tokens": 36129846.0, + "reward": 0.8682861328125, + "reward_std": 0.015984468162059784, + "rewards//mean": 0.8682861328125, + "rewards//std": 0.024895427748560905, + "step": 4960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9922, + "grad_norm": 1.9634400606155396, + "kl": 0.36319613084197044, + "learning_rate": 1.6111130454543597e-10, + "loss": 0.0145, + "num_tokens": 36137150.0, + "reward": 0.87786865234375, + "reward_std": 0.016457267105579376, + "rewards//mean": 0.87786865234375, + "rewards//std": 0.02806708961725235, + "step": 4961 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.9924, + "grad_norm": 1.4365711212158203, + "kl": 0.360240213572979, + "learning_rate": 1.531568400233163e-10, + "loss": 0.0186, + "num_tokens": 36144423.0, + "reward": 0.84967041015625, + "reward_std": 0.013666604645550251, + "rewards//mean": 0.84967041015625, + "rewards//std": 0.018492886796593666, + "step": 4962 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9926, + "grad_norm": 1.704257845878601, + "kl": 0.37568104825913906, + "learning_rate": 1.4540371374988092e-10, + "loss": 0.015, + "num_tokens": 36151711.0, + "reward": 0.77130126953125, + "reward_std": 0.012296844273805618, + "rewards//mean": 0.77130126953125, + "rewards//std": 0.01789296790957451, + "step": 4963 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9928, + "grad_norm": 2.1631500720977783, + "kl": 0.4052448607981205, + "learning_rate": 1.3785192884802065e-10, + "loss": 0.0162, + "num_tokens": 36159111.0, + "reward": 0.831298828125, + "reward_std": 0.01564479060471058, + "rewards//mean": 0.831298828125, + "rewards//std": 0.018465137109160423, + "step": 4964 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.993, + "grad_norm": 1.8277498483657837, + "kl": 0.48601012490689754, + "learning_rate": 1.3050148835958009e-10, + "loss": 0.0194, + "num_tokens": 36166375.0, + "reward": 0.85723876953125, + "reward_std": 0.012746473774313927, + "rewards//mean": 0.85723876953125, + "rewards//std": 0.022648442536592484, + "step": 4965 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9932, + "grad_norm": 1.5922942161560059, + "kl": 0.36833876743912697, + "learning_rate": 1.2335239524541297e-10, + "loss": 0.0147, + "num_tokens": 36173671.0, + "reward": 0.85272216796875, + "reward_std": 0.016722403466701508, + "rewards//mean": 0.85272216796875, + "rewards//std": 0.023413510993123055, + "step": 4966 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9934, + "grad_norm": 1.391300916671753, + "kl": 0.32725678011775017, + "learning_rate": 1.1640465238516028e-10, + "loss": 0.0131, + "num_tokens": 36181055.0, + "reward": 0.8687744140625, + "reward_std": 0.01158067211508751, + "rewards//mean": 0.8687744140625, + "rewards//std": 0.014743204228579998, + "step": 4967 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9936, + "grad_norm": 1.4886157512664795, + "kl": 0.32198560796678066, + "learning_rate": 1.0965826257725019e-10, + "loss": 0.0129, + "num_tokens": 36188319.0, + "reward": 0.87744140625, + "reward_std": 0.01630101539194584, + "rewards//mean": 0.87744140625, + "rewards//std": 0.023370232433080673, + "step": 4968 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9938, + "grad_norm": 1.3896520137786865, + "kl": 0.2973788809031248, + "learning_rate": 1.0311322853928661e-10, + "loss": 0.0119, + "num_tokens": 36195639.0, + "reward": 0.86834716796875, + "reward_std": 0.01476279553025961, + "rewards//mean": 0.86834716796875, + "rewards//std": 0.02339281141757965, + "step": 4969 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.994, + "grad_norm": 1.6063538789749146, + "kl": 0.3164714351296425, + "learning_rate": 9.676955290749412e-11, + "loss": 0.0127, + "num_tokens": 36202943.0, + "reward": 0.78466796875, + "reward_std": 0.015464487485587597, + "rewards//mean": 0.78466796875, + "rewards//std": 0.027148181572556496, + "step": 4970 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9942, + "grad_norm": 1.8534350395202637, + "kl": 0.37422879599034786, + "learning_rate": 9.06272382371065e-11, + "loss": 0.015, + "num_tokens": 36210223.0, + "reward": 0.8282470703125, + "reward_std": 0.009423469193279743, + "rewards//mean": 0.8282470703125, + "rewards//std": 0.015184272080659866, + "step": 4971 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.9375, + "epoch": 0.9944, + "grad_norm": 1.4383752346038818, + "kl": 0.3199087493121624, + "learning_rate": 8.468628700231129e-11, + "loss": 0.0113, + "num_tokens": 36217483.0, + "reward": 0.76397705078125, + "reward_std": 0.01381351612508297, + "rewards//mean": 0.76397705078125, + "rewards//std": 0.022116858512163162, + "step": 4972 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9946, + "grad_norm": 1.4886505603790283, + "kl": 0.3595810681581497, + "learning_rate": 7.89467015961387e-11, + "loss": 0.0144, + "num_tokens": 36224763.0, + "reward": 0.8316650390625, + "reward_std": 0.011827974580228329, + "rewards//mean": 0.8316650390625, + "rewards//std": 0.021403547376394272, + "step": 4973 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9948, + "grad_norm": 1.8299192190170288, + "kl": 0.2801384311169386, + "learning_rate": 7.340848433040614e-11, + "loss": 0.0112, + "num_tokens": 36232123.0, + "reward": 0.8743896484375, + "reward_std": 0.011738014407455921, + "rewards//mean": 0.8743896484375, + "rewards//std": 0.015659356489777565, + "step": 4974 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.995, + "grad_norm": 1.5976238250732422, + "kl": 0.3487021401524544, + "learning_rate": 6.807163743594025e-11, + "loss": 0.0139, + "num_tokens": 36239411.0, + "reward": 0.8448486328125, + "reward_std": 0.018834587186574936, + "rewards//mean": 0.8448486328125, + "rewards//std": 0.027660517022013664, + "step": 4975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9952, + "grad_norm": 1.7341468334197998, + "kl": 0.33699605613946915, + "learning_rate": 6.293616306246586e-11, + "loss": 0.0135, + "num_tokens": 36246611.0, + "reward": 0.8179931640625, + "reward_std": 0.015005189925432205, + "rewards//mean": 0.8179931640625, + "rewards//std": 0.020760180428624153, + "step": 4976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9954, + "grad_norm": 1.873314619064331, + "kl": 0.3247050903737545, + "learning_rate": 5.800206327855051e-11, + "loss": 0.013, + "num_tokens": 36253891.0, + "reward": 0.819580078125, + "reward_std": 0.018773134797811508, + "rewards//mean": 0.819580078125, + "rewards//std": 0.031802188605070114, + "step": 4977 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9956, + "grad_norm": 1.4468228816986084, + "kl": 0.36095694452524185, + "learning_rate": 5.3269340071548927e-11, + "loss": 0.0144, + "num_tokens": 36261259.0, + "reward": 0.8717041015625, + "reward_std": 0.014329320751130581, + "rewards//mean": 0.8717041015625, + "rewards//std": 0.02905900403857231, + "step": 4978 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9958, + "grad_norm": 1.5934624671936035, + "kl": 0.36454967968165874, + "learning_rate": 4.873799534788059e-11, + "loss": 0.0146, + "num_tokens": 36268659.0, + "reward": 0.8353271484375, + "reward_std": 0.015010848641395569, + "rewards//mean": 0.8353271484375, + "rewards//std": 0.020812615752220154, + "step": 4979 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.996, + "grad_norm": 1.570415735244751, + "kl": 0.3538685105741024, + "learning_rate": 4.440803093280765e-11, + "loss": 0.0142, + "num_tokens": 36275891.0, + "reward": 0.8541259765625, + "reward_std": 0.016606919467449188, + "rewards//mean": 0.8541259765625, + "rewards//std": 0.028622308745980263, + "step": 4980 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.890625, + "epoch": 0.9962, + "grad_norm": 1.6205183267593384, + "kl": 0.37368748523294926, + "learning_rate": 4.0279448570323946e-11, + "loss": 0.0148, + "num_tokens": 36283140.0, + "reward": 0.87347412109375, + "reward_std": 0.01761632040143013, + "rewards//mean": 0.87347412109375, + "rewards//std": 0.020334633067250252, + "step": 4981 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9964, + "grad_norm": 1.4659837484359741, + "kl": 0.3459746651351452, + "learning_rate": 3.6352249923543576e-11, + "loss": 0.0138, + "num_tokens": 36290372.0, + "reward": 0.79052734375, + "reward_std": 0.0162075012922287, + "rewards//mean": 0.79052734375, + "rewards//std": 0.026333006098866463, + "step": 4982 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9966, + "grad_norm": 1.486013650894165, + "kl": 0.3460343834012747, + "learning_rate": 3.262643657425679e-11, + "loss": 0.0138, + "num_tokens": 36297652.0, + "reward": 0.8182373046875, + "reward_std": 0.012839418835937977, + "rewards//mean": 0.8182373046875, + "rewards//std": 0.017579417675733566, + "step": 4983 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9968, + "grad_norm": 1.9241646528244019, + "kl": 0.3338615894317627, + "learning_rate": 2.9102010023263067e-11, + "loss": 0.0134, + "num_tokens": 36304892.0, + "reward": 0.86114501953125, + "reward_std": 0.019137132912874222, + "rewards//mean": 0.86114501953125, + "rewards//std": 0.023897293955087662, + "step": 4984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.997, + "grad_norm": 1.6967860460281372, + "kl": 0.32243289425969124, + "learning_rate": 2.57789716902046e-11, + "loss": 0.0129, + "num_tokens": 36312236.0, + "reward": 0.8477783203125, + "reward_std": 0.017030581831932068, + "rewards//mean": 0.8477783203125, + "rewards//std": 0.018597062677145004, + "step": 4985 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.828125, + "epoch": 0.9972, + "grad_norm": 1.7944608926773071, + "kl": 0.3653903305530548, + "learning_rate": 2.2657322913566258e-11, + "loss": 0.0049, + "num_tokens": 36319521.0, + "reward": 0.8653564453125, + "reward_std": 0.015485413372516632, + "rewards//mean": 0.8653564453125, + "rewards//std": 0.028056098148226738, + "step": 4986 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9974, + "grad_norm": 1.7547532320022583, + "kl": 0.34843731485307217, + "learning_rate": 1.973706495078664e-11, + "loss": 0.0139, + "num_tokens": 36326881.0, + "reward": 0.84625244140625, + "reward_std": 0.015173893421888351, + "rewards//mean": 0.84625244140625, + "rewards//std": 0.02570812590420246, + "step": 4987 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.9976, + "grad_norm": 1.7118479013442993, + "kl": 0.33948297053575516, + "learning_rate": 1.7018198978091537e-11, + "loss": 0.0123, + "num_tokens": 36334075.0, + "reward": 0.82061767578125, + "reward_std": 0.01578173041343689, + "rewards//mean": 0.82061767578125, + "rewards//std": 0.02263105846941471, + "step": 4988 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.859375, + "epoch": 0.9978, + "grad_norm": 1.6948556900024414, + "kl": 0.4326485004276037, + "learning_rate": 1.4500726090715953e-11, + "loss": 0.014, + "num_tokens": 36341314.0, + "reward": 0.84954833984375, + "reward_std": 0.02043428272008896, + "rewards//mean": 0.84954833984375, + "rewards//std": 0.026584668084979057, + "step": 4989 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.998, + "grad_norm": 1.593395709991455, + "kl": 0.3503384720534086, + "learning_rate": 1.2184647302626582e-11, + "loss": 0.0155, + "num_tokens": 36348668.0, + "reward": 0.82452392578125, + "reward_std": 0.015002339147031307, + "rewards//mean": 0.82452392578125, + "rewards//std": 0.028084881603717804, + "step": 4990 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9982, + "grad_norm": 1.654030203819275, + "kl": 0.34983208030462265, + "learning_rate": 1.0069963546743831e-11, + "loss": 0.014, + "num_tokens": 36355932.0, + "reward": 0.8538818359375, + "reward_std": 0.01564771868288517, + "rewards//mean": 0.8538818359375, + "rewards//std": 0.021973086521029472, + "step": 4991 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.875, + "epoch": 0.9984, + "grad_norm": 1.4698388576507568, + "kl": 0.3522527404129505, + "learning_rate": 8.156675674941826e-12, + "loss": 0.0141, + "num_tokens": 36363348.0, + "reward": 0.85467529296875, + "reward_std": 0.015195854939520359, + "rewards//mean": 0.85467529296875, + "rewards//std": 0.022044189274311066, + "step": 4992 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.609375, + "epoch": 0.9986, + "grad_norm": 1.4374734163284302, + "kl": 0.3715679906308651, + "learning_rate": 6.444784457770858e-12, + "loss": -0.0128, + "num_tokens": 36370523.0, + "reward": 0.81134033203125, + "reward_std": 0.013619422912597656, + "rewards//mean": 0.81134033203125, + "rewards//std": 0.023413510993123055, + "step": 4993 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9988, + "grad_norm": 1.4618210792541504, + "kl": 0.3092677406966686, + "learning_rate": 4.9342905849014686e-12, + "loss": 0.0124, + "num_tokens": 36377979.0, + "reward": 0.79180908203125, + "reward_std": 0.010933930054306984, + "rewards//mean": 0.79180908203125, + "rewards//std": 0.01825147122144699, + "step": 4994 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.999, + "grad_norm": 1.41682767868042, + "kl": 0.3410138636827469, + "learning_rate": 3.6251946647358753e-12, + "loss": 0.0136, + "num_tokens": 36385267.0, + "reward": 0.85723876953125, + "reward_std": 0.0174893606454134, + "rewards//mean": 0.85723876953125, + "rewards//std": 0.02044525183737278, + "step": 4995 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9992, + "grad_norm": 1.4384037256240845, + "kl": 0.34193285927176476, + "learning_rate": 2.517497224463483e-12, + "loss": 0.0137, + "num_tokens": 36392603.0, + "reward": 0.84271240234375, + "reward_std": 0.0171145461499691, + "rewards//mean": 0.84271240234375, + "rewards//std": 0.025221863761544228, + "step": 4996 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9994, + "grad_norm": 1.987500786781311, + "kl": 0.5015513692051172, + "learning_rate": 1.6111987103939462e-12, + "loss": 0.0201, + "num_tokens": 36399947.0, + "reward": 0.877197265625, + "reward_std": 0.016206169500947, + "rewards//mean": 0.877197265625, + "rewards//std": 0.02867857925593853, + "step": 4997 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.90625, + "epoch": 0.9996, + "grad_norm": 1.6876543760299683, + "kl": 0.33894727006554604, + "learning_rate": 9.062994875685959e-13, + "loss": 0.0164, + "num_tokens": 36407341.0, + "reward": 0.83087158203125, + "reward_std": 0.013601981103420258, + "rewards//mean": 0.83087158203125, + "rewards//std": 0.023772813379764557, + "step": 4998 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 0.9998, + "grad_norm": 1.6161924600601196, + "kl": 0.3700113818049431, + "learning_rate": 4.027998398714594e-13, + "loss": 0.0148, + "num_tokens": 36414581.0, + "reward": 0.847412109375, + "reward_std": 0.018122514709830284, + "rewards//mean": 0.847412109375, + "rewards//std": 0.02826172299683094, + "step": 4999 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0, + "epoch": 1.0, + "grad_norm": 1.6101901531219482, + "kl": 0.41173384711146355, + "learning_rate": 1.0069997008477216e-13, + "loss": 0.0165, + "num_tokens": 36421749.0, + "reward": 0.82647705078125, + "reward_std": 0.011516369879245758, + "rewards//mean": 0.82647705078125, + "rewards//std": 0.018956124782562256, + "step": 5000 + } + ], + "logging_steps": 1, + "max_steps": 5000, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}