{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9998343548119927, "eval_steps": 300, "global_step": 3018, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 896.1125213623047, "epoch": 0.001656451880072884, "grad_norm": 0.19080834090709686, "kl": 4.379749298095703e-05, "learning_rate": 3.311258278145696e-07, "loss": 0.0, "reward": 0.22500000521540642, "reward_std": 0.160313368588686, "rewards/accuracy_reward": 0.22500000521540642, "rewards/format_reward": 0.0, "step": 5 }, { "completion_length": 876.5979309082031, "epoch": 0.003312903760145768, "grad_norm": 0.11577171832323074, "kl": 8.071660995483399e-05, "learning_rate": 6.622516556291392e-07, "loss": 0.0, "reward": 0.21250000782310963, "reward_std": 0.15867876783013343, "rewards/accuracy_reward": 0.21250000782310963, "rewards/format_reward": 0.0, "step": 10 }, { "completion_length": 923.4271087646484, "epoch": 0.0049693556402186515, "grad_norm": 0.09625789523124695, "kl": 8.138418197631836e-05, "learning_rate": 9.933774834437087e-07, "loss": 0.0, "reward": 0.16458333786576987, "reward_std": 0.11284543424844742, "rewards/accuracy_reward": 0.16458333786576987, "rewards/format_reward": 0.0, "step": 15 }, { "completion_length": 886.6583526611328, "epoch": 0.006625807520291536, "grad_norm": 0.1537444144487381, "kl": 8.507966995239258e-05, "learning_rate": 1.3245033112582784e-06, "loss": 0.0, "reward": 0.21875000447034837, "reward_std": 0.17023502737283708, "rewards/accuracy_reward": 0.21875000447034837, "rewards/format_reward": 0.0, "step": 20 }, { "completion_length": 887.618765258789, "epoch": 0.008282259400364419, "grad_norm": 0.1724800020456314, "kl": 0.00013091564178466797, "learning_rate": 1.655629139072848e-06, "loss": 0.0, "reward": 0.19375000428408384, "reward_std": 0.155801273137331, "rewards/accuracy_reward": 0.19375000428408384, "rewards/format_reward": 0.0, "step": 25 }, { "completion_length": 851.8562683105469, "epoch": 0.009938711280437303, "grad_norm": 0.12672410905361176, "kl": 0.0004146099090576172, "learning_rate": 1.9867549668874175e-06, "loss": 0.0, "reward": 0.26458334233611824, "reward_std": 0.1894800379872322, "rewards/accuracy_reward": 0.26458334233611824, "rewards/format_reward": 0.0, "step": 30 }, { "completion_length": 863.7062683105469, "epoch": 0.011595163160510187, "grad_norm": 0.09456944465637207, "kl": 0.0009840011596679687, "learning_rate": 2.317880794701987e-06, "loss": 0.0, "reward": 0.29583334401249883, "reward_std": 0.168301273137331, "rewards/accuracy_reward": 0.29583334401249883, "rewards/format_reward": 0.0, "step": 35 }, { "completion_length": 820.3333557128906, "epoch": 0.013251615040583071, "grad_norm": 0.12397999316453934, "kl": 0.0029320716857910156, "learning_rate": 2.6490066225165567e-06, "loss": 0.0001, "reward": 0.3625000076368451, "reward_std": 0.15163460485637187, "rewards/accuracy_reward": 0.3625000076368451, "rewards/format_reward": 0.0, "step": 40 }, { "completion_length": 791.4562683105469, "epoch": 0.014908066920655955, "grad_norm": 0.17849470674991608, "kl": 0.0056793212890625, "learning_rate": 2.980132450331126e-06, "loss": 0.0002, "reward": 0.41666668131947515, "reward_std": 0.17663460336625575, "rewards/accuracy_reward": 0.41666668131947515, "rewards/format_reward": 0.0, "step": 45 }, { "completion_length": 735.3562744140625, "epoch": 0.016564518800728838, "grad_norm": 0.16775797307491302, "kl": 0.005182647705078125, "learning_rate": 3.311258278145696e-06, "loss": 0.0002, "reward": 0.48333334624767305, "reward_std": 0.16477919071912767, "rewards/accuracy_reward": 0.48333334624767305, "rewards/format_reward": 0.0, "step": 50 }, { "completion_length": 794.0062683105468, "epoch": 0.018220970680801724, "grad_norm": 0.1746249496936798, "kl": 0.004477310180664063, "learning_rate": 3.642384105960265e-06, "loss": 0.0002, "reward": 0.4125000102445483, "reward_std": 0.18367876596748828, "rewards/accuracy_reward": 0.4125000102445483, "rewards/format_reward": 0.0, "step": 55 }, { "completion_length": 774.7583465576172, "epoch": 0.019877422560874606, "grad_norm": 0.1647450178861618, "kl": 0.004723358154296875, "learning_rate": 3.973509933774835e-06, "loss": 0.0002, "reward": 0.4291666761040688, "reward_std": 0.17792377509176732, "rewards/accuracy_reward": 0.4291666761040688, "rewards/format_reward": 0.0, "step": 60 }, { "completion_length": 765.7687713623047, "epoch": 0.021533874440947492, "grad_norm": 0.13205137848854065, "kl": 0.005755615234375, "learning_rate": 4.304635761589404e-06, "loss": 0.0002, "reward": 0.47500001043081286, "reward_std": 0.23179128989577294, "rewards/accuracy_reward": 0.47500001043081286, "rewards/format_reward": 0.0, "step": 65 }, { "completion_length": 814.6562713623047, "epoch": 0.023190326321020374, "grad_norm": 0.1245804950594902, "kl": 0.006722259521484375, "learning_rate": 4.635761589403974e-06, "loss": 0.0003, "reward": 0.4208333469927311, "reward_std": 0.21770296134054662, "rewards/accuracy_reward": 0.4208333469927311, "rewards/format_reward": 0.0, "step": 70 }, { "completion_length": 788.5708557128906, "epoch": 0.024846778201093257, "grad_norm": 0.14767543971538544, "kl": 0.0077606201171875, "learning_rate": 4.966887417218543e-06, "loss": 0.0003, "reward": 0.3979166761040688, "reward_std": 0.2048575308173895, "rewards/accuracy_reward": 0.3979166761040688, "rewards/format_reward": 0.0, "step": 75 }, { "completion_length": 764.1229400634766, "epoch": 0.026503230081166142, "grad_norm": 0.10707354545593262, "kl": 0.01027374267578125, "learning_rate": 5.2980132450331135e-06, "loss": 0.0004, "reward": 0.41041667833924295, "reward_std": 0.1785683583468199, "rewards/accuracy_reward": 0.41041667833924295, "rewards/format_reward": 0.0, "step": 80 }, { "completion_length": 778.3375213623046, "epoch": 0.028159681961239025, "grad_norm": 0.11660794168710709, "kl": 0.01074371337890625, "learning_rate": 5.629139072847682e-06, "loss": 0.0004, "reward": 0.4437500134110451, "reward_std": 0.17504628300666808, "rewards/accuracy_reward": 0.4437500134110451, "rewards/format_reward": 0.0, "step": 85 }, { "completion_length": 736.3229385375977, "epoch": 0.02981613384131191, "grad_norm": 0.20283590257167816, "kl": 0.012200927734375, "learning_rate": 5.960264900662252e-06, "loss": 0.0005, "reward": 0.48541667982935904, "reward_std": 0.21448003873229027, "rewards/accuracy_reward": 0.48541667982935904, "rewards/format_reward": 0.0, "step": 90 }, { "completion_length": 754.0000183105469, "epoch": 0.031472585721384796, "grad_norm": 0.11152322590351105, "kl": 0.01071624755859375, "learning_rate": 6.291390728476822e-06, "loss": 0.0004, "reward": 0.4541666811332107, "reward_std": 0.12311252430081368, "rewards/accuracy_reward": 0.4541666811332107, "rewards/format_reward": 0.0, "step": 95 }, { "completion_length": 803.4104370117187, "epoch": 0.033129037601457675, "grad_norm": 0.14146840572357178, "kl": 0.014569091796875, "learning_rate": 6.622516556291392e-06, "loss": 0.0006, "reward": 0.4458333425223827, "reward_std": 0.19459044486284255, "rewards/accuracy_reward": 0.4458333425223827, "rewards/format_reward": 0.0, "step": 100 }, { "completion_length": 754.060433959961, "epoch": 0.03478548948153056, "grad_norm": 0.13774582743644714, "kl": 0.0156219482421875, "learning_rate": 6.953642384105961e-06, "loss": 0.0006, "reward": 0.466666679084301, "reward_std": 0.1647791914641857, "rewards/accuracy_reward": 0.466666679084301, "rewards/format_reward": 0.0, "step": 105 }, { "completion_length": 817.8791870117187, "epoch": 0.03644194136160345, "grad_norm": 0.14425694942474365, "kl": 0.0191192626953125, "learning_rate": 7.28476821192053e-06, "loss": 0.0008, "reward": 0.3666666800156236, "reward_std": 0.20069086626172067, "rewards/accuracy_reward": 0.3666666800156236, "rewards/format_reward": 0.0, "step": 110 }, { "completion_length": 778.3312744140625, "epoch": 0.038098393241676326, "grad_norm": 0.18174272775650024, "kl": 0.03355712890625, "learning_rate": 7.6158940397351e-06, "loss": 0.0013, "reward": 0.48125001043081284, "reward_std": 0.19910254292190074, "rewards/accuracy_reward": 0.48125001043081284, "rewards/format_reward": 0.0, "step": 115 }, { "completion_length": 715.7771026611329, "epoch": 0.03975484512174921, "grad_norm": 0.24900782108306885, "kl": 0.026123046875, "learning_rate": 7.94701986754967e-06, "loss": 0.001, "reward": 0.49583334252238276, "reward_std": 0.22440169639885427, "rewards/accuracy_reward": 0.49583334252238276, "rewards/format_reward": 0.0, "step": 120 }, { "completion_length": 627.0104324340821, "epoch": 0.0414112970018221, "grad_norm": 0.21013814210891724, "kl": 0.032183837890625, "learning_rate": 8.278145695364238e-06, "loss": 0.0013, "reward": 0.5375000104308129, "reward_std": 0.18883545249700545, "rewards/accuracy_reward": 0.5375000104308129, "rewards/format_reward": 0.0, "step": 125 }, { "completion_length": 676.9125244140625, "epoch": 0.043067748881894984, "grad_norm": 0.1772216260433197, "kl": 0.030029296875, "learning_rate": 8.609271523178809e-06, "loss": 0.0012, "reward": 0.49166668206453323, "reward_std": 0.2577350240200758, "rewards/accuracy_reward": 0.49166668206453323, "rewards/format_reward": 0.0, "step": 130 }, { "completion_length": 727.4396072387696, "epoch": 0.04472420076196786, "grad_norm": 0.1660698652267456, "kl": 0.03292236328125, "learning_rate": 8.940397350993379e-06, "loss": 0.0013, "reward": 0.49166667759418486, "reward_std": 0.2151246231049299, "rewards/accuracy_reward": 0.49166667759418486, "rewards/format_reward": 0.0, "step": 135 }, { "completion_length": 761.6021057128906, "epoch": 0.04638065264204075, "grad_norm": 0.13266272842884064, "kl": 0.040228271484375, "learning_rate": 9.271523178807948e-06, "loss": 0.0016, "reward": 0.4645833460614085, "reward_std": 0.186901693046093, "rewards/accuracy_reward": 0.4645833460614085, "rewards/format_reward": 0.0, "step": 140 }, { "completion_length": 765.5750274658203, "epoch": 0.048037104522113634, "grad_norm": 0.18851597607135773, "kl": 0.059521484375, "learning_rate": 9.602649006622518e-06, "loss": 0.0024, "reward": 0.4187500115483999, "reward_std": 0.1820904441177845, "rewards/accuracy_reward": 0.4187500115483999, "rewards/format_reward": 0.0, "step": 145 }, { "completion_length": 768.2916870117188, "epoch": 0.04969355640218651, "grad_norm": 0.1305186152458191, "kl": 0.0878662109375, "learning_rate": 9.933774834437086e-06, "loss": 0.0035, "reward": 0.49583334773778914, "reward_std": 0.16864670366048812, "rewards/accuracy_reward": 0.49583334773778914, "rewards/format_reward": 0.0, "step": 150 }, { "completion_length": 743.2250152587891, "epoch": 0.0513500082822594, "grad_norm": 0.19374318420886993, "kl": 0.092041015625, "learning_rate": 1.0264900662251655e-05, "loss": 0.0037, "reward": 0.5270833477377892, "reward_std": 0.19746793769299983, "rewards/accuracy_reward": 0.5270833477377892, "rewards/format_reward": 0.0, "step": 155 }, { "completion_length": 739.9854400634765, "epoch": 0.053006460162332285, "grad_norm": 0.23001034557819366, "kl": 0.09957275390625, "learning_rate": 1.0596026490066227e-05, "loss": 0.004, "reward": 0.479166679084301, "reward_std": 0.21864670403301717, "rewards/accuracy_reward": 0.479166679084301, "rewards/format_reward": 0.0, "step": 160 }, { "completion_length": 769.5062683105468, "epoch": 0.05466291204240517, "grad_norm": 0.280485600233078, "kl": 0.08792724609375, "learning_rate": 1.0927152317880796e-05, "loss": 0.0035, "reward": 0.48750001192092896, "reward_std": 0.2125462803989649, "rewards/accuracy_reward": 0.48750001192092896, "rewards/format_reward": 0.0, "step": 165 }, { "completion_length": 800.6750213623047, "epoch": 0.05631936392247805, "grad_norm": 0.17463065683841705, "kl": 0.06080322265625, "learning_rate": 1.1258278145695364e-05, "loss": 0.0024, "reward": 0.39583334475755694, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.39583334475755694, "rewards/format_reward": 0.0, "step": 170 }, { "completion_length": 850.7000091552734, "epoch": 0.057975815802550935, "grad_norm": 0.16773326694965363, "kl": 0.081640625, "learning_rate": 1.1589403973509934e-05, "loss": 0.0033, "reward": 0.3354166751727462, "reward_std": 0.2346687860786915, "rewards/accuracy_reward": 0.3354166751727462, "rewards/format_reward": 0.0, "step": 175 }, { "completion_length": 745.3333511352539, "epoch": 0.05963226768262382, "grad_norm": 0.21802081167697906, "kl": 0.08717041015625, "learning_rate": 1.1920529801324505e-05, "loss": 0.0035, "reward": 0.4479166783392429, "reward_std": 0.22985753417015076, "rewards/accuracy_reward": 0.4479166783392429, "rewards/format_reward": 0.0, "step": 180 }, { "completion_length": 570.7750198364258, "epoch": 0.06128871956269671, "grad_norm": 0.19431446492671967, "kl": 0.05576171875, "learning_rate": 1.2251655629139075e-05, "loss": 0.0022, "reward": 0.5312500119209289, "reward_std": 0.22152419947087765, "rewards/accuracy_reward": 0.5312500119209289, "rewards/format_reward": 0.0, "step": 185 }, { "completion_length": 610.850016784668, "epoch": 0.06294517144276959, "grad_norm": 0.12707968056201935, "kl": 0.09378662109375, "learning_rate": 1.2582781456953644e-05, "loss": 0.0038, "reward": 0.4541666775941849, "reward_std": 0.20326920710504054, "rewards/accuracy_reward": 0.4541666775941849, "rewards/format_reward": 0.0, "step": 190 }, { "completion_length": 710.0750183105469, "epoch": 0.06460162332284247, "grad_norm": 0.25488951802253723, "kl": 0.1151611328125, "learning_rate": 1.2913907284768212e-05, "loss": 0.0046, "reward": 0.4770833466202021, "reward_std": 0.2083796139806509, "rewards/accuracy_reward": 0.4770833466202021, "rewards/format_reward": 0.0, "step": 195 }, { "completion_length": 742.4000274658204, "epoch": 0.06625807520291535, "grad_norm": 0.9343157410621643, "kl": 0.2155517578125, "learning_rate": 1.3245033112582784e-05, "loss": 0.0086, "reward": 0.48125001043081284, "reward_std": 0.21671294569969177, "rewards/accuracy_reward": 0.48125001043081284, "rewards/format_reward": 0.0, "step": 200 }, { "completion_length": 709.556265258789, "epoch": 0.06791452708298824, "grad_norm": 0.12223739922046661, "kl": 0.17430419921875, "learning_rate": 1.3576158940397353e-05, "loss": 0.007, "reward": 0.5541666787117719, "reward_std": 0.21065880581736565, "rewards/accuracy_reward": 0.5541666787117719, "rewards/format_reward": 0.0, "step": 205 }, { "completion_length": 698.1041870117188, "epoch": 0.06957097896306112, "grad_norm": 0.2804203927516937, "kl": 0.11883544921875, "learning_rate": 1.3907284768211921e-05, "loss": 0.0048, "reward": 0.6041666805744171, "reward_std": 0.2256908670067787, "rewards/accuracy_reward": 0.6041666805744171, "rewards/format_reward": 0.0, "step": 210 }, { "completion_length": 672.1666900634766, "epoch": 0.071227430843134, "grad_norm": 0.1824299842119217, "kl": 0.12828369140625, "learning_rate": 1.423841059602649e-05, "loss": 0.0051, "reward": 0.5604166816920042, "reward_std": 0.20099001750349998, "rewards/accuracy_reward": 0.5604166816920042, "rewards/format_reward": 0.0, "step": 215 }, { "completion_length": 744.2125183105469, "epoch": 0.0728838827232069, "grad_norm": 0.22101767361164093, "kl": 0.1577392578125, "learning_rate": 1.456953642384106e-05, "loss": 0.0063, "reward": 0.500000013411045, "reward_std": 0.22216878421604633, "rewards/accuracy_reward": 0.500000013411045, "rewards/format_reward": 0.0, "step": 220 }, { "completion_length": 769.333349609375, "epoch": 0.07454033460327977, "grad_norm": 0.2826877236366272, "kl": 0.2108642578125, "learning_rate": 1.490066225165563e-05, "loss": 0.0084, "reward": 0.4562500104308128, "reward_std": 0.20614670179784297, "rewards/accuracy_reward": 0.4562500104308128, "rewards/format_reward": 0.0, "step": 225 }, { "completion_length": 705.0396087646484, "epoch": 0.07619678648335265, "grad_norm": 0.23939134180545807, "kl": 0.29638671875, "learning_rate": 1.52317880794702e-05, "loss": 0.0119, "reward": 0.4145833499729633, "reward_std": 0.2587250404059887, "rewards/accuracy_reward": 0.4145833499729633, "rewards/format_reward": 0.0, "step": 230 }, { "completion_length": 647.1750122070313, "epoch": 0.07785323836342554, "grad_norm": 0.16035372018814087, "kl": 0.1832763671875, "learning_rate": 1.5562913907284768e-05, "loss": 0.0073, "reward": 0.3958333432674408, "reward_std": 0.2212250478565693, "rewards/accuracy_reward": 0.3958333432674408, "rewards/format_reward": 0.0, "step": 235 }, { "completion_length": 671.2896041870117, "epoch": 0.07950969024349842, "grad_norm": 0.1413412094116211, "kl": 0.1302734375, "learning_rate": 1.589403973509934e-05, "loss": 0.0052, "reward": 0.5125000163912773, "reward_std": 0.2151246216148138, "rewards/accuracy_reward": 0.5125000163912773, "rewards/format_reward": 0.0, "step": 240 }, { "completion_length": 737.310433959961, "epoch": 0.08116614212357132, "grad_norm": 0.15592940151691437, "kl": 0.151263427734375, "learning_rate": 1.6225165562913908e-05, "loss": 0.006, "reward": 0.4750000134110451, "reward_std": 0.22569086514413356, "rewards/accuracy_reward": 0.4750000134110451, "rewards/format_reward": 0.0, "step": 245 }, { "completion_length": 756.185433959961, "epoch": 0.0828225940036442, "grad_norm": 0.1589566171169281, "kl": 0.16707763671875, "learning_rate": 1.6556291390728477e-05, "loss": 0.0067, "reward": 0.46458334773778914, "reward_std": 0.19781337045133113, "rewards/accuracy_reward": 0.46458334773778914, "rewards/format_reward": 0.0, "step": 250 }, { "completion_length": 682.9062652587891, "epoch": 0.08447904588371707, "grad_norm": 0.4016859233379364, "kl": 0.19429931640625, "learning_rate": 1.688741721854305e-05, "loss": 0.0078, "reward": 0.46250001564621923, "reward_std": 0.19330127201974392, "rewards/accuracy_reward": 0.46250001564621923, "rewards/format_reward": 0.0, "step": 255 }, { "completion_length": 614.3875183105469, "epoch": 0.08613549776378997, "grad_norm": 0.23245790600776672, "kl": 0.278564453125, "learning_rate": 1.7218543046357617e-05, "loss": 0.0111, "reward": 0.535416679084301, "reward_std": 0.2442912884056568, "rewards/accuracy_reward": 0.535416679084301, "rewards/format_reward": 0.0, "step": 260 }, { "completion_length": 562.735432434082, "epoch": 0.08779194964386285, "grad_norm": 0.8683246970176697, "kl": 0.541845703125, "learning_rate": 1.754966887417219e-05, "loss": 0.0217, "reward": 0.5312500193715095, "reward_std": 0.2263354528695345, "rewards/accuracy_reward": 0.5312500193715095, "rewards/format_reward": 0.0, "step": 265 }, { "completion_length": 647.112516784668, "epoch": 0.08944840152393572, "grad_norm": 0.7691146731376648, "kl": 0.94287109375, "learning_rate": 1.7880794701986758e-05, "loss": 0.0377, "reward": 0.3375000096857548, "reward_std": 0.19459044188261032, "rewards/accuracy_reward": 0.3375000096857548, "rewards/format_reward": 0.0, "step": 270 }, { "completion_length": 664.1458587646484, "epoch": 0.09110485340400862, "grad_norm": 0.37577712535858154, "kl": 0.41962890625, "learning_rate": 1.8211920529801327e-05, "loss": 0.0168, "reward": 0.445833346247673, "reward_std": 0.20902419947087764, "rewards/accuracy_reward": 0.445833346247673, "rewards/format_reward": 0.0, "step": 275 }, { "completion_length": 671.5562683105469, "epoch": 0.0927613052840815, "grad_norm": 0.1227492094039917, "kl": 0.186767578125, "learning_rate": 1.8543046357615895e-05, "loss": 0.0075, "reward": 0.5729166835546493, "reward_std": 0.18466878533363343, "rewards/accuracy_reward": 0.5729166835546493, "rewards/format_reward": 0.0, "step": 280 }, { "completion_length": 743.0979309082031, "epoch": 0.09441775716415438, "grad_norm": 0.22498369216918945, "kl": 0.14178466796875, "learning_rate": 1.8874172185430467e-05, "loss": 0.0057, "reward": 0.49166668504476546, "reward_std": 0.20198003426194192, "rewards/accuracy_reward": 0.49166668504476546, "rewards/format_reward": 0.0, "step": 285 }, { "completion_length": 715.5729400634766, "epoch": 0.09607420904422727, "grad_norm": 0.17437005043029785, "kl": 0.16676025390625, "learning_rate": 1.9205298013245036e-05, "loss": 0.0067, "reward": 0.550000011920929, "reward_std": 0.21512462347745895, "rewards/accuracy_reward": 0.550000011920929, "rewards/format_reward": 0.0, "step": 290 }, { "completion_length": 651.0041809082031, "epoch": 0.09773066092430015, "grad_norm": 0.15947847068309784, "kl": 0.20322265625, "learning_rate": 1.9536423841059604e-05, "loss": 0.0081, "reward": 0.5145833484828473, "reward_std": 0.1881908643990755, "rewards/accuracy_reward": 0.5145833484828473, "rewards/format_reward": 0.0, "step": 295 }, { "completion_length": 662.0083587646484, "epoch": 0.09938711280437303, "grad_norm": 0.22194314002990723, "kl": 0.3526123046875, "learning_rate": 1.9867549668874173e-05, "loss": 0.0141, "reward": 0.5270833462476731, "reward_std": 0.23913460560142993, "rewards/accuracy_reward": 0.5270833462476731, "rewards/format_reward": 0.0, "step": 300 }, { "epoch": 0.09938711280437303, "eval_completion_length": 747.364599609375, "eval_kl": 0.401953125, "eval_loss": 0.01589573174715042, "eval_reward": 0.4666666746139526, "eval_reward_std": 0.20902419984340667, "eval_rewards/accuracy_reward": 0.4666666746139526, "eval_rewards/format_reward": 0.0, "eval_runtime": 67.0908, "eval_samples_per_second": 1.476, "eval_steps_per_second": 0.03, "step": 300 }, { "completion_length": 758.5333526611328, "epoch": 0.10104356468444592, "grad_norm": 0.1960778534412384, "kl": 0.35498046875, "learning_rate": 1.9999939792264632e-05, "loss": 0.0142, "reward": 0.43125001415610315, "reward_std": 0.2154237784445286, "rewards/accuracy_reward": 0.43125001415610315, "rewards/format_reward": 0.0, "step": 305 }, { "completion_length": 796.8062744140625, "epoch": 0.1027000165645188, "grad_norm": 0.22320789098739624, "kl": 0.40380859375, "learning_rate": 1.999957185872951e-05, "loss": 0.0162, "reward": 0.4312500134110451, "reward_std": 0.19394585601985453, "rewards/accuracy_reward": 0.4312500134110451, "rewards/format_reward": 0.0, "step": 310 }, { "completion_length": 758.7479309082031, "epoch": 0.10435646844459169, "grad_norm": 0.18412695825099945, "kl": 0.148583984375, "learning_rate": 1.9998869452693128e-05, "loss": 0.0059, "reward": 0.45833334475755694, "reward_std": 0.19106836430728436, "rewards/accuracy_reward": 0.45833334475755694, "rewards/format_reward": 0.0, "step": 315 }, { "completion_length": 700.8521057128906, "epoch": 0.10601292032466457, "grad_norm": 0.11393094062805176, "kl": 0.0789306640625, "learning_rate": 1.999783259765003e-05, "loss": 0.0032, "reward": 0.5312500163912773, "reward_std": 0.2311467032879591, "rewards/accuracy_reward": 0.5312500163912773, "rewards/format_reward": 0.0, "step": 320 }, { "completion_length": 767.4062591552735, "epoch": 0.10766937220473745, "grad_norm": 0.2904423773288727, "kl": 0.12275390625, "learning_rate": 1.9996461328281653e-05, "loss": 0.0049, "reward": 0.427083346247673, "reward_std": 0.16542377471923828, "rewards/accuracy_reward": 0.427083346247673, "rewards/format_reward": 0.0, "step": 325 }, { "completion_length": 701.9541900634765, "epoch": 0.10932582408481034, "grad_norm": 0.4198339581489563, "kl": 0.1619873046875, "learning_rate": 1.9994755690455154e-05, "loss": 0.0065, "reward": 0.5916666865348816, "reward_std": 0.18144585974514485, "rewards/accuracy_reward": 0.5916666865348816, "rewards/format_reward": 0.0, "step": 330 }, { "completion_length": 643.3083526611329, "epoch": 0.11098227596488322, "grad_norm": 0.8133944869041443, "kl": 0.303955078125, "learning_rate": 1.9992715741221863e-05, "loss": 0.0122, "reward": 0.5125000178813934, "reward_std": 0.22569086365401744, "rewards/accuracy_reward": 0.5125000178813934, "rewards/format_reward": 0.0, "step": 335 }, { "completion_length": 684.7771057128906, "epoch": 0.1126387278449561, "grad_norm": 0.4868590831756592, "kl": 0.4471435546875, "learning_rate": 1.99903415488154e-05, "loss": 0.0179, "reward": 0.42500001192092896, "reward_std": 0.2458796128630638, "rewards/accuracy_reward": 0.42500001192092896, "rewards/format_reward": 0.0, "step": 340 }, { "completion_length": 657.3896041870117, "epoch": 0.11429517972502899, "grad_norm": 0.387178510427475, "kl": 0.5364013671875, "learning_rate": 1.9987633192649372e-05, "loss": 0.0215, "reward": 0.4937500134110451, "reward_std": 0.24171294569969176, "rewards/accuracy_reward": 0.4937500134110451, "rewards/format_reward": 0.0, "step": 345 }, { "completion_length": 631.7312713623047, "epoch": 0.11595163160510187, "grad_norm": 0.4646587669849396, "kl": 0.42890625, "learning_rate": 1.9984590763314722e-05, "loss": 0.0172, "reward": 0.4833333522081375, "reward_std": 0.2116025399416685, "rewards/accuracy_reward": 0.4833333522081375, "rewards/format_reward": 0.0, "step": 350 }, { "completion_length": 644.5896087646485, "epoch": 0.11760808348517475, "grad_norm": 0.2956700325012207, "kl": 0.39091796875, "learning_rate": 1.9981214362576705e-05, "loss": 0.0156, "reward": 0.4395833447575569, "reward_std": 0.2596687819808722, "rewards/accuracy_reward": 0.4395833447575569, "rewards/format_reward": 0.0, "step": 355 }, { "completion_length": 662.9791839599609, "epoch": 0.11926453536524764, "grad_norm": 0.20021367073059082, "kl": 0.2126953125, "learning_rate": 1.997750410337147e-05, "loss": 0.0085, "reward": 0.4250000089406967, "reward_std": 0.22955838069319726, "rewards/accuracy_reward": 0.4250000089406967, "rewards/format_reward": 0.0, "step": 360 }, { "completion_length": 692.1583557128906, "epoch": 0.12092098724532052, "grad_norm": 0.2789163589477539, "kl": 0.35279541015625, "learning_rate": 1.9973460109802306e-05, "loss": 0.0141, "reward": 0.4416666775941849, "reward_std": 0.2769800368696451, "rewards/accuracy_reward": 0.4416666775941849, "rewards/format_reward": 0.0, "step": 365 }, { "completion_length": 697.0437698364258, "epoch": 0.12257743912539341, "grad_norm": 0.24779824912548065, "kl": 0.47197265625, "learning_rate": 1.9969082517135463e-05, "loss": 0.0189, "reward": 0.4666666813194752, "reward_std": 0.20808046273887157, "rewards/accuracy_reward": 0.4666666813194752, "rewards/format_reward": 0.0, "step": 370 }, { "completion_length": 697.4104370117187, "epoch": 0.1242338910054663, "grad_norm": 0.43209102749824524, "kl": 0.33740234375, "learning_rate": 1.996437147179565e-05, "loss": 0.0135, "reward": 0.5250000149011612, "reward_std": 0.20069086737930775, "rewards/accuracy_reward": 0.5250000149011612, "rewards/format_reward": 0.0, "step": 375 }, { "completion_length": 708.7521026611328, "epoch": 0.12589034288553919, "grad_norm": 0.16372160613536835, "kl": 0.241748046875, "learning_rate": 1.995932713136112e-05, "loss": 0.0097, "reward": 0.4625000134110451, "reward_std": 0.18754628039896487, "rewards/accuracy_reward": 0.4625000134110451, "rewards/format_reward": 0.0, "step": 380 }, { "completion_length": 712.6521026611329, "epoch": 0.12754679476561206, "grad_norm": 0.3833288848400116, "kl": 0.2181884765625, "learning_rate": 1.9953949664558418e-05, "loss": 0.0087, "reward": 0.477083345875144, "reward_std": 0.2000462792813778, "rewards/accuracy_reward": 0.477083345875144, "rewards/format_reward": 0.0, "step": 385 }, { "completion_length": 730.1208587646485, "epoch": 0.12920324664568494, "grad_norm": 0.8248327970504761, "kl": 0.376513671875, "learning_rate": 1.994823925125672e-05, "loss": 0.0151, "reward": 0.42291667871177197, "reward_std": 0.22504628002643584, "rewards/accuracy_reward": 0.42291667871177197, "rewards/format_reward": 0.0, "step": 390 }, { "completion_length": 694.5875244140625, "epoch": 0.13085969852575782, "grad_norm": 0.5915169715881348, "kl": 1.29951171875, "learning_rate": 1.994219608246183e-05, "loss": 0.052, "reward": 0.43750001192092897, "reward_std": 0.29012462347745893, "rewards/accuracy_reward": 0.43750001192092897, "rewards/format_reward": 0.0, "step": 395 }, { "completion_length": 533.525016784668, "epoch": 0.1325161504058307, "grad_norm": 0.26577144861221313, "kl": 0.21474609375, "learning_rate": 1.993582036030978e-05, "loss": 0.0086, "reward": 0.4166666783392429, "reward_std": 0.25902419798076154, "rewards/accuracy_reward": 0.4166666783392429, "rewards/format_reward": 0.0, "step": 400 }, { "completion_length": 484.5666778564453, "epoch": 0.1341726022859036, "grad_norm": 0.23886007070541382, "kl": 0.216748046875, "learning_rate": 1.9929112298060067e-05, "loss": 0.0087, "reward": 0.4604166805744171, "reward_std": 0.25297005511820314, "rewards/accuracy_reward": 0.4604166805744171, "rewards/format_reward": 0.0, "step": 405 }, { "completion_length": 591.591682434082, "epoch": 0.1358290541659765, "grad_norm": 0.5166580677032471, "kl": 0.451708984375, "learning_rate": 1.9922072120088537e-05, "loss": 0.0181, "reward": 0.42916667833924294, "reward_std": 0.2032692089676857, "rewards/accuracy_reward": 0.42916667833924294, "rewards/format_reward": 0.0, "step": 410 }, { "completion_length": 651.060432434082, "epoch": 0.13748550604604937, "grad_norm": 0.272011935710907, "kl": 0.54384765625, "learning_rate": 1.991470006187987e-05, "loss": 0.0218, "reward": 0.3437500111758709, "reward_std": 0.19394585601985453, "rewards/accuracy_reward": 0.3437500111758709, "rewards/format_reward": 0.0, "step": 415 }, { "completion_length": 689.4771087646484, "epoch": 0.13914195792612225, "grad_norm": 0.6360898613929749, "kl": 1.144921875, "learning_rate": 1.9906996370019692e-05, "loss": 0.0458, "reward": 0.39791667815297843, "reward_std": 0.2285683576017618, "rewards/accuracy_reward": 0.39791667815297843, "rewards/format_reward": 0.0, "step": 420 }, { "completion_length": 703.275015258789, "epoch": 0.14079840980619512, "grad_norm": 0.5180982351303101, "kl": 0.7983154296875, "learning_rate": 1.989896130218635e-05, "loss": 0.0319, "reward": 0.44375001415610316, "reward_std": 0.21929128840565681, "rewards/accuracy_reward": 0.44375001415610316, "rewards/format_reward": 0.0, "step": 425 }, { "completion_length": 671.4666900634766, "epoch": 0.142454861686268, "grad_norm": 0.4963127672672272, "kl": 0.740234375, "learning_rate": 1.989059512714227e-05, "loss": 0.0296, "reward": 0.514583346247673, "reward_std": 0.2167129460722208, "rewards/accuracy_reward": 0.514583346247673, "rewards/format_reward": 0.0, "step": 430 }, { "completion_length": 732.5208557128906, "epoch": 0.1441113135663409, "grad_norm": 0.34941744804382324, "kl": 0.7269775390625, "learning_rate": 1.988189812472498e-05, "loss": 0.0291, "reward": 0.45208334773778913, "reward_std": 0.246869632974267, "rewards/accuracy_reward": 0.45208334773778913, "rewards/format_reward": 0.0, "step": 435 }, { "completion_length": 600.2520980834961, "epoch": 0.1457677654464138, "grad_norm": 1.5662246942520142, "kl": 0.49345703125, "learning_rate": 1.9872870585837757e-05, "loss": 0.0198, "reward": 0.5083333499729633, "reward_std": 0.18531336933374404, "rewards/accuracy_reward": 0.5083333499729633, "rewards/format_reward": 0.0, "step": 440 }, { "completion_length": 649.4854415893554, "epoch": 0.14742421732648667, "grad_norm": 3.917665481567383, "kl": 2.084765625, "learning_rate": 1.9863512812439874e-05, "loss": 0.0834, "reward": 0.33958334643393756, "reward_std": 0.27186962924897673, "rewards/accuracy_reward": 0.33958334643393756, "rewards/format_reward": 0.0, "step": 445 }, { "completion_length": 689.2979370117188, "epoch": 0.14908066920655955, "grad_norm": 1.4896368980407715, "kl": 1.8654296875, "learning_rate": 1.9853825117536522e-05, "loss": 0.0746, "reward": 0.3041666751727462, "reward_std": 0.27826920710504055, "rewards/accuracy_reward": 0.3041666751727462, "rewards/format_reward": 0.0, "step": 450 }, { "completion_length": 771.2729431152344, "epoch": 0.15073712108663243, "grad_norm": 2.015925884246826, "kl": 1.638671875, "learning_rate": 1.984380782516833e-05, "loss": 0.0656, "reward": 0.3229166720062494, "reward_std": 0.2478133700788021, "rewards/accuracy_reward": 0.3229166720062494, "rewards/format_reward": 0.0, "step": 455 }, { "completion_length": 733.333349609375, "epoch": 0.1523935729667053, "grad_norm": 0.42906132340431213, "kl": 7.5837890625, "learning_rate": 1.983346127040053e-05, "loss": 0.3034, "reward": 0.40625001341104505, "reward_std": 0.24652419984340668, "rewards/accuracy_reward": 0.40625001341104505, "rewards/format_reward": 0.0, "step": 460 }, { "completion_length": 682.4437713623047, "epoch": 0.1540500248467782, "grad_norm": 10.433393478393555, "kl": 1.748583984375, "learning_rate": 1.9822785799311736e-05, "loss": 0.07, "reward": 0.46458334252238276, "reward_std": 0.2381908643990755, "rewards/accuracy_reward": 0.46458334252238276, "rewards/format_reward": 0.0, "step": 465 }, { "completion_length": 689.8271057128907, "epoch": 0.1557064767268511, "grad_norm": 0.6034363508224487, "kl": 0.5762451171875, "learning_rate": 1.9811781768982392e-05, "loss": 0.023, "reward": 0.48125001415610313, "reward_std": 0.20580126903951168, "rewards/accuracy_reward": 0.48125001415610313, "rewards/format_reward": 0.0, "step": 470 }, { "completion_length": 686.9583572387695, "epoch": 0.15736292860692397, "grad_norm": 8.642258644104004, "kl": 1.34150390625, "learning_rate": 1.9800449547482813e-05, "loss": 0.0537, "reward": 0.48333335146307943, "reward_std": 0.24716878458857536, "rewards/accuracy_reward": 0.48333335146307943, "rewards/format_reward": 0.0, "step": 475 }, { "completion_length": 679.1291961669922, "epoch": 0.15901938048699685, "grad_norm": 0.14480511844158173, "kl": 0.66591796875, "learning_rate": 1.9788789513860875e-05, "loss": 0.0267, "reward": 0.4770833447575569, "reward_std": 0.21413460783660412, "rewards/accuracy_reward": 0.4770833447575569, "rewards/format_reward": 0.0, "step": 480 }, { "completion_length": 652.0937652587891, "epoch": 0.16067583236706973, "grad_norm": 1.0938204526901245, "kl": 0.411962890625, "learning_rate": 1.9776802058129336e-05, "loss": 0.0165, "reward": 0.4458333447575569, "reward_std": 0.21254627853631974, "rewards/accuracy_reward": 0.4458333447575569, "rewards/format_reward": 0.0, "step": 485 }, { "completion_length": 656.5437744140625, "epoch": 0.16233228424714263, "grad_norm": 0.6434494853019714, "kl": 0.73515625, "learning_rate": 1.9764487581252787e-05, "loss": 0.0294, "reward": 0.48125001695007086, "reward_std": 0.235957957431674, "rewards/accuracy_reward": 0.48125001695007086, "rewards/format_reward": 0.0, "step": 490 }, { "completion_length": 678.7646087646484, "epoch": 0.1639887361272155, "grad_norm": 1.7666065692901611, "kl": 0.97236328125, "learning_rate": 1.975184649513426e-05, "loss": 0.0389, "reward": 0.42500001192092896, "reward_std": 0.2593696258962154, "rewards/accuracy_reward": 0.42500001192092896, "rewards/format_reward": 0.0, "step": 495 }, { "completion_length": 689.7437683105469, "epoch": 0.1656451880072884, "grad_norm": 2.206672430038452, "kl": 1.451171875, "learning_rate": 1.9738879222601425e-05, "loss": 0.058, "reward": 0.3937500089406967, "reward_std": 0.2692912872880697, "rewards/accuracy_reward": 0.3937500089406967, "rewards/format_reward": 0.0, "step": 500 }, { "completion_length": 708.7958557128907, "epoch": 0.16730163988736127, "grad_norm": 1.299712061882019, "kl": 1.85439453125, "learning_rate": 1.972558619739246e-05, "loss": 0.0742, "reward": 0.3729166768491268, "reward_std": 0.2333796124905348, "rewards/accuracy_reward": 0.3729166768491268, "rewards/format_reward": 0.0, "step": 505 }, { "completion_length": 702.5687652587891, "epoch": 0.16895809176743415, "grad_norm": 0.628940224647522, "kl": 1.220849609375, "learning_rate": 1.9711967864141542e-05, "loss": 0.0488, "reward": 0.3708333451300859, "reward_std": 0.2519800361245871, "rewards/accuracy_reward": 0.3708333451300859, "rewards/format_reward": 0.0, "step": 510 }, { "completion_length": 602.331265258789, "epoch": 0.17061454364750703, "grad_norm": 1.2457903623580933, "kl": 1.27421875, "learning_rate": 1.9698024678363967e-05, "loss": 0.051, "reward": 0.42708334103226664, "reward_std": 0.2420583751052618, "rewards/accuracy_reward": 0.42708334103226664, "rewards/format_reward": 0.0, "step": 515 }, { "completion_length": 616.0666870117187, "epoch": 0.17227099552757993, "grad_norm": 0.7327155470848083, "kl": 1.5724609375, "learning_rate": 1.968375710644093e-05, "loss": 0.0628, "reward": 0.4604166805744171, "reward_std": 0.2696367222815752, "rewards/accuracy_reward": 0.4604166805744171, "rewards/format_reward": 0.0, "step": 520 }, { "completion_length": 619.8146026611328, "epoch": 0.1739274474076528, "grad_norm": 0.37498822808265686, "kl": 0.619384765625, "learning_rate": 1.9669165625603907e-05, "loss": 0.0248, "reward": 0.41250001192092894, "reward_std": 0.24270296320319176, "rewards/accuracy_reward": 0.41250001192092894, "rewards/format_reward": 0.0, "step": 525 }, { "completion_length": 586.7270980834961, "epoch": 0.1755838992877257, "grad_norm": 0.6590388417243958, "kl": 0.4463623046875, "learning_rate": 1.9654250723918706e-05, "loss": 0.0179, "reward": 0.506250011920929, "reward_std": 0.2552029609680176, "rewards/accuracy_reward": 0.506250011920929, "rewards/format_reward": 0.0, "step": 530 }, { "completion_length": 604.837515258789, "epoch": 0.17724035116779857, "grad_norm": 0.17691288888454437, "kl": 0.4416015625, "learning_rate": 1.9639012900269133e-05, "loss": 0.0177, "reward": 0.4854166850447655, "reward_std": 0.20227918773889542, "rewards/accuracy_reward": 0.4854166850447655, "rewards/format_reward": 0.0, "step": 535 }, { "completion_length": 627.5291809082031, "epoch": 0.17889680304787145, "grad_norm": 0.24128106236457825, "kl": 0.2916015625, "learning_rate": 1.9623452664340305e-05, "loss": 0.0117, "reward": 0.4541666850447655, "reward_std": 0.2295583810657263, "rewards/accuracy_reward": 0.4541666850447655, "rewards/format_reward": 0.0, "step": 540 }, { "completion_length": 663.8896011352539, "epoch": 0.18055325492794436, "grad_norm": 0.8167592883110046, "kl": 0.703564453125, "learning_rate": 1.9607570536601613e-05, "loss": 0.0281, "reward": 0.4645833492279053, "reward_std": 0.22152419574558735, "rewards/accuracy_reward": 0.4645833492279053, "rewards/format_reward": 0.0, "step": 545 }, { "completion_length": 710.7187728881836, "epoch": 0.18220970680801724, "grad_norm": 3.1678805351257324, "kl": 0.996337890625, "learning_rate": 1.9591367048289297e-05, "loss": 0.0399, "reward": 0.422916679084301, "reward_std": 0.2529700543731451, "rewards/accuracy_reward": 0.422916679084301, "rewards/format_reward": 0.0, "step": 550 }, { "completion_length": 711.6521026611329, "epoch": 0.18386615868809011, "grad_norm": 0.49747931957244873, "kl": 0.800634765625, "learning_rate": 1.957484274138869e-05, "loss": 0.032, "reward": 0.445833346247673, "reward_std": 0.23308046385645867, "rewards/accuracy_reward": 0.445833346247673, "rewards/format_reward": 0.0, "step": 555 }, { "completion_length": 656.9083465576172, "epoch": 0.185522610568163, "grad_norm": 5.867295265197754, "kl": 1.53369140625, "learning_rate": 1.9557998168616087e-05, "loss": 0.0613, "reward": 0.4666666775941849, "reward_std": 0.29364670254290104, "rewards/accuracy_reward": 0.4666666775941849, "rewards/format_reward": 0.0, "step": 560 }, { "completion_length": 584.8145965576172, "epoch": 0.18717906244823587, "grad_norm": 0.8317631483078003, "kl": 0.905078125, "learning_rate": 1.954083389340024e-05, "loss": 0.0362, "reward": 0.4604166805744171, "reward_std": 0.24429129026830196, "rewards/accuracy_reward": 0.4604166805744171, "rewards/format_reward": 0.0, "step": 565 }, { "completion_length": 601.2896026611328, "epoch": 0.18883551432830875, "grad_norm": 5.019211769104004, "kl": 0.6514404296875, "learning_rate": 1.9523350489863545e-05, "loss": 0.0261, "reward": 0.497916679084301, "reward_std": 0.2942912891507149, "rewards/accuracy_reward": 0.497916679084301, "rewards/format_reward": 0.0, "step": 570 }, { "completion_length": 668.1104400634765, "epoch": 0.19049196620838166, "grad_norm": 0.37168097496032715, "kl": 0.8615234375, "learning_rate": 1.9505548542802805e-05, "loss": 0.0345, "reward": 0.42708334922790525, "reward_std": 0.24463671930134295, "rewards/accuracy_reward": 0.42708334922790525, "rewards/format_reward": 0.0, "step": 575 }, { "completion_length": 661.7437698364258, "epoch": 0.19214841808845454, "grad_norm": 0.3525070250034332, "kl": 0.52646484375, "learning_rate": 1.9487428647669688e-05, "loss": 0.0211, "reward": 0.456250012665987, "reward_std": 0.23948003835976123, "rewards/accuracy_reward": 0.456250012665987, "rewards/format_reward": 0.0, "step": 580 }, { "completion_length": 648.9750183105468, "epoch": 0.19380486996852742, "grad_norm": 0.21801568567752838, "kl": 0.52060546875, "learning_rate": 1.9468991410550813e-05, "loss": 0.0208, "reward": 0.4604166805744171, "reward_std": 0.21542377397418022, "rewards/accuracy_reward": 0.4604166805744171, "rewards/format_reward": 0.0, "step": 585 }, { "completion_length": 647.545849609375, "epoch": 0.1954613218486003, "grad_norm": 0.39778226613998413, "kl": 0.457568359375, "learning_rate": 1.9450237448147463e-05, "loss": 0.0183, "reward": 0.4083333447575569, "reward_std": 0.2234579548239708, "rewards/accuracy_reward": 0.4083333447575569, "rewards/format_reward": 0.0, "step": 590 }, { "completion_length": 636.2604370117188, "epoch": 0.19711777372867317, "grad_norm": 0.5258747935295105, "kl": 0.439501953125, "learning_rate": 1.9431167387754967e-05, "loss": 0.0176, "reward": 0.4416666775941849, "reward_std": 0.2019800364971161, "rewards/accuracy_reward": 0.4416666775941849, "rewards/format_reward": 0.0, "step": 595 }, { "completion_length": 702.1166839599609, "epoch": 0.19877422560874605, "grad_norm": 0.8898454308509827, "kl": 0.84365234375, "learning_rate": 1.9411781867241718e-05, "loss": 0.0338, "reward": 0.37083334401249884, "reward_std": 0.21254627890884875, "rewards/accuracy_reward": 0.37083334401249884, "rewards/format_reward": 0.0, "step": 600 }, { "epoch": 0.19877422560874605, "eval_completion_length": 659.3083618164062, "eval_kl": 0.92265625, "eval_loss": 0.038530491292476654, "eval_reward": 0.41250001788139345, "eval_reward_std": 0.25936963558197024, "eval_rewards/accuracy_reward": 0.41250001788139345, "eval_rewards/format_reward": 0.0, "eval_runtime": 60.2623, "eval_samples_per_second": 1.643, "eval_steps_per_second": 0.033, "step": 600 }, { "completion_length": 688.4958587646485, "epoch": 0.20043067748881896, "grad_norm": 0.2848971486091614, "kl": 0.76669921875, "learning_rate": 1.9392081535027824e-05, "loss": 0.0307, "reward": 0.43333334382623434, "reward_std": 0.23531336858868598, "rewards/accuracy_reward": 0.43333334382623434, "rewards/format_reward": 0.0, "step": 605 }, { "completion_length": 680.3187683105468, "epoch": 0.20208712936889184, "grad_norm": 0.7399642467498779, "kl": 0.5251953125, "learning_rate": 1.937206705006344e-05, "loss": 0.021, "reward": 0.47500001192092894, "reward_std": 0.21383545286953448, "rewards/accuracy_reward": 0.47500001192092894, "rewards/format_reward": 0.0, "step": 610 }, { "completion_length": 696.2312683105469, "epoch": 0.20374358124896472, "grad_norm": 0.20121276378631592, "kl": 0.3503662109375, "learning_rate": 1.935173908180671e-05, "loss": 0.014, "reward": 0.5208333492279053, "reward_std": 0.20455838032066823, "rewards/accuracy_reward": 0.5208333492279053, "rewards/format_reward": 0.0, "step": 615 }, { "completion_length": 710.6229370117187, "epoch": 0.2054000331290376, "grad_norm": 0.1902662217617035, "kl": 0.3919677734375, "learning_rate": 1.9331098310201392e-05, "loss": 0.0157, "reward": 0.46458334773778914, "reward_std": 0.20837961360812188, "rewards/accuracy_reward": 0.46458334773778914, "rewards/format_reward": 0.0, "step": 620 }, { "completion_length": 680.6083526611328, "epoch": 0.20705648500911047, "grad_norm": 0.245217964053154, "kl": 0.512890625, "learning_rate": 1.9310145425654086e-05, "loss": 0.0205, "reward": 0.5125000178813934, "reward_std": 0.23402419984340667, "rewards/accuracy_reward": 0.5125000178813934, "rewards/format_reward": 0.0, "step": 625 }, { "completion_length": 695.0312744140625, "epoch": 0.20871293688918338, "grad_norm": 1.195257306098938, "kl": 0.46025390625, "learning_rate": 1.9288881129011177e-05, "loss": 0.0184, "reward": 0.4854166805744171, "reward_std": 0.24429128803312777, "rewards/accuracy_reward": 0.4854166805744171, "rewards/format_reward": 0.0, "step": 630 }, { "completion_length": 661.4646026611329, "epoch": 0.21036938876925626, "grad_norm": 0.6811662316322327, "kl": 0.450390625, "learning_rate": 1.926730613153536e-05, "loss": 0.018, "reward": 0.493750012665987, "reward_std": 0.21095795854926108, "rewards/accuracy_reward": 0.493750012665987, "rewards/format_reward": 0.0, "step": 635 }, { "completion_length": 672.5187622070313, "epoch": 0.21202584064932914, "grad_norm": 0.2703233063220978, "kl": 0.303759765625, "learning_rate": 1.9245421154881873e-05, "loss": 0.0121, "reward": 0.5250000163912774, "reward_std": 0.22921294942498208, "rewards/accuracy_reward": 0.5250000163912774, "rewards/format_reward": 0.0, "step": 640 }, { "completion_length": 680.6708557128907, "epoch": 0.21368229252940202, "grad_norm": 0.41743773221969604, "kl": 0.37470703125, "learning_rate": 1.922322693107434e-05, "loss": 0.015, "reward": 0.4645833410322666, "reward_std": 0.17152419984340667, "rewards/accuracy_reward": 0.4645833410322666, "rewards/format_reward": 0.0, "step": 645 }, { "completion_length": 693.1125213623047, "epoch": 0.2153387444094749, "grad_norm": 0.7818052768707275, "kl": 0.73203125, "learning_rate": 1.9200724202480305e-05, "loss": 0.0293, "reward": 0.4562500111758709, "reward_std": 0.2333796124905348, "rewards/accuracy_reward": 0.4562500111758709, "rewards/format_reward": 0.0, "step": 650 }, { "completion_length": 734.1791839599609, "epoch": 0.21699519628954778, "grad_norm": 0.6706784963607788, "kl": 0.6401611328125, "learning_rate": 1.9177913721786384e-05, "loss": 0.0256, "reward": 0.39166668355464934, "reward_std": 0.23531337268650532, "rewards/accuracy_reward": 0.39166668355464934, "rewards/format_reward": 0.0, "step": 655 }, { "completion_length": 693.2187652587891, "epoch": 0.21865164816962068, "grad_norm": 1.0797624588012695, "kl": 0.8861328125, "learning_rate": 1.9154796251973092e-05, "loss": 0.0355, "reward": 0.450000012665987, "reward_std": 0.2830804593861103, "rewards/accuracy_reward": 0.450000012665987, "rewards/format_reward": 0.0, "step": 660 }, { "completion_length": 663.1604309082031, "epoch": 0.22030810004969356, "grad_norm": 3.3238718509674072, "kl": 0.89541015625, "learning_rate": 1.913137256628934e-05, "loss": 0.0358, "reward": 0.4250000074505806, "reward_std": 0.2510362945497036, "rewards/accuracy_reward": 0.4250000074505806, "rewards/format_reward": 0.0, "step": 665 }, { "completion_length": 613.7625183105469, "epoch": 0.22196455192976644, "grad_norm": 0.31066539883613586, "kl": 0.9150390625, "learning_rate": 1.9107643448226536e-05, "loss": 0.0366, "reward": 0.4458333445712924, "reward_std": 0.23402419909834862, "rewards/accuracy_reward": 0.4458333445712924, "rewards/format_reward": 0.0, "step": 670 }, { "completion_length": 559.9062728881836, "epoch": 0.22362100380983932, "grad_norm": 0.2717919945716858, "kl": 0.419970703125, "learning_rate": 1.908360969149242e-05, "loss": 0.0168, "reward": 0.44583334531635044, "reward_std": 0.1836787685751915, "rewards/accuracy_reward": 0.44583334531635044, "rewards/format_reward": 0.0, "step": 675 }, { "completion_length": 584.2187683105469, "epoch": 0.2252774556899122, "grad_norm": 0.19857753813266754, "kl": 0.2474609375, "learning_rate": 1.905927209998447e-05, "loss": 0.0099, "reward": 0.5083333417773247, "reward_std": 0.27122504487633703, "rewards/accuracy_reward": 0.5083333417773247, "rewards/format_reward": 0.0, "step": 680 }, { "completion_length": 596.768766784668, "epoch": 0.2269339075699851, "grad_norm": 0.5171321630477905, "kl": 0.39990234375, "learning_rate": 1.903463148776306e-05, "loss": 0.016, "reward": 0.5500000178813934, "reward_std": 0.21512462422251702, "rewards/accuracy_reward": 0.5500000178813934, "rewards/format_reward": 0.0, "step": 685 }, { "completion_length": 710.6625213623047, "epoch": 0.22859035945005798, "grad_norm": 0.18548190593719482, "kl": 0.44384765625, "learning_rate": 1.900968867902419e-05, "loss": 0.0177, "reward": 0.2895833408460021, "reward_std": 0.24558046124875546, "rewards/accuracy_reward": 0.2895833408460021, "rewards/format_reward": 0.0, "step": 690 }, { "completion_length": 803.2937713623047, "epoch": 0.23024681133013086, "grad_norm": 0.22200892865657806, "kl": 0.2267822265625, "learning_rate": 1.8984444508071952e-05, "loss": 0.0091, "reward": 0.2833333406597376, "reward_std": 0.21864670217037202, "rewards/accuracy_reward": 0.2833333406597376, "rewards/format_reward": 0.0, "step": 695 }, { "completion_length": 790.5250183105469, "epoch": 0.23190326321020374, "grad_norm": 0.40342462062835693, "kl": 0.474267578125, "learning_rate": 1.8958899819290592e-05, "loss": 0.019, "reward": 0.33333334382623436, "reward_std": 0.21160254552960395, "rewards/accuracy_reward": 0.33333334382623436, "rewards/format_reward": 0.0, "step": 700 }, { "completion_length": 755.8937713623047, "epoch": 0.23355971509027662, "grad_norm": 0.24101851880550385, "kl": 0.603369140625, "learning_rate": 1.893305546711629e-05, "loss": 0.0241, "reward": 0.3895833492279053, "reward_std": 0.24429128728806973, "rewards/accuracy_reward": 0.3895833492279053, "rewards/format_reward": 0.0, "step": 705 }, { "completion_length": 716.2125213623046, "epoch": 0.2352161669703495, "grad_norm": 0.20975109934806824, "kl": 0.59482421875, "learning_rate": 1.890691231600856e-05, "loss": 0.0238, "reward": 0.42916667759418486, "reward_std": 0.2112571083009243, "rewards/accuracy_reward": 0.42916667759418486, "rewards/format_reward": 0.0, "step": 710 }, { "completion_length": 677.0979370117187, "epoch": 0.2368726188504224, "grad_norm": 0.26579591631889343, "kl": 0.3972412109375, "learning_rate": 1.8880471240421365e-05, "loss": 0.0159, "reward": 0.38750001359730957, "reward_std": 0.19364670440554618, "rewards/accuracy_reward": 0.38750001359730957, "rewards/format_reward": 0.0, "step": 715 }, { "completion_length": 673.9146057128906, "epoch": 0.23852907073049529, "grad_norm": 0.2846389412879944, "kl": 0.497705078125, "learning_rate": 1.8853733124773837e-05, "loss": 0.0199, "reward": 0.36875001192092893, "reward_std": 0.17504628151655197, "rewards/accuracy_reward": 0.36875001192092893, "rewards/format_reward": 0.0, "step": 720 }, { "completion_length": 676.1208557128906, "epoch": 0.24018552261056816, "grad_norm": 0.934825599193573, "kl": 0.683984375, "learning_rate": 1.8826698863420705e-05, "loss": 0.0274, "reward": 0.38958334531635047, "reward_std": 0.23724712990224361, "rewards/accuracy_reward": 0.38958334531635047, "rewards/format_reward": 0.0, "step": 725 }, { "completion_length": 668.1791870117188, "epoch": 0.24184197449064104, "grad_norm": 0.327308714389801, "kl": 0.660009765625, "learning_rate": 1.8799369360622394e-05, "loss": 0.0264, "reward": 0.3708333414047956, "reward_std": 0.24047005623579026, "rewards/accuracy_reward": 0.3708333414047956, "rewards/format_reward": 0.0, "step": 730 }, { "completion_length": 643.1812683105469, "epoch": 0.24349842637071392, "grad_norm": 0.3906155228614807, "kl": 0.5317138671875, "learning_rate": 1.8771745530514748e-05, "loss": 0.0212, "reward": 0.383333345875144, "reward_std": 0.1779237762093544, "rewards/accuracy_reward": 0.383333345875144, "rewards/format_reward": 0.0, "step": 735 }, { "completion_length": 629.2770980834961, "epoch": 0.24515487825078683, "grad_norm": 0.26094552874565125, "kl": 0.566552734375, "learning_rate": 1.8743828297078485e-05, "loss": 0.0227, "reward": 0.435416679084301, "reward_std": 0.20743587352335452, "rewards/accuracy_reward": 0.435416679084301, "rewards/format_reward": 0.0, "step": 740 }, { "completion_length": 645.8604370117188, "epoch": 0.2468113301308597, "grad_norm": 0.4099646210670471, "kl": 0.59619140625, "learning_rate": 1.871561859410828e-05, "loss": 0.0238, "reward": 0.400000012665987, "reward_std": 0.21031336784362792, "rewards/accuracy_reward": 0.400000012665987, "rewards/format_reward": 0.0, "step": 745 }, { "completion_length": 648.1687667846679, "epoch": 0.2484677820109326, "grad_norm": 0.2718549370765686, "kl": 0.5390625, "learning_rate": 1.8687117365181514e-05, "loss": 0.0216, "reward": 0.3895833432674408, "reward_std": 0.20485753193497658, "rewards/accuracy_reward": 0.3895833432674408, "rewards/format_reward": 0.0, "step": 750 }, { "completion_length": 668.1000213623047, "epoch": 0.25012423389100547, "grad_norm": 0.22385920584201813, "kl": 0.2530029296875, "learning_rate": 1.8658325563626737e-05, "loss": 0.0101, "reward": 0.46250001192092893, "reward_std": 0.1933012716472149, "rewards/accuracy_reward": 0.46250001192092893, "rewards/format_reward": 0.0, "step": 755 }, { "completion_length": 664.1125213623047, "epoch": 0.25178068577107837, "grad_norm": 0.26623448729515076, "kl": 0.9230712890625, "learning_rate": 1.8629244152491773e-05, "loss": 0.0368, "reward": 0.48958334922790525, "reward_std": 0.23243587650358677, "rewards/accuracy_reward": 0.48958334922790525, "rewards/format_reward": 0.0, "step": 760 }, { "completion_length": 699.6666778564453, "epoch": 0.2534371376511512, "grad_norm": 0.20795650780200958, "kl": 0.29345703125, "learning_rate": 1.8599874104511503e-05, "loss": 0.0117, "reward": 0.4458333447575569, "reward_std": 0.19940169602632524, "rewards/accuracy_reward": 0.4458333447575569, "rewards/format_reward": 0.0, "step": 765 }, { "completion_length": 705.4354370117187, "epoch": 0.25509358953122413, "grad_norm": 0.37470635771751404, "kl": 0.3725830078125, "learning_rate": 1.8570216402075326e-05, "loss": 0.0149, "reward": 0.4791666826233268, "reward_std": 0.20550212040543556, "rewards/accuracy_reward": 0.4791666826233268, "rewards/format_reward": 0.0, "step": 770 }, { "completion_length": 738.8562744140625, "epoch": 0.256750041411297, "grad_norm": 0.2921293079853058, "kl": 0.54072265625, "learning_rate": 1.8540272037194304e-05, "loss": 0.0216, "reward": 0.42708334922790525, "reward_std": 0.19523502960801126, "rewards/accuracy_reward": 0.42708334922790525, "rewards/format_reward": 0.0, "step": 775 }, { "completion_length": 750.8854400634766, "epoch": 0.2584064932913699, "grad_norm": 0.20843365788459778, "kl": 0.500830078125, "learning_rate": 1.8510042011467978e-05, "loss": 0.0201, "reward": 0.3937500104308128, "reward_std": 0.21671294420957565, "rewards/accuracy_reward": 0.3937500104308128, "rewards/format_reward": 0.0, "step": 780 }, { "completion_length": 703.3854370117188, "epoch": 0.2600629451714428, "grad_norm": 0.20130056142807007, "kl": 0.370947265625, "learning_rate": 1.847952733605088e-05, "loss": 0.0148, "reward": 0.48750001564621925, "reward_std": 0.18496793657541274, "rewards/accuracy_reward": 0.48750001564621925, "rewards/format_reward": 0.0, "step": 785 }, { "completion_length": 727.4521057128907, "epoch": 0.26171939705151565, "grad_norm": 1.448848843574524, "kl": 0.75234375, "learning_rate": 1.8448729031618687e-05, "loss": 0.0301, "reward": 0.48333335146307943, "reward_std": 0.2567912880331278, "rewards/accuracy_reward": 0.48333335146307943, "rewards/format_reward": 0.0, "step": 790 }, { "completion_length": 693.9416900634766, "epoch": 0.26337584893158855, "grad_norm": 0.6484307050704956, "kl": 0.5817626953125, "learning_rate": 1.8417648128334093e-05, "loss": 0.0233, "reward": 0.3875000115483999, "reward_std": 0.2330804578959942, "rewards/accuracy_reward": 0.3875000115483999, "rewards/format_reward": 0.0, "step": 795 }, { "completion_length": 657.4375213623047, "epoch": 0.2650323008116614, "grad_norm": 0.39433908462524414, "kl": 0.365625, "learning_rate": 1.838628566581236e-05, "loss": 0.0146, "reward": 0.4437500137835741, "reward_std": 0.21542377546429634, "rewards/accuracy_reward": 0.4437500137835741, "rewards/format_reward": 0.0, "step": 800 }, { "completion_length": 671.0646057128906, "epoch": 0.2666887526917343, "grad_norm": 0.4095575213432312, "kl": 0.736669921875, "learning_rate": 1.8354642693086525e-05, "loss": 0.0295, "reward": 0.5229166820645332, "reward_std": 0.24300211779773234, "rewards/accuracy_reward": 0.5229166820645332, "rewards/format_reward": 0.0, "step": 805 }, { "completion_length": 635.4541900634765, "epoch": 0.2683452045718072, "grad_norm": 0.53558349609375, "kl": 0.479345703125, "learning_rate": 1.8322720268572333e-05, "loss": 0.0192, "reward": 0.4812500074505806, "reward_std": 0.20485753305256366, "rewards/accuracy_reward": 0.4812500074505806, "rewards/format_reward": 0.0, "step": 810 }, { "completion_length": 598.5416854858398, "epoch": 0.27000165645188007, "grad_norm": 0.29555371403694153, "kl": 0.3069091796875, "learning_rate": 1.8290519460032805e-05, "loss": 0.0123, "reward": 0.4812500089406967, "reward_std": 0.197813368961215, "rewards/accuracy_reward": 0.4812500089406967, "rewards/format_reward": 0.0, "step": 815 }, { "completion_length": 615.3604370117188, "epoch": 0.271658108331953, "grad_norm": 0.3225797414779663, "kl": 0.366455078125, "learning_rate": 1.8258041344542567e-05, "loss": 0.0147, "reward": 0.49375001192092893, "reward_std": 0.20485753044486046, "rewards/accuracy_reward": 0.49375001192092893, "rewards/format_reward": 0.0, "step": 820 }, { "completion_length": 658.5166900634765, "epoch": 0.2733145602120258, "grad_norm": 0.30557191371917725, "kl": 0.508984375, "learning_rate": 1.8225287008451774e-05, "loss": 0.0204, "reward": 0.41875000968575476, "reward_std": 0.2192912895232439, "rewards/accuracy_reward": 0.41875000968575476, "rewards/format_reward": 0.0, "step": 825 }, { "completion_length": 613.7437683105469, "epoch": 0.27497101209209873, "grad_norm": 0.7485781908035278, "kl": 0.598681640625, "learning_rate": 1.8192257547349805e-05, "loss": 0.024, "reward": 0.4562500067055225, "reward_std": 0.2548575304448605, "rewards/accuracy_reward": 0.4562500067055225, "rewards/format_reward": 0.0, "step": 830 }, { "completion_length": 669.1021026611328, "epoch": 0.2766274639721716, "grad_norm": 1.4298166036605835, "kl": 1.2833984375, "learning_rate": 1.815895406602861e-05, "loss": 0.0513, "reward": 0.32083334065973756, "reward_std": 0.24106836020946504, "rewards/accuracy_reward": 0.32083334065973756, "rewards/format_reward": 0.0, "step": 835 }, { "completion_length": 674.0125183105469, "epoch": 0.2782839158522445, "grad_norm": 0.4115672707557678, "kl": 0.687353515625, "learning_rate": 1.8125377678445755e-05, "loss": 0.0275, "reward": 0.3708333453163505, "reward_std": 0.22921294309198856, "rewards/accuracy_reward": 0.3708333453163505, "rewards/format_reward": 0.0, "step": 840 }, { "completion_length": 703.8771026611328, "epoch": 0.2799403677323174, "grad_norm": 0.25243300199508667, "kl": 1.035546875, "learning_rate": 1.8091529507687148e-05, "loss": 0.0414, "reward": 0.3937500111758709, "reward_std": 0.23982547372579574, "rewards/accuracy_reward": 0.3937500111758709, "rewards/format_reward": 0.0, "step": 845 }, { "completion_length": 636.6708541870117, "epoch": 0.28159681961239025, "grad_norm": 0.40690305829048157, "kl": 0.2364990234375, "learning_rate": 1.8057410685929505e-05, "loss": 0.0095, "reward": 0.4812500156462193, "reward_std": 0.21929128803312778, "rewards/accuracy_reward": 0.4812500156462193, "rewards/format_reward": 0.0, "step": 850 }, { "completion_length": 662.7271026611328, "epoch": 0.28325327149246315, "grad_norm": 0.3789184093475342, "kl": 0.259228515625, "learning_rate": 1.802302235440245e-05, "loss": 0.0104, "reward": 0.479166679084301, "reward_std": 0.21864670217037202, "rewards/accuracy_reward": 0.479166679084301, "rewards/format_reward": 0.0, "step": 855 }, { "completion_length": 734.9291809082031, "epoch": 0.284909723372536, "grad_norm": 0.3198982775211334, "kl": 0.384521484375, "learning_rate": 1.7988365663350352e-05, "loss": 0.0154, "reward": 0.4458333447575569, "reward_std": 0.24364670254290105, "rewards/accuracy_reward": 0.4458333447575569, "rewards/format_reward": 0.0, "step": 860 }, { "completion_length": 747.6208526611329, "epoch": 0.2865661752526089, "grad_norm": 0.5380960702896118, "kl": 0.534619140625, "learning_rate": 1.795344177199386e-05, "loss": 0.0214, "reward": 0.41458334103226663, "reward_std": 0.2625925559550524, "rewards/accuracy_reward": 0.41458334103226663, "rewards/format_reward": 0.0, "step": 865 }, { "completion_length": 785.6625213623047, "epoch": 0.2882226271326818, "grad_norm": 0.2527111768722534, "kl": 0.5081787109375, "learning_rate": 1.7918251848491118e-05, "loss": 0.0204, "reward": 0.44375001043081286, "reward_std": 0.26576920635998247, "rewards/accuracy_reward": 0.44375001043081286, "rewards/format_reward": 0.0, "step": 870 }, { "completion_length": 754.852099609375, "epoch": 0.28987907901275467, "grad_norm": 0.27882009744644165, "kl": 0.3931640625, "learning_rate": 1.7882797069898693e-05, "loss": 0.0157, "reward": 0.450000011920929, "reward_std": 0.22345795333385468, "rewards/accuracy_reward": 0.450000011920929, "rewards/format_reward": 0.0, "step": 875 }, { "completion_length": 751.118765258789, "epoch": 0.2915355308928276, "grad_norm": 0.2522261440753937, "kl": 0.3256591796875, "learning_rate": 1.7847078622132202e-05, "loss": 0.013, "reward": 0.4562500149011612, "reward_std": 0.2048575308173895, "rewards/accuracy_reward": 0.4562500149011612, "rewards/format_reward": 0.0, "step": 880 }, { "completion_length": 808.8958618164063, "epoch": 0.29319198277290043, "grad_norm": 0.671545684337616, "kl": 0.5681640625, "learning_rate": 1.781109769992666e-05, "loss": 0.0227, "reward": 0.37916667610406873, "reward_std": 0.19012462124228477, "rewards/accuracy_reward": 0.37916667610406873, "rewards/format_reward": 0.0, "step": 885 }, { "completion_length": 784.058349609375, "epoch": 0.29484843465297333, "grad_norm": 0.5918344855308533, "kl": 0.693212890625, "learning_rate": 1.7774855506796497e-05, "loss": 0.0277, "reward": 0.39583334550261495, "reward_std": 0.22826920710504056, "rewards/accuracy_reward": 0.39583334550261495, "rewards/format_reward": 0.0, "step": 890 }, { "completion_length": 795.4021057128906, "epoch": 0.29650488653304624, "grad_norm": 0.412251353263855, "kl": 0.76201171875, "learning_rate": 1.773835325499531e-05, "loss": 0.0305, "reward": 0.3854166783392429, "reward_std": 0.2039137937128544, "rewards/accuracy_reward": 0.3854166783392429, "rewards/format_reward": 0.0, "step": 895 }, { "completion_length": 778.9271057128906, "epoch": 0.2981613384131191, "grad_norm": 0.44891592860221863, "kl": 0.75380859375, "learning_rate": 1.770159216547532e-05, "loss": 0.0302, "reward": 0.38541667386889455, "reward_std": 0.23724713250994683, "rewards/accuracy_reward": 0.38541667386889455, "rewards/format_reward": 0.0, "step": 900 }, { "epoch": 0.2981613384131191, "eval_completion_length": 746.9437744140625, "eval_kl": 0.80546875, "eval_loss": 0.03196508809924126, "eval_reward": 0.40625001192092897, "eval_reward_std": 0.2096687823534012, "eval_rewards/accuracy_reward": 0.40625001192092897, "eval_rewards/format_reward": 0.0, "eval_runtime": 63.3629, "eval_samples_per_second": 1.562, "eval_steps_per_second": 0.032, "step": 900 }, { "completion_length": 736.6875274658203, "epoch": 0.299817790293192, "grad_norm": 0.27564573287963867, "kl": 0.70947265625, "learning_rate": 1.7664573467846532e-05, "loss": 0.0284, "reward": 0.4166666764765978, "reward_std": 0.2199358742684126, "rewards/accuracy_reward": 0.4166666764765978, "rewards/format_reward": 0.0, "step": 905 }, { "completion_length": 718.3896026611328, "epoch": 0.30147424217326485, "grad_norm": 0.42780938744544983, "kl": 0.5337890625, "learning_rate": 1.76272984003356e-05, "loss": 0.0214, "reward": 0.4187500124797225, "reward_std": 0.1856125235557556, "rewards/accuracy_reward": 0.4187500124797225, "rewards/format_reward": 0.0, "step": 910 }, { "completion_length": 688.7000213623047, "epoch": 0.30313069405333776, "grad_norm": 0.6437339186668396, "kl": 0.69990234375, "learning_rate": 1.7589768209744404e-05, "loss": 0.028, "reward": 0.46458334624767306, "reward_std": 0.25039171427488327, "rewards/accuracy_reward": 0.46458334624767306, "rewards/format_reward": 0.0, "step": 915 }, { "completion_length": 781.716683959961, "epoch": 0.3047871459334106, "grad_norm": 0.7020159959793091, "kl": 1.4259765625, "learning_rate": 1.7551984151408363e-05, "loss": 0.057, "reward": 0.3416666749864817, "reward_std": 0.20773502811789513, "rewards/accuracy_reward": 0.3416666749864817, "rewards/format_reward": 0.0, "step": 920 }, { "completion_length": 657.5958526611328, "epoch": 0.3064435978134835, "grad_norm": 0.7334643006324768, "kl": 0.385009765625, "learning_rate": 1.7513947489154443e-05, "loss": 0.0154, "reward": 0.4708333447575569, "reward_std": 0.24433757215738297, "rewards/accuracy_reward": 0.4708333447575569, "rewards/format_reward": 0.0, "step": 925 }, { "completion_length": 676.4812728881836, "epoch": 0.3081000496935564, "grad_norm": 0.23671846091747284, "kl": 0.3033203125, "learning_rate": 1.7475659495258864e-05, "loss": 0.0121, "reward": 0.49583334773778914, "reward_std": 0.24012462459504605, "rewards/accuracy_reward": 0.49583334773778914, "rewards/format_reward": 0.0, "step": 930 }, { "completion_length": 651.4791900634766, "epoch": 0.3097565015736293, "grad_norm": 0.3459617495536804, "kl": 0.48291015625, "learning_rate": 1.7437121450404576e-05, "loss": 0.0193, "reward": 0.4666666805744171, "reward_std": 0.2173575323075056, "rewards/accuracy_reward": 0.4666666805744171, "rewards/format_reward": 0.0, "step": 935 }, { "completion_length": 674.1687667846679, "epoch": 0.3114129534537022, "grad_norm": 0.5430594682693481, "kl": 0.457470703125, "learning_rate": 1.739833464363838e-05, "loss": 0.0183, "reward": 0.4708333469927311, "reward_std": 0.23531336896121502, "rewards/accuracy_reward": 0.4708333469927311, "rewards/format_reward": 0.0, "step": 940 }, { "completion_length": 731.504183959961, "epoch": 0.31306940533377503, "grad_norm": 0.6707265973091125, "kl": 0.668359375, "learning_rate": 1.7359300372327865e-05, "loss": 0.0267, "reward": 0.40833334624767303, "reward_std": 0.22087961100041867, "rewards/accuracy_reward": 0.40833334624767303, "rewards/format_reward": 0.0, "step": 945 }, { "completion_length": 721.0833526611328, "epoch": 0.31472585721384794, "grad_norm": 0.2909613847732544, "kl": 0.77763671875, "learning_rate": 1.7320019942117954e-05, "loss": 0.0311, "reward": 0.4333333492279053, "reward_std": 0.2340242002159357, "rewards/accuracy_reward": 0.4333333492279053, "rewards/format_reward": 0.0, "step": 950 }, { "completion_length": 744.758349609375, "epoch": 0.31638230909392084, "grad_norm": 0.4231288731098175, "kl": 0.6888671875, "learning_rate": 1.7280494666887283e-05, "loss": 0.0275, "reward": 0.4000000096857548, "reward_std": 0.23179128728806972, "rewards/accuracy_reward": 0.4000000096857548, "rewards/format_reward": 0.0, "step": 955 }, { "completion_length": 691.8166900634766, "epoch": 0.3180387609739937, "grad_norm": 0.5277166962623596, "kl": 0.568896484375, "learning_rate": 1.7240725868704218e-05, "loss": 0.0228, "reward": 0.41875001341104506, "reward_std": 0.29205838181078436, "rewards/accuracy_reward": 0.41875001341104506, "rewards/format_reward": 0.0, "step": 960 }, { "completion_length": 708.1083526611328, "epoch": 0.3196952128540666, "grad_norm": 0.48094213008880615, "kl": 0.5755859375, "learning_rate": 1.720071487778265e-05, "loss": 0.023, "reward": 0.4270833469927311, "reward_std": 0.20614670254290104, "rewards/accuracy_reward": 0.4270833469927311, "rewards/format_reward": 0.0, "step": 965 }, { "completion_length": 704.1312683105468, "epoch": 0.32135166473413945, "grad_norm": 0.38625073432922363, "kl": 0.4560546875, "learning_rate": 1.71604630324375e-05, "loss": 0.0182, "reward": 0.4000000078231096, "reward_std": 0.21864670366048813, "rewards/accuracy_reward": 0.4000000078231096, "rewards/format_reward": 0.0, "step": 970 }, { "completion_length": 728.3500244140625, "epoch": 0.32300811661421236, "grad_norm": 0.5861239433288574, "kl": 0.8328125, "learning_rate": 1.711997167903995e-05, "loss": 0.0333, "reward": 0.4083333443850279, "reward_std": 0.236947974935174, "rewards/accuracy_reward": 0.4083333443850279, "rewards/format_reward": 0.0, "step": 975 }, { "completion_length": 684.3646026611328, "epoch": 0.32466456849428527, "grad_norm": 1.3483315706253052, "kl": 1.72607421875, "learning_rate": 1.7079242171972417e-05, "loss": 0.069, "reward": 0.4666666820645332, "reward_std": 0.21160254217684268, "rewards/accuracy_reward": 0.4666666820645332, "rewards/format_reward": 0.0, "step": 980 }, { "completion_length": 666.629183959961, "epoch": 0.3263210203743581, "grad_norm": 0.24399296939373016, "kl": 0.83662109375, "learning_rate": 1.7038275873583233e-05, "loss": 0.0335, "reward": 0.4875000104308128, "reward_std": 0.24716878235340117, "rewards/accuracy_reward": 0.4875000104308128, "rewards/format_reward": 0.0, "step": 985 }, { "completion_length": 666.2250183105468, "epoch": 0.327977472254431, "grad_norm": 0.5504988431930542, "kl": 0.447412109375, "learning_rate": 1.6997074154141097e-05, "loss": 0.0179, "reward": 0.4958333529531956, "reward_std": 0.19587961435317994, "rewards/accuracy_reward": 0.4958333529531956, "rewards/format_reward": 0.0, "step": 990 }, { "completion_length": 666.4583526611328, "epoch": 0.3296339241345039, "grad_norm": 0.8427032232284546, "kl": 0.6099609375, "learning_rate": 1.695563839178923e-05, "loss": 0.0244, "reward": 0.46875001639127734, "reward_std": 0.22504628002643584, "rewards/accuracy_reward": 0.46875001639127734, "rewards/format_reward": 0.0, "step": 995 }, { "completion_length": 682.5854339599609, "epoch": 0.3312903760145768, "grad_norm": 0.3408009111881256, "kl": 0.74482421875, "learning_rate": 1.6913969972499272e-05, "loss": 0.0298, "reward": 0.4145833432674408, "reward_std": 0.23690169379115106, "rewards/accuracy_reward": 0.4145833432674408, "rewards/format_reward": 0.0, "step": 1000 }, { "completion_length": 631.1000122070312, "epoch": 0.33294682789464963, "grad_norm": 0.6009291410446167, "kl": 0.4625, "learning_rate": 1.6872070290024944e-05, "loss": 0.0185, "reward": 0.450000011920929, "reward_std": 0.26160254068672656, "rewards/accuracy_reward": 0.450000011920929, "rewards/format_reward": 0.0, "step": 1005 }, { "completion_length": 663.2291839599609, "epoch": 0.33460327977472254, "grad_norm": 0.41487398743629456, "kl": 0.52548828125, "learning_rate": 1.682994074585541e-05, "loss": 0.021, "reward": 0.43750000819563867, "reward_std": 0.24141379520297052, "rewards/accuracy_reward": 0.43750000819563867, "rewards/format_reward": 0.0, "step": 1010 }, { "completion_length": 650.4541870117188, "epoch": 0.33625973165479545, "grad_norm": 0.5349622964859009, "kl": 0.663330078125, "learning_rate": 1.6787582749168403e-05, "loss": 0.0265, "reward": 0.412500012665987, "reward_std": 0.2606588002294302, "rewards/accuracy_reward": 0.412500012665987, "rewards/format_reward": 0.0, "step": 1015 }, { "completion_length": 652.7500183105469, "epoch": 0.3379161835348683, "grad_norm": 0.13883282244205475, "kl": 0.3160400390625, "learning_rate": 1.674499771678309e-05, "loss": 0.0126, "reward": 0.5104166731238365, "reward_std": 0.1820904441177845, "rewards/accuracy_reward": 0.5104166731238365, "rewards/format_reward": 0.0, "step": 1020 }, { "completion_length": 672.3291961669922, "epoch": 0.3395726354149412, "grad_norm": 0.21802331507205963, "kl": 0.34443359375, "learning_rate": 1.6702187073112688e-05, "loss": 0.0138, "reward": 0.43333334773778914, "reward_std": 0.23660254329442978, "rewards/accuracy_reward": 0.43333334773778914, "rewards/format_reward": 0.0, "step": 1025 }, { "completion_length": 714.1812774658204, "epoch": 0.34122908729501406, "grad_norm": 0.45314645767211914, "kl": 0.484619140625, "learning_rate": 1.665915225011681e-05, "loss": 0.0194, "reward": 0.39375001676380633, "reward_std": 0.20966878533363342, "rewards/accuracy_reward": 0.39375001676380633, "rewards/format_reward": 0.0, "step": 1030 }, { "completion_length": 714.2666870117188, "epoch": 0.34288553917508696, "grad_norm": 0.5284367203712463, "kl": 0.52314453125, "learning_rate": 1.6615894687253583e-05, "loss": 0.0209, "reward": 0.47083334624767303, "reward_std": 0.2279237736016512, "rewards/accuracy_reward": 0.47083334624767303, "rewards/format_reward": 0.0, "step": 1035 }, { "completion_length": 787.770849609375, "epoch": 0.34454199105515987, "grad_norm": 0.9596872925758362, "kl": 0.79892578125, "learning_rate": 1.6572415831431466e-05, "loss": 0.0319, "reward": 0.3125000109896064, "reward_std": 0.2077350240200758, "rewards/accuracy_reward": 0.3125000109896064, "rewards/format_reward": 0.0, "step": 1040 }, { "completion_length": 711.8437713623047, "epoch": 0.3461984429352327, "grad_norm": 0.396168053150177, "kl": 0.417529296875, "learning_rate": 1.65287171369609e-05, "loss": 0.0167, "reward": 0.35833334252238275, "reward_std": 0.254558377712965, "rewards/accuracy_reward": 0.35833334252238275, "rewards/format_reward": 0.0, "step": 1045 }, { "completion_length": 694.8271057128907, "epoch": 0.3478548948153056, "grad_norm": 0.3313078284263611, "kl": 0.43134765625, "learning_rate": 1.6484800065505627e-05, "loss": 0.0173, "reward": 0.37083334419876335, "reward_std": 0.22921294681727886, "rewards/accuracy_reward": 0.37083334419876335, "rewards/format_reward": 0.0, "step": 1050 }, { "completion_length": 680.5146026611328, "epoch": 0.3495113466953785, "grad_norm": 0.43609166145324707, "kl": 0.43310546875, "learning_rate": 1.6440666086033818e-05, "loss": 0.0173, "reward": 0.4041666768491268, "reward_std": 0.21993587389588357, "rewards/accuracy_reward": 0.4041666768491268, "rewards/format_reward": 0.0, "step": 1055 }, { "completion_length": 680.5312683105469, "epoch": 0.3511677985754514, "grad_norm": 0.5309097766876221, "kl": 0.45859375, "learning_rate": 1.6396316674768914e-05, "loss": 0.0183, "reward": 0.42708334773778917, "reward_std": 0.24558045640587806, "rewards/accuracy_reward": 0.42708334773778917, "rewards/format_reward": 0.0, "step": 1060 }, { "completion_length": 670.9166900634766, "epoch": 0.3528242504555243, "grad_norm": 0.3254081904888153, "kl": 0.417822265625, "learning_rate": 1.6351753315140285e-05, "loss": 0.0167, "reward": 0.3916666734963655, "reward_std": 0.1721687823534012, "rewards/accuracy_reward": 0.3916666734963655, "rewards/format_reward": 0.0, "step": 1065 }, { "completion_length": 689.9750152587891, "epoch": 0.35448070233559714, "grad_norm": 0.5018993616104126, "kl": 0.573681640625, "learning_rate": 1.630697749773359e-05, "loss": 0.023, "reward": 0.4291666798293591, "reward_std": 0.17663460560142993, "rewards/accuracy_reward": 0.4291666798293591, "rewards/format_reward": 0.0, "step": 1070 }, { "completion_length": 675.8416870117187, "epoch": 0.35613715421567005, "grad_norm": 0.5006002187728882, "kl": 0.480029296875, "learning_rate": 1.626199072024091e-05, "loss": 0.0192, "reward": 0.4666666850447655, "reward_std": 0.2067912880331278, "rewards/accuracy_reward": 0.4666666850447655, "rewards/format_reward": 0.0, "step": 1075 }, { "completion_length": 680.4812744140625, "epoch": 0.3577936060957429, "grad_norm": 0.176527738571167, "kl": 0.363330078125, "learning_rate": 1.621679448741067e-05, "loss": 0.0145, "reward": 0.45833334177732465, "reward_std": 0.2330804578959942, "rewards/accuracy_reward": 0.45833334177732465, "rewards/format_reward": 0.0, "step": 1080 }, { "completion_length": 716.1146026611328, "epoch": 0.3594500579758158, "grad_norm": 0.6546955704689026, "kl": 0.305029296875, "learning_rate": 1.6171390310997303e-05, "loss": 0.0122, "reward": 0.5104166850447655, "reward_std": 0.2070904441177845, "rewards/accuracy_reward": 0.5104166850447655, "rewards/format_reward": 0.0, "step": 1085 }, { "completion_length": 767.052099609375, "epoch": 0.3611065098558887, "grad_norm": 0.42851394414901733, "kl": 0.3079345703125, "learning_rate": 1.6125779709710668e-05, "loss": 0.0123, "reward": 0.3937500096857548, "reward_std": 0.20356836095452308, "rewards/accuracy_reward": 0.3937500096857548, "rewards/format_reward": 0.0, "step": 1090 }, { "completion_length": 739.2958587646484, "epoch": 0.36276296173596156, "grad_norm": 0.343398779630661, "kl": 0.33525390625, "learning_rate": 1.6079964209165276e-05, "loss": 0.0134, "reward": 0.410416679084301, "reward_std": 0.20872504338622094, "rewards/accuracy_reward": 0.410416679084301, "rewards/format_reward": 0.0, "step": 1095 }, { "completion_length": 738.2833526611328, "epoch": 0.36441941361603447, "grad_norm": 0.2588794529438019, "kl": 0.349267578125, "learning_rate": 1.603394534182925e-05, "loss": 0.014, "reward": 0.458333345875144, "reward_std": 0.24235752858221532, "rewards/accuracy_reward": 0.458333345875144, "rewards/format_reward": 0.0, "step": 1100 }, { "completion_length": 724.3396057128906, "epoch": 0.3660758654961073, "grad_norm": 0.285118043422699, "kl": 0.3353515625, "learning_rate": 1.598772464697305e-05, "loss": 0.0134, "reward": 0.4354166828095913, "reward_std": 0.24523502513766288, "rewards/accuracy_reward": 0.4354166828095913, "rewards/format_reward": 0.0, "step": 1105 }, { "completion_length": 684.5375213623047, "epoch": 0.36773231737618023, "grad_norm": 0.39234358072280884, "kl": 0.3454833984375, "learning_rate": 1.5941303670618018e-05, "loss": 0.0138, "reward": 0.5041666857898235, "reward_std": 0.24235753305256366, "rewards/accuracy_reward": 0.5041666857898235, "rewards/format_reward": 0.0, "step": 1110 }, { "completion_length": 673.3500198364258, "epoch": 0.3693887692562531, "grad_norm": 0.24206091463565826, "kl": 0.32705078125, "learning_rate": 1.5894683965484632e-05, "loss": 0.0131, "reward": 0.48750001192092896, "reward_std": 0.2138354502618313, "rewards/accuracy_reward": 0.48750001192092896, "rewards/format_reward": 0.0, "step": 1115 }, { "completion_length": 692.7229431152343, "epoch": 0.371045221136326, "grad_norm": 0.30689141154289246, "kl": 0.4552734375, "learning_rate": 1.5847867090940602e-05, "loss": 0.0182, "reward": 0.433333345502615, "reward_std": 0.23273502960801123, "rewards/accuracy_reward": 0.433333345502615, "rewards/format_reward": 0.0, "step": 1120 }, { "completion_length": 706.5958526611328, "epoch": 0.3727016730163989, "grad_norm": 0.40047401189804077, "kl": 0.406640625, "learning_rate": 1.5800854612948678e-05, "loss": 0.0163, "reward": 0.427083346247673, "reward_std": 0.22152419686317443, "rewards/accuracy_reward": 0.427083346247673, "rewards/format_reward": 0.0, "step": 1125 }, { "completion_length": 706.827099609375, "epoch": 0.37435812489647174, "grad_norm": 0.24229349195957184, "kl": 0.40498046875, "learning_rate": 1.57536481040143e-05, "loss": 0.0162, "reward": 0.4354166805744171, "reward_std": 0.22633545175194741, "rewards/accuracy_reward": 0.4354166805744171, "rewards/format_reward": 0.0, "step": 1130 }, { "completion_length": 720.9083557128906, "epoch": 0.37601457677654465, "grad_norm": 0.28101521730422974, "kl": 0.4875, "learning_rate": 1.5706249143132982e-05, "loss": 0.0195, "reward": 0.435416679084301, "reward_std": 0.20520296543836594, "rewards/accuracy_reward": 0.435416679084301, "rewards/format_reward": 0.0, "step": 1135 }, { "completion_length": 757.6208465576171, "epoch": 0.3776710286566175, "grad_norm": 0.4630211293697357, "kl": 0.435009765625, "learning_rate": 1.5658659315737505e-05, "loss": 0.0174, "reward": 0.3562500111758709, "reward_std": 0.23948003351688385, "rewards/accuracy_reward": 0.3562500111758709, "rewards/format_reward": 0.0, "step": 1140 }, { "completion_length": 686.3125152587891, "epoch": 0.3793274805366904, "grad_norm": 0.20541873574256897, "kl": 0.3150390625, "learning_rate": 1.5610880213644883e-05, "loss": 0.0126, "reward": 0.4291666798293591, "reward_std": 0.15644585862755775, "rewards/accuracy_reward": 0.4291666798293591, "rewards/format_reward": 0.0, "step": 1145 }, { "completion_length": 658.9958480834961, "epoch": 0.3809839324167633, "grad_norm": 0.4020118713378906, "kl": 0.31796875, "learning_rate": 1.5562913435003113e-05, "loss": 0.0127, "reward": 0.5562500175088644, "reward_std": 0.2157692089676857, "rewards/accuracy_reward": 0.5562500175088644, "rewards/format_reward": 0.0, "step": 1150 }, { "completion_length": 687.1750183105469, "epoch": 0.38264038429683617, "grad_norm": 0.39713943004608154, "kl": 0.3373046875, "learning_rate": 1.5514760584237733e-05, "loss": 0.0135, "reward": 0.4416666828095913, "reward_std": 0.17440169416368007, "rewards/accuracy_reward": 0.4416666828095913, "rewards/format_reward": 0.0, "step": 1155 }, { "completion_length": 672.8979400634765, "epoch": 0.3842968361769091, "grad_norm": 0.31331461668014526, "kl": 0.383935546875, "learning_rate": 1.5466423271998144e-05, "loss": 0.0154, "reward": 0.4833333417773247, "reward_std": 0.22345795519649983, "rewards/accuracy_reward": 0.4833333417773247, "rewards/format_reward": 0.0, "step": 1160 }, { "completion_length": 689.0312683105469, "epoch": 0.3859532880569819, "grad_norm": 0.5943039059638977, "kl": 0.3049560546875, "learning_rate": 1.5417903115103746e-05, "loss": 0.0122, "reward": 0.44375001415610316, "reward_std": 0.21190169490873814, "rewards/accuracy_reward": 0.44375001415610316, "rewards/format_reward": 0.0, "step": 1165 }, { "completion_length": 684.7646057128907, "epoch": 0.38760973993705483, "grad_norm": 0.2976776957511902, "kl": 0.2733642578125, "learning_rate": 1.536920173648984e-05, "loss": 0.0109, "reward": 0.450000011920929, "reward_std": 0.2279237773269415, "rewards/accuracy_reward": 0.450000011920929, "rewards/format_reward": 0.0, "step": 1170 }, { "completion_length": 701.6416839599609, "epoch": 0.38926619181712774, "grad_norm": 0.4693623185157776, "kl": 0.3373046875, "learning_rate": 1.5320320765153367e-05, "loss": 0.0135, "reward": 0.4312500089406967, "reward_std": 0.23466878272593023, "rewards/accuracy_reward": 0.4312500089406967, "rewards/format_reward": 0.0, "step": 1175 }, { "completion_length": 707.841683959961, "epoch": 0.3909226436972006, "grad_norm": 0.23049846291542053, "kl": 0.411181640625, "learning_rate": 1.5271261836098403e-05, "loss": 0.0165, "reward": 0.4020833453163505, "reward_std": 0.20709044188261033, "rewards/accuracy_reward": 0.4020833453163505, "rewards/format_reward": 0.0, "step": 1180 }, { "completion_length": 662.3458465576172, "epoch": 0.3925790955772735, "grad_norm": 0.4061271846294403, "kl": 0.285546875, "learning_rate": 1.5222026590281473e-05, "loss": 0.0114, "reward": 0.5437500134110451, "reward_std": 0.2613033875823021, "rewards/accuracy_reward": 0.5437500134110451, "rewards/format_reward": 0.0, "step": 1185 }, { "completion_length": 682.847933959961, "epoch": 0.39423554745734635, "grad_norm": 0.3948563039302826, "kl": 0.39638671875, "learning_rate": 1.5172616674556673e-05, "loss": 0.0158, "reward": 0.3812500089406967, "reward_std": 0.2253917146474123, "rewards/accuracy_reward": 0.3812500089406967, "rewards/format_reward": 0.0, "step": 1190 }, { "completion_length": 694.9166961669922, "epoch": 0.39589199933741925, "grad_norm": 0.2184487283229828, "kl": 0.351904296875, "learning_rate": 1.5123033741620564e-05, "loss": 0.0141, "reward": 0.4000000089406967, "reward_std": 0.21606836281716824, "rewards/accuracy_reward": 0.4000000089406967, "rewards/format_reward": 0.0, "step": 1195 }, { "completion_length": 669.9291854858399, "epoch": 0.3975484512174921, "grad_norm": 0.28960826992988586, "kl": 0.3259765625, "learning_rate": 1.5073279449956916e-05, "loss": 0.013, "reward": 0.47500001490116117, "reward_std": 0.25808046013116837, "rewards/accuracy_reward": 0.47500001490116117, "rewards/format_reward": 0.0, "step": 1200 }, { "epoch": 0.3975484512174921, "eval_completion_length": 673.3979370117188, "eval_kl": 0.318359375, "eval_loss": 0.012691713869571686, "eval_reward": 0.4395833432674408, "eval_reward_std": 0.20614669919013978, "eval_rewards/accuracy_reward": 0.4395833432674408, "eval_rewards/format_reward": 0.0, "eval_runtime": 61.7647, "eval_samples_per_second": 1.603, "eval_steps_per_second": 0.032, "step": 1200 }, { "completion_length": 709.1104339599609, "epoch": 0.399204903097565, "grad_norm": 0.585544764995575, "kl": 0.33935546875, "learning_rate": 1.5023355463781221e-05, "loss": 0.0136, "reward": 0.414583345502615, "reward_std": 0.25004628039896487, "rewards/accuracy_reward": 0.414583345502615, "rewards/format_reward": 0.0, "step": 1205 }, { "completion_length": 677.2625244140625, "epoch": 0.4008613549776379, "grad_norm": 0.24460500478744507, "kl": 0.382373046875, "learning_rate": 1.4973263452985023e-05, "loss": 0.0153, "reward": 0.4437500134110451, "reward_std": 0.21448003463447093, "rewards/accuracy_reward": 0.4437500134110451, "rewards/format_reward": 0.0, "step": 1210 }, { "completion_length": 665.2854385375977, "epoch": 0.40251780685771077, "grad_norm": 0.7513113021850586, "kl": 0.37060546875, "learning_rate": 1.4923005093080074e-05, "loss": 0.0148, "reward": 0.45416667833924296, "reward_std": 0.24141378998756408, "rewards/accuracy_reward": 0.45416667833924296, "rewards/format_reward": 0.0, "step": 1215 }, { "completion_length": 649.739599609375, "epoch": 0.4041742587377837, "grad_norm": 0.950705885887146, "kl": 0.486767578125, "learning_rate": 1.4872582065142285e-05, "loss": 0.0195, "reward": 0.4395833469927311, "reward_std": 0.21319086253643035, "rewards/accuracy_reward": 0.4395833469927311, "rewards/format_reward": 0.0, "step": 1220 }, { "completion_length": 658.6437713623047, "epoch": 0.4058307106178565, "grad_norm": 0.6829798817634583, "kl": 0.4041015625, "learning_rate": 1.482199605575549e-05, "loss": 0.0161, "reward": 0.4875000139698386, "reward_std": 0.2366025395691395, "rewards/accuracy_reward": 0.4875000139698386, "rewards/format_reward": 0.0, "step": 1225 }, { "completion_length": 650.9583557128906, "epoch": 0.40748716249792943, "grad_norm": 0.44855284690856934, "kl": 0.37470703125, "learning_rate": 1.4771248756955042e-05, "loss": 0.015, "reward": 0.46250001043081285, "reward_std": 0.21864670068025588, "rewards/accuracy_reward": 0.46250001043081285, "rewards/format_reward": 0.0, "step": 1230 }, { "completion_length": 635.1833511352539, "epoch": 0.40914361437800234, "grad_norm": 1.2026690244674683, "kl": 0.39873046875, "learning_rate": 1.472034186617121e-05, "loss": 0.016, "reward": 0.5208333492279053, "reward_std": 0.22792377695441246, "rewards/accuracy_reward": 0.5208333492279053, "rewards/format_reward": 0.0, "step": 1235 }, { "completion_length": 660.3354370117188, "epoch": 0.4108000662580752, "grad_norm": 0.23393508791923523, "kl": 0.312255859375, "learning_rate": 1.4669277086172406e-05, "loss": 0.0125, "reward": 0.4250000149011612, "reward_std": 0.2327350229024887, "rewards/accuracy_reward": 0.4250000149011612, "rewards/format_reward": 0.0, "step": 1240 }, { "completion_length": 658.3583435058594, "epoch": 0.4124565181381481, "grad_norm": 0.160048246383667, "kl": 0.314697265625, "learning_rate": 1.461805612500823e-05, "loss": 0.0126, "reward": 0.4145833425223827, "reward_std": 0.23595795184373855, "rewards/accuracy_reward": 0.4145833425223827, "rewards/format_reward": 0.0, "step": 1245 }, { "completion_length": 656.5708557128906, "epoch": 0.41411297001822095, "grad_norm": 0.3508008122444153, "kl": 0.3119140625, "learning_rate": 1.4566680695952333e-05, "loss": 0.0125, "reward": 0.44166667833924295, "reward_std": 0.26641379557549955, "rewards/accuracy_reward": 0.44166667833924295, "rewards/format_reward": 0.0, "step": 1250 }, { "completion_length": 681.8521041870117, "epoch": 0.41576942189829386, "grad_norm": 0.2417812943458557, "kl": 0.377197265625, "learning_rate": 1.4515152517445117e-05, "loss": 0.0151, "reward": 0.35416667386889455, "reward_std": 0.21254627965390682, "rewards/accuracy_reward": 0.35416667386889455, "rewards/format_reward": 0.0, "step": 1255 }, { "completion_length": 659.9708526611328, "epoch": 0.41742587377836676, "grad_norm": 0.3389175236225128, "kl": 0.28935546875, "learning_rate": 1.4463473313036241e-05, "loss": 0.0116, "reward": 0.4250000186264515, "reward_std": 0.24141379483044148, "rewards/accuracy_reward": 0.4250000186264515, "rewards/format_reward": 0.0, "step": 1260 }, { "completion_length": 644.7312728881836, "epoch": 0.4190823256584396, "grad_norm": 0.1789284348487854, "kl": 0.2664306640625, "learning_rate": 1.4411644811327e-05, "loss": 0.0107, "reward": 0.4812500111758709, "reward_std": 0.23948003575205803, "rewards/accuracy_reward": 0.4812500111758709, "rewards/format_reward": 0.0, "step": 1265 }, { "completion_length": 662.2854370117187, "epoch": 0.4207387775385125, "grad_norm": 0.4415893852710724, "kl": 0.368505859375, "learning_rate": 1.4359668745912472e-05, "loss": 0.0147, "reward": 0.4395833477377892, "reward_std": 0.2048575334250927, "rewards/accuracy_reward": 0.4395833477377892, "rewards/format_reward": 0.0, "step": 1270 }, { "completion_length": 662.0062683105468, "epoch": 0.42239522941858537, "grad_norm": 0.21580536663532257, "kl": 0.31416015625, "learning_rate": 1.4307546855323549e-05, "loss": 0.0126, "reward": 0.4104166766628623, "reward_std": 0.17727919183671476, "rewards/accuracy_reward": 0.4104166766628623, "rewards/format_reward": 0.0, "step": 1275 }, { "completion_length": 602.8979385375976, "epoch": 0.4240516812986583, "grad_norm": 0.26672840118408203, "kl": 0.2587890625, "learning_rate": 1.4255280882968787e-05, "loss": 0.0104, "reward": 0.5458333522081376, "reward_std": 0.20198003761470318, "rewards/accuracy_reward": 0.5458333522081376, "rewards/format_reward": 0.0, "step": 1280 }, { "completion_length": 631.1666870117188, "epoch": 0.4257081331787312, "grad_norm": 0.20725172758102417, "kl": 0.2576171875, "learning_rate": 1.4202872577076087e-05, "loss": 0.0103, "reward": 0.5166666835546494, "reward_std": 0.220879615470767, "rewards/accuracy_reward": 0.5166666835546494, "rewards/format_reward": 0.0, "step": 1285 }, { "completion_length": 674.3062713623046, "epoch": 0.42736458505880404, "grad_norm": 0.1765548288822174, "kl": 0.314013671875, "learning_rate": 1.415032369063422e-05, "loss": 0.0126, "reward": 0.40833334624767303, "reward_std": 0.20069086477160453, "rewards/accuracy_reward": 0.40833334624767303, "rewards/format_reward": 0.0, "step": 1290 }, { "completion_length": 671.6271087646485, "epoch": 0.42902103693887694, "grad_norm": 0.2126505821943283, "kl": 0.253173828125, "learning_rate": 1.4097635981334183e-05, "loss": 0.0101, "reward": 0.4166666746139526, "reward_std": 0.21031336933374406, "rewards/accuracy_reward": 0.4166666746139526, "rewards/format_reward": 0.0, "step": 1295 }, { "completion_length": 668.1354339599609, "epoch": 0.4306774888189498, "grad_norm": 0.30363258719444275, "kl": 0.266357421875, "learning_rate": 1.4044811211510419e-05, "loss": 0.0107, "reward": 0.4895833432674408, "reward_std": 0.2731588024646044, "rewards/accuracy_reward": 0.4895833432674408, "rewards/format_reward": 0.0, "step": 1300 }, { "completion_length": 682.6979400634766, "epoch": 0.4323339406990227, "grad_norm": 0.30442872643470764, "kl": 0.3134765625, "learning_rate": 1.3991851148081873e-05, "loss": 0.0125, "reward": 0.40833334214985373, "reward_std": 0.216068359836936, "rewards/accuracy_reward": 0.40833334214985373, "rewards/format_reward": 0.0, "step": 1305 }, { "completion_length": 672.7625244140625, "epoch": 0.43399039257909555, "grad_norm": 0.34132835268974304, "kl": 0.314892578125, "learning_rate": 1.3938757562492873e-05, "loss": 0.0126, "reward": 0.47708334773778915, "reward_std": 0.24781336970627307, "rewards/accuracy_reward": 0.47708334773778915, "rewards/format_reward": 0.0, "step": 1310 }, { "completion_length": 670.9979370117187, "epoch": 0.43564684445916846, "grad_norm": 0.4142015278339386, "kl": 0.3630859375, "learning_rate": 1.388553223065389e-05, "loss": 0.0145, "reward": 0.46250001192092893, "reward_std": 0.2830804567784071, "rewards/accuracy_reward": 0.46250001192092893, "rewards/format_reward": 0.0, "step": 1315 }, { "completion_length": 682.8625244140625, "epoch": 0.43730329633924137, "grad_norm": 0.3019844591617584, "kl": 0.3751708984375, "learning_rate": 1.3832176932882136e-05, "loss": 0.015, "reward": 0.49583334624767306, "reward_std": 0.23625710979104042, "rewards/accuracy_reward": 0.49583334624767306, "rewards/format_reward": 0.0, "step": 1320 }, { "completion_length": 691.8646026611328, "epoch": 0.4389597482193142, "grad_norm": 0.36983171105384827, "kl": 0.330908203125, "learning_rate": 1.3778693453842006e-05, "loss": 0.0132, "reward": 0.502083346620202, "reward_std": 0.20004628151655196, "rewards/accuracy_reward": 0.502083346620202, "rewards/format_reward": 0.0, "step": 1325 }, { "completion_length": 734.3437713623047, "epoch": 0.4406162000993871, "grad_norm": 0.3177379071712494, "kl": 0.2937255859375, "learning_rate": 1.3725083582485397e-05, "loss": 0.0117, "reward": 0.4312500111758709, "reward_std": 0.21929128840565681, "rewards/accuracy_reward": 0.4312500111758709, "rewards/format_reward": 0.0, "step": 1330 }, { "completion_length": 702.8979370117188, "epoch": 0.44227265197946, "grad_norm": 0.23854368925094604, "kl": 0.231103515625, "learning_rate": 1.3671349111991857e-05, "loss": 0.0092, "reward": 0.47083334624767303, "reward_std": 0.21864670068025588, "rewards/accuracy_reward": 0.47083334624767303, "rewards/format_reward": 0.0, "step": 1335 }, { "completion_length": 732.0812744140625, "epoch": 0.4439291038595329, "grad_norm": 0.48066446185112, "kl": 0.326123046875, "learning_rate": 1.3617491839708614e-05, "loss": 0.0131, "reward": 0.39375001229345796, "reward_std": 0.2131908643990755, "rewards/accuracy_reward": 0.39375001229345796, "rewards/format_reward": 0.0, "step": 1340 }, { "completion_length": 670.1020965576172, "epoch": 0.4455855557396058, "grad_norm": 0.40385717153549194, "kl": 0.308251953125, "learning_rate": 1.356351356709045e-05, "loss": 0.0123, "reward": 0.5458333492279053, "reward_std": 0.2151246227324009, "rewards/accuracy_reward": 0.5458333492279053, "rewards/format_reward": 0.0, "step": 1345 }, { "completion_length": 715.9937683105469, "epoch": 0.44724200761967864, "grad_norm": 0.19422303140163422, "kl": 0.2736328125, "learning_rate": 1.3509416099639456e-05, "loss": 0.0109, "reward": 0.42916667424142363, "reward_std": 0.21606836132705212, "rewards/accuracy_reward": 0.42916667424142363, "rewards/format_reward": 0.0, "step": 1350 }, { "completion_length": 735.5812683105469, "epoch": 0.44889845949975155, "grad_norm": 0.27902907133102417, "kl": 0.34892578125, "learning_rate": 1.3455201246844629e-05, "loss": 0.014, "reward": 0.4479166783392429, "reward_std": 0.22410253696143628, "rewards/accuracy_reward": 0.4479166783392429, "rewards/format_reward": 0.0, "step": 1355 }, { "completion_length": 702.3312744140625, "epoch": 0.4505549113798244, "grad_norm": 0.5592268109321594, "kl": 0.336376953125, "learning_rate": 1.3400870822121348e-05, "loss": 0.0135, "reward": 0.46875001192092897, "reward_std": 0.22023502811789514, "rewards/accuracy_reward": 0.46875001192092897, "rewards/format_reward": 0.0, "step": 1360 }, { "completion_length": 682.2604354858398, "epoch": 0.4522113632598973, "grad_norm": 0.2038479447364807, "kl": 0.378076171875, "learning_rate": 1.334642664275072e-05, "loss": 0.0151, "reward": 0.4979166816920042, "reward_std": 0.19910254292190074, "rewards/accuracy_reward": 0.4979166816920042, "rewards/format_reward": 0.0, "step": 1365 }, { "completion_length": 680.8312713623047, "epoch": 0.4538678151399702, "grad_norm": 0.47576379776000977, "kl": 0.455126953125, "learning_rate": 1.3291870529818809e-05, "loss": 0.0182, "reward": 0.4562500096857548, "reward_std": 0.19652419872581958, "rewards/accuracy_reward": 0.4562500096857548, "rewards/format_reward": 0.0, "step": 1370 }, { "completion_length": 668.5958557128906, "epoch": 0.45552426702004306, "grad_norm": 0.3894989788532257, "kl": 0.56162109375, "learning_rate": 1.3237204308155689e-05, "loss": 0.0225, "reward": 0.40000001043081285, "reward_std": 0.1971687838435173, "rewards/accuracy_reward": 0.40000001043081285, "rewards/format_reward": 0.0, "step": 1375 }, { "completion_length": 629.2166870117187, "epoch": 0.45718071890011597, "grad_norm": 0.22087852656841278, "kl": 0.35419921875, "learning_rate": 1.3182429806274442e-05, "loss": 0.0142, "reward": 0.4729166831821203, "reward_std": 0.18337961323559285, "rewards/accuracy_reward": 0.4729166831821203, "rewards/format_reward": 0.0, "step": 1380 }, { "completion_length": 662.1562683105469, "epoch": 0.4588371707801888, "grad_norm": 0.5765883922576904, "kl": 0.39814453125, "learning_rate": 1.3127548856309966e-05, "loss": 0.0159, "reward": 0.46041668131947516, "reward_std": 0.24523502588272095, "rewards/accuracy_reward": 0.46041668131947516, "rewards/format_reward": 0.0, "step": 1385 }, { "completion_length": 595.0854293823243, "epoch": 0.4604936226602617, "grad_norm": 0.20864719152450562, "kl": 0.2517333984375, "learning_rate": 1.3072563293957725e-05, "loss": 0.0101, "reward": 0.5312500163912773, "reward_std": 0.18432335332036018, "rewards/accuracy_reward": 0.5312500163912773, "rewards/format_reward": 0.0, "step": 1390 }, { "completion_length": 644.4937683105469, "epoch": 0.4621500745403346, "grad_norm": 0.35174861550331116, "kl": 0.26015625, "learning_rate": 1.3017474958412316e-05, "loss": 0.0104, "reward": 0.477083345502615, "reward_std": 0.200046281889081, "rewards/accuracy_reward": 0.477083345502615, "rewards/format_reward": 0.0, "step": 1395 }, { "completion_length": 650.4437652587891, "epoch": 0.4638065264204075, "grad_norm": 0.22749535739421844, "kl": 0.248095703125, "learning_rate": 1.2962285692305964e-05, "loss": 0.0099, "reward": 0.5000000119209289, "reward_std": 0.2032692063599825, "rewards/accuracy_reward": 0.5000000119209289, "rewards/format_reward": 0.0, "step": 1400 }, { "completion_length": 705.6166870117188, "epoch": 0.4654629783004804, "grad_norm": 0.19232533872127533, "kl": 0.2246826171875, "learning_rate": 1.29069973416469e-05, "loss": 0.009, "reward": 0.4562500171363354, "reward_std": 0.20932335406541824, "rewards/accuracy_reward": 0.4562500171363354, "rewards/format_reward": 0.0, "step": 1405 }, { "completion_length": 701.7500213623047, "epoch": 0.46711943018055324, "grad_norm": 0.280925452709198, "kl": 0.2745849609375, "learning_rate": 1.2851611755757587e-05, "loss": 0.011, "reward": 0.4979166805744171, "reward_std": 0.2167129445821047, "rewards/accuracy_reward": 0.4979166805744171, "rewards/format_reward": 0.0, "step": 1410 }, { "completion_length": 676.7312713623047, "epoch": 0.46877588206062615, "grad_norm": 1.1114954948425293, "kl": 0.38310546875, "learning_rate": 1.279613078721289e-05, "loss": 0.0153, "reward": 0.502083346247673, "reward_std": 0.19394585825502872, "rewards/accuracy_reward": 0.502083346247673, "rewards/format_reward": 0.0, "step": 1415 }, { "completion_length": 701.3937713623047, "epoch": 0.470432333940699, "grad_norm": 0.19857953488826752, "kl": 0.3532470703125, "learning_rate": 1.2740556291778096e-05, "loss": 0.0141, "reward": 0.512500011920929, "reward_std": 0.2256908655166626, "rewards/accuracy_reward": 0.512500011920929, "rewards/format_reward": 0.0, "step": 1420 }, { "completion_length": 680.070849609375, "epoch": 0.4720887858207719, "grad_norm": 0.3523332178592682, "kl": 0.317919921875, "learning_rate": 1.2684890128346834e-05, "loss": 0.0127, "reward": 0.4520833473652601, "reward_std": 0.16929129101336002, "rewards/accuracy_reward": 0.4520833473652601, "rewards/format_reward": 0.0, "step": 1425 }, { "completion_length": 654.2500183105469, "epoch": 0.4737452377008448, "grad_norm": 0.2034429907798767, "kl": 0.288818359375, "learning_rate": 1.2629134158878919e-05, "loss": 0.0116, "reward": 0.5041666835546493, "reward_std": 0.20292377583682536, "rewards/accuracy_reward": 0.5041666835546493, "rewards/format_reward": 0.0, "step": 1430 }, { "completion_length": 637.9000213623046, "epoch": 0.47540168958091766, "grad_norm": 0.29249683022499084, "kl": 0.302978515625, "learning_rate": 1.2573290248338059e-05, "loss": 0.0121, "reward": 0.4604166805744171, "reward_std": 0.2022791899740696, "rewards/accuracy_reward": 0.4604166805744171, "rewards/format_reward": 0.0, "step": 1435 }, { "completion_length": 659.5791870117188, "epoch": 0.47705814146099057, "grad_norm": 0.31368008255958557, "kl": 0.3239501953125, "learning_rate": 1.2517360264629463e-05, "loss": 0.013, "reward": 0.4666666788980365, "reward_std": 0.2019800368696451, "rewards/accuracy_reward": 0.4666666788980365, "rewards/format_reward": 0.0, "step": 1440 }, { "completion_length": 642.7541900634766, "epoch": 0.4787145933410634, "grad_norm": 0.21325795352458954, "kl": 0.287646484375, "learning_rate": 1.2461346078537386e-05, "loss": 0.0115, "reward": 0.5145833469927311, "reward_std": 0.22891378737986087, "rewards/accuracy_reward": 0.5145833469927311, "rewards/format_reward": 0.0, "step": 1445 }, { "completion_length": 637.3958526611328, "epoch": 0.48037104522113633, "grad_norm": 0.21760497987270355, "kl": 0.24052734375, "learning_rate": 1.2405249563662539e-05, "loss": 0.0096, "reward": 0.5687500178813935, "reward_std": 0.24558046236634254, "rewards/accuracy_reward": 0.5687500178813935, "rewards/format_reward": 0.0, "step": 1450 }, { "completion_length": 719.0791870117188, "epoch": 0.48202749710120923, "grad_norm": 0.4115123152732849, "kl": 0.313916015625, "learning_rate": 1.2349072596359415e-05, "loss": 0.0126, "reward": 0.39583334475755694, "reward_std": 0.22826920822262764, "rewards/accuracy_reward": 0.39583334475755694, "rewards/format_reward": 0.0, "step": 1455 }, { "completion_length": 682.4354370117187, "epoch": 0.4836839489812821, "grad_norm": 0.25766491889953613, "kl": 0.3433349609375, "learning_rate": 1.2292817055673543e-05, "loss": 0.0137, "reward": 0.43125000949949027, "reward_std": 0.16765668392181396, "rewards/accuracy_reward": 0.43125000949949027, "rewards/format_reward": 0.0, "step": 1460 }, { "completion_length": 705.8770935058594, "epoch": 0.485340400861355, "grad_norm": 0.540080189704895, "kl": 0.327978515625, "learning_rate": 1.2236484823278627e-05, "loss": 0.0131, "reward": 0.44791667982935907, "reward_std": 0.23466878645122052, "rewards/accuracy_reward": 0.44791667982935907, "rewards/format_reward": 0.0, "step": 1465 }, { "completion_length": 759.4854400634765, "epoch": 0.48699685274142784, "grad_norm": 0.45100829005241394, "kl": 0.39150390625, "learning_rate": 1.2180077783413601e-05, "loss": 0.0157, "reward": 0.41666667759418485, "reward_std": 0.25584754943847654, "rewards/accuracy_reward": 0.41666667759418485, "rewards/format_reward": 0.0, "step": 1470 }, { "completion_length": 714.1479339599609, "epoch": 0.48865330462150075, "grad_norm": 0.38617846369743347, "kl": 0.458251953125, "learning_rate": 1.2123597822819627e-05, "loss": 0.0183, "reward": 0.43125001043081285, "reward_std": 0.19042377285659312, "rewards/accuracy_reward": 0.43125001043081285, "rewards/format_reward": 0.0, "step": 1475 }, { "completion_length": 736.893765258789, "epoch": 0.49030975650157366, "grad_norm": 0.21284358203411102, "kl": 0.40634765625, "learning_rate": 1.2067046830676947e-05, "loss": 0.0162, "reward": 0.4291666761040688, "reward_std": 0.1827350277453661, "rewards/accuracy_reward": 0.4291666761040688, "rewards/format_reward": 0.0, "step": 1480 }, { "completion_length": 720.0125213623047, "epoch": 0.4919662083816465, "grad_norm": 0.44137057662010193, "kl": 0.368017578125, "learning_rate": 1.2010426698541728e-05, "loss": 0.0147, "reward": 0.4625000149011612, "reward_std": 0.22440169379115105, "rewards/accuracy_reward": 0.4625000149011612, "rewards/format_reward": 0.0, "step": 1485 }, { "completion_length": 708.1625213623047, "epoch": 0.4936226602617194, "grad_norm": 0.5844438076019287, "kl": 0.382421875, "learning_rate": 1.1953739320282778e-05, "loss": 0.0153, "reward": 0.4666666768491268, "reward_std": 0.24235752858221532, "rewards/accuracy_reward": 0.4666666768491268, "rewards/format_reward": 0.0, "step": 1490 }, { "completion_length": 744.9166839599609, "epoch": 0.49527911214179227, "grad_norm": 0.30175694823265076, "kl": 0.460205078125, "learning_rate": 1.1896986592018196e-05, "loss": 0.0184, "reward": 0.4041666816920042, "reward_std": 0.18179129250347614, "rewards/accuracy_reward": 0.4041666816920042, "rewards/format_reward": 0.0, "step": 1495 }, { "completion_length": 697.6354309082031, "epoch": 0.4969355640218652, "grad_norm": 0.8834903836250305, "kl": 0.40322265625, "learning_rate": 1.1840170412051957e-05, "loss": 0.0161, "reward": 0.42916667833924294, "reward_std": 0.2225142154842615, "rewards/accuracy_reward": 0.42916667833924294, "rewards/format_reward": 0.0, "step": 1500 }, { "epoch": 0.4969355640218652, "eval_completion_length": 717.1500244140625, "eval_kl": 0.428125, "eval_loss": 0.015960458666086197, "eval_reward": 0.44791668057441714, "eval_reward_std": 0.20966877937316894, "eval_rewards/accuracy_reward": 0.44791668057441714, "eval_rewards/format_reward": 0.0, "eval_runtime": 61.3074, "eval_samples_per_second": 1.615, "eval_steps_per_second": 0.033, "step": 1500 }, { "completion_length": 720.0083587646484, "epoch": 0.498592015901938, "grad_norm": 0.37546849250793457, "kl": 0.399560546875, "learning_rate": 1.1783292680810403e-05, "loss": 0.016, "reward": 0.44375001043081286, "reward_std": 0.22504627853631973, "rewards/accuracy_reward": 0.44375001043081286, "rewards/format_reward": 0.0, "step": 1505 }, { "completion_length": 699.6812713623046, "epoch": 0.5002484677820109, "grad_norm": 0.6259009838104248, "kl": 0.414208984375, "learning_rate": 1.1726355300778693e-05, "loss": 0.0166, "reward": 0.45625001192092896, "reward_std": 0.1904237762093544, "rewards/accuracy_reward": 0.45625001192092896, "rewards/format_reward": 0.0, "step": 1510 }, { "completion_length": 680.4125213623047, "epoch": 0.5019049196620838, "grad_norm": 0.36564525961875916, "kl": 0.349169921875, "learning_rate": 1.1669360176437159e-05, "loss": 0.014, "reward": 0.40416667647659776, "reward_std": 0.19716878719627856, "rewards/accuracy_reward": 0.40416667647659776, "rewards/format_reward": 0.0, "step": 1515 }, { "completion_length": 690.5979370117187, "epoch": 0.5035613715421567, "grad_norm": 0.2909294068813324, "kl": 0.33076171875, "learning_rate": 1.1612309214197599e-05, "loss": 0.0132, "reward": 0.43333334699273107, "reward_std": 0.21031337194144725, "rewards/accuracy_reward": 0.43333334699273107, "rewards/format_reward": 0.0, "step": 1520 }, { "completion_length": 707.164599609375, "epoch": 0.5052178234222295, "grad_norm": 0.3798951506614685, "kl": 0.3591064453125, "learning_rate": 1.1555204322339521e-05, "loss": 0.0144, "reward": 0.4166666816920042, "reward_std": 0.18754627779126168, "rewards/accuracy_reward": 0.4166666816920042, "rewards/format_reward": 0.0, "step": 1525 }, { "completion_length": 688.660433959961, "epoch": 0.5068742753023024, "grad_norm": 0.3599902093410492, "kl": 0.346826171875, "learning_rate": 1.1498047410946307e-05, "loss": 0.0139, "reward": 0.4812500115483999, "reward_std": 0.21705838181078435, "rewards/accuracy_reward": 0.4812500115483999, "rewards/format_reward": 0.0, "step": 1530 }, { "completion_length": 703.9375259399415, "epoch": 0.5085307271823754, "grad_norm": 0.42666754126548767, "kl": 0.325, "learning_rate": 1.1440840391841317e-05, "loss": 0.013, "reward": 0.4166666783392429, "reward_std": 0.21477918922901154, "rewards/accuracy_reward": 0.4166666783392429, "rewards/format_reward": 0.0, "step": 1535 }, { "completion_length": 694.2187683105469, "epoch": 0.5101871790624483, "grad_norm": 0.4676518440246582, "kl": 0.384765625, "learning_rate": 1.1383585178523955e-05, "loss": 0.0154, "reward": 0.3979166761040688, "reward_std": 0.1978133711963892, "rewards/accuracy_reward": 0.3979166761040688, "rewards/format_reward": 0.0, "step": 1540 }, { "completion_length": 687.7062713623047, "epoch": 0.5118436309425212, "grad_norm": 0.3685455620288849, "kl": 0.4453125, "learning_rate": 1.1326283686105656e-05, "loss": 0.0178, "reward": 0.47916668131947515, "reward_std": 0.18531337045133114, "rewards/accuracy_reward": 0.47916668131947515, "rewards/format_reward": 0.0, "step": 1545 }, { "completion_length": 678.2166931152344, "epoch": 0.513500082822594, "grad_norm": 0.22115883231163025, "kl": 0.243115234375, "learning_rate": 1.126893783124583e-05, "loss": 0.0097, "reward": 0.47083334922790526, "reward_std": 0.20808046236634253, "rewards/accuracy_reward": 0.47083334922790526, "rewards/format_reward": 0.0, "step": 1550 }, { "completion_length": 677.8250244140625, "epoch": 0.5151565347026669, "grad_norm": 0.2383933812379837, "kl": 0.264697265625, "learning_rate": 1.1211549532087749e-05, "loss": 0.0106, "reward": 0.4458333469927311, "reward_std": 0.20679129138588906, "rewards/accuracy_reward": 0.4458333469927311, "rewards/format_reward": 0.0, "step": 1555 }, { "completion_length": 673.0104370117188, "epoch": 0.5168129865827398, "grad_norm": 0.2102532535791397, "kl": 0.201416015625, "learning_rate": 1.1154120708194398e-05, "loss": 0.0081, "reward": 0.45000001043081284, "reward_std": 0.21125711165368558, "rewards/accuracy_reward": 0.45000001043081284, "rewards/format_reward": 0.0, "step": 1560 }, { "completion_length": 683.3708465576171, "epoch": 0.5184694384628127, "grad_norm": 0.273816853761673, "kl": 0.225, "learning_rate": 1.1096653280484255e-05, "loss": 0.009, "reward": 0.46250001192092893, "reward_std": 0.2090241976082325, "rewards/accuracy_reward": 0.46250001192092893, "rewards/format_reward": 0.0, "step": 1565 }, { "completion_length": 641.0146057128907, "epoch": 0.5201258903428856, "grad_norm": 0.19903257489204407, "kl": 0.2029296875, "learning_rate": 1.1039149171167046e-05, "loss": 0.0081, "reward": 0.4854166805744171, "reward_std": 0.2372471284121275, "rewards/accuracy_reward": 0.4854166805744171, "rewards/format_reward": 0.0, "step": 1570 }, { "completion_length": 674.7729339599609, "epoch": 0.5217823422229584, "grad_norm": 0.25549396872520447, "kl": 0.2138671875, "learning_rate": 1.0981610303679449e-05, "loss": 0.0086, "reward": 0.4395833432674408, "reward_std": 0.2215242013335228, "rewards/accuracy_reward": 0.4395833432674408, "rewards/format_reward": 0.0, "step": 1575 }, { "completion_length": 685.847933959961, "epoch": 0.5234387941030313, "grad_norm": 0.28113311529159546, "kl": 0.250830078125, "learning_rate": 1.0924038602620757e-05, "loss": 0.01, "reward": 0.3895833436399698, "reward_std": 0.23466878421604634, "rewards/accuracy_reward": 0.3895833436399698, "rewards/format_reward": 0.0, "step": 1580 }, { "completion_length": 671.772932434082, "epoch": 0.5250952459831042, "grad_norm": 0.28916826844215393, "kl": 0.2568359375, "learning_rate": 1.08664359936885e-05, "loss": 0.0103, "reward": 0.4333333447575569, "reward_std": 0.21606836020946502, "rewards/accuracy_reward": 0.4333333447575569, "rewards/format_reward": 0.0, "step": 1585 }, { "completion_length": 683.6187713623046, "epoch": 0.5267516978631771, "grad_norm": 0.3163332939147949, "kl": 0.260400390625, "learning_rate": 1.0808804403614044e-05, "loss": 0.0104, "reward": 0.46875001564621926, "reward_std": 0.2548575311899185, "rewards/accuracy_reward": 0.46875001564621926, "rewards/format_reward": 0.0, "step": 1590 }, { "completion_length": 671.8937744140625, "epoch": 0.52840814974325, "grad_norm": 0.371674120426178, "kl": 0.27998046875, "learning_rate": 1.0751145760098125e-05, "loss": 0.0112, "reward": 0.46875001639127734, "reward_std": 0.20837961174547673, "rewards/accuracy_reward": 0.46875001639127734, "rewards/format_reward": 0.0, "step": 1595 }, { "completion_length": 659.8500244140625, "epoch": 0.5300646016233228, "grad_norm": 0.2995961010456085, "kl": 0.249169921875, "learning_rate": 1.0693461991746389e-05, "loss": 0.01, "reward": 0.4666666805744171, "reward_std": 0.18849001936614512, "rewards/accuracy_reward": 0.4666666805744171, "rewards/format_reward": 0.0, "step": 1600 }, { "completion_length": 659.0187728881835, "epoch": 0.5317210535033957, "grad_norm": 0.20874351263046265, "kl": 0.234130859375, "learning_rate": 1.0635755028004872e-05, "loss": 0.0094, "reward": 0.4395833443850279, "reward_std": 0.260957957431674, "rewards/accuracy_reward": 0.4395833443850279, "rewards/format_reward": 0.0, "step": 1605 }, { "completion_length": 642.754182434082, "epoch": 0.5333775053834686, "grad_norm": 0.25365403294563293, "kl": 0.2177001953125, "learning_rate": 1.0578026799095464e-05, "loss": 0.0087, "reward": 0.49791668355464935, "reward_std": 0.2433475513011217, "rewards/accuracy_reward": 0.49791668355464935, "rewards/format_reward": 0.0, "step": 1610 }, { "completion_length": 705.3583557128907, "epoch": 0.5350339572635415, "grad_norm": 0.3721843659877777, "kl": 0.29697265625, "learning_rate": 1.0520279235951347e-05, "loss": 0.0119, "reward": 0.43750000894069674, "reward_std": 0.26418088302016257, "rewards/accuracy_reward": 0.43750000894069674, "rewards/format_reward": 0.0, "step": 1615 }, { "completion_length": 700.8541870117188, "epoch": 0.5366904091436144, "grad_norm": 0.34244218468666077, "kl": 0.3037109375, "learning_rate": 1.046251427015241e-05, "loss": 0.0122, "reward": 0.49583334680646657, "reward_std": 0.18977918922901155, "rewards/accuracy_reward": 0.49583334680646657, "rewards/format_reward": 0.0, "step": 1620 }, { "completion_length": 719.1437637329102, "epoch": 0.5383468610236872, "grad_norm": 0.4916062355041504, "kl": 0.3265869140625, "learning_rate": 1.0404733833860639e-05, "loss": 0.0131, "reward": 0.41666667982935907, "reward_std": 0.20292377509176732, "rewards/accuracy_reward": 0.41666667982935907, "rewards/format_reward": 0.0, "step": 1625 }, { "completion_length": 690.9146026611328, "epoch": 0.5400033129037601, "grad_norm": 0.2534123957157135, "kl": 0.260791015625, "learning_rate": 1.0346939859755481e-05, "loss": 0.0104, "reward": 0.4708333468064666, "reward_std": 0.19811252355575562, "rewards/accuracy_reward": 0.4708333468064666, "rewards/format_reward": 0.0, "step": 1630 }, { "completion_length": 668.445849609375, "epoch": 0.541659764783833, "grad_norm": 0.28708308935165405, "kl": 0.250732421875, "learning_rate": 1.028913428096921e-05, "loss": 0.01, "reward": 0.4479166753590107, "reward_std": 0.20133545100688935, "rewards/accuracy_reward": 0.4479166753590107, "rewards/format_reward": 0.0, "step": 1635 }, { "completion_length": 639.433349609375, "epoch": 0.543316216663906, "grad_norm": 0.20855924487113953, "kl": 0.234912109375, "learning_rate": 1.023131903102226e-05, "loss": 0.0094, "reward": 0.5208333447575569, "reward_std": 0.21606836281716824, "rewards/accuracy_reward": 0.5208333447575569, "rewards/format_reward": 0.0, "step": 1640 }, { "completion_length": 691.8666809082031, "epoch": 0.5449726685439787, "grad_norm": 0.22074873745441437, "kl": 0.265380859375, "learning_rate": 1.0173496043758555e-05, "loss": 0.0106, "reward": 0.4979166805744171, "reward_std": 0.23948003798723222, "rewards/accuracy_reward": 0.4979166805744171, "rewards/format_reward": 0.0, "step": 1645 }, { "completion_length": 682.6000244140625, "epoch": 0.5466291204240517, "grad_norm": 0.4427434504032135, "kl": 0.267919921875, "learning_rate": 1.0115667253280817e-05, "loss": 0.0107, "reward": 0.4916666842997074, "reward_std": 0.18625710979104043, "rewards/accuracy_reward": 0.4916666842997074, "rewards/format_reward": 0.0, "step": 1650 }, { "completion_length": 684.0750122070312, "epoch": 0.5482855723041246, "grad_norm": 0.1968732625246048, "kl": 0.228076171875, "learning_rate": 1.0057834593885884e-05, "loss": 0.0091, "reward": 0.506250011920929, "reward_std": 0.1843233522027731, "rewards/accuracy_reward": 0.506250011920929, "rewards/format_reward": 0.0, "step": 1655 }, { "completion_length": 685.7916870117188, "epoch": 0.5499420241841975, "grad_norm": 0.19811367988586426, "kl": 0.252978515625, "learning_rate": 1e-05, "loss": 0.0101, "reward": 0.47500001490116117, "reward_std": 0.1958796139806509, "rewards/accuracy_reward": 0.47500001490116117, "rewards/format_reward": 0.0, "step": 1660 }, { "completion_length": 730.6146057128906, "epoch": 0.5515984760642704, "grad_norm": 0.21075910329818726, "kl": 0.25458984375, "learning_rate": 9.942165406114118e-06, "loss": 0.0102, "reward": 0.47916667722165585, "reward_std": 0.2186467032879591, "rewards/accuracy_reward": 0.47916667722165585, "rewards/format_reward": 0.0, "step": 1665 }, { "completion_length": 720.410433959961, "epoch": 0.5532549279443432, "grad_norm": 0.2317790687084198, "kl": 0.2525390625, "learning_rate": 9.884332746719186e-06, "loss": 0.0101, "reward": 0.4791666841134429, "reward_std": 0.19587961211800575, "rewards/accuracy_reward": 0.4791666841134429, "rewards/format_reward": 0.0, "step": 1670 }, { "completion_length": 720.6916854858398, "epoch": 0.5549113798244161, "grad_norm": 0.17310546338558197, "kl": 0.259228515625, "learning_rate": 9.826503956241447e-06, "loss": 0.0104, "reward": 0.5166666781529784, "reward_std": 0.19235753118991852, "rewards/accuracy_reward": 0.5166666781529784, "rewards/format_reward": 0.0, "step": 1675 }, { "completion_length": 728.5937683105469, "epoch": 0.556567831704489, "grad_norm": 0.280504435300827, "kl": 0.290869140625, "learning_rate": 9.768680968977743e-06, "loss": 0.0116, "reward": 0.42500000949949024, "reward_std": 0.21160253845155239, "rewards/accuracy_reward": 0.42500000949949024, "rewards/format_reward": 0.0, "step": 1680 }, { "completion_length": 685.6833465576171, "epoch": 0.5582242835845619, "grad_norm": 0.302950382232666, "kl": 0.274560546875, "learning_rate": 9.710865719030795e-06, "loss": 0.011, "reward": 0.45833334550261495, "reward_std": 0.22440169267356397, "rewards/accuracy_reward": 0.45833334550261495, "rewards/format_reward": 0.0, "step": 1685 }, { "completion_length": 703.9687652587891, "epoch": 0.5598807354646348, "grad_norm": 0.32050901651382446, "kl": 0.298828125, "learning_rate": 9.653060140244524e-06, "loss": 0.012, "reward": 0.47291667815297844, "reward_std": 0.22633545324206353, "rewards/accuracy_reward": 0.47291667815297844, "rewards/format_reward": 0.0, "step": 1690 }, { "completion_length": 683.5437713623047, "epoch": 0.5615371873447076, "grad_norm": 0.3828408122062683, "kl": 0.287841796875, "learning_rate": 9.595266166139366e-06, "loss": 0.0115, "reward": 0.485416679084301, "reward_std": 0.2494479738175869, "rewards/accuracy_reward": 0.485416679084301, "rewards/format_reward": 0.0, "step": 1695 }, { "completion_length": 680.5375244140625, "epoch": 0.5631936392247805, "grad_norm": 0.9023665189743042, "kl": 0.385009765625, "learning_rate": 9.537485729847594e-06, "loss": 0.0154, "reward": 0.4104166828095913, "reward_std": 0.24042377881705762, "rewards/accuracy_reward": 0.4104166828095913, "rewards/format_reward": 0.0, "step": 1700 }, { "completion_length": 696.5208526611328, "epoch": 0.5648500911048534, "grad_norm": 0.3717682957649231, "kl": 0.426904296875, "learning_rate": 9.479720764048655e-06, "loss": 0.0171, "reward": 0.37916667871177195, "reward_std": 0.22921294532716274, "rewards/accuracy_reward": 0.37916667871177195, "rewards/format_reward": 0.0, "step": 1705 }, { "completion_length": 685.3937683105469, "epoch": 0.5665065429849263, "grad_norm": 0.3070675730705261, "kl": 0.32724609375, "learning_rate": 9.421973200904538e-06, "loss": 0.0131, "reward": 0.41875001415610313, "reward_std": 0.23819086477160453, "rewards/accuracy_reward": 0.41875001415610313, "rewards/format_reward": 0.0, "step": 1710 }, { "completion_length": 640.4771087646484, "epoch": 0.5681629948649992, "grad_norm": 0.21792180836200714, "kl": 0.33271484375, "learning_rate": 9.36424497199513e-06, "loss": 0.0133, "reward": 0.4666666805744171, "reward_std": 0.2010362982749939, "rewards/accuracy_reward": 0.4666666805744171, "rewards/format_reward": 0.0, "step": 1715 }, { "completion_length": 717.6104370117188, "epoch": 0.569819446745072, "grad_norm": 0.3205876052379608, "kl": 0.36552734375, "learning_rate": 9.306538008253611e-06, "loss": 0.0146, "reward": 0.38750001043081284, "reward_std": 0.22663460597395896, "rewards/accuracy_reward": 0.38750001043081284, "rewards/format_reward": 0.0, "step": 1720 }, { "completion_length": 744.0396026611328, "epoch": 0.5714758986251449, "grad_norm": 0.31855928897857666, "kl": 0.363232421875, "learning_rate": 9.248854239901877e-06, "loss": 0.0145, "reward": 0.43333334680646657, "reward_std": 0.2651246223598719, "rewards/accuracy_reward": 0.43333334680646657, "rewards/format_reward": 0.0, "step": 1725 }, { "completion_length": 723.5583587646485, "epoch": 0.5731323505052178, "grad_norm": 0.7780797481536865, "kl": 0.373974609375, "learning_rate": 9.19119559638596e-06, "loss": 0.015, "reward": 0.479166678711772, "reward_std": 0.18849002048373223, "rewards/accuracy_reward": 0.479166678711772, "rewards/format_reward": 0.0, "step": 1730 }, { "completion_length": 712.6416885375977, "epoch": 0.5747888023852907, "grad_norm": 0.7579002976417542, "kl": 0.339208984375, "learning_rate": 9.133564006311503e-06, "loss": 0.0136, "reward": 0.5020833477377892, "reward_std": 0.19042377583682538, "rewards/accuracy_reward": 0.5020833477377892, "rewards/format_reward": 0.0, "step": 1735 }, { "completion_length": 755.8354339599609, "epoch": 0.5764452542653636, "grad_norm": 0.30858829617500305, "kl": 0.4185546875, "learning_rate": 9.075961397379247e-06, "loss": 0.0168, "reward": 0.41875001341104506, "reward_std": 0.22246793881058693, "rewards/accuracy_reward": 0.41875001341104506, "rewards/format_reward": 0.0, "step": 1740 }, { "completion_length": 788.7104400634765, "epoch": 0.5781017061454364, "grad_norm": 0.7897486686706543, "kl": 0.42119140625, "learning_rate": 9.018389696320556e-06, "loss": 0.0169, "reward": 0.3895833410322666, "reward_std": 0.23466878272593023, "rewards/accuracy_reward": 0.3895833410322666, "rewards/format_reward": 0.0, "step": 1745 }, { "completion_length": 812.4021057128906, "epoch": 0.5797581580255093, "grad_norm": 0.7141085863113403, "kl": 0.540234375, "learning_rate": 8.960850828832958e-06, "loss": 0.0216, "reward": 0.3583333417773247, "reward_std": 0.1827350292354822, "rewards/accuracy_reward": 0.3583333417773247, "rewards/format_reward": 0.0, "step": 1750 }, { "completion_length": 802.1375152587891, "epoch": 0.5814146099055822, "grad_norm": 0.5715038180351257, "kl": 0.48818359375, "learning_rate": 8.903346719515748e-06, "loss": 0.0195, "reward": 0.3375000089406967, "reward_std": 0.21864670403301717, "rewards/accuracy_reward": 0.3375000089406967, "rewards/format_reward": 0.0, "step": 1755 }, { "completion_length": 786.660433959961, "epoch": 0.5830710617856552, "grad_norm": 0.4674881100654602, "kl": 0.476708984375, "learning_rate": 8.845879291805605e-06, "loss": 0.0191, "reward": 0.3625000108033419, "reward_std": 0.21864670403301717, "rewards/accuracy_reward": 0.3625000108033419, "rewards/format_reward": 0.0, "step": 1760 }, { "completion_length": 750.9021026611329, "epoch": 0.5847275136657281, "grad_norm": 0.3583502173423767, "kl": 0.395068359375, "learning_rate": 8.788450467912254e-06, "loss": 0.0158, "reward": 0.4479166749864817, "reward_std": 0.1820904415100813, "rewards/accuracy_reward": 0.4479166749864817, "rewards/format_reward": 0.0, "step": 1765 }, { "completion_length": 778.5708526611328, "epoch": 0.5863839655458009, "grad_norm": 0.6018885374069214, "kl": 0.55849609375, "learning_rate": 8.731062168754174e-06, "loss": 0.0223, "reward": 0.3875000156462193, "reward_std": 0.24622504748404025, "rewards/accuracy_reward": 0.3875000156462193, "rewards/format_reward": 0.0, "step": 1770 }, { "completion_length": 746.4541900634765, "epoch": 0.5880404174258738, "grad_norm": 0.5292181968688965, "kl": 0.456591796875, "learning_rate": 8.673716313894349e-06, "loss": 0.0183, "reward": 0.4708333432674408, "reward_std": 0.2410683624446392, "rewards/accuracy_reward": 0.4708333432674408, "rewards/format_reward": 0.0, "step": 1775 }, { "completion_length": 743.9604370117188, "epoch": 0.5896968693059467, "grad_norm": 0.21024447679519653, "kl": 0.348486328125, "learning_rate": 8.616414821476048e-06, "loss": 0.0139, "reward": 0.447916678711772, "reward_std": 0.1917129471898079, "rewards/accuracy_reward": 0.447916678711772, "rewards/format_reward": 0.0, "step": 1780 }, { "completion_length": 747.0312683105469, "epoch": 0.5913533211860196, "grad_norm": 0.28301718831062317, "kl": 0.39404296875, "learning_rate": 8.559159608158688e-06, "loss": 0.0158, "reward": 0.414583345875144, "reward_std": 0.20614670403301716, "rewards/accuracy_reward": 0.414583345875144, "rewards/format_reward": 0.0, "step": 1785 }, { "completion_length": 731.0166900634765, "epoch": 0.5930097730660925, "grad_norm": 0.5452842116355896, "kl": 0.403662109375, "learning_rate": 8.501952589053694e-06, "loss": 0.0161, "reward": 0.42916667759418486, "reward_std": 0.24141379334032537, "rewards/accuracy_reward": 0.42916667759418486, "rewards/format_reward": 0.0, "step": 1790 }, { "completion_length": 752.0458541870117, "epoch": 0.5946662249461653, "grad_norm": 1.0961339473724365, "kl": 0.415478515625, "learning_rate": 8.444795677660479e-06, "loss": 0.0166, "reward": 0.4104166775941849, "reward_std": 0.2083796165883541, "rewards/accuracy_reward": 0.4104166775941849, "rewards/format_reward": 0.0, "step": 1795 }, { "completion_length": 777.9250183105469, "epoch": 0.5963226768262382, "grad_norm": 0.4235822558403015, "kl": 0.332470703125, "learning_rate": 8.387690785802403e-06, "loss": 0.0133, "reward": 0.38750000949949026, "reward_std": 0.187546281889081, "rewards/accuracy_reward": 0.38750000949949026, "rewards/format_reward": 0.0, "step": 1800 }, { "epoch": 0.5963226768262382, "eval_completion_length": 728.2041748046875, "eval_kl": 0.2736328125, "eval_loss": 0.010752188973128796, "eval_reward": 0.4791666865348816, "eval_reward_std": 0.21735753417015075, "eval_rewards/accuracy_reward": 0.4791666865348816, "eval_rewards/format_reward": 0.0, "eval_runtime": 63.2521, "eval_samples_per_second": 1.565, "eval_steps_per_second": 0.032, "step": 1800 }, { "completion_length": 732.3104400634766, "epoch": 0.5979791287063111, "grad_norm": 0.33254268765449524, "kl": 0.297998046875, "learning_rate": 8.330639823562843e-06, "loss": 0.0119, "reward": 0.4416666805744171, "reward_std": 0.22311252318322658, "rewards/accuracy_reward": 0.4416666805744171, "rewards/format_reward": 0.0, "step": 1805 }, { "completion_length": 734.577099609375, "epoch": 0.599635580586384, "grad_norm": 0.4225768744945526, "kl": 0.341064453125, "learning_rate": 8.273644699221309e-06, "loss": 0.0136, "reward": 0.39791668131947516, "reward_std": 0.20743587538599967, "rewards/accuracy_reward": 0.39791668131947516, "rewards/format_reward": 0.0, "step": 1810 }, { "completion_length": 761.0875213623046, "epoch": 0.6012920324664568, "grad_norm": 0.5745430588722229, "kl": 0.377880859375, "learning_rate": 8.2167073191896e-06, "loss": 0.0151, "reward": 0.4020833482965827, "reward_std": 0.21929128915071489, "rewards/accuracy_reward": 0.4020833482965827, "rewards/format_reward": 0.0, "step": 1815 }, { "completion_length": 723.6250183105469, "epoch": 0.6029484843465297, "grad_norm": 0.5707725286483765, "kl": 0.498828125, "learning_rate": 8.159829587948048e-06, "loss": 0.02, "reward": 0.48958334922790525, "reward_std": 0.20356835909187793, "rewards/accuracy_reward": 0.48958334922790525, "rewards/format_reward": 0.0, "step": 1820 }, { "completion_length": 739.9666870117187, "epoch": 0.6046049362266026, "grad_norm": 0.583268940448761, "kl": 0.51259765625, "learning_rate": 8.103013407981805e-06, "loss": 0.0205, "reward": 0.43333334252238276, "reward_std": 0.23179129101336002, "rewards/accuracy_reward": 0.43333334252238276, "rewards/format_reward": 0.0, "step": 1825 }, { "completion_length": 723.4916931152344, "epoch": 0.6062613881066755, "grad_norm": 0.41526469588279724, "kl": 0.44990234375, "learning_rate": 8.046260679717225e-06, "loss": 0.018, "reward": 0.450000011920929, "reward_std": 0.23050211742520332, "rewards/accuracy_reward": 0.450000011920929, "rewards/format_reward": 0.0, "step": 1830 }, { "completion_length": 737.7604370117188, "epoch": 0.6079178399867484, "grad_norm": 0.5477588176727295, "kl": 0.40693359375, "learning_rate": 7.989573301458274e-06, "loss": 0.0163, "reward": 0.46250001043081285, "reward_std": 0.21959044076502324, "rewards/accuracy_reward": 0.46250001043081285, "rewards/format_reward": 0.0, "step": 1835 }, { "completion_length": 787.0145965576172, "epoch": 0.6095742918668212, "grad_norm": 0.40717196464538574, "kl": 0.46953125, "learning_rate": 7.932953169323057e-06, "loss": 0.0188, "reward": 0.3687500135973096, "reward_std": 0.2048575323075056, "rewards/accuracy_reward": 0.3687500135973096, "rewards/format_reward": 0.0, "step": 1840 }, { "completion_length": 759.6125213623047, "epoch": 0.6112307437468941, "grad_norm": 0.6435567140579224, "kl": 0.3642578125, "learning_rate": 7.876402177180377e-06, "loss": 0.0146, "reward": 0.42291668206453326, "reward_std": 0.20709044262766838, "rewards/accuracy_reward": 0.42291668206453326, "rewards/format_reward": 0.0, "step": 1845 }, { "completion_length": 764.6812713623046, "epoch": 0.612887195626967, "grad_norm": 0.23377174139022827, "kl": 0.44404296875, "learning_rate": 7.8199222165864e-06, "loss": 0.0178, "reward": 0.3541666757315397, "reward_std": 0.17216878645122052, "rewards/accuracy_reward": 0.3541666757315397, "rewards/format_reward": 0.0, "step": 1850 }, { "completion_length": 734.722933959961, "epoch": 0.6145436475070399, "grad_norm": 0.37620407342910767, "kl": 0.443408203125, "learning_rate": 7.763515176721378e-06, "loss": 0.0177, "reward": 0.42083334028720853, "reward_std": 0.20644585862755777, "rewards/accuracy_reward": 0.42083334028720853, "rewards/format_reward": 0.0, "step": 1855 }, { "completion_length": 727.708349609375, "epoch": 0.6162000993871128, "grad_norm": 0.8196117281913757, "kl": 0.46259765625, "learning_rate": 7.70718294432646e-06, "loss": 0.0185, "reward": 0.4229166772216558, "reward_std": 0.20485753007233143, "rewards/accuracy_reward": 0.4229166772216558, "rewards/format_reward": 0.0, "step": 1860 }, { "completion_length": 763.6937805175781, "epoch": 0.6178565512671856, "grad_norm": 0.6530346870422363, "kl": 0.53818359375, "learning_rate": 7.65092740364059e-06, "loss": 0.0215, "reward": 0.42916668206453323, "reward_std": 0.23531336821615695, "rewards/accuracy_reward": 0.42916668206453323, "rewards/format_reward": 0.0, "step": 1865 }, { "completion_length": 734.4854385375977, "epoch": 0.6195130031472585, "grad_norm": 0.3041071891784668, "kl": 0.45185546875, "learning_rate": 7.594750436337467e-06, "loss": 0.0181, "reward": 0.42500001545995475, "reward_std": 0.17216878719627857, "rewards/accuracy_reward": 0.42500001545995475, "rewards/format_reward": 0.0, "step": 1870 }, { "completion_length": 752.8083526611329, "epoch": 0.6211694550273315, "grad_norm": 0.26719197630882263, "kl": 0.410986328125, "learning_rate": 7.538653921462613e-06, "loss": 0.0165, "reward": 0.36041667833924296, "reward_std": 0.22633544802665712, "rewards/accuracy_reward": 0.36041667833924296, "rewards/format_reward": 0.0, "step": 1875 }, { "completion_length": 735.6979370117188, "epoch": 0.6228259069074044, "grad_norm": 0.5672621130943298, "kl": 0.35576171875, "learning_rate": 7.482639735370536e-06, "loss": 0.0142, "reward": 0.4520833492279053, "reward_std": 0.21319086626172065, "rewards/accuracy_reward": 0.4520833492279053, "rewards/format_reward": 0.0, "step": 1880 }, { "completion_length": 743.7021057128907, "epoch": 0.6244823587874773, "grad_norm": 0.4073526859283447, "kl": 0.334326171875, "learning_rate": 7.4267097516619426e-06, "loss": 0.0134, "reward": 0.4520833479240537, "reward_std": 0.19652419798076154, "rewards/accuracy_reward": 0.4520833479240537, "rewards/format_reward": 0.0, "step": 1885 }, { "completion_length": 749.9062622070312, "epoch": 0.6261388106675501, "grad_norm": 0.33473727107048035, "kl": 0.392724609375, "learning_rate": 7.37086584112108e-06, "loss": 0.0157, "reward": 0.42916668206453323, "reward_std": 0.20069086402654648, "rewards/accuracy_reward": 0.42916668206453323, "rewards/format_reward": 0.0, "step": 1890 }, { "completion_length": 787.1646087646484, "epoch": 0.627795262547623, "grad_norm": 0.39572182297706604, "kl": 0.44306640625, "learning_rate": 7.315109871653168e-06, "loss": 0.0177, "reward": 0.40208334028720855, "reward_std": 0.23595795445144177, "rewards/accuracy_reward": 0.40208334028720855, "rewards/format_reward": 0.0, "step": 1895 }, { "completion_length": 740.4646057128906, "epoch": 0.6294517144276959, "grad_norm": 0.36331722140312195, "kl": 0.5134765625, "learning_rate": 7.2594437082219074e-06, "loss": 0.0205, "reward": 0.47083334624767303, "reward_std": 0.24012462347745894, "rewards/accuracy_reward": 0.47083334624767303, "rewards/format_reward": 0.0, "step": 1900 }, { "completion_length": 785.1271026611328, "epoch": 0.6311081663077688, "grad_norm": 0.8698108792304993, "kl": 0.526025390625, "learning_rate": 7.203869212787112e-06, "loss": 0.021, "reward": 0.414583346247673, "reward_std": 0.2333796139806509, "rewards/accuracy_reward": 0.414583346247673, "rewards/format_reward": 0.0, "step": 1905 }, { "completion_length": 752.597933959961, "epoch": 0.6327646181878417, "grad_norm": 0.36271390318870544, "kl": 0.472412109375, "learning_rate": 7.148388244242414e-06, "loss": 0.0189, "reward": 0.43333334252238276, "reward_std": 0.1695904430001974, "rewards/accuracy_reward": 0.43333334252238276, "rewards/format_reward": 0.0, "step": 1910 }, { "completion_length": 727.0104370117188, "epoch": 0.6344210700679145, "grad_norm": 0.31013360619544983, "kl": 0.269775390625, "learning_rate": 7.093002658353103e-06, "loss": 0.0108, "reward": 0.5312500186264515, "reward_std": 0.23501421920955182, "rewards/accuracy_reward": 0.5312500186264515, "rewards/format_reward": 0.0, "step": 1915 }, { "completion_length": 712.2583465576172, "epoch": 0.6360775219479874, "grad_norm": 0.2759442925453186, "kl": 0.31630859375, "learning_rate": 7.037714307694038e-06, "loss": 0.0127, "reward": 0.5187500193715096, "reward_std": 0.19910254068672656, "rewards/accuracy_reward": 0.5187500193715096, "rewards/format_reward": 0.0, "step": 1920 }, { "completion_length": 750.2437774658204, "epoch": 0.6377339738280603, "grad_norm": 0.3513622283935547, "kl": 0.356201171875, "learning_rate": 6.982525041587687e-06, "loss": 0.0142, "reward": 0.46041668206453323, "reward_std": 0.19652419947087765, "rewards/accuracy_reward": 0.46041668206453323, "rewards/format_reward": 0.0, "step": 1925 }, { "completion_length": 752.7187744140625, "epoch": 0.6393904257081332, "grad_norm": 0.5992259979248047, "kl": 0.425, "learning_rate": 6.927436706042276e-06, "loss": 0.017, "reward": 0.39791667424142363, "reward_std": 0.2333796124905348, "rewards/accuracy_reward": 0.39791667424142363, "rewards/format_reward": 0.0, "step": 1930 }, { "completion_length": 720.0104339599609, "epoch": 0.6410468775882061, "grad_norm": 0.7501030564308167, "kl": 0.432666015625, "learning_rate": 6.8724511436900356e-06, "loss": 0.0173, "reward": 0.4416666842997074, "reward_std": 0.21512462049722672, "rewards/accuracy_reward": 0.4416666842997074, "rewards/format_reward": 0.0, "step": 1935 }, { "completion_length": 765.1562683105469, "epoch": 0.6427033294682789, "grad_norm": 0.45305824279785156, "kl": 0.46982421875, "learning_rate": 6.8175701937255645e-06, "loss": 0.0188, "reward": 0.45833334475755694, "reward_std": 0.20902419909834863, "rewards/accuracy_reward": 0.45833334475755694, "rewards/format_reward": 0.0, "step": 1940 }, { "completion_length": 697.9291839599609, "epoch": 0.6443597813483518, "grad_norm": 0.7772151827812195, "kl": 0.42802734375, "learning_rate": 6.762795691844315e-06, "loss": 0.0171, "reward": 0.43333334624767306, "reward_std": 0.17792377844452859, "rewards/accuracy_reward": 0.43333334624767306, "rewards/format_reward": 0.0, "step": 1945 }, { "completion_length": 705.7145965576171, "epoch": 0.6460162332284247, "grad_norm": 0.3005319833755493, "kl": 0.417724609375, "learning_rate": 6.708129470181197e-06, "loss": 0.0167, "reward": 0.5083333440124989, "reward_std": 0.26418088302016257, "rewards/accuracy_reward": 0.5083333440124989, "rewards/format_reward": 0.0, "step": 1950 }, { "completion_length": 741.6041839599609, "epoch": 0.6476726851084976, "grad_norm": 0.37007224559783936, "kl": 0.42861328125, "learning_rate": 6.653573357249281e-06, "loss": 0.0172, "reward": 0.39791667945683, "reward_std": 0.19042377769947053, "rewards/accuracy_reward": 0.39791667945683, "rewards/format_reward": 0.0, "step": 1955 }, { "completion_length": 732.3145965576172, "epoch": 0.6493291369885705, "grad_norm": 0.33476710319519043, "kl": 0.272119140625, "learning_rate": 6.5991291778786556e-06, "loss": 0.0109, "reward": 0.49166668131947516, "reward_std": 0.23179128989577294, "rewards/accuracy_reward": 0.49166668131947516, "rewards/format_reward": 0.0, "step": 1960 }, { "completion_length": 724.1000183105468, "epoch": 0.6509855888686433, "grad_norm": 0.21214179694652557, "kl": 0.25830078125, "learning_rate": 6.5447987531553726e-06, "loss": 0.0103, "reward": 0.4479166816920042, "reward_std": 0.2144800338894129, "rewards/accuracy_reward": 0.4479166816920042, "rewards/format_reward": 0.0, "step": 1965 }, { "completion_length": 712.2146057128906, "epoch": 0.6526420407487162, "grad_norm": 0.3199763298034668, "kl": 0.282177734375, "learning_rate": 6.490583900360543e-06, "loss": 0.0113, "reward": 0.46875001639127734, "reward_std": 0.20580127313733101, "rewards/accuracy_reward": 0.46875001639127734, "rewards/format_reward": 0.0, "step": 1970 }, { "completion_length": 750.9541839599609, "epoch": 0.6542984926287891, "grad_norm": 0.2559630870819092, "kl": 0.322802734375, "learning_rate": 6.43648643290955e-06, "loss": 0.0129, "reward": 0.4312500134110451, "reward_std": 0.20133544839918613, "rewards/accuracy_reward": 0.4312500134110451, "rewards/format_reward": 0.0, "step": 1975 }, { "completion_length": 743.2896026611328, "epoch": 0.655954944508862, "grad_norm": 0.3454988896846771, "kl": 0.313671875, "learning_rate": 6.38250816029139e-06, "loss": 0.0126, "reward": 0.47708334773778915, "reward_std": 0.22762462235987185, "rewards/accuracy_reward": 0.47708334773778915, "rewards/format_reward": 0.0, "step": 1980 }, { "completion_length": 758.4833526611328, "epoch": 0.657611396388935, "grad_norm": 0.30943936109542847, "kl": 0.312890625, "learning_rate": 6.3286508880081466e-06, "loss": 0.0125, "reward": 0.4604166805744171, "reward_std": 0.19910254143178463, "rewards/accuracy_reward": 0.4604166805744171, "rewards/format_reward": 0.0, "step": 1985 }, { "completion_length": 758.243765258789, "epoch": 0.6592678482690078, "grad_norm": 0.24653878808021545, "kl": 0.317578125, "learning_rate": 6.274916417514605e-06, "loss": 0.0127, "reward": 0.4937500089406967, "reward_std": 0.18337961323559285, "rewards/accuracy_reward": 0.4937500089406967, "rewards/format_reward": 0.0, "step": 1990 }, { "completion_length": 777.2271026611328, "epoch": 0.6609243001490807, "grad_norm": 0.3619004487991333, "kl": 0.46044921875, "learning_rate": 6.221306546157997e-06, "loss": 0.0184, "reward": 0.41458334438502786, "reward_std": 0.225046281889081, "rewards/accuracy_reward": 0.41458334438502786, "rewards/format_reward": 0.0, "step": 1995 }, { "completion_length": 763.9604370117188, "epoch": 0.6625807520291536, "grad_norm": 0.7756472229957581, "kl": 0.389892578125, "learning_rate": 6.167823067117868e-06, "loss": 0.0156, "reward": 0.47500001490116117, "reward_std": 0.2616025425493717, "rewards/accuracy_reward": 0.47500001490116117, "rewards/format_reward": 0.0, "step": 2000 }, { "completion_length": 760.6291900634766, "epoch": 0.6642372039092265, "grad_norm": 0.4779156744480133, "kl": 0.502783203125, "learning_rate": 6.114467769346113e-06, "loss": 0.0201, "reward": 0.48333334773778913, "reward_std": 0.2340242013335228, "rewards/accuracy_reward": 0.48333334773778913, "rewards/format_reward": 0.0, "step": 2005 }, { "completion_length": 811.664599609375, "epoch": 0.6658936557892993, "grad_norm": 0.942679762840271, "kl": 0.5095703125, "learning_rate": 6.061242437507131e-06, "loss": 0.0204, "reward": 0.4187500115483999, "reward_std": 0.2237571097910404, "rewards/accuracy_reward": 0.4187500115483999, "rewards/format_reward": 0.0, "step": 2010 }, { "completion_length": 790.5479370117188, "epoch": 0.6675501076693722, "grad_norm": 0.3575034737586975, "kl": 0.51806640625, "learning_rate": 6.008148851918131e-06, "loss": 0.0207, "reward": 0.42916667759418486, "reward_std": 0.2340241987258196, "rewards/accuracy_reward": 0.42916667759418486, "rewards/format_reward": 0.0, "step": 2015 }, { "completion_length": 788.5791870117188, "epoch": 0.6692065595494451, "grad_norm": 0.8670142292976379, "kl": 0.5390625, "learning_rate": 5.955188788489583e-06, "loss": 0.0216, "reward": 0.4770833499729633, "reward_std": 0.24429128579795362, "rewards/accuracy_reward": 0.4770833499729633, "rewards/format_reward": 0.0, "step": 2020 }, { "completion_length": 813.5000213623047, "epoch": 0.670863011429518, "grad_norm": 0.7403881549835205, "kl": 0.51845703125, "learning_rate": 5.902364018665822e-06, "loss": 0.0208, "reward": 0.387500011920929, "reward_std": 0.23213672153651715, "rewards/accuracy_reward": 0.387500011920929, "rewards/format_reward": 0.0, "step": 2025 }, { "completion_length": 771.6791809082031, "epoch": 0.6725194633095909, "grad_norm": 0.33360540866851807, "kl": 0.37666015625, "learning_rate": 5.849676309365786e-06, "loss": 0.0151, "reward": 0.46041668131947516, "reward_std": 0.2430021196603775, "rewards/accuracy_reward": 0.46041668131947516, "rewards/format_reward": 0.0, "step": 2030 }, { "completion_length": 783.3562744140625, "epoch": 0.6741759151896637, "grad_norm": 0.454338937997818, "kl": 0.39423828125, "learning_rate": 5.7971274229239136e-06, "loss": 0.0158, "reward": 0.44166667815297844, "reward_std": 0.23883544765412806, "rewards/accuracy_reward": 0.44166667815297844, "rewards/format_reward": 0.0, "step": 2035 }, { "completion_length": 778.9416900634766, "epoch": 0.6758323670697366, "grad_norm": 0.22541843354701996, "kl": 0.377490234375, "learning_rate": 5.744719117031217e-06, "loss": 0.0151, "reward": 0.45000001303851606, "reward_std": 0.20326921120285987, "rewards/accuracy_reward": 0.45000001303851606, "rewards/format_reward": 0.0, "step": 2040 }, { "completion_length": 762.0771057128907, "epoch": 0.6774888189498095, "grad_norm": 0.35068559646606445, "kl": 0.3775390625, "learning_rate": 5.692453144676451e-06, "loss": 0.0151, "reward": 0.48541667982935904, "reward_std": 0.2096687864512205, "rewards/accuracy_reward": 0.48541667982935904, "rewards/format_reward": 0.0, "step": 2045 }, { "completion_length": 760.2083557128906, "epoch": 0.6791452708298824, "grad_norm": 0.3603161573410034, "kl": 0.3955078125, "learning_rate": 5.6403312540875325e-06, "loss": 0.0158, "reward": 0.43333334773778914, "reward_std": 0.22440169453620912, "rewards/accuracy_reward": 0.43333334773778914, "rewards/format_reward": 0.0, "step": 2050 }, { "completion_length": 758.1187683105469, "epoch": 0.6808017227099553, "grad_norm": 0.42675259709358215, "kl": 0.34716796875, "learning_rate": 5.588355188673002e-06, "loss": 0.0139, "reward": 0.4625000134110451, "reward_std": 0.22698003463447095, "rewards/accuracy_reward": 0.4625000134110451, "rewards/format_reward": 0.0, "step": 2055 }, { "completion_length": 778.1958557128906, "epoch": 0.6824581745900281, "grad_norm": 0.30794984102249146, "kl": 0.380517578125, "learning_rate": 5.536526686963762e-06, "loss": 0.0152, "reward": 0.4479166831821203, "reward_std": 0.1881908655166626, "rewards/accuracy_reward": 0.4479166831821203, "rewards/format_reward": 0.0, "step": 2060 }, { "completion_length": 746.5708511352539, "epoch": 0.684114626470101, "grad_norm": 0.24928486347198486, "kl": 0.360546875, "learning_rate": 5.484847482554887e-06, "loss": 0.0144, "reward": 0.49583334624767306, "reward_std": 0.1805021185427904, "rewards/accuracy_reward": 0.49583334624767306, "rewards/format_reward": 0.0, "step": 2065 }, { "completion_length": 778.4854339599609, "epoch": 0.6857710783501739, "grad_norm": 0.267110675573349, "kl": 0.4107421875, "learning_rate": 5.433319304047666e-06, "loss": 0.0164, "reward": 0.5000000163912773, "reward_std": 0.18625710904598236, "rewards/accuracy_reward": 0.5000000163912773, "rewards/format_reward": 0.0, "step": 2070 }, { "completion_length": 792.5541839599609, "epoch": 0.6874275302302468, "grad_norm": 0.5058810114860535, "kl": 0.42265625, "learning_rate": 5.38194387499177e-06, "loss": 0.0169, "reward": 0.48333334773778913, "reward_std": 0.25198003388941287, "rewards/accuracy_reward": 0.48333334773778913, "rewards/format_reward": 0.0, "step": 2075 }, { "completion_length": 809.6625244140625, "epoch": 0.6890839821103197, "grad_norm": 0.4862288236618042, "kl": 0.51103515625, "learning_rate": 5.330722913827594e-06, "loss": 0.0204, "reward": 0.42083334624767305, "reward_std": 0.21383544914424418, "rewards/accuracy_reward": 0.42083334624767305, "rewards/format_reward": 0.0, "step": 2080 }, { "completion_length": 781.1896057128906, "epoch": 0.6907404339903925, "grad_norm": 0.27606523036956787, "kl": 0.453515625, "learning_rate": 5.279658133828793e-06, "loss": 0.0181, "reward": 0.47916668429970743, "reward_std": 0.206791290640831, "rewards/accuracy_reward": 0.47916668429970743, "rewards/format_reward": 0.0, "step": 2085 }, { "completion_length": 794.6625213623047, "epoch": 0.6923968858704654, "grad_norm": 0.26152464747428894, "kl": 0.39599609375, "learning_rate": 5.228751243044961e-06, "loss": 0.0158, "reward": 0.4250000163912773, "reward_std": 0.23660254143178464, "rewards/accuracy_reward": 0.4250000163912773, "rewards/format_reward": 0.0, "step": 2090 }, { "completion_length": 800.8250183105469, "epoch": 0.6940533377505383, "grad_norm": 0.5088229179382324, "kl": 0.445849609375, "learning_rate": 5.178003944244511e-06, "loss": 0.0178, "reward": 0.45416668206453326, "reward_std": 0.21641379669308664, "rewards/accuracy_reward": 0.45416668206453326, "rewards/format_reward": 0.0, "step": 2095 }, { "completion_length": 832.2625213623047, "epoch": 0.6957097896306113, "grad_norm": 0.31478533148765564, "kl": 0.4666015625, "learning_rate": 5.127417934857718e-06, "loss": 0.0187, "reward": 0.39583334475755694, "reward_std": 0.22826920486986638, "rewards/accuracy_reward": 0.39583334475755694, "rewards/format_reward": 0.0, "step": 2100 }, { "epoch": 0.6957097896306113, "eval_completion_length": 825.1125244140625, "eval_kl": 0.437109375, "eval_loss": 0.016952065750956535, "eval_reward": 0.4520833432674408, "eval_reward_std": 0.20709043741226196, "eval_rewards/accuracy_reward": 0.4520833432674408, "eval_rewards/format_reward": 0.0, "eval_runtime": 68.7441, "eval_samples_per_second": 1.44, "eval_steps_per_second": 0.029, "step": 2100 }, { "completion_length": 831.1479339599609, "epoch": 0.6973662415106842, "grad_norm": 0.5540557503700256, "kl": 0.459814453125, "learning_rate": 5.076994906919927e-06, "loss": 0.0184, "reward": 0.416666679084301, "reward_std": 0.24012462124228479, "rewards/accuracy_reward": 0.416666679084301, "rewards/format_reward": 0.0, "step": 2105 }, { "completion_length": 835.7458587646485, "epoch": 0.699022693390757, "grad_norm": 0.3873566687107086, "kl": 0.35390625, "learning_rate": 5.026736547014981e-06, "loss": 0.0142, "reward": 0.39166667833924296, "reward_std": 0.19330127127468585, "rewards/accuracy_reward": 0.39166667833924296, "rewards/format_reward": 0.0, "step": 2110 }, { "completion_length": 813.2625183105469, "epoch": 0.7006791452708299, "grad_norm": 0.43175235390663147, "kl": 0.281005859375, "learning_rate": 4.976644536218783e-06, "loss": 0.0112, "reward": 0.46458334773778914, "reward_std": 0.27539171017706393, "rewards/accuracy_reward": 0.46458334773778914, "rewards/format_reward": 0.0, "step": 2115 }, { "completion_length": 793.8333465576172, "epoch": 0.7023355971509028, "grad_norm": 0.491457998752594, "kl": 0.2587890625, "learning_rate": 4.926720550043089e-06, "loss": 0.0104, "reward": 0.49375001192092893, "reward_std": 0.22856836020946503, "rewards/accuracy_reward": 0.49375001192092893, "rewards/format_reward": 0.0, "step": 2120 }, { "completion_length": 798.045849609375, "epoch": 0.7039920490309757, "grad_norm": 0.34596359729766846, "kl": 0.274462890625, "learning_rate": 4.8769662583794405e-06, "loss": 0.011, "reward": 0.5083333477377892, "reward_std": 0.17792377509176732, "rewards/accuracy_reward": 0.5083333477377892, "rewards/format_reward": 0.0, "step": 2125 }, { "completion_length": 786.7146057128906, "epoch": 0.7056485009110486, "grad_norm": 0.36578240990638733, "kl": 0.35576171875, "learning_rate": 4.827383325443331e-06, "loss": 0.0142, "reward": 0.4604166805744171, "reward_std": 0.286303386464715, "rewards/accuracy_reward": 0.4604166805744171, "rewards/format_reward": 0.0, "step": 2130 }, { "completion_length": 782.6958587646484, "epoch": 0.7073049527911214, "grad_norm": 0.5711299777030945, "kl": 0.380078125, "learning_rate": 4.777973409718527e-06, "loss": 0.0152, "reward": 0.4270833436399698, "reward_std": 0.21929129473865033, "rewards/accuracy_reward": 0.4270833436399698, "rewards/format_reward": 0.0, "step": 2135 }, { "completion_length": 789.4583526611328, "epoch": 0.7089614046711943, "grad_norm": 0.30322369933128357, "kl": 0.344970703125, "learning_rate": 4.728738163901597e-06, "loss": 0.0138, "reward": 0.42708334848284724, "reward_std": 0.20039171166718006, "rewards/accuracy_reward": 0.42708334848284724, "rewards/format_reward": 0.0, "step": 2140 }, { "completion_length": 766.5791839599609, "epoch": 0.7106178565512672, "grad_norm": 0.2763904333114624, "kl": 0.335107421875, "learning_rate": 4.679679234846636e-06, "loss": 0.0134, "reward": 0.4562500104308128, "reward_std": 0.2167129497975111, "rewards/accuracy_reward": 0.4562500104308128, "rewards/format_reward": 0.0, "step": 2145 }, { "completion_length": 776.5479339599609, "epoch": 0.7122743084313401, "grad_norm": 0.30575233697891235, "kl": 0.355419921875, "learning_rate": 4.630798263510162e-06, "loss": 0.0142, "reward": 0.4937500111758709, "reward_std": 0.21929128877818585, "rewards/accuracy_reward": 0.4937500111758709, "rewards/format_reward": 0.0, "step": 2150 }, { "completion_length": 809.695849609375, "epoch": 0.713930760311413, "grad_norm": 0.2517027258872986, "kl": 0.385595703125, "learning_rate": 4.58209688489626e-06, "loss": 0.0154, "reward": 0.4520833469927311, "reward_std": 0.21576920747756959, "rewards/accuracy_reward": 0.4520833469927311, "rewards/format_reward": 0.0, "step": 2155 }, { "completion_length": 852.525015258789, "epoch": 0.7155872121914858, "grad_norm": 0.2506435811519623, "kl": 0.43876953125, "learning_rate": 4.533576728001858e-06, "loss": 0.0176, "reward": 0.3500000074505806, "reward_std": 0.2173575319349766, "rewards/accuracy_reward": 0.3500000074505806, "rewards/format_reward": 0.0, "step": 2160 }, { "completion_length": 816.933349609375, "epoch": 0.7172436640715587, "grad_norm": 0.22968539595603943, "kl": 0.441064453125, "learning_rate": 4.485239415762268e-06, "loss": 0.0176, "reward": 0.4416666775941849, "reward_std": 0.22921294383704663, "rewards/accuracy_reward": 0.4416666775941849, "rewards/format_reward": 0.0, "step": 2165 }, { "completion_length": 821.4541839599609, "epoch": 0.7189001159516316, "grad_norm": 0.25325411558151245, "kl": 0.433837890625, "learning_rate": 4.437086564996891e-06, "loss": 0.0173, "reward": 0.39791667498648164, "reward_std": 0.1869016945362091, "rewards/accuracy_reward": 0.39791667498648164, "rewards/format_reward": 0.0, "step": 2170 }, { "completion_length": 825.7000244140625, "epoch": 0.7205565678317045, "grad_norm": 0.46654900908470154, "kl": 0.494384765625, "learning_rate": 4.389119786355119e-06, "loss": 0.0198, "reward": 0.42916667740792036, "reward_std": 0.2340241987258196, "rewards/accuracy_reward": 0.42916667740792036, "rewards/format_reward": 0.0, "step": 2175 }, { "completion_length": 855.8416870117187, "epoch": 0.7222130197117774, "grad_norm": 0.49702784419059753, "kl": 0.535546875, "learning_rate": 4.341340684262498e-06, "loss": 0.0214, "reward": 0.34791667498648166, "reward_std": 0.20614670105278493, "rewards/accuracy_reward": 0.34791667498648166, "rewards/format_reward": 0.0, "step": 2180 }, { "completion_length": 802.0312652587891, "epoch": 0.7238694715918502, "grad_norm": 0.32142379879951477, "kl": 0.408203125, "learning_rate": 4.2937508568670194e-06, "loss": 0.0163, "reward": 0.445833345502615, "reward_std": 0.17921294569969176, "rewards/accuracy_reward": 0.445833345502615, "rewards/format_reward": 0.0, "step": 2185 }, { "completion_length": 813.6291870117187, "epoch": 0.7255259234719231, "grad_norm": 0.30990809202194214, "kl": 0.47890625, "learning_rate": 4.246351895985702e-06, "loss": 0.0192, "reward": 0.4145833447575569, "reward_std": 0.20614670477807523, "rewards/accuracy_reward": 0.4145833447575569, "rewards/format_reward": 0.0, "step": 2190 }, { "completion_length": 828.1479309082031, "epoch": 0.727182375351996, "grad_norm": 0.5752753019332886, "kl": 0.44189453125, "learning_rate": 4.1991453870513265e-06, "loss": 0.0177, "reward": 0.3916666775941849, "reward_std": 0.19811252020299436, "rewards/accuracy_reward": 0.3916666775941849, "rewards/format_reward": 0.0, "step": 2195 }, { "completion_length": 838.1312713623047, "epoch": 0.7288388272320689, "grad_norm": 0.4127509295940399, "kl": 0.531640625, "learning_rate": 4.152132909059402e-06, "loss": 0.0213, "reward": 0.42708334103226664, "reward_std": 0.25391379222273824, "rewards/accuracy_reward": 0.42708334103226664, "rewards/format_reward": 0.0, "step": 2200 }, { "completion_length": 811.5666870117187, "epoch": 0.7304952791121417, "grad_norm": 0.7732940912246704, "kl": 0.53388671875, "learning_rate": 4.105316034515372e-06, "loss": 0.0214, "reward": 0.42083334773778913, "reward_std": 0.22087961472570897, "rewards/accuracy_reward": 0.42083334773778913, "rewards/format_reward": 0.0, "step": 2205 }, { "completion_length": 844.933349609375, "epoch": 0.7321517309922146, "grad_norm": 0.6802802681922913, "kl": 0.4306640625, "learning_rate": 4.058696329381987e-06, "loss": 0.0172, "reward": 0.48958334773778917, "reward_std": 0.23690169341862202, "rewards/accuracy_reward": 0.48958334773778917, "rewards/format_reward": 0.0, "step": 2210 }, { "completion_length": 826.3791900634766, "epoch": 0.7338081828722876, "grad_norm": 0.3347947895526886, "kl": 0.4943359375, "learning_rate": 4.012275353026952e-06, "loss": 0.0198, "reward": 0.41250001098960637, "reward_std": 0.2269800379872322, "rewards/accuracy_reward": 0.41250001098960637, "rewards/format_reward": 0.0, "step": 2215 }, { "completion_length": 761.1687713623047, "epoch": 0.7354646347523605, "grad_norm": 0.31574389338493347, "kl": 0.391748046875, "learning_rate": 3.966054658170754e-06, "loss": 0.0157, "reward": 0.43958334550261496, "reward_std": 0.21894585750997067, "rewards/accuracy_reward": 0.43958334550261496, "rewards/format_reward": 0.0, "step": 2220 }, { "completion_length": 770.4687683105469, "epoch": 0.7371210866324334, "grad_norm": 0.2964814007282257, "kl": 0.3833984375, "learning_rate": 3.9200357908347274e-06, "loss": 0.0153, "reward": 0.43125001303851607, "reward_std": 0.21190169639885426, "rewards/accuracy_reward": 0.43125001303851607, "rewards/format_reward": 0.0, "step": 2225 }, { "completion_length": 789.3646057128906, "epoch": 0.7387775385125062, "grad_norm": 0.33185550570487976, "kl": 0.441015625, "learning_rate": 3.874220290289337e-06, "loss": 0.0176, "reward": 0.412500013038516, "reward_std": 0.25292377918958664, "rewards/accuracy_reward": 0.412500013038516, "rewards/format_reward": 0.0, "step": 2230 }, { "completion_length": 803.4687744140625, "epoch": 0.7404339903925791, "grad_norm": 0.3957579731941223, "kl": 0.430322265625, "learning_rate": 3.828609689002701e-06, "loss": 0.0172, "reward": 0.40208334736526014, "reward_std": 0.22856836058199406, "rewards/accuracy_reward": 0.40208334736526014, "rewards/format_reward": 0.0, "step": 2235 }, { "completion_length": 839.6166870117188, "epoch": 0.742090442272652, "grad_norm": 1.2343977689743042, "kl": 0.468310546875, "learning_rate": 3.7832055125893318e-06, "loss": 0.0187, "reward": 0.3812500124797225, "reward_std": 0.2131908643990755, "rewards/accuracy_reward": 0.3812500124797225, "rewards/format_reward": 0.0, "step": 2240 }, { "completion_length": 842.9271118164063, "epoch": 0.7437468941527249, "grad_norm": 0.4856538474559784, "kl": 0.464599609375, "learning_rate": 3.738009279759092e-06, "loss": 0.0186, "reward": 0.40833334550261496, "reward_std": 0.24493587128818034, "rewards/accuracy_reward": 0.40833334550261496, "rewards/format_reward": 0.0, "step": 2245 }, { "completion_length": 856.6333557128906, "epoch": 0.7454033460327978, "grad_norm": 0.32759642601013184, "kl": 0.47216796875, "learning_rate": 3.6930225022664136e-06, "loss": 0.0189, "reward": 0.39375000931322574, "reward_std": 0.21095795407891274, "rewards/accuracy_reward": 0.39375000931322574, "rewards/format_reward": 0.0, "step": 2250 }, { "completion_length": 827.320849609375, "epoch": 0.7470597979128706, "grad_norm": 1.8044863939285278, "kl": 0.47880859375, "learning_rate": 3.6482466848597164e-06, "loss": 0.0192, "reward": 0.42708334550261495, "reward_std": 0.27668088041245936, "rewards/accuracy_reward": 0.42708334550261495, "rewards/format_reward": 0.0, "step": 2255 }, { "completion_length": 790.2062683105469, "epoch": 0.7487162497929435, "grad_norm": 0.48375847935676575, "kl": 0.4705078125, "learning_rate": 3.6036833252310887e-06, "loss": 0.0188, "reward": 0.3791666768491268, "reward_std": 0.2090241987258196, "rewards/accuracy_reward": 0.3791666768491268, "rewards/format_reward": 0.0, "step": 2260 }, { "completion_length": 790.3687744140625, "epoch": 0.7503727016730164, "grad_norm": 0.42819076776504517, "kl": 0.5126953125, "learning_rate": 3.5593339139661885e-06, "loss": 0.0205, "reward": 0.4354166805744171, "reward_std": 0.29558046348392963, "rewards/accuracy_reward": 0.4354166805744171, "rewards/format_reward": 0.0, "step": 2265 }, { "completion_length": 788.3791870117187, "epoch": 0.7520291535530893, "grad_norm": 0.3388981521129608, "kl": 0.441650390625, "learning_rate": 3.515199934494373e-06, "loss": 0.0177, "reward": 0.47916667759418485, "reward_std": 0.25198003426194193, "rewards/accuracy_reward": 0.47916667759418485, "rewards/format_reward": 0.0, "step": 2270 }, { "completion_length": 828.795849609375, "epoch": 0.7536856054331622, "grad_norm": 0.5436757802963257, "kl": 0.480810546875, "learning_rate": 3.4712828630391015e-06, "loss": 0.0192, "reward": 0.45000001341104506, "reward_std": 0.23179128840565683, "rewards/accuracy_reward": 0.45000001341104506, "rewards/format_reward": 0.0, "step": 2275 }, { "completion_length": 876.9645965576171, "epoch": 0.755342057313235, "grad_norm": 0.40211012959480286, "kl": 0.462158203125, "learning_rate": 3.427584168568535e-06, "loss": 0.0185, "reward": 0.4041666775941849, "reward_std": 0.24845795333385468, "rewards/accuracy_reward": 0.4041666775941849, "rewards/format_reward": 0.0, "step": 2280 }, { "completion_length": 925.714599609375, "epoch": 0.7569985091933079, "grad_norm": 0.30408617854118347, "kl": 0.457763671875, "learning_rate": 3.384105312746421e-06, "loss": 0.0183, "reward": 0.3395833423361182, "reward_std": 0.23759256042540072, "rewards/accuracy_reward": 0.3395833423361182, "rewards/format_reward": 0.0, "step": 2285 }, { "completion_length": 898.845849609375, "epoch": 0.7586549610733808, "grad_norm": 0.47999709844589233, "kl": 0.446240234375, "learning_rate": 3.3408477498831917e-06, "loss": 0.0178, "reward": 0.37291667591780425, "reward_std": 0.22633545026183127, "rewards/accuracy_reward": 0.37291667591780425, "rewards/format_reward": 0.0, "step": 2290 }, { "completion_length": 843.5166839599609, "epoch": 0.7603114129534537, "grad_norm": 0.5802361369132996, "kl": 0.399169921875, "learning_rate": 3.2978129268873162e-06, "loss": 0.016, "reward": 0.4375000074505806, "reward_std": 0.2427029635757208, "rewards/accuracy_reward": 0.4375000074505806, "rewards/format_reward": 0.0, "step": 2295 }, { "completion_length": 790.1041900634766, "epoch": 0.7619678648335266, "grad_norm": 0.8189281225204468, "kl": 0.38359375, "learning_rate": 3.2550022832169125e-06, "loss": 0.0153, "reward": 0.48541667610406875, "reward_std": 0.20966878794133664, "rewards/accuracy_reward": 0.48541667610406875, "rewards/format_reward": 0.0, "step": 2300 }, { "completion_length": 798.7021057128907, "epoch": 0.7636243167135994, "grad_norm": 0.5261828303337097, "kl": 0.414306640625, "learning_rate": 3.2124172508315996e-06, "loss": 0.0166, "reward": 0.4666666816920042, "reward_std": 0.2186467032879591, "rewards/accuracy_reward": 0.4666666816920042, "rewards/format_reward": 0.0, "step": 2305 }, { "completion_length": 808.5458526611328, "epoch": 0.7652807685936723, "grad_norm": 0.3806306719779968, "kl": 0.47734375, "learning_rate": 3.170059254144593e-06, "loss": 0.0191, "reward": 0.418750011920929, "reward_std": 0.18209044300019742, "rewards/accuracy_reward": 0.418750011920929, "rewards/format_reward": 0.0, "step": 2310 }, { "completion_length": 789.3541900634766, "epoch": 0.7669372204737452, "grad_norm": 0.372968852519989, "kl": 0.36953125, "learning_rate": 3.127929709975057e-06, "loss": 0.0148, "reward": 0.48333334624767305, "reward_std": 0.2427029687911272, "rewards/accuracy_reward": 0.48333334624767305, "rewards/format_reward": 0.0, "step": 2315 }, { "completion_length": 811.9687683105469, "epoch": 0.7685936723538181, "grad_norm": 0.3055851459503174, "kl": 0.372314453125, "learning_rate": 3.086030027500728e-06, "loss": 0.0149, "reward": 0.48125001303851606, "reward_std": 0.2369016956537962, "rewards/accuracy_reward": 0.48125001303851606, "rewards/format_reward": 0.0, "step": 2320 }, { "completion_length": 815.4937713623046, "epoch": 0.770250124233891, "grad_norm": 0.530989944934845, "kl": 0.384716796875, "learning_rate": 3.0443616082107753e-06, "loss": 0.0154, "reward": 0.4895833469927311, "reward_std": 0.19523502849042415, "rewards/accuracy_reward": 0.4895833469927311, "rewards/format_reward": 0.0, "step": 2325 }, { "completion_length": 861.0104400634766, "epoch": 0.7719065761139638, "grad_norm": 0.550189197063446, "kl": 0.516796875, "learning_rate": 3.002925845858905e-06, "loss": 0.0207, "reward": 0.389583345502615, "reward_std": 0.1833796128630638, "rewards/accuracy_reward": 0.389583345502615, "rewards/format_reward": 0.0, "step": 2330 }, { "completion_length": 822.1021026611328, "epoch": 0.7735630279940368, "grad_norm": 0.21661648154258728, "kl": 0.3982421875, "learning_rate": 2.9617241264167707e-06, "loss": 0.0159, "reward": 0.43958334177732467, "reward_std": 0.20966878421604634, "rewards/accuracy_reward": 0.43958334177732467, "rewards/format_reward": 0.0, "step": 2335 }, { "completion_length": 811.6041870117188, "epoch": 0.7752194798741097, "grad_norm": 0.3415497839450836, "kl": 0.448779296875, "learning_rate": 2.920757828027586e-06, "loss": 0.018, "reward": 0.5041666805744172, "reward_std": 0.20421294458210468, "rewards/accuracy_reward": 0.5041666805744172, "rewards/format_reward": 0.0, "step": 2340 }, { "completion_length": 864.837515258789, "epoch": 0.7768759317541826, "grad_norm": 0.35626089572906494, "kl": 0.417626953125, "learning_rate": 2.8800283209600498e-06, "loss": 0.0167, "reward": 0.3895833406597376, "reward_std": 0.20391379594802855, "rewards/accuracy_reward": 0.3895833406597376, "rewards/format_reward": 0.0, "step": 2345 }, { "completion_length": 822.8375213623046, "epoch": 0.7785323836342555, "grad_norm": 0.5177320837974548, "kl": 0.34208984375, "learning_rate": 2.839536967562504e-06, "loss": 0.0137, "reward": 0.3916666768491268, "reward_std": 0.20902419947087764, "rewards/accuracy_reward": 0.3916666768491268, "rewards/format_reward": 0.0, "step": 2350 }, { "completion_length": 803.8708587646485, "epoch": 0.7801888355143283, "grad_norm": 0.23999100923538208, "kl": 0.38974609375, "learning_rate": 2.7992851222173534e-06, "loss": 0.0156, "reward": 0.5020833484828472, "reward_std": 0.23337961062788964, "rewards/accuracy_reward": 0.5020833484828472, "rewards/format_reward": 0.0, "step": 2355 }, { "completion_length": 803.5187713623047, "epoch": 0.7818452873944012, "grad_norm": 0.3911328911781311, "kl": 0.390478515625, "learning_rate": 2.759274131295787e-06, "loss": 0.0156, "reward": 0.41875001341104506, "reward_std": 0.23982546888291836, "rewards/accuracy_reward": 0.41875001341104506, "rewards/format_reward": 0.0, "step": 2360 }, { "completion_length": 828.5166870117188, "epoch": 0.7835017392744741, "grad_norm": 0.7431268095970154, "kl": 0.40712890625, "learning_rate": 2.7195053331127195e-06, "loss": 0.0163, "reward": 0.4270833469927311, "reward_std": 0.2106125231832266, "rewards/accuracy_reward": 0.4270833469927311, "rewards/format_reward": 0.0, "step": 2365 }, { "completion_length": 830.2500213623047, "epoch": 0.785158191154547, "grad_norm": 0.35880258679389954, "kl": 0.421875, "learning_rate": 2.679980057882049e-06, "loss": 0.0169, "reward": 0.45208334624767305, "reward_std": 0.222813368961215, "rewards/accuracy_reward": 0.45208334624767305, "rewards/format_reward": 0.0, "step": 2370 }, { "completion_length": 795.7812713623047, "epoch": 0.7868146430346199, "grad_norm": 0.27409225702285767, "kl": 0.42607421875, "learning_rate": 2.6406996276721384e-06, "loss": 0.017, "reward": 0.45208334624767305, "reward_std": 0.23595795184373855, "rewards/accuracy_reward": 0.45208334624767305, "rewards/format_reward": 0.0, "step": 2375 }, { "completion_length": 812.9750244140625, "epoch": 0.7884710949146927, "grad_norm": 1.1168698072433472, "kl": 0.417431640625, "learning_rate": 2.60166535636162e-06, "loss": 0.0167, "reward": 0.4895833507180214, "reward_std": 0.2333796113729477, "rewards/accuracy_reward": 0.4895833507180214, "rewards/format_reward": 0.0, "step": 2380 }, { "completion_length": 787.9687713623047, "epoch": 0.7901275467947656, "grad_norm": 0.5705488920211792, "kl": 0.41181640625, "learning_rate": 2.562878549595428e-06, "loss": 0.0165, "reward": 0.44375001192092894, "reward_std": 0.1942912895232439, "rewards/accuracy_reward": 0.44375001192092894, "rewards/format_reward": 0.0, "step": 2385 }, { "completion_length": 856.5291870117187, "epoch": 0.7917839986748385, "grad_norm": 0.28524014353752136, "kl": 0.39423828125, "learning_rate": 2.5243405047411353e-06, "loss": 0.0158, "reward": 0.42916668206453323, "reward_std": 0.2366025395691395, "rewards/accuracy_reward": 0.42916668206453323, "rewards/format_reward": 0.0, "step": 2390 }, { "completion_length": 859.1187713623046, "epoch": 0.7934404505549114, "grad_norm": 0.4310338497161865, "kl": 0.425, "learning_rate": 2.48605251084556e-06, "loss": 0.017, "reward": 0.41041667461395265, "reward_std": 0.22186963371932505, "rewards/accuracy_reward": 0.41041667461395265, "rewards/format_reward": 0.0, "step": 2395 }, { "completion_length": 845.1729370117188, "epoch": 0.7950969024349842, "grad_norm": 0.21333444118499756, "kl": 0.39482421875, "learning_rate": 2.448015848591638e-06, "loss": 0.0158, "reward": 0.4395833441987634, "reward_std": 0.1865562628954649, "rewards/accuracy_reward": 0.4395833441987634, "rewards/format_reward": 0.0, "step": 2400 }, { "epoch": 0.7950969024349842, "eval_completion_length": 829.9437622070312, "eval_kl": 0.368359375, "eval_loss": 0.014470603317022324, "eval_reward": 0.4645833432674408, "eval_reward_std": 0.23819086253643035, "eval_rewards/accuracy_reward": 0.4645833432674408, "eval_rewards/format_reward": 0.0, "eval_runtime": 68.8838, "eval_samples_per_second": 1.437, "eval_steps_per_second": 0.029, "step": 2400 }, { "completion_length": 842.677099609375, "epoch": 0.7967533543150571, "grad_norm": 0.30256763100624084, "kl": 0.3810546875, "learning_rate": 2.4102317902556017e-06, "loss": 0.0152, "reward": 0.4750000089406967, "reward_std": 0.2388354554772377, "rewards/accuracy_reward": 0.4750000089406967, "rewards/format_reward": 0.0, "step": 2405 }, { "completion_length": 854.7916809082031, "epoch": 0.79840980619513, "grad_norm": 0.333625853061676, "kl": 0.4546875, "learning_rate": 2.3727015996644043e-06, "loss": 0.0182, "reward": 0.416666678711772, "reward_std": 0.2388354495167732, "rewards/accuracy_reward": 0.416666678711772, "rewards/format_reward": 0.0, "step": 2410 }, { "completion_length": 854.1729431152344, "epoch": 0.8000662580752029, "grad_norm": 3.99680233001709, "kl": 0.55419921875, "learning_rate": 2.3354265321534674e-06, "loss": 0.0222, "reward": 0.404166678711772, "reward_std": 0.2616025425493717, "rewards/accuracy_reward": 0.404166678711772, "rewards/format_reward": 0.0, "step": 2415 }, { "completion_length": 864.052099609375, "epoch": 0.8017227099552758, "grad_norm": 0.5783562064170837, "kl": 0.4556640625, "learning_rate": 2.298407834524682e-06, "loss": 0.0182, "reward": 0.40833334289491174, "reward_std": 0.21477919071912766, "rewards/accuracy_reward": 0.40833334289491174, "rewards/format_reward": 0.0, "step": 2420 }, { "completion_length": 883.464599609375, "epoch": 0.8033791618353486, "grad_norm": 0.7375392317771912, "kl": 0.499365234375, "learning_rate": 2.261646745004693e-06, "loss": 0.02, "reward": 0.43750001713633535, "reward_std": 0.23273502700030804, "rewards/accuracy_reward": 0.43750001713633535, "rewards/format_reward": 0.0, "step": 2425 }, { "completion_length": 896.4521026611328, "epoch": 0.8050356137154215, "grad_norm": 0.44074785709381104, "kl": 0.404296875, "learning_rate": 2.2251444932035094e-06, "loss": 0.0162, "reward": 0.44791668057441714, "reward_std": 0.2657692074775696, "rewards/accuracy_reward": 0.44791668057441714, "rewards/format_reward": 0.0, "step": 2430 }, { "completion_length": 871.6958526611328, "epoch": 0.8066920655954944, "grad_norm": 0.6988074779510498, "kl": 0.45419921875, "learning_rate": 2.1889023000733435e-06, "loss": 0.0182, "reward": 0.44583334438502786, "reward_std": 0.24459044262766838, "rewards/accuracy_reward": 0.44583334438502786, "rewards/format_reward": 0.0, "step": 2435 }, { "completion_length": 871.8625213623047, "epoch": 0.8083485174755674, "grad_norm": 0.5102762579917908, "kl": 0.50673828125, "learning_rate": 2.1529213778677993e-06, "loss": 0.0203, "reward": 0.36666667424142363, "reward_std": 0.20550212152302266, "rewards/accuracy_reward": 0.36666667424142363, "rewards/format_reward": 0.0, "step": 2440 }, { "completion_length": 824.4229400634765, "epoch": 0.8100049693556403, "grad_norm": 0.32286399602890015, "kl": 0.371533203125, "learning_rate": 2.117202930101312e-06, "loss": 0.0149, "reward": 0.4541666805744171, "reward_std": 0.2305021181702614, "rewards/accuracy_reward": 0.4541666805744171, "rewards/format_reward": 0.0, "step": 2445 }, { "completion_length": 796.9916839599609, "epoch": 0.811661421235713, "grad_norm": 0.35136878490448, "kl": 0.34365234375, "learning_rate": 2.081748151508883e-06, "loss": 0.0137, "reward": 0.4916666835546494, "reward_std": 0.20550211891531944, "rewards/accuracy_reward": 0.4916666835546494, "rewards/format_reward": 0.0, "step": 2450 }, { "completion_length": 817.9791870117188, "epoch": 0.813317873115786, "grad_norm": 0.5059108138084412, "kl": 0.36103515625, "learning_rate": 2.0465582280061424e-06, "loss": 0.0144, "reward": 0.5208333462476731, "reward_std": 0.20326921120285987, "rewards/accuracy_reward": 0.5208333462476731, "rewards/format_reward": 0.0, "step": 2455 }, { "completion_length": 812.785433959961, "epoch": 0.8149743249958589, "grad_norm": 0.47841915488243103, "kl": 0.38623046875, "learning_rate": 2.0116343366496493e-06, "loss": 0.0155, "reward": 0.45833334848284724, "reward_std": 0.21254628002643586, "rewards/accuracy_reward": 0.45833334848284724, "rewards/format_reward": 0.0, "step": 2460 }, { "completion_length": 808.4604370117188, "epoch": 0.8166307768759318, "grad_norm": 0.5747477412223816, "kl": 0.4443359375, "learning_rate": 1.976977645597552e-06, "loss": 0.0178, "reward": 0.4770833447575569, "reward_std": 0.253568359464407, "rewards/accuracy_reward": 0.4770833447575569, "rewards/format_reward": 0.0, "step": 2465 }, { "completion_length": 836.9333557128906, "epoch": 0.8182872287560047, "grad_norm": 0.7831066846847534, "kl": 0.424072265625, "learning_rate": 1.942589314070494e-06, "loss": 0.017, "reward": 0.4541666802018881, "reward_std": 0.2282692078500986, "rewards/accuracy_reward": 0.4541666802018881, "rewards/format_reward": 0.0, "step": 2470 }, { "completion_length": 841.3833526611328, "epoch": 0.8199436806360775, "grad_norm": 0.5068976283073425, "kl": 0.437353515625, "learning_rate": 1.908470492312854e-06, "loss": 0.0175, "reward": 0.4625000096857548, "reward_std": 0.1994016945362091, "rewards/accuracy_reward": 0.4625000096857548, "rewards/format_reward": 0.0, "step": 2475 }, { "completion_length": 821.5791839599609, "epoch": 0.8216001325161504, "grad_norm": 0.46918413043022156, "kl": 0.433251953125, "learning_rate": 1.8746223215542482e-06, "loss": 0.0173, "reward": 0.42500001229345796, "reward_std": 0.20421294569969178, "rewards/accuracy_reward": 0.42500001229345796, "rewards/format_reward": 0.0, "step": 2480 }, { "completion_length": 813.2646057128907, "epoch": 0.8232565843962233, "grad_norm": 0.6446995139122009, "kl": 0.40888671875, "learning_rate": 1.8410459339713894e-06, "loss": 0.0164, "reward": 0.4437500134110451, "reward_std": 0.2635362960398197, "rewards/accuracy_reward": 0.4437500134110451, "rewards/format_reward": 0.0, "step": 2485 }, { "completion_length": 820.8312713623047, "epoch": 0.8249130362762962, "grad_norm": 0.5229965448379517, "kl": 0.388134765625, "learning_rate": 1.8077424526501964e-06, "loss": 0.0155, "reward": 0.5041666805744172, "reward_std": 0.23308045975863934, "rewards/accuracy_reward": 0.5041666805744172, "rewards/format_reward": 0.0, "step": 2490 }, { "completion_length": 808.5896026611329, "epoch": 0.8265694881563691, "grad_norm": 0.7357300519943237, "kl": 0.363134765625, "learning_rate": 1.7747129915482287e-06, "loss": 0.0145, "reward": 0.5083333447575569, "reward_std": 0.2567912913858891, "rewards/accuracy_reward": 0.5083333447575569, "rewards/format_reward": 0.0, "step": 2495 }, { "completion_length": 846.4687713623047, "epoch": 0.8282259400364419, "grad_norm": 0.6610658168792725, "kl": 0.474609375, "learning_rate": 1.7419586554574364e-06, "loss": 0.019, "reward": 0.40208334252238276, "reward_std": 0.21671294160187243, "rewards/accuracy_reward": 0.40208334252238276, "rewards/format_reward": 0.0, "step": 2500 }, { "completion_length": 816.4500152587891, "epoch": 0.8298823919165148, "grad_norm": 0.4140505790710449, "kl": 0.452001953125, "learning_rate": 1.7094805399671955e-06, "loss": 0.0181, "reward": 0.45625001564621925, "reward_std": 0.26224712617695334, "rewards/accuracy_reward": 0.45625001564621925, "rewards/format_reward": 0.0, "step": 2505 }, { "completion_length": 846.3458526611328, "epoch": 0.8315388437965877, "grad_norm": 0.49973127245903015, "kl": 0.465087890625, "learning_rate": 1.6772797314276712e-06, "loss": 0.0186, "reward": 0.42291667610406875, "reward_std": 0.22985753193497657, "rewards/accuracy_reward": 0.42291667610406875, "rewards/format_reward": 0.0, "step": 2510 }, { "completion_length": 858.4104309082031, "epoch": 0.8331952956766606, "grad_norm": 0.47388219833374023, "kl": 0.391650390625, "learning_rate": 1.6453573069134787e-06, "loss": 0.0157, "reward": 0.4458333469927311, "reward_std": 0.2410683583468199, "rewards/accuracy_reward": 0.4458333469927311, "rewards/format_reward": 0.0, "step": 2515 }, { "completion_length": 843.2666839599609, "epoch": 0.8348517475567335, "grad_norm": 0.4863141179084778, "kl": 0.3888671875, "learning_rate": 1.6137143341876439e-06, "loss": 0.0156, "reward": 0.5041666820645332, "reward_std": 0.24141379445791245, "rewards/accuracy_reward": 0.5041666820645332, "rewards/format_reward": 0.0, "step": 2520 }, { "completion_length": 844.4750213623047, "epoch": 0.8365081994368063, "grad_norm": 0.6674135327339172, "kl": 0.368994140625, "learning_rate": 1.5823518716659103e-06, "loss": 0.0148, "reward": 0.43750001341104505, "reward_std": 0.2225142192095518, "rewards/accuracy_reward": 0.43750001341104505, "rewards/format_reward": 0.0, "step": 2525 }, { "completion_length": 816.466683959961, "epoch": 0.8381646513168792, "grad_norm": 0.32421231269836426, "kl": 0.393359375, "learning_rate": 1.5512709683813165e-06, "loss": 0.0157, "reward": 0.4770833492279053, "reward_std": 0.17633545361459255, "rewards/accuracy_reward": 0.4770833492279053, "rewards/format_reward": 0.0, "step": 2530 }, { "completion_length": 776.0666900634766, "epoch": 0.8398211031969521, "grad_norm": 0.40859735012054443, "kl": 0.385302734375, "learning_rate": 1.520472663949122e-06, "loss": 0.0154, "reward": 0.4541666753590107, "reward_std": 0.16349001862108709, "rewards/accuracy_reward": 0.4541666753590107, "rewards/format_reward": 0.0, "step": 2535 }, { "completion_length": 793.7625122070312, "epoch": 0.841477555077025, "grad_norm": 0.47426044940948486, "kl": 0.40693359375, "learning_rate": 1.4899579885320237e-06, "loss": 0.0163, "reward": 0.43541668355464935, "reward_std": 0.18724712617695333, "rewards/accuracy_reward": 0.43541668355464935, "rewards/format_reward": 0.0, "step": 2540 }, { "completion_length": 797.7500213623047, "epoch": 0.843134006957098, "grad_norm": 0.2779918313026428, "kl": 0.4111328125, "learning_rate": 1.4597279628057005e-06, "loss": 0.0164, "reward": 0.4416666813194752, "reward_std": 0.21606836132705212, "rewards/accuracy_reward": 0.4416666813194752, "rewards/format_reward": 0.0, "step": 2545 }, { "completion_length": 804.8687713623046, "epoch": 0.8447904588371707, "grad_norm": 0.20720548927783966, "kl": 0.36943359375, "learning_rate": 1.4297835979246777e-06, "loss": 0.0148, "reward": 0.479166679084301, "reward_std": 0.1994016941636801, "rewards/accuracy_reward": 0.479166679084301, "rewards/format_reward": 0.0, "step": 2550 }, { "completion_length": 827.7520935058594, "epoch": 0.8464469107172437, "grad_norm": 0.40864264965057373, "kl": 0.40283203125, "learning_rate": 1.4001258954884955e-06, "loss": 0.0161, "reward": 0.3729166768491268, "reward_std": 0.18432335332036018, "rewards/accuracy_reward": 0.3729166768491268, "rewards/format_reward": 0.0, "step": 2555 }, { "completion_length": 849.7896026611328, "epoch": 0.8481033625973166, "grad_norm": 1.3422250747680664, "kl": 0.4154296875, "learning_rate": 1.370755847508226e-06, "loss": 0.0166, "reward": 0.4187500163912773, "reward_std": 0.21800211630761623, "rewards/accuracy_reward": 0.4187500163912773, "rewards/format_reward": 0.0, "step": 2560 }, { "completion_length": 848.8979339599609, "epoch": 0.8497598144773895, "grad_norm": 0.3789568543434143, "kl": 0.429296875, "learning_rate": 1.3416744363732637e-06, "loss": 0.0172, "reward": 0.4062500074505806, "reward_std": 0.22152419649064542, "rewards/accuracy_reward": 0.4062500074505806, "rewards/format_reward": 0.0, "step": 2565 }, { "completion_length": 866.0646057128906, "epoch": 0.8514162663574624, "grad_norm": 0.5348113179206848, "kl": 0.45419921875, "learning_rate": 1.3128826348184886e-06, "loss": 0.0182, "reward": 0.4083333447575569, "reward_std": 0.24141379185020923, "rewards/accuracy_reward": 0.4083333447575569, "rewards/format_reward": 0.0, "step": 2570 }, { "completion_length": 833.3354309082031, "epoch": 0.8530727182375352, "grad_norm": 0.37174373865127563, "kl": 0.426513671875, "learning_rate": 1.2843814058917249e-06, "loss": 0.0171, "reward": 0.47500001937150954, "reward_std": 0.19235753268003464, "rewards/accuracy_reward": 0.47500001937150954, "rewards/format_reward": 0.0, "step": 2575 }, { "completion_length": 851.4729370117187, "epoch": 0.8547291701176081, "grad_norm": 0.6994616985321045, "kl": 0.39619140625, "learning_rate": 1.256171702921516e-06, "loss": 0.0159, "reward": 0.481250011920929, "reward_std": 0.22058046236634254, "rewards/accuracy_reward": 0.481250011920929, "rewards/format_reward": 0.0, "step": 2580 }, { "completion_length": 861.0166961669922, "epoch": 0.856385621997681, "grad_norm": 0.2743810713291168, "kl": 0.358642578125, "learning_rate": 1.2282544694852561e-06, "loss": 0.0143, "reward": 0.4979166775941849, "reward_std": 0.20837961360812188, "rewards/accuracy_reward": 0.4979166775941849, "rewards/format_reward": 0.0, "step": 2585 }, { "completion_length": 843.2666900634765, "epoch": 0.8580420738777539, "grad_norm": 0.6943274736404419, "kl": 0.400390625, "learning_rate": 1.200630639377609e-06, "loss": 0.016, "reward": 0.44166667964309453, "reward_std": 0.2708796102553606, "rewards/accuracy_reward": 0.44166667964309453, "rewards/format_reward": 0.0, "step": 2590 }, { "completion_length": 857.3750213623047, "epoch": 0.8596985257578267, "grad_norm": 0.2415345013141632, "kl": 0.476171875, "learning_rate": 1.1733011365792947e-06, "loss": 0.019, "reward": 0.385416678711772, "reward_std": 0.22152420058846473, "rewards/accuracy_reward": 0.385416678711772, "rewards/format_reward": 0.0, "step": 2595 }, { "completion_length": 837.4333526611329, "epoch": 0.8613549776378996, "grad_norm": 0.27433979511260986, "kl": 0.39541015625, "learning_rate": 1.1462668752261652e-06, "loss": 0.0158, "reward": 0.4583333432674408, "reward_std": 0.25103629752993584, "rewards/accuracy_reward": 0.4583333432674408, "rewards/format_reward": 0.0, "step": 2600 }, { "completion_length": 871.4375213623047, "epoch": 0.8630114295179725, "grad_norm": 0.4250912070274353, "kl": 0.367236328125, "learning_rate": 1.1195287595786352e-06, "loss": 0.0147, "reward": 0.4541666772216558, "reward_std": 0.1888354517519474, "rewards/accuracy_reward": 0.4541666772216558, "rewards/format_reward": 0.0, "step": 2605 }, { "completion_length": 838.5187713623047, "epoch": 0.8646678813980454, "grad_norm": 0.33398547768592834, "kl": 0.378662109375, "learning_rate": 1.0930876839914418e-06, "loss": 0.0151, "reward": 0.4979166753590107, "reward_std": 0.2433475524187088, "rewards/accuracy_reward": 0.4979166753590107, "rewards/format_reward": 0.0, "step": 2610 }, { "completion_length": 837.4687652587891, "epoch": 0.8663243332781183, "grad_norm": 0.6561420559883118, "kl": 0.38388671875, "learning_rate": 1.0669445328837146e-06, "loss": 0.0153, "reward": 0.48958335146307946, "reward_std": 0.22985753193497657, "rewards/accuracy_reward": 0.48958335146307946, "rewards/format_reward": 0.0, "step": 2615 }, { "completion_length": 874.7979370117188, "epoch": 0.8679807851581911, "grad_norm": 0.44199249148368835, "kl": 0.40126953125, "learning_rate": 1.04110018070941e-06, "loss": 0.0161, "reward": 0.40833334550261496, "reward_std": 0.2366025410592556, "rewards/accuracy_reward": 0.40833334550261496, "rewards/format_reward": 0.0, "step": 2620 }, { "completion_length": 843.5708557128906, "epoch": 0.869637237038264, "grad_norm": 0.42768463492393494, "kl": 0.401220703125, "learning_rate": 1.0155554919280496e-06, "loss": 0.0161, "reward": 0.43750001192092897, "reward_std": 0.23050211742520332, "rewards/accuracy_reward": 0.43750001192092897, "rewards/format_reward": 0.0, "step": 2625 }, { "completion_length": 840.8146026611328, "epoch": 0.8712936889183369, "grad_norm": 0.5562691688537598, "kl": 0.39638671875, "learning_rate": 9.903113209758098e-07, "loss": 0.0158, "reward": 0.43125001043081285, "reward_std": 0.22281336933374404, "rewards/accuracy_reward": 0.43125001043081285, "rewards/format_reward": 0.0, "step": 2630 }, { "completion_length": 853.3166900634766, "epoch": 0.8729501407984098, "grad_norm": 0.32051053643226624, "kl": 0.427294921875, "learning_rate": 9.65368512236944e-07, "loss": 0.0171, "reward": 0.4166666816920042, "reward_std": 0.22698003463447095, "rewards/accuracy_reward": 0.4166666816920042, "rewards/format_reward": 0.0, "step": 2635 }, { "completion_length": 847.4896057128906, "epoch": 0.8746065926784827, "grad_norm": 0.34779927134513855, "kl": 0.431982421875, "learning_rate": 9.407279000155311e-07, "loss": 0.0173, "reward": 0.4395833432674408, "reward_std": 0.2372471246868372, "rewards/accuracy_reward": 0.4395833432674408, "rewards/format_reward": 0.0, "step": 2640 }, { "completion_length": 834.4166839599609, "epoch": 0.8762630445585555, "grad_norm": 0.625748872756958, "kl": 0.396826171875, "learning_rate": 9.163903085075843e-07, "loss": 0.0159, "reward": 0.464583345502615, "reward_std": 0.22633545175194741, "rewards/accuracy_reward": 0.464583345502615, "rewards/format_reward": 0.0, "step": 2645 }, { "completion_length": 841.9229431152344, "epoch": 0.8779194964386284, "grad_norm": 0.4750681221485138, "kl": 0.49716796875, "learning_rate": 8.923565517734633e-07, "loss": 0.0199, "reward": 0.45833334773778917, "reward_std": 0.20550211593508722, "rewards/accuracy_reward": 0.45833334773778917, "rewards/format_reward": 0.0, "step": 2650 }, { "completion_length": 829.5375183105468, "epoch": 0.8795759483187013, "grad_norm": 0.45479315519332886, "kl": 0.4185546875, "learning_rate": 8.686274337106626e-07, "loss": 0.0167, "reward": 0.5125000178813934, "reward_std": 0.22440169416368008, "rewards/accuracy_reward": 0.5125000178813934, "rewards/format_reward": 0.0, "step": 2655 }, { "completion_length": 860.8896026611328, "epoch": 0.8812324001987742, "grad_norm": 0.36250945925712585, "kl": 0.4498046875, "learning_rate": 8.452037480269082e-07, "loss": 0.018, "reward": 0.433333345502615, "reward_std": 0.22569086477160455, "rewards/accuracy_reward": 0.433333345502615, "rewards/format_reward": 0.0, "step": 2660 }, { "completion_length": 841.8750183105469, "epoch": 0.8828888520788472, "grad_norm": 0.3981919288635254, "kl": 0.427490234375, "learning_rate": 8.220862782136186e-07, "loss": 0.0171, "reward": 0.4354166805744171, "reward_std": 0.19042377769947053, "rewards/accuracy_reward": 0.4354166805744171, "rewards/format_reward": 0.0, "step": 2665 }, { "completion_length": 842.5083587646484, "epoch": 0.88454530395892, "grad_norm": 0.3860984146595001, "kl": 0.452197265625, "learning_rate": 7.992757975196974e-07, "loss": 0.0181, "reward": 0.49583334773778914, "reward_std": 0.2891808860003948, "rewards/accuracy_reward": 0.49583334773778914, "rewards/format_reward": 0.0, "step": 2670 }, { "completion_length": 847.6229309082031, "epoch": 0.8862017558389929, "grad_norm": 0.5426933169364929, "kl": 0.45400390625, "learning_rate": 7.767730689256614e-07, "loss": 0.0182, "reward": 0.40625001341104505, "reward_std": 0.19265668466687202, "rewards/accuracy_reward": 0.40625001341104505, "rewards/format_reward": 0.0, "step": 2675 }, { "completion_length": 839.7500213623047, "epoch": 0.8878582077190658, "grad_norm": 0.7710992693901062, "kl": 0.478466796875, "learning_rate": 7.545788451181313e-07, "loss": 0.0191, "reward": 0.4437500134110451, "reward_std": 0.23209044486284255, "rewards/accuracy_reward": 0.4437500134110451, "rewards/format_reward": 0.0, "step": 2680 }, { "completion_length": 853.0354400634766, "epoch": 0.8895146595991387, "grad_norm": 0.8055346012115479, "kl": 0.4396484375, "learning_rate": 7.326938684646423e-07, "loss": 0.0176, "reward": 0.43333334699273107, "reward_std": 0.2564458556473255, "rewards/accuracy_reward": 0.43333334699273107, "rewards/format_reward": 0.0, "step": 2685 }, { "completion_length": 871.5000152587891, "epoch": 0.8911711114792116, "grad_norm": 0.372955322265625, "kl": 0.48623046875, "learning_rate": 7.11118870988825e-07, "loss": 0.0195, "reward": 0.38958334233611824, "reward_std": 0.21929129101336003, "rewards/accuracy_reward": 0.38958334233611824, "rewards/format_reward": 0.0, "step": 2690 }, { "completion_length": 841.6375152587891, "epoch": 0.8928275633592844, "grad_norm": 0.28008803725242615, "kl": 0.4203125, "learning_rate": 6.898545743459162e-07, "loss": 0.0168, "reward": 0.44166667610406873, "reward_std": 0.1910683646798134, "rewards/accuracy_reward": 0.44166667610406873, "rewards/format_reward": 0.0, "step": 2695 }, { "completion_length": 863.3125183105469, "epoch": 0.8944840152393573, "grad_norm": 0.3400105834007263, "kl": 0.48291015625, "learning_rate": 6.689016897986123e-07, "loss": 0.0193, "reward": 0.3729166783392429, "reward_std": 0.21413460671901702, "rewards/accuracy_reward": 0.3729166783392429, "rewards/format_reward": 0.0, "step": 2700 }, { "epoch": 0.8944840152393573, "eval_completion_length": 849.39794921875, "eval_kl": 0.44140625, "eval_loss": 0.017924753949046135, "eval_reward": 0.45000000596046447, "eval_reward_std": 0.24012462198734283, "eval_rewards/accuracy_reward": 0.45000000596046447, "eval_rewards/format_reward": 0.0, "eval_runtime": 69.0286, "eval_samples_per_second": 1.434, "eval_steps_per_second": 0.029, "step": 2700 }, { "completion_length": 849.3729370117187, "epoch": 0.8961404671194302, "grad_norm": 0.3317373991012573, "kl": 0.439697265625, "learning_rate": 6.48260918193292e-07, "loss": 0.0176, "reward": 0.4708333432674408, "reward_std": 0.24141379445791245, "rewards/accuracy_reward": 0.4708333432674408, "rewards/format_reward": 0.0, "step": 2705 }, { "completion_length": 878.320849609375, "epoch": 0.8977969189995031, "grad_norm": 0.4570317566394806, "kl": 0.4861328125, "learning_rate": 6.279329499365649e-07, "loss": 0.0194, "reward": 0.3750000085681677, "reward_std": 0.22182335332036018, "rewards/accuracy_reward": 0.3750000085681677, "rewards/format_reward": 0.0, "step": 2710 }, { "completion_length": 845.7666900634765, "epoch": 0.899453370879576, "grad_norm": 0.8007364869117737, "kl": 0.404150390625, "learning_rate": 6.079184649721792e-07, "loss": 0.0162, "reward": 0.47708334773778915, "reward_std": 0.2529700558632612, "rewards/accuracy_reward": 0.47708334773778915, "rewards/format_reward": 0.0, "step": 2715 }, { "completion_length": 837.2937683105469, "epoch": 0.9011098227596488, "grad_norm": 0.6531445980072021, "kl": 0.419287109375, "learning_rate": 5.88218132758287e-07, "loss": 0.0168, "reward": 0.4375000074505806, "reward_std": 0.21606836318969727, "rewards/accuracy_reward": 0.4375000074505806, "rewards/format_reward": 0.0, "step": 2720 }, { "completion_length": 833.6979400634766, "epoch": 0.9027662746397217, "grad_norm": 0.5401316285133362, "kl": 0.3904296875, "learning_rate": 5.688326122450338e-07, "loss": 0.0156, "reward": 0.4270833469927311, "reward_std": 0.21929129026830196, "rewards/accuracy_reward": 0.4270833469927311, "rewards/format_reward": 0.0, "step": 2725 }, { "completion_length": 846.6208557128906, "epoch": 0.9044227265197946, "grad_norm": 0.3286316692829132, "kl": 0.43271484375, "learning_rate": 5.497625518525374e-07, "loss": 0.0173, "reward": 0.41666668057441714, "reward_std": 0.23660254068672656, "rewards/accuracy_reward": 0.41666668057441714, "rewards/format_reward": 0.0, "step": 2730 }, { "completion_length": 860.7958557128907, "epoch": 0.9060791783998675, "grad_norm": 0.449181467294693, "kl": 0.50908203125, "learning_rate": 5.310085894491878e-07, "loss": 0.0204, "reward": 0.3979166753590107, "reward_std": 0.2670583836734295, "rewards/accuracy_reward": 0.3979166753590107, "rewards/format_reward": 0.0, "step": 2735 }, { "completion_length": 862.4791839599609, "epoch": 0.9077356302799404, "grad_norm": 0.7487555742263794, "kl": 0.4298828125, "learning_rate": 5.125713523303133e-07, "loss": 0.0172, "reward": 0.4458333432674408, "reward_std": 0.24880339279770852, "rewards/accuracy_reward": 0.4458333432674408, "rewards/format_reward": 0.0, "step": 2740 }, { "completion_length": 865.7083587646484, "epoch": 0.9093920821600132, "grad_norm": 0.4296983778476715, "kl": 0.46826171875, "learning_rate": 4.944514571971981e-07, "loss": 0.0187, "reward": 0.48541668355464934, "reward_std": 0.26705837696790696, "rewards/accuracy_reward": 0.48541668355464934, "rewards/format_reward": 0.0, "step": 2745 }, { "completion_length": 874.9812774658203, "epoch": 0.9110485340400861, "grad_norm": 0.5879957675933838, "kl": 0.472412109375, "learning_rate": 4.7664951013645875e-07, "loss": 0.0189, "reward": 0.38958334662020205, "reward_std": 0.242058377712965, "rewards/accuracy_reward": 0.38958334662020205, "rewards/format_reward": 0.0, "step": 2750 }, { "completion_length": 850.6875183105469, "epoch": 0.912704985920159, "grad_norm": 0.5066933035850525, "kl": 0.423046875, "learning_rate": 4.591661065997599e-07, "loss": 0.0169, "reward": 0.4562500134110451, "reward_std": 0.2417129475623369, "rewards/accuracy_reward": 0.4562500134110451, "rewards/format_reward": 0.0, "step": 2755 }, { "completion_length": 859.1125183105469, "epoch": 0.9143614378002319, "grad_norm": 0.3260541558265686, "kl": 0.4240234375, "learning_rate": 4.420018313839147e-07, "loss": 0.017, "reward": 0.43333334624767306, "reward_std": 0.22345795445144176, "rewards/accuracy_reward": 0.43333334624767306, "rewards/format_reward": 0.0, "step": 2760 }, { "completion_length": 854.2416900634765, "epoch": 0.9160178896803048, "grad_norm": 0.6293469071388245, "kl": 0.44599609375, "learning_rate": 4.2515725861130954e-07, "loss": 0.0178, "reward": 0.4500000089406967, "reward_std": 0.24012462086975575, "rewards/accuracy_reward": 0.4500000089406967, "rewards/format_reward": 0.0, "step": 2765 }, { "completion_length": 842.1521026611329, "epoch": 0.9176743415603776, "grad_norm": 0.43350356817245483, "kl": 0.44091796875, "learning_rate": 4.086329517107046e-07, "loss": 0.0176, "reward": 0.4687500178813934, "reward_std": 0.24076920934021473, "rewards/accuracy_reward": 0.4687500178813934, "rewards/format_reward": 0.0, "step": 2770 }, { "completion_length": 872.2354400634765, "epoch": 0.9193307934404505, "grad_norm": 0.3854561150074005, "kl": 0.45693359375, "learning_rate": 3.924294633983905e-07, "loss": 0.0183, "reward": 0.40833334214985373, "reward_std": 0.27861464135348796, "rewards/accuracy_reward": 0.40833334214985373, "rewards/format_reward": 0.0, "step": 2775 }, { "completion_length": 860.4583618164063, "epoch": 0.9209872453205235, "grad_norm": 0.33282986283302307, "kl": 0.396484375, "learning_rate": 3.7654733565969826e-07, "loss": 0.0159, "reward": 0.38750001043081284, "reward_std": 0.19716878570616245, "rewards/accuracy_reward": 0.38750001043081284, "rewards/format_reward": 0.0, "step": 2780 }, { "completion_length": 856.3916839599609, "epoch": 0.9226436972005964, "grad_norm": 0.6708009839057922, "kl": 0.4916015625, "learning_rate": 3.6098709973087065e-07, "loss": 0.0197, "reward": 0.41041668355464933, "reward_std": 0.23819086849689483, "rewards/accuracy_reward": 0.41041668355464933, "rewards/format_reward": 0.0, "step": 2785 }, { "completion_length": 826.0333557128906, "epoch": 0.9243001490806692, "grad_norm": 0.46027451753616333, "kl": 0.3951171875, "learning_rate": 3.457492760812975e-07, "loss": 0.0158, "reward": 0.4666666753590107, "reward_std": 0.2506908606737852, "rewards/accuracy_reward": 0.4666666753590107, "rewards/format_reward": 0.0, "step": 2790 }, { "completion_length": 834.8875244140625, "epoch": 0.9259566009607421, "grad_norm": 0.5461171269416809, "kl": 0.432763671875, "learning_rate": 3.308343743960951e-07, "loss": 0.0173, "reward": 0.47291667982935903, "reward_std": 0.22058046162128447, "rewards/accuracy_reward": 0.47291667982935903, "rewards/format_reward": 0.0, "step": 2795 }, { "completion_length": 852.3146026611328, "epoch": 0.927613052840815, "grad_norm": 0.5871738791465759, "kl": 0.464306640625, "learning_rate": 3.1624289355907334e-07, "loss": 0.0186, "reward": 0.40000001173466443, "reward_std": 0.2147791888564825, "rewards/accuracy_reward": 0.40000001173466443, "rewards/format_reward": 0.0, "step": 2800 }, { "completion_length": 869.3729400634766, "epoch": 0.9292695047208879, "grad_norm": 0.5766992568969727, "kl": 0.46728515625, "learning_rate": 3.019753216360355e-07, "loss": 0.0187, "reward": 0.3979166768491268, "reward_std": 0.23114670626819134, "rewards/accuracy_reward": 0.3979166768491268, "rewards/format_reward": 0.0, "step": 2805 }, { "completion_length": 839.2729339599609, "epoch": 0.9309259566009608, "grad_norm": 0.541927695274353, "kl": 0.440771484375, "learning_rate": 2.8803213585846036e-07, "loss": 0.0176, "reward": 0.4604166805744171, "reward_std": 0.27762461788952353, "rewards/accuracy_reward": 0.4604166805744171, "rewards/format_reward": 0.0, "step": 2810 }, { "completion_length": 855.3354431152344, "epoch": 0.9325824084810336, "grad_norm": 0.32520902156829834, "kl": 0.43955078125, "learning_rate": 2.744138026075405e-07, "loss": 0.0176, "reward": 0.3687500078231096, "reward_std": 0.22633544988930226, "rewards/accuracy_reward": 0.3687500078231096, "rewards/format_reward": 0.0, "step": 2815 }, { "completion_length": 849.6708587646484, "epoch": 0.9342388603611065, "grad_norm": 0.3022269308567047, "kl": 0.453466796875, "learning_rate": 2.6112077739857465e-07, "loss": 0.0181, "reward": 0.450000012665987, "reward_std": 0.20808046013116838, "rewards/accuracy_reward": 0.450000012665987, "rewards/format_reward": 0.0, "step": 2820 }, { "completion_length": 835.9937713623046, "epoch": 0.9358953122411794, "grad_norm": 0.4162655770778656, "kl": 0.37392578125, "learning_rate": 2.481535048657402e-07, "loss": 0.015, "reward": 0.48333334624767305, "reward_std": 0.2292129471898079, "rewards/accuracy_reward": 0.48333334624767305, "rewards/format_reward": 0.0, "step": 2825 }, { "completion_length": 832.3375274658204, "epoch": 0.9375517641212523, "grad_norm": 0.2978215217590332, "kl": 0.40634765625, "learning_rate": 2.3551241874721353e-07, "loss": 0.0162, "reward": 0.4666666805744171, "reward_std": 0.26418088003993034, "rewards/accuracy_reward": 0.4666666805744171, "rewards/format_reward": 0.0, "step": 2830 }, { "completion_length": 826.2104370117188, "epoch": 0.9392082160013252, "grad_norm": 0.45683208107948303, "kl": 0.41142578125, "learning_rate": 2.2319794187066978e-07, "loss": 0.0165, "reward": 0.5041666820645332, "reward_std": 0.23308045640587807, "rewards/accuracy_reward": 0.5041666820645332, "rewards/format_reward": 0.0, "step": 2835 }, { "completion_length": 847.3062744140625, "epoch": 0.940864667881398, "grad_norm": 0.3496578633785248, "kl": 0.47001953125, "learning_rate": 2.1121048613912843e-07, "loss": 0.0188, "reward": 0.39375000521540643, "reward_std": 0.20356836020946503, "rewards/accuracy_reward": 0.39375000521540643, "rewards/format_reward": 0.0, "step": 2840 }, { "completion_length": 828.4916839599609, "epoch": 0.9425211197614709, "grad_norm": 0.38677114248275757, "kl": 0.4427734375, "learning_rate": 1.9955045251718763e-07, "loss": 0.0177, "reward": 0.41250001192092894, "reward_std": 0.26864670030772686, "rewards/accuracy_reward": 0.41250001192092894, "rewards/format_reward": 0.0, "step": 2845 }, { "completion_length": 817.0625274658203, "epoch": 0.9441775716415438, "grad_norm": 0.5749364495277405, "kl": 0.3998046875, "learning_rate": 1.8821823101760949e-07, "loss": 0.016, "reward": 0.46875001192092897, "reward_std": 0.27702631801366806, "rewards/accuracy_reward": 0.46875001192092897, "rewards/format_reward": 0.0, "step": 2850 }, { "completion_length": 831.6000274658203, "epoch": 0.9458340235216167, "grad_norm": 0.7381191849708557, "kl": 0.41953125, "learning_rate": 1.772142006882671e-07, "loss": 0.0168, "reward": 0.3958333443850279, "reward_std": 0.25421294420957563, "rewards/accuracy_reward": 0.3958333443850279, "rewards/format_reward": 0.0, "step": 2855 }, { "completion_length": 851.7771026611329, "epoch": 0.9474904754016896, "grad_norm": 0.43567535281181335, "kl": 0.4125, "learning_rate": 1.665387295994747e-07, "loss": 0.0165, "reward": 0.41875001341104506, "reward_std": 0.264480035379529, "rewards/accuracy_reward": 0.41875001341104506, "rewards/format_reward": 0.0, "step": 2860 }, { "completion_length": 824.6000183105468, "epoch": 0.9491469272817624, "grad_norm": 0.4709452688694, "kl": 0.39912109375, "learning_rate": 1.561921748316708e-07, "loss": 0.016, "reward": 0.4666666712611914, "reward_std": 0.2401246264576912, "rewards/accuracy_reward": 0.4666666712611914, "rewards/format_reward": 0.0, "step": 2865 }, { "completion_length": 858.0937652587891, "epoch": 0.9508033791618353, "grad_norm": 0.9191097021102905, "kl": 0.46552734375, "learning_rate": 1.4617488246348012e-07, "loss": 0.0186, "reward": 0.3937500089406967, "reward_std": 0.23466878719627857, "rewards/accuracy_reward": 0.3937500089406967, "rewards/format_reward": 0.0, "step": 2870 }, { "completion_length": 836.1500183105469, "epoch": 0.9524598310419082, "grad_norm": 0.5584227442741394, "kl": 0.5505859375, "learning_rate": 1.3648718756012813e-07, "loss": 0.022, "reward": 0.39375001303851603, "reward_std": 0.24300211891531945, "rewards/accuracy_reward": 0.39375001303851603, "rewards/format_reward": 0.0, "step": 2875 }, { "completion_length": 854.8437713623047, "epoch": 0.9541162829219811, "grad_norm": 0.8235767483711243, "kl": 0.44990234375, "learning_rate": 1.271294141622459e-07, "loss": 0.018, "reward": 0.4083333447575569, "reward_std": 0.25679129250347615, "rewards/accuracy_reward": 0.4083333447575569, "rewards/format_reward": 0.0, "step": 2880 }, { "completion_length": 846.1604370117187, "epoch": 0.955772734802054, "grad_norm": 0.6451326608657837, "kl": 0.458984375, "learning_rate": 1.1810187527502182e-07, "loss": 0.0183, "reward": 0.4229166805744171, "reward_std": 0.24300211742520333, "rewards/accuracy_reward": 0.4229166805744171, "rewards/format_reward": 0.0, "step": 2885 }, { "completion_length": 836.4083526611328, "epoch": 0.9574291866821268, "grad_norm": 0.38889092206954956, "kl": 0.453076171875, "learning_rate": 1.094048728577346e-07, "loss": 0.0181, "reward": 0.4500000156462193, "reward_std": 0.2125462803989649, "rewards/accuracy_reward": 0.4500000156462193, "rewards/format_reward": 0.0, "step": 2890 }, { "completion_length": 833.8458557128906, "epoch": 0.9590856385621997, "grad_norm": 0.5073965787887573, "kl": 0.44609375, "learning_rate": 1.0103869781365239e-07, "loss": 0.0178, "reward": 0.41666667684912684, "reward_std": 0.17792377546429633, "rewards/accuracy_reward": 0.41666667684912684, "rewards/format_reward": 0.0, "step": 2895 }, { "completion_length": 842.3333526611328, "epoch": 0.9607420904422727, "grad_norm": 0.5080061554908752, "kl": 0.419677734375, "learning_rate": 9.300362998030832e-08, "loss": 0.0168, "reward": 0.4104166768491268, "reward_std": 0.24429128617048262, "rewards/accuracy_reward": 0.4104166768491268, "rewards/format_reward": 0.0, "step": 2900 }, { "completion_length": 851.3291931152344, "epoch": 0.9623985423223456, "grad_norm": 0.7902911305427551, "kl": 0.46533203125, "learning_rate": 8.529993812013249e-08, "loss": 0.0186, "reward": 0.454166679084301, "reward_std": 0.22698003686964513, "rewards/accuracy_reward": 0.454166679084301, "rewards/format_reward": 0.0, "step": 2905 }, { "completion_length": 841.2291809082031, "epoch": 0.9640549942024185, "grad_norm": 0.8123273849487305, "kl": 0.4431640625, "learning_rate": 7.792787991146356e-08, "loss": 0.0177, "reward": 0.43333334401249884, "reward_std": 0.21864670254290103, "rewards/accuracy_reward": 0.43333334401249884, "rewards/format_reward": 0.0, "step": 2910 }, { "completion_length": 859.2583435058593, "epoch": 0.9657114460824913, "grad_norm": 0.4602167308330536, "kl": 0.41357421875, "learning_rate": 7.088770193993455e-08, "loss": 0.0166, "reward": 0.41250001043081286, "reward_std": 0.1792129475623369, "rewards/accuracy_reward": 0.41250001043081286, "rewards/format_reward": 0.0, "step": 2915 }, { "completion_length": 845.3479370117187, "epoch": 0.9673678979625642, "grad_norm": 0.374661922454834, "kl": 0.39228515625, "learning_rate": 6.417963969022389e-08, "loss": 0.0157, "reward": 0.4250000111758709, "reward_std": 0.2616025399416685, "rewards/accuracy_reward": 0.4250000111758709, "rewards/format_reward": 0.0, "step": 2920 }, { "completion_length": 856.5791839599609, "epoch": 0.9690243498426371, "grad_norm": 0.3867492079734802, "kl": 0.44404296875, "learning_rate": 5.78039175381695e-08, "loss": 0.0178, "reward": 0.40208334308117627, "reward_std": 0.1952350303530693, "rewards/accuracy_reward": 0.40208334308117627, "rewards/format_reward": 0.0, "step": 2925 }, { "completion_length": 832.2479400634766, "epoch": 0.97068080172271, "grad_norm": 0.3468852937221527, "kl": 0.379833984375, "learning_rate": 5.176074874327919e-08, "loss": 0.0152, "reward": 0.48750001192092896, "reward_std": 0.2353133711963892, "rewards/accuracy_reward": 0.48750001192092896, "rewards/format_reward": 0.0, "step": 2930 }, { "completion_length": 838.5354370117187, "epoch": 0.9723372536027829, "grad_norm": 0.28038203716278076, "kl": 0.4005859375, "learning_rate": 4.605033544158311e-08, "loss": 0.016, "reward": 0.458333345875144, "reward_std": 0.2580804605036974, "rewards/accuracy_reward": 0.458333345875144, "rewards/format_reward": 0.0, "step": 2935 }, { "completion_length": 846.7125274658204, "epoch": 0.9739937054828557, "grad_norm": 0.8813409805297852, "kl": 0.44443359375, "learning_rate": 4.067286863888131e-08, "loss": 0.0178, "reward": 0.45833335220813753, "reward_std": 0.24587960839271544, "rewards/accuracy_reward": 0.45833335220813753, "rewards/format_reward": 0.0, "step": 2940 }, { "completion_length": 822.5021026611328, "epoch": 0.9756501573629286, "grad_norm": 0.4285522401332855, "kl": 0.351513671875, "learning_rate": 3.562852820435447e-08, "loss": 0.014, "reward": 0.5291666835546494, "reward_std": 0.22921294905245304, "rewards/accuracy_reward": 0.5291666835546494, "rewards/format_reward": 0.0, "step": 2945 }, { "completion_length": 854.6208526611329, "epoch": 0.9773066092430015, "grad_norm": 0.3708440065383911, "kl": 0.454833984375, "learning_rate": 3.091748286453866e-08, "loss": 0.0182, "reward": 0.4104166768491268, "reward_std": 0.24781336896121503, "rewards/accuracy_reward": 0.4104166768491268, "rewards/format_reward": 0.0, "step": 2950 }, { "completion_length": 857.6333557128906, "epoch": 0.9789630611230744, "grad_norm": 0.8107637166976929, "kl": 0.429736328125, "learning_rate": 2.6539890197695428e-08, "loss": 0.0172, "reward": 0.4062500104308128, "reward_std": 0.225391710922122, "rewards/accuracy_reward": 0.4062500104308128, "rewards/format_reward": 0.0, "step": 2955 }, { "completion_length": 823.1729400634765, "epoch": 0.9806195130031473, "grad_norm": 0.4021890461444855, "kl": 0.371728515625, "learning_rate": 2.2495896628529355e-08, "loss": 0.0149, "reward": 0.46666667610406876, "reward_std": 0.23531337231397628, "rewards/accuracy_reward": 0.46666667610406876, "rewards/format_reward": 0.0, "step": 2960 }, { "completion_length": 824.6479339599609, "epoch": 0.9822759648832201, "grad_norm": 0.4021756947040558, "kl": 0.41806640625, "learning_rate": 1.878563742329642e-08, "loss": 0.0167, "reward": 0.45416668355464934, "reward_std": 0.24012462310492994, "rewards/accuracy_reward": 0.45416668355464934, "rewards/format_reward": 0.0, "step": 2965 }, { "completion_length": 823.7062683105469, "epoch": 0.983932416763293, "grad_norm": 0.4632764458656311, "kl": 0.397119140625, "learning_rate": 1.5409236685277608e-08, "loss": 0.0159, "reward": 0.5229166820645332, "reward_std": 0.20872504711151124, "rewards/accuracy_reward": 0.5229166820645332, "rewards/format_reward": 0.0, "step": 2970 }, { "completion_length": 840.0520965576172, "epoch": 0.9855888686433659, "grad_norm": 0.42559853196144104, "kl": 0.42001953125, "learning_rate": 1.2366807350628895e-08, "loss": 0.0168, "reward": 0.43750001266598704, "reward_std": 0.20103629976511, "rewards/accuracy_reward": 0.43750001266598704, "rewards/format_reward": 0.0, "step": 2975 }, { "completion_length": 849.2771087646485, "epoch": 0.9872453205234388, "grad_norm": 0.4889445900917053, "kl": 0.394140625, "learning_rate": 9.658451184600959e-09, "loss": 0.0158, "reward": 0.4770833447575569, "reward_std": 0.20133545324206353, "rewards/accuracy_reward": 0.4770833447575569, "rewards/format_reward": 0.0, "step": 2980 }, { "completion_length": 822.970849609375, "epoch": 0.9889017724035116, "grad_norm": 0.3275405764579773, "kl": 0.368408203125, "learning_rate": 7.284258778139652e-09, "loss": 0.0147, "reward": 0.47916667982935907, "reward_std": 0.23660254180431367, "rewards/accuracy_reward": 0.47916667982935907, "rewards/format_reward": 0.0, "step": 2985 }, { "completion_length": 834.8687622070313, "epoch": 0.9905582242835845, "grad_norm": 0.47362110018730164, "kl": 0.3921875, "learning_rate": 5.2443095448506674e-09, "loss": 0.0157, "reward": 0.4687500111758709, "reward_std": 0.19910254031419755, "rewards/accuracy_reward": 0.4687500111758709, "rewards/format_reward": 0.0, "step": 2990 }, { "completion_length": 831.3062683105469, "epoch": 0.9922146761636574, "grad_norm": 0.4880805015563965, "kl": 0.389794921875, "learning_rate": 3.538671718349429e-09, "loss": 0.0156, "reward": 0.46458334699273107, "reward_std": 0.2529700580984354, "rewards/accuracy_reward": 0.46458334699273107, "rewards/format_reward": 0.0, "step": 2995 }, { "completion_length": 834.5312713623047, "epoch": 0.9938711280437303, "grad_norm": 0.3579634726047516, "kl": 0.389111328125, "learning_rate": 2.167402349972925e-09, "loss": 0.0156, "reward": 0.45833334028720857, "reward_std": 0.2256908666342497, "rewards/accuracy_reward": 0.45833334028720857, "rewards/format_reward": 0.0, "step": 3000 }, { "epoch": 0.9938711280437303, "eval_completion_length": 800.9666870117187, "eval_kl": 0.482421875, "eval_loss": 0.019216088578104973, "eval_reward": 0.44375001192092894, "eval_reward_std": 0.2285683572292328, "eval_rewards/accuracy_reward": 0.44375001192092894, "eval_rewards/format_reward": 0.0, "eval_runtime": 68.7847, "eval_samples_per_second": 1.439, "eval_steps_per_second": 0.029, "step": 3000 }, { "completion_length": 828.9583557128906, "epoch": 0.9955275799238033, "grad_norm": 0.48756203055381775, "kl": 0.47802734375, "learning_rate": 1.1305473068745632e-09, "loss": 0.0191, "reward": 0.4250000104308128, "reward_std": 0.193646702170372, "rewards/accuracy_reward": 0.4250000104308128, "rewards/format_reward": 0.0, "step": 3005 }, { "completion_length": 834.8437713623047, "epoch": 0.997184031803876, "grad_norm": 0.38578832149505615, "kl": 0.44384765625, "learning_rate": 4.2814127048873553e-10, "loss": 0.0177, "reward": 0.42291667610406875, "reward_std": 0.2526246216148138, "rewards/accuracy_reward": 0.42291667610406875, "rewards/format_reward": 0.0, "step": 3010 }, { "completion_length": 841.8979339599609, "epoch": 0.998840483683949, "grad_norm": 0.4027821719646454, "kl": 0.45146484375, "learning_rate": 6.020773537174229e-11, "loss": 0.018, "reward": 0.4354166820645332, "reward_std": 0.24171294532716275, "rewards/accuracy_reward": 0.4354166820645332, "rewards/format_reward": 0.0, "step": 3015 }, { "completion_length": 841.6944630940756, "epoch": 0.9998343548119927, "kl": 0.4324544270833333, "reward": 0.4618055646618207, "reward_std": 0.2800035278002421, "rewards/accuracy_reward": 0.4618055646618207, "rewards/format_reward": 0.0, "step": 3018, "total_flos": 0.0, "train_loss": 0.017366865699724354, "train_runtime": 93681.4322, "train_samples_per_second": 0.773, "train_steps_per_second": 0.032 } ], "logging_steps": 5, "max_steps": 3018, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }