Kadins's picture
Model save
cdfdae9 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9985185185185185,
"eval_steps": 500,
"global_step": 337,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 1801.2277526855469,
"epoch": 0.002962962962962963,
"grad_norm": 0.17165035705260304,
"kl": 0.0,
"learning_rate": 2.941176470588235e-08,
"loss": 0.0224,
"reward": 0.5656742379069328,
"reward_std": 0.2888262942433357,
"rewards/exp_len_reward": 0.5656742379069328,
"step": 1
},
{
"completion_length": 2185.982208251953,
"epoch": 0.005925925925925926,
"grad_norm": 0.15400600491691718,
"kl": 0.0,
"learning_rate": 5.88235294117647e-08,
"loss": 0.0623,
"reward": 0.3711318001151085,
"reward_std": 0.24151454865932465,
"rewards/exp_len_reward": 0.3711318001151085,
"step": 2
},
{
"completion_length": 2091.6697387695312,
"epoch": 0.008888888888888889,
"grad_norm": 0.12985062585096363,
"kl": 0.0001380443572998047,
"learning_rate": 8.823529411764706e-08,
"loss": -0.0138,
"reward": 0.45638658851385117,
"reward_std": 0.27105605974793434,
"rewards/exp_len_reward": 0.45638658851385117,
"step": 3
},
{
"completion_length": 2074.7500915527344,
"epoch": 0.011851851851851851,
"grad_norm": 0.15982273028009736,
"kl": 0.0001361370086669922,
"learning_rate": 1.176470588235294e-07,
"loss": 0.0803,
"reward": 0.5727517828345299,
"reward_std": 0.2747129164636135,
"rewards/exp_len_reward": 0.5727517828345299,
"step": 4
},
{
"completion_length": 1972.2679138183594,
"epoch": 0.014814814814814815,
"grad_norm": 0.14384158083081833,
"kl": 0.00012791156768798828,
"learning_rate": 1.4705882352941175e-07,
"loss": -0.0259,
"reward": 0.49002690985798836,
"reward_std": 0.19663079921156168,
"rewards/exp_len_reward": 0.49002690985798836,
"step": 5
},
{
"completion_length": 1804.8885192871094,
"epoch": 0.017777777777777778,
"grad_norm": 0.16133143911721617,
"kl": 0.00011050701141357422,
"learning_rate": 1.764705882352941e-07,
"loss": 0.0338,
"reward": 0.5603420734405518,
"reward_std": 0.3139008916914463,
"rewards/exp_len_reward": 0.5603420734405518,
"step": 6
},
{
"completion_length": 1797.9956359863281,
"epoch": 0.02074074074074074,
"grad_norm": 0.1746647382731447,
"kl": 0.00010704994201660156,
"learning_rate": 2.0588235294117645e-07,
"loss": 0.0104,
"reward": 0.5684476867318153,
"reward_std": 0.2409583255648613,
"rewards/exp_len_reward": 0.5684476867318153,
"step": 7
},
{
"completion_length": 2083.8929748535156,
"epoch": 0.023703703703703703,
"grad_norm": 0.1420875124823215,
"kl": 0.00012505054473876953,
"learning_rate": 2.352941176470588e-07,
"loss": 0.0083,
"reward": 0.4780779331922531,
"reward_std": 0.27319788932800293,
"rewards/exp_len_reward": 0.4780779331922531,
"step": 8
},
{
"completion_length": 2139.6027221679688,
"epoch": 0.02666666666666667,
"grad_norm": 0.13758669498682333,
"kl": 0.0001379251480102539,
"learning_rate": 2.6470588235294114e-07,
"loss": 0.037,
"reward": 0.3995143547654152,
"reward_std": 0.2271974515169859,
"rewards/exp_len_reward": 0.3995143547654152,
"step": 9
},
{
"completion_length": 1866.290283203125,
"epoch": 0.02962962962962963,
"grad_norm": 0.18570234734655267,
"kl": 0.00011456012725830078,
"learning_rate": 2.941176470588235e-07,
"loss": 0.1181,
"reward": 0.46772801876068115,
"reward_std": 0.2898196689784527,
"rewards/exp_len_reward": 0.46772801876068115,
"step": 10
},
{
"completion_length": 2405.1697387695312,
"epoch": 0.03259259259259259,
"grad_norm": 0.13468269595632437,
"kl": 0.0001614093780517578,
"learning_rate": 3.2352941176470586e-07,
"loss": -0.0412,
"reward": 0.4437425658106804,
"reward_std": 0.2463722713291645,
"rewards/exp_len_reward": 0.4437425658106804,
"step": 11
},
{
"completion_length": 1774.8750915527344,
"epoch": 0.035555555555555556,
"grad_norm": 0.1541607968056605,
"kl": 0.00010025501251220703,
"learning_rate": 3.529411764705882e-07,
"loss": 0.0198,
"reward": 0.4462169408798218,
"reward_std": 0.25852715596556664,
"rewards/exp_len_reward": 0.4462169408798218,
"step": 12
},
{
"completion_length": 1651.9152374267578,
"epoch": 0.03851851851851852,
"grad_norm": 0.17738313728076366,
"kl": 9.834766387939453e-05,
"learning_rate": 3.8235294117647053e-07,
"loss": 0.0837,
"reward": 0.5214016065001488,
"reward_std": 0.1839424017816782,
"rewards/exp_len_reward": 0.5214016065001488,
"step": 13
},
{
"completion_length": 1320.7232666015625,
"epoch": 0.04148148148148148,
"grad_norm": 0.16613281883713185,
"kl": 9.1552734375e-05,
"learning_rate": 4.117647058823529e-07,
"loss": 0.0156,
"reward": 0.5551810637116432,
"reward_std": 0.28313471004366875,
"rewards/exp_len_reward": 0.5551810637116432,
"step": 14
},
{
"completion_length": 1841.2857971191406,
"epoch": 0.044444444444444446,
"grad_norm": 0.146202640780426,
"kl": 0.00010335445404052734,
"learning_rate": 4.4117647058823526e-07,
"loss": 0.0368,
"reward": 0.5522215813398361,
"reward_std": 0.337805338203907,
"rewards/exp_len_reward": 0.5522215813398361,
"step": 15
},
{
"completion_length": 1566.0045166015625,
"epoch": 0.047407407407407405,
"grad_norm": 0.18526362140745664,
"kl": 0.00010216236114501953,
"learning_rate": 4.705882352941176e-07,
"loss": 0.0547,
"reward": 0.5769367516040802,
"reward_std": 0.29369358718395233,
"rewards/exp_len_reward": 0.5769367516040802,
"step": 16
},
{
"completion_length": 1614.1385040283203,
"epoch": 0.05037037037037037,
"grad_norm": 0.1437351588405587,
"kl": 0.00010508298873901367,
"learning_rate": 5e-07,
"loss": -0.02,
"reward": 0.41402483731508255,
"reward_std": 0.35327037796378136,
"rewards/exp_len_reward": 0.41402483731508255,
"step": 17
},
{
"completion_length": 1919.7411499023438,
"epoch": 0.05333333333333334,
"grad_norm": 0.16214726757721953,
"kl": 0.0001310110092163086,
"learning_rate": 5.294117647058823e-07,
"loss": -0.0029,
"reward": 0.5019121393561363,
"reward_std": 0.3405100703239441,
"rewards/exp_len_reward": 0.5019121393561363,
"step": 18
},
{
"completion_length": 2193.102783203125,
"epoch": 0.056296296296296296,
"grad_norm": 0.17578873894389108,
"kl": 0.0001233816146850586,
"learning_rate": 5.588235294117647e-07,
"loss": 0.0736,
"reward": 0.5333621501922607,
"reward_std": 0.332721009850502,
"rewards/exp_len_reward": 0.5333621501922607,
"step": 19
},
{
"completion_length": 1631.3750610351562,
"epoch": 0.05925925925925926,
"grad_norm": 0.16308413233344032,
"kl": 8.952617645263672e-05,
"learning_rate": 5.88235294117647e-07,
"loss": 0.039,
"reward": 0.5893106684088707,
"reward_std": 0.20388014614582062,
"rewards/exp_len_reward": 0.5893106684088707,
"step": 20
},
{
"completion_length": 1881.4911499023438,
"epoch": 0.06222222222222222,
"grad_norm": 0.185982449259781,
"kl": 0.00013494491577148438,
"learning_rate": 6.176470588235294e-07,
"loss": 0.032,
"reward": 0.5047426372766495,
"reward_std": 0.3071717321872711,
"rewards/exp_len_reward": 0.5047426372766495,
"step": 21
},
{
"completion_length": 2141.5001220703125,
"epoch": 0.06518518518518518,
"grad_norm": 0.12297826000738939,
"kl": 0.00014662742614746094,
"learning_rate": 6.470588235294117e-07,
"loss": 0.01,
"reward": 0.4561139643192291,
"reward_std": 0.26503079757094383,
"rewards/exp_len_reward": 0.4561139643192291,
"step": 22
},
{
"completion_length": 1731.0938110351562,
"epoch": 0.06814814814814815,
"grad_norm": 0.16752422299971495,
"kl": 0.00011110305786132812,
"learning_rate": 6.764705882352941e-07,
"loss": 0.0578,
"reward": 0.5439532399177551,
"reward_std": 0.2806715965270996,
"rewards/exp_len_reward": 0.5439532399177551,
"step": 23
},
{
"completion_length": 1966.1787109375,
"epoch": 0.07111111111111111,
"grad_norm": 0.21835387563377565,
"kl": 0.00012505054473876953,
"learning_rate": 7.058823529411765e-07,
"loss": 0.0838,
"reward": 0.5456492155790329,
"reward_std": 0.21898872777819633,
"rewards/exp_len_reward": 0.5456492155790329,
"step": 24
},
{
"completion_length": 1735.8259582519531,
"epoch": 0.07407407407407407,
"grad_norm": 0.16254233382968727,
"kl": 0.0001264810562133789,
"learning_rate": 7.352941176470589e-07,
"loss": -0.0254,
"reward": 0.5086465999484062,
"reward_std": 0.28585580736398697,
"rewards/exp_len_reward": 0.5086465999484062,
"step": 25
},
{
"completion_length": 1944.1384887695312,
"epoch": 0.07703703703703704,
"grad_norm": 0.1278452103013438,
"kl": 0.00012123584747314453,
"learning_rate": 7.647058823529411e-07,
"loss": -0.0574,
"reward": 0.521670825779438,
"reward_std": 0.2907986231148243,
"rewards/exp_len_reward": 0.521670825779438,
"step": 26
},
{
"completion_length": 2062.15185546875,
"epoch": 0.08,
"grad_norm": 0.14646903593402805,
"kl": 0.00015497207641601562,
"learning_rate": 7.941176470588235e-07,
"loss": 0.0199,
"reward": 0.5175457671284676,
"reward_std": 0.21266279742121696,
"rewards/exp_len_reward": 0.5175457671284676,
"step": 27
},
{
"completion_length": 1839.6786804199219,
"epoch": 0.08296296296296296,
"grad_norm": 0.14287471559793075,
"kl": 0.00013077259063720703,
"learning_rate": 8.235294117647058e-07,
"loss": 0.0364,
"reward": 0.40920490026474,
"reward_std": 0.3160223700106144,
"rewards/exp_len_reward": 0.40920490026474,
"step": 28
},
{
"completion_length": 1802.4777526855469,
"epoch": 0.08592592592592592,
"grad_norm": 0.18816138571640492,
"kl": 0.0001322031021118164,
"learning_rate": 8.529411764705882e-07,
"loss": 0.077,
"reward": 0.5730812773108482,
"reward_std": 0.25979165360331535,
"rewards/exp_len_reward": 0.5730812773108482,
"step": 29
},
{
"completion_length": 2252.700958251953,
"epoch": 0.08888888888888889,
"grad_norm": 0.1563414689939465,
"kl": 0.00017523765563964844,
"learning_rate": 8.823529411764705e-07,
"loss": 0.0581,
"reward": 0.4275534115731716,
"reward_std": 0.3341764882206917,
"rewards/exp_len_reward": 0.4275534115731716,
"step": 30
},
{
"completion_length": 2195.9510192871094,
"epoch": 0.09185185185185185,
"grad_norm": 0.14534598607811303,
"kl": 0.000186920166015625,
"learning_rate": 9.117647058823529e-07,
"loss": 0.0581,
"reward": 0.369928453117609,
"reward_std": 0.2257540374994278,
"rewards/exp_len_reward": 0.369928453117609,
"step": 31
},
{
"completion_length": 2284.1072387695312,
"epoch": 0.09481481481481481,
"grad_norm": 0.13504634261216167,
"kl": 0.0002334117889404297,
"learning_rate": 9.411764705882352e-07,
"loss": 0.0254,
"reward": 0.5308222621679306,
"reward_std": 0.2156723290681839,
"rewards/exp_len_reward": 0.5308222621679306,
"step": 32
},
{
"completion_length": 1720.0759735107422,
"epoch": 0.09777777777777778,
"grad_norm": 0.2262086823960172,
"kl": 0.00020706653594970703,
"learning_rate": 9.705882352941176e-07,
"loss": 0.1291,
"reward": 0.6482533067464828,
"reward_std": 0.23323534801602364,
"rewards/exp_len_reward": 0.6482533067464828,
"step": 33
},
{
"completion_length": 2521.1072387695312,
"epoch": 0.10074074074074074,
"grad_norm": 0.12083975758187475,
"kl": 0.00023365020751953125,
"learning_rate": 1e-06,
"loss": -0.0182,
"reward": 0.37041275948286057,
"reward_std": 0.27552830427885056,
"rewards/exp_len_reward": 0.37041275948286057,
"step": 34
},
{
"completion_length": 2528.1741943359375,
"epoch": 0.1037037037037037,
"grad_norm": 0.12437434921657496,
"kl": 0.0003223419189453125,
"learning_rate": 9.99975812381176e-07,
"loss": 0.015,
"reward": 0.331495076417923,
"reward_std": 0.2619011141359806,
"rewards/exp_len_reward": 0.331495076417923,
"step": 35
},
{
"completion_length": 1851.3170776367188,
"epoch": 0.10666666666666667,
"grad_norm": 0.15925522443930307,
"kl": 0.00037288665771484375,
"learning_rate": 9.999032521248854e-07,
"loss": -0.026,
"reward": 0.4777600094676018,
"reward_std": 0.31441882997751236,
"rewards/exp_len_reward": 0.4777600094676018,
"step": 36
},
{
"completion_length": 1999.2813110351562,
"epoch": 0.10962962962962963,
"grad_norm": 0.1217664172693966,
"kl": 0.0003540515899658203,
"learning_rate": 9.997823270313945e-07,
"loss": -0.0106,
"reward": 0.5654740855097771,
"reward_std": 0.2559405229985714,
"rewards/exp_len_reward": 0.5654740855097771,
"step": 37
},
{
"completion_length": 1974.83935546875,
"epoch": 0.11259259259259259,
"grad_norm": 0.1707475640038693,
"kl": 0.0003287792205810547,
"learning_rate": 9.996130501002146e-07,
"loss": 0.0946,
"reward": 0.4990657716989517,
"reward_std": 0.22302044555544853,
"rewards/exp_len_reward": 0.4990657716989517,
"step": 38
},
{
"completion_length": 2258.2500915527344,
"epoch": 0.11555555555555555,
"grad_norm": 0.13998814580057628,
"kl": 0.00038623809814453125,
"learning_rate": 9.99395439528705e-07,
"loss": 0.0104,
"reward": 0.4020570404827595,
"reward_std": 0.35356171429157257,
"rewards/exp_len_reward": 0.4020570404827595,
"step": 39
},
{
"completion_length": 2405.2233276367188,
"epoch": 0.11851851851851852,
"grad_norm": 0.13044077504292861,
"kl": 0.00035953521728515625,
"learning_rate": 9.991295187101175e-07,
"loss": 0.0227,
"reward": 0.3418276160955429,
"reward_std": 0.30284278094768524,
"rewards/exp_len_reward": 0.3418276160955429,
"step": 40
},
{
"completion_length": 2109.4688720703125,
"epoch": 0.12148148148148148,
"grad_norm": 0.1529163557190866,
"kl": 0.0004558563232421875,
"learning_rate": 9.988153162310798e-07,
"loss": 0.0287,
"reward": 0.3564532473683357,
"reward_std": 0.2458956204354763,
"rewards/exp_len_reward": 0.3564532473683357,
"step": 41
},
{
"completion_length": 2029.4866638183594,
"epoch": 0.12444444444444444,
"grad_norm": 0.17325144566551426,
"kl": 0.000431060791015625,
"learning_rate": 9.98452865868525e-07,
"loss": 0.0635,
"reward": 0.5387638658285141,
"reward_std": 0.20850694179534912,
"rewards/exp_len_reward": 0.5387638658285141,
"step": 42
},
{
"completion_length": 1609.05810546875,
"epoch": 0.1274074074074074,
"grad_norm": 0.18834562491158613,
"kl": 0.0004696846008300781,
"learning_rate": 9.980422065860585e-07,
"loss": 0.0148,
"reward": 0.5655806735157967,
"reward_std": 0.25856464356184006,
"rewards/exp_len_reward": 0.5655806735157967,
"step": 43
},
{
"completion_length": 2234.6385192871094,
"epoch": 0.13037037037037036,
"grad_norm": 0.13146085912536554,
"kl": 0.0006403923034667969,
"learning_rate": 9.975833825297694e-07,
"loss": -0.0197,
"reward": 0.5414331331849098,
"reward_std": 0.2674776539206505,
"rewards/exp_len_reward": 0.5414331331849098,
"step": 44
},
{
"completion_length": 2471.6028442382812,
"epoch": 0.13333333333333333,
"grad_norm": 0.12838849490986115,
"kl": 0.0006742477416992188,
"learning_rate": 9.970764430234865e-07,
"loss": -0.0289,
"reward": 0.4237719103693962,
"reward_std": 0.25649312883615494,
"rewards/exp_len_reward": 0.4237719103693962,
"step": 45
},
{
"completion_length": 2365.5001220703125,
"epoch": 0.1362962962962963,
"grad_norm": 0.14914654023839766,
"kl": 0.0007715225219726562,
"learning_rate": 9.965214425634744e-07,
"loss": 0.0748,
"reward": 0.5114802047610283,
"reward_std": 0.2153051160275936,
"rewards/exp_len_reward": 0.5114802047610283,
"step": 46
},
{
"completion_length": 1474.0982666015625,
"epoch": 0.13925925925925925,
"grad_norm": 0.17420480563840932,
"kl": 0.0010280609130859375,
"learning_rate": 9.959184408125757e-07,
"loss": 0.0243,
"reward": 0.5414484888315201,
"reward_std": 0.23497811146080494,
"rewards/exp_len_reward": 0.5414484888315201,
"step": 47
},
{
"completion_length": 2123.9866943359375,
"epoch": 0.14222222222222222,
"grad_norm": 0.20318427732480254,
"kl": 0.0011425018310546875,
"learning_rate": 9.952675025937969e-07,
"loss": 0.0641,
"reward": 0.3801772743463516,
"reward_std": 0.26251309737563133,
"rewards/exp_len_reward": 0.3801772743463516,
"step": 48
},
{
"completion_length": 2290.9911499023438,
"epoch": 0.1451851851851852,
"grad_norm": 0.16137418781063556,
"kl": 0.0010118484497070312,
"learning_rate": 9.945686978833404e-07,
"loss": 0.0867,
"reward": 0.5667200461030006,
"reward_std": 0.2713882625102997,
"rewards/exp_len_reward": 0.5667200461030006,
"step": 49
},
{
"completion_length": 2380.5492248535156,
"epoch": 0.14814814814814814,
"grad_norm": 0.16965865430432733,
"kl": 0.0011739730834960938,
"learning_rate": 9.938221018030818e-07,
"loss": 0.1294,
"reward": 0.5124416798353195,
"reward_std": 0.25596321001648903,
"rewards/exp_len_reward": 0.5124416798353195,
"step": 50
},
{
"completion_length": 2247.0670776367188,
"epoch": 0.1511111111111111,
"grad_norm": 0.12970462538444932,
"kl": 0.001293182373046875,
"learning_rate": 9.930277946124936e-07,
"loss": 0.0198,
"reward": 0.4916500821709633,
"reward_std": 0.2014484405517578,
"rewards/exp_len_reward": 0.4916500821709633,
"step": 51
},
{
"completion_length": 2249.3125610351562,
"epoch": 0.15407407407407409,
"grad_norm": 0.1622954753109236,
"kl": 0.001811981201171875,
"learning_rate": 9.921858617000186e-07,
"loss": 0.0112,
"reward": 0.6037572771310806,
"reward_std": 0.24519889429211617,
"rewards/exp_len_reward": 0.6037572771310806,
"step": 52
},
{
"completion_length": 2421.6787109375,
"epoch": 0.15703703703703703,
"grad_norm": 0.18286903039083063,
"kl": 0.0019397735595703125,
"learning_rate": 9.912963935738895e-07,
"loss": 0.0844,
"reward": 0.503461018204689,
"reward_std": 0.26762050203979015,
"rewards/exp_len_reward": 0.503461018204689,
"step": 53
},
{
"completion_length": 2319.352813720703,
"epoch": 0.16,
"grad_norm": 0.15710291606441482,
"kl": 0.001514434814453125,
"learning_rate": 9.903594858523993e-07,
"loss": 0.0332,
"reward": 0.560577280819416,
"reward_std": 0.25206807255744934,
"rewards/exp_len_reward": 0.560577280819416,
"step": 54
},
{
"completion_length": 2609.1741638183594,
"epoch": 0.16296296296296298,
"grad_norm": 0.13990344576481042,
"kl": 0.001995086669921875,
"learning_rate": 9.893752392536231e-07,
"loss": 0.0342,
"reward": 0.46819788962602615,
"reward_std": 0.215255219489336,
"rewards/exp_len_reward": 0.46819788962602615,
"step": 55
},
{
"completion_length": 2860.785888671875,
"epoch": 0.16592592592592592,
"grad_norm": 0.12137029587563823,
"kl": 0.00226593017578125,
"learning_rate": 9.883437595845901e-07,
"loss": 0.0001,
"reward": 0.47793612629175186,
"reward_std": 0.21438376046717167,
"rewards/exp_len_reward": 0.47793612629175186,
"step": 56
},
{
"completion_length": 2219.946533203125,
"epoch": 0.1688888888888889,
"grad_norm": 0.16329103001651094,
"kl": 0.003246307373046875,
"learning_rate": 9.872651577299092e-07,
"loss": 0.0075,
"reward": 0.5384046509861946,
"reward_std": 0.22491934522986412,
"rewards/exp_len_reward": 0.5384046509861946,
"step": 57
},
{
"completion_length": 2603.6117553710938,
"epoch": 0.17185185185185184,
"grad_norm": 0.15535812470619684,
"kl": 0.002307891845703125,
"learning_rate": 9.861395496398497e-07,
"loss": 0.0209,
"reward": 0.44340164959430695,
"reward_std": 0.2533542141318321,
"rewards/exp_len_reward": 0.44340164959430695,
"step": 58
},
{
"completion_length": 2278.138458251953,
"epoch": 0.1748148148148148,
"grad_norm": 0.19628078093756599,
"kl": 0.0034637451171875,
"learning_rate": 9.849670563178756e-07,
"loss": 0.0342,
"reward": 0.6375216767191887,
"reward_std": 0.22566119581460953,
"rewards/exp_len_reward": 0.6375216767191887,
"step": 59
},
{
"completion_length": 2012.8616638183594,
"epoch": 0.17777777777777778,
"grad_norm": 0.1834506950176924,
"kl": 0.00319671630859375,
"learning_rate": 9.83747803807638e-07,
"loss": -0.0158,
"reward": 0.6068644598126411,
"reward_std": 0.2145768441259861,
"rewards/exp_len_reward": 0.6068644598126411,
"step": 60
},
{
"completion_length": 2115.3438720703125,
"epoch": 0.18074074074074073,
"grad_norm": 0.15523600283402922,
"kl": 0.003215789794921875,
"learning_rate": 9.82481923179426e-07,
"loss": 0.0184,
"reward": 0.6350785046815872,
"reward_std": 0.18117142282426357,
"rewards/exp_len_reward": 0.6350785046815872,
"step": 61
},
{
"completion_length": 2114.2411193847656,
"epoch": 0.1837037037037037,
"grad_norm": 0.20605772552254983,
"kl": 0.003391265869140625,
"learning_rate": 9.811695505160755e-07,
"loss": 0.1202,
"reward": 0.4864572286605835,
"reward_std": 0.28768010064959526,
"rewards/exp_len_reward": 0.4864572286605835,
"step": 62
},
{
"completion_length": 2333.384063720703,
"epoch": 0.18666666666666668,
"grad_norm": 0.17526874157597688,
"kl": 0.00322723388671875,
"learning_rate": 9.79810826898341e-07,
"loss": 0.0574,
"reward": 0.5434901565313339,
"reward_std": 0.2581823952496052,
"rewards/exp_len_reward": 0.5434901565313339,
"step": 63
},
{
"completion_length": 2263.5625610351562,
"epoch": 0.18962962962962962,
"grad_norm": 0.1678305474956245,
"kl": 0.0043487548828125,
"learning_rate": 9.784058983897284e-07,
"loss": -0.0221,
"reward": 0.5998831987380981,
"reward_std": 0.20506521314382553,
"rewards/exp_len_reward": 0.5998831987380981,
"step": 64
},
{
"completion_length": 2293.4688415527344,
"epoch": 0.1925925925925926,
"grad_norm": 0.12491619040889941,
"kl": 0.003841400146484375,
"learning_rate": 9.769549160207952e-07,
"loss": -0.0029,
"reward": 0.4781326428055763,
"reward_std": 0.24647967144846916,
"rewards/exp_len_reward": 0.4781326428055763,
"step": 65
},
{
"completion_length": 2674.1876220703125,
"epoch": 0.19555555555555557,
"grad_norm": 0.17123713322573883,
"kl": 0.003971099853515625,
"learning_rate": 9.754580357729116e-07,
"loss": 0.0074,
"reward": 0.6343429163098335,
"reward_std": 0.17385547421872616,
"rewards/exp_len_reward": 0.6343429163098335,
"step": 66
},
{
"completion_length": 2128.9553833007812,
"epoch": 0.1985185185185185,
"grad_norm": 0.19102462314359206,
"kl": 0.00537109375,
"learning_rate": 9.739154185614949e-07,
"loss": 0.0521,
"reward": 0.5997566878795624,
"reward_std": 0.25122974812984467,
"rewards/exp_len_reward": 0.5997566878795624,
"step": 67
},
{
"completion_length": 2614.040283203125,
"epoch": 0.20148148148148148,
"grad_norm": 0.13959648308338587,
"kl": 0.005168914794921875,
"learning_rate": 9.723272302187106e-07,
"loss": -0.0082,
"reward": 0.5444743484258652,
"reward_std": 0.22692475281655788,
"rewards/exp_len_reward": 0.5444743484258652,
"step": 68
},
{
"completion_length": 2289.901885986328,
"epoch": 0.20444444444444446,
"grad_norm": 0.19300943423240471,
"kl": 0.004718780517578125,
"learning_rate": 9.706936414756435e-07,
"loss": 0.0497,
"reward": 0.5558685436844826,
"reward_std": 0.2169661819934845,
"rewards/exp_len_reward": 0.5558685436844826,
"step": 69
},
{
"completion_length": 2974.8751220703125,
"epoch": 0.2074074074074074,
"grad_norm": 0.121339310942439,
"kl": 0.004192352294921875,
"learning_rate": 9.69014827943947e-07,
"loss": -0.0285,
"reward": 0.40666233375668526,
"reward_std": 0.2304704710841179,
"rewards/exp_len_reward": 0.40666233375668526,
"step": 70
},
{
"completion_length": 2327.964385986328,
"epoch": 0.21037037037037037,
"grad_norm": 0.14030638320428154,
"kl": 0.0042724609375,
"learning_rate": 9.672909700969612e-07,
"loss": 0.0189,
"reward": 0.5954511985182762,
"reward_std": 0.1756008304655552,
"rewards/exp_len_reward": 0.5954511985182762,
"step": 71
},
{
"completion_length": 2386.6920776367188,
"epoch": 0.21333333333333335,
"grad_norm": 0.15142935641482305,
"kl": 0.0072174072265625,
"learning_rate": 9.65522253250316e-07,
"loss": 0.0134,
"reward": 0.5968082100152969,
"reward_std": 0.24449098855257034,
"rewards/exp_len_reward": 0.5968082100152969,
"step": 72
},
{
"completion_length": 2349.0001220703125,
"epoch": 0.2162962962962963,
"grad_norm": 0.19733194845778884,
"kl": 0.0043792724609375,
"learning_rate": 9.637088675420063e-07,
"loss": 0.0693,
"reward": 0.6912636756896973,
"reward_std": 0.23656904697418213,
"rewards/exp_len_reward": 0.6912636756896973,
"step": 73
},
{
"completion_length": 2502.040283203125,
"epoch": 0.21925925925925926,
"grad_norm": 0.17306578944129042,
"kl": 0.00576019287109375,
"learning_rate": 9.618510079119533e-07,
"loss": 0.0302,
"reward": 0.5406814813613892,
"reward_std": 0.22251397371292114,
"rewards/exp_len_reward": 0.5406814813613892,
"step": 74
},
{
"completion_length": 2595.4509887695312,
"epoch": 0.2222222222222222,
"grad_norm": 0.15937561200793735,
"kl": 0.005767822265625,
"learning_rate": 9.59948874081048e-07,
"loss": -0.0046,
"reward": 0.4833944961428642,
"reward_std": 0.2554270029067993,
"rewards/exp_len_reward": 0.4833944961428642,
"step": 75
},
{
"completion_length": 2212.65185546875,
"epoch": 0.22518518518518518,
"grad_norm": 0.16902423297675423,
"kl": 0.00504302978515625,
"learning_rate": 9.580026705296824e-07,
"loss": 0.0447,
"reward": 0.6849584132432938,
"reward_std": 0.19782762601971626,
"rewards/exp_len_reward": 0.6849584132432938,
"step": 76
},
{
"completion_length": 2508.058074951172,
"epoch": 0.22814814814814816,
"grad_norm": 0.14168432606139286,
"kl": 0.00524139404296875,
"learning_rate": 9.56012606475766e-07,
"loss": -0.02,
"reward": 0.5538096725940704,
"reward_std": 0.1790675725787878,
"rewards/exp_len_reward": 0.5538096725940704,
"step": 77
},
{
"completion_length": 2099.8438720703125,
"epoch": 0.2311111111111111,
"grad_norm": 0.16648937358394447,
"kl": 0.0051116943359375,
"learning_rate": 9.539788958522353e-07,
"loss": 0.0118,
"reward": 0.5786676853895187,
"reward_std": 0.20153935626149178,
"rewards/exp_len_reward": 0.5786676853895187,
"step": 78
},
{
"completion_length": 2088.1697387695312,
"epoch": 0.23407407407407407,
"grad_norm": 0.18793602052290423,
"kl": 0.0059051513671875,
"learning_rate": 9.519017572840562e-07,
"loss": 0.0275,
"reward": 0.6274498999118805,
"reward_std": 0.27273761481046677,
"rewards/exp_len_reward": 0.6274498999118805,
"step": 79
},
{
"completion_length": 2347.9019165039062,
"epoch": 0.23703703703703705,
"grad_norm": 0.1591086590370902,
"kl": 0.005828857421875,
"learning_rate": 9.49781414064722e-07,
"loss": 0.0009,
"reward": 0.5423298478126526,
"reward_std": 0.27614232525229454,
"rewards/exp_len_reward": 0.5423298478126526,
"step": 80
},
{
"completion_length": 2335.3259887695312,
"epoch": 0.24,
"grad_norm": 0.1492302716898933,
"kl": 0.00699615478515625,
"learning_rate": 9.476180941322485e-07,
"loss": -0.0002,
"reward": 0.48675865679979324,
"reward_std": 0.22459113597869873,
"rewards/exp_len_reward": 0.48675865679979324,
"step": 81
},
{
"completion_length": 2585.65185546875,
"epoch": 0.24296296296296296,
"grad_norm": 0.17333826527197865,
"kl": 0.0056304931640625,
"learning_rate": 9.454120300446708e-07,
"loss": 0.0085,
"reward": 0.46361441165208817,
"reward_std": 0.2632403336465359,
"rewards/exp_len_reward": 0.46361441165208817,
"step": 82
},
{
"completion_length": 2431.7322998046875,
"epoch": 0.24592592592592594,
"grad_norm": 0.1602558093279175,
"kl": 0.006927490234375,
"learning_rate": 9.431634589550437e-07,
"loss": -0.0156,
"reward": 0.6010517254471779,
"reward_std": 0.1429296052083373,
"rewards/exp_len_reward": 0.6010517254471779,
"step": 83
},
{
"completion_length": 2337.3037109375,
"epoch": 0.24888888888888888,
"grad_norm": 0.23867947641732404,
"kl": 0.00533294677734375,
"learning_rate": 9.408726225859463e-07,
"loss": 0.0472,
"reward": 0.5745537057518959,
"reward_std": 0.19299127161502838,
"rewards/exp_len_reward": 0.5745537057518959,
"step": 84
},
{
"completion_length": 1958.0089721679688,
"epoch": 0.2518518518518518,
"grad_norm": 0.18352640456742336,
"kl": 0.00751495361328125,
"learning_rate": 9.385397672034984e-07,
"loss": 0.0373,
"reward": 0.5863458216190338,
"reward_std": 0.18559462763369083,
"rewards/exp_len_reward": 0.5863458216190338,
"step": 85
},
{
"completion_length": 2164.3348693847656,
"epoch": 0.2548148148148148,
"grad_norm": 0.1583111043850771,
"kl": 0.0073394775390625,
"learning_rate": 9.361651435908859e-07,
"loss": 0.0039,
"reward": 0.6185845136642456,
"reward_std": 0.23790935426950455,
"rewards/exp_len_reward": 0.6185845136642456,
"step": 86
},
{
"completion_length": 1974.8393859863281,
"epoch": 0.2577777777777778,
"grad_norm": 0.24011600157172625,
"kl": 0.0063018798828125,
"learning_rate": 9.337490070214005e-07,
"loss": 0.0782,
"reward": 0.6514532268047333,
"reward_std": 0.21611288189888,
"rewards/exp_len_reward": 0.6514532268047333,
"step": 87
},
{
"completion_length": 2044.7188415527344,
"epoch": 0.2607407407407407,
"grad_norm": 0.16828102239112172,
"kl": 0.0050811767578125,
"learning_rate": 9.312916172309998e-07,
"loss": 0.0209,
"reward": 0.6684047281742096,
"reward_std": 0.26073355227708817,
"rewards/exp_len_reward": 0.6684047281742096,
"step": 88
},
{
"completion_length": 2310.620635986328,
"epoch": 0.2637037037037037,
"grad_norm": 0.14135906575873353,
"kl": 0.0082855224609375,
"learning_rate": 9.287932383903842e-07,
"loss": -0.0474,
"reward": 0.580662876367569,
"reward_std": 0.16773580014705658,
"rewards/exp_len_reward": 0.580662876367569,
"step": 89
},
{
"completion_length": 2240.1117553710938,
"epoch": 0.26666666666666666,
"grad_norm": 0.1981410777840034,
"kl": 0.00653839111328125,
"learning_rate": 9.262541390765981e-07,
"loss": 0.0485,
"reward": 0.5460301488637924,
"reward_std": 0.20029782131314278,
"rewards/exp_len_reward": 0.5460301488637924,
"step": 90
},
{
"completion_length": 2113.919708251953,
"epoch": 0.2696296296296296,
"grad_norm": 0.17969252200032312,
"kl": 0.00701141357421875,
"learning_rate": 9.236745922441589e-07,
"loss": 0.0784,
"reward": 0.6915992498397827,
"reward_std": 0.19186005368828773,
"rewards/exp_len_reward": 0.6915992498397827,
"step": 91
},
{
"completion_length": 2258.0224609375,
"epoch": 0.2725925925925926,
"grad_norm": 0.19839390162400178,
"kl": 0.00594329833984375,
"learning_rate": 9.210548751957133e-07,
"loss": 0.0849,
"reward": 0.6840342581272125,
"reward_std": 0.24512023478746414,
"rewards/exp_len_reward": 0.6840342581272125,
"step": 92
},
{
"completion_length": 2275.4107971191406,
"epoch": 0.27555555555555555,
"grad_norm": 0.25858993059609997,
"kl": 0.0076751708984375,
"learning_rate": 9.183952695522273e-07,
"loss": 0.0955,
"reward": 0.685549259185791,
"reward_std": 0.23211624845862389,
"rewards/exp_len_reward": 0.685549259185791,
"step": 93
},
{
"completion_length": 2360.0581665039062,
"epoch": 0.2785185185185185,
"grad_norm": 0.19051489587675371,
"kl": 0.00785064697265625,
"learning_rate": 9.156960612227125e-07,
"loss": 0.0796,
"reward": 0.5155021622776985,
"reward_std": 0.2668054960668087,
"rewards/exp_len_reward": 0.5155021622776985,
"step": 94
},
{
"completion_length": 2579.8260192871094,
"epoch": 0.2814814814814815,
"grad_norm": 0.13837089717937884,
"kl": 0.00725555419921875,
"learning_rate": 9.129575403734897e-07,
"loss": 0.0142,
"reward": 0.5060503482818604,
"reward_std": 0.23962176218628883,
"rewards/exp_len_reward": 0.5060503482818604,
"step": 95
},
{
"completion_length": 2322.634002685547,
"epoch": 0.28444444444444444,
"grad_norm": 0.19969897102985795,
"kl": 0.00792694091796875,
"learning_rate": 9.101800013969962e-07,
"loss": 0.0381,
"reward": 0.5826811380684376,
"reward_std": 0.21555687859654427,
"rewards/exp_len_reward": 0.5826811380684376,
"step": 96
},
{
"completion_length": 2316.5670776367188,
"epoch": 0.2874074074074074,
"grad_norm": 0.16955049457906315,
"kl": 0.00732421875,
"learning_rate": 9.07363742880139e-07,
"loss": -0.0193,
"reward": 0.6406743228435516,
"reward_std": 0.2392715960741043,
"rewards/exp_len_reward": 0.6406743228435516,
"step": 97
},
{
"completion_length": 2166.6920471191406,
"epoch": 0.2903703703703704,
"grad_norm": 0.16741896253103236,
"kl": 0.0088043212890625,
"learning_rate": 9.045090675721959e-07,
"loss": 0.0393,
"reward": 0.6196507066488266,
"reward_std": 0.2079339139163494,
"rewards/exp_len_reward": 0.6196507066488266,
"step": 98
},
{
"completion_length": 2266.2010192871094,
"epoch": 0.29333333333333333,
"grad_norm": 0.19630807827817112,
"kl": 0.008056640625,
"learning_rate": 9.016162823522701e-07,
"loss": -0.0373,
"reward": 0.6069852858781815,
"reward_std": 0.2058863341808319,
"rewards/exp_len_reward": 0.6069852858781815,
"step": 99
},
{
"completion_length": 2042.7992248535156,
"epoch": 0.2962962962962963,
"grad_norm": 0.22170963608651797,
"kl": 0.0109100341796875,
"learning_rate": 8.986856981963004e-07,
"loss": 0.0303,
"reward": 0.6604138612747192,
"reward_std": 0.20395291596651077,
"rewards/exp_len_reward": 0.6604138612747192,
"step": 100
},
{
"completion_length": 2354.2366943359375,
"epoch": 0.2992592592592593,
"grad_norm": 0.17072493336044253,
"kl": 0.0085601806640625,
"learning_rate": 8.957176301436312e-07,
"loss": 0.0235,
"reward": 0.5688716098666191,
"reward_std": 0.21174464374780655,
"rewards/exp_len_reward": 0.5688716098666191,
"step": 101
},
{
"completion_length": 2221.0625915527344,
"epoch": 0.3022222222222222,
"grad_norm": 0.20959585853001003,
"kl": 0.00896453857421875,
"learning_rate": 8.927123972631457e-07,
"loss": 0.0744,
"reward": 0.6062222719192505,
"reward_std": 0.22734928503632545,
"rewards/exp_len_reward": 0.6062222719192505,
"step": 102
},
{
"completion_length": 2169.2947692871094,
"epoch": 0.30518518518518517,
"grad_norm": 0.16321546987755797,
"kl": 0.00870513916015625,
"learning_rate": 8.896703226189656e-07,
"loss": 0.0455,
"reward": 0.6819661110639572,
"reward_std": 0.19446462020277977,
"rewards/exp_len_reward": 0.6819661110639572,
"step": 103
},
{
"completion_length": 2295.540252685547,
"epoch": 0.30814814814814817,
"grad_norm": 0.15015615986614694,
"kl": 0.00914764404296875,
"learning_rate": 8.865917332357217e-07,
"loss": 0.0034,
"reward": 0.5188455395400524,
"reward_std": 0.19476320780813694,
"rewards/exp_len_reward": 0.5188455395400524,
"step": 104
},
{
"completion_length": 2407.134033203125,
"epoch": 0.3111111111111111,
"grad_norm": 0.16080385779787676,
"kl": 0.008636474609375,
"learning_rate": 8.834769600633986e-07,
"loss": 0.0888,
"reward": 0.5705942884087563,
"reward_std": 0.2538597658276558,
"rewards/exp_len_reward": 0.5705942884087563,
"step": 105
},
{
"completion_length": 1935.7322387695312,
"epoch": 0.31407407407407406,
"grad_norm": 0.17954312631984032,
"kl": 0.00833892822265625,
"learning_rate": 8.803263379417572e-07,
"loss": -0.0044,
"reward": 0.5462605357170105,
"reward_std": 0.2077418938279152,
"rewards/exp_len_reward": 0.5462605357170105,
"step": 106
},
{
"completion_length": 2878.9197998046875,
"epoch": 0.31703703703703706,
"grad_norm": 0.1997205984738251,
"kl": 0.0114898681640625,
"learning_rate": 8.771402055643391e-07,
"loss": 0.0063,
"reward": 0.5251818224787712,
"reward_std": 0.18359562009572983,
"rewards/exp_len_reward": 0.5251818224787712,
"step": 107
},
{
"completion_length": 2381.1295776367188,
"epoch": 0.32,
"grad_norm": 0.15668192052340332,
"kl": 0.00910186767578125,
"learning_rate": 8.73918905442058e-07,
"loss": -0.0058,
"reward": 0.6536043733358383,
"reward_std": 0.1777793299406767,
"rewards/exp_len_reward": 0.6536043733358383,
"step": 108
},
{
"completion_length": 2140.8616943359375,
"epoch": 0.32296296296296295,
"grad_norm": 0.1908188055608542,
"kl": 0.0096588134765625,
"learning_rate": 8.706627838663782e-07,
"loss": -0.0087,
"reward": 0.5826982110738754,
"reward_std": 0.23715216293931007,
"rewards/exp_len_reward": 0.5826982110738754,
"step": 109
},
{
"completion_length": 2211.8750610351562,
"epoch": 0.32592592592592595,
"grad_norm": 0.17454162730160738,
"kl": 0.0082244873046875,
"learning_rate": 8.673721908720884e-07,
"loss": 0.0936,
"reward": 0.6324276328086853,
"reward_std": 0.19550849869847298,
"rewards/exp_len_reward": 0.6324276328086853,
"step": 110
},
{
"completion_length": 1898.7500915527344,
"epoch": 0.3288888888888889,
"grad_norm": 0.19019621715427518,
"kl": 0.00946807861328125,
"learning_rate": 8.640474801996732e-07,
"loss": 0.0601,
"reward": 0.706642210483551,
"reward_std": 0.13929060846567154,
"rewards/exp_len_reward": 0.706642210483551,
"step": 111
},
{
"completion_length": 2158.0179443359375,
"epoch": 0.33185185185185184,
"grad_norm": 0.1803886062628602,
"kl": 0.008941650390625,
"learning_rate": 8.606890092572861e-07,
"loss": 0.0214,
"reward": 0.5730146244168282,
"reward_std": 0.28394924849271774,
"rewards/exp_len_reward": 0.5730146244168282,
"step": 112
},
{
"completion_length": 1975.2322082519531,
"epoch": 0.3348148148148148,
"grad_norm": 0.26887475456345783,
"kl": 0.012176513671875,
"learning_rate": 8.572971390823266e-07,
"loss": 0.09,
"reward": 0.6306671500205994,
"reward_std": 0.236881572753191,
"rewards/exp_len_reward": 0.6306671500205994,
"step": 113
},
{
"completion_length": 2393.74560546875,
"epoch": 0.3377777777777778,
"grad_norm": 0.1680385692386688,
"kl": 0.00844573974609375,
"learning_rate": 8.538722343026302e-07,
"loss": 0.0391,
"reward": 0.403280146420002,
"reward_std": 0.22043467685580254,
"rewards/exp_len_reward": 0.403280146420002,
"step": 114
},
{
"completion_length": 1869.9822540283203,
"epoch": 0.34074074074074073,
"grad_norm": 0.2100589530247618,
"kl": 0.00864410400390625,
"learning_rate": 8.50414663097269e-07,
"loss": 0.0856,
"reward": 0.7126432359218597,
"reward_std": 0.2112839464098215,
"rewards/exp_len_reward": 0.7126432359218597,
"step": 115
},
{
"completion_length": 2621.9688720703125,
"epoch": 0.3437037037037037,
"grad_norm": 0.1833213887219074,
"kl": 0.0106201171875,
"learning_rate": 8.46924797156974e-07,
"loss": 0.0355,
"reward": 0.4800976812839508,
"reward_std": 0.2597590982913971,
"rewards/exp_len_reward": 0.4800976812839508,
"step": 116
},
{
"completion_length": 2430.9598999023438,
"epoch": 0.3466666666666667,
"grad_norm": 0.1777526950884752,
"kl": 0.0114288330078125,
"learning_rate": 8.434030116441765e-07,
"loss": -0.0294,
"reward": 0.45774422585964203,
"reward_std": 0.14171775989234447,
"rewards/exp_len_reward": 0.45774422585964203,
"step": 117
},
{
"completion_length": 2442.321533203125,
"epoch": 0.3496296296296296,
"grad_norm": 0.1793512093791192,
"kl": 0.0110931396484375,
"learning_rate": 8.39849685152679e-07,
"loss": 0.0494,
"reward": 0.5156404674053192,
"reward_std": 0.23575026541948318,
"rewards/exp_len_reward": 0.5156404674053192,
"step": 118
},
{
"completion_length": 1924.6116638183594,
"epoch": 0.35259259259259257,
"grad_norm": 0.28968627912789613,
"kl": 0.007965087890625,
"learning_rate": 8.36265199666956e-07,
"loss": 0.1686,
"reward": 0.7478295713663101,
"reward_std": 0.1679641492664814,
"rewards/exp_len_reward": 0.7478295713663101,
"step": 119
},
{
"completion_length": 1653.5670471191406,
"epoch": 0.35555555555555557,
"grad_norm": 0.21781918537576359,
"kl": 0.00855255126953125,
"learning_rate": 8.326499405210902e-07,
"loss": 0.0401,
"reward": 0.6878243908286095,
"reward_std": 0.19577785581350327,
"rewards/exp_len_reward": 0.6878243908286095,
"step": 120
},
{
"completion_length": 2593.0269165039062,
"epoch": 0.3585185185185185,
"grad_norm": 0.15518499757489337,
"kl": 0.0120086669921875,
"learning_rate": 8.290042963573488e-07,
"loss": 0.011,
"reward": 0.5845073834061623,
"reward_std": 0.23032733984291553,
"rewards/exp_len_reward": 0.5845073834061623,
"step": 121
},
{
"completion_length": 1934.8036499023438,
"epoch": 0.36148148148148146,
"grad_norm": 0.3199511661475365,
"kl": 0.009521484375,
"learning_rate": 8.25328659084405e-07,
"loss": 0.1144,
"reward": 0.6791598200798035,
"reward_std": 0.17451436072587967,
"rewards/exp_len_reward": 0.6791598200798035,
"step": 122
},
{
"completion_length": 2042.8750915527344,
"epoch": 0.36444444444444446,
"grad_norm": 0.1947938475896873,
"kl": 0.0099334716796875,
"learning_rate": 8.216234238352065e-07,
"loss": 0.0709,
"reward": 0.7000842541456223,
"reward_std": 0.25582029670476913,
"rewards/exp_len_reward": 0.7000842541456223,
"step": 123
},
{
"completion_length": 1863.2098999023438,
"epoch": 0.3674074074074074,
"grad_norm": 0.2066827286740214,
"kl": 0.0110931396484375,
"learning_rate": 8.178889889244996e-07,
"loss": 0.0224,
"reward": 0.5978061109781265,
"reward_std": 0.17980634421110153,
"rewards/exp_len_reward": 0.5978061109781265,
"step": 124
},
{
"completion_length": 1728.6876220703125,
"epoch": 0.37037037037037035,
"grad_norm": 0.1729347976132675,
"kl": 0.0093841552734375,
"learning_rate": 8.141257558060092e-07,
"loss": 0.0214,
"reward": 0.7174255400896072,
"reward_std": 0.1986971478909254,
"rewards/exp_len_reward": 0.7174255400896072,
"step": 125
},
{
"completion_length": 2421.3482666015625,
"epoch": 0.37333333333333335,
"grad_norm": 0.1407316534343894,
"kl": 0.0113677978515625,
"learning_rate": 8.103341290292833e-07,
"loss": 0.0084,
"reward": 0.5087610557675362,
"reward_std": 0.2041846662759781,
"rewards/exp_len_reward": 0.5087610557675362,
"step": 126
},
{
"completion_length": 1954.4152526855469,
"epoch": 0.3762962962962963,
"grad_norm": 0.19211843182621693,
"kl": 0.0105438232421875,
"learning_rate": 8.065145161962021e-07,
"loss": 0.0738,
"reward": 0.6312553137540817,
"reward_std": 0.14810450747609138,
"rewards/exp_len_reward": 0.6312553137540817,
"step": 127
},
{
"completion_length": 2153.8795776367188,
"epoch": 0.37925925925925924,
"grad_norm": 0.170330613639256,
"kl": 0.0113677978515625,
"learning_rate": 8.02667327917163e-07,
"loss": 0.0318,
"reward": 0.6820006817579269,
"reward_std": 0.19498692452907562,
"rewards/exp_len_reward": 0.6820006817579269,
"step": 128
},
{
"completion_length": 2104.5179138183594,
"epoch": 0.38222222222222224,
"grad_norm": 0.2463458601299465,
"kl": 0.0134429931640625,
"learning_rate": 7.987929777669372e-07,
"loss": 0.0701,
"reward": 0.6237293034791946,
"reward_std": 0.22337394580245018,
"rewards/exp_len_reward": 0.6237293034791946,
"step": 129
},
{
"completion_length": 2244.7634887695312,
"epoch": 0.3851851851851852,
"grad_norm": 0.21232997470712528,
"kl": 0.0127105712890625,
"learning_rate": 7.948918822402123e-07,
"loss": -0.01,
"reward": 0.5622997805476189,
"reward_std": 0.21293797343969345,
"rewards/exp_len_reward": 0.5622997805476189,
"step": 130
},
{
"completion_length": 2267.8349609375,
"epoch": 0.38814814814814813,
"grad_norm": 0.18491480276734742,
"kl": 0.014862060546875,
"learning_rate": 7.909644607068174e-07,
"loss": -0.0161,
"reward": 0.5492302775382996,
"reward_std": 0.19176549836993217,
"rewards/exp_len_reward": 0.5492302775382996,
"step": 131
},
{
"completion_length": 2307.584930419922,
"epoch": 0.39111111111111113,
"grad_norm": 0.18853458559326913,
"kl": 0.0153656005859375,
"learning_rate": 7.870111353666414e-07,
"loss": 0.0551,
"reward": 0.5810948982834816,
"reward_std": 0.21469852700829506,
"rewards/exp_len_reward": 0.5810948982834816,
"step": 132
},
{
"completion_length": 2127.2501525878906,
"epoch": 0.3940740740740741,
"grad_norm": 0.2079768750933827,
"kl": 0.0130462646484375,
"learning_rate": 7.830323312042464e-07,
"loss": 0.0753,
"reward": 0.5947326272726059,
"reward_std": 0.2402301263064146,
"rewards/exp_len_reward": 0.5947326272726059,
"step": 133
},
{
"completion_length": 2028.3483276367188,
"epoch": 0.397037037037037,
"grad_norm": 0.21498398205311295,
"kl": 0.01544189453125,
"learning_rate": 7.790284759431809e-07,
"loss": 0.0471,
"reward": 0.6623428612947464,
"reward_std": 0.16073044575750828,
"rewards/exp_len_reward": 0.6623428612947464,
"step": 134
},
{
"completion_length": 2291.884002685547,
"epoch": 0.4,
"grad_norm": 0.1715810834745006,
"kl": 0.01556396484375,
"learning_rate": 7.75e-07,
"loss": 0.0255,
"reward": 0.5680599883198738,
"reward_std": 0.2503676153719425,
"rewards/exp_len_reward": 0.5680599883198738,
"step": 135
},
{
"completion_length": 2042.2545776367188,
"epoch": 0.40296296296296297,
"grad_norm": 0.2520458310084413,
"kl": 0.012939453125,
"learning_rate": 7.709473364379949e-07,
"loss": 0.0913,
"reward": 0.5720359832048416,
"reward_std": 0.2734212428331375,
"rewards/exp_len_reward": 0.5720359832048416,
"step": 136
},
{
"completion_length": 1867.6385192871094,
"epoch": 0.4059259259259259,
"grad_norm": 0.1612993062652069,
"kl": 0.0109100341796875,
"learning_rate": 7.668709209206391e-07,
"loss": 0.0005,
"reward": 0.6729157119989395,
"reward_std": 0.24489626288414001,
"rewards/exp_len_reward": 0.6729157119989395,
"step": 137
},
{
"completion_length": 2194.866180419922,
"epoch": 0.4088888888888889,
"grad_norm": 0.1838415012898094,
"kl": 0.01513671875,
"learning_rate": 7.627711916647531e-07,
"loss": 0.0393,
"reward": 0.616047129034996,
"reward_std": 0.241153996437788,
"rewards/exp_len_reward": 0.616047129034996,
"step": 138
},
{
"completion_length": 2489.272430419922,
"epoch": 0.41185185185185186,
"grad_norm": 0.15970472042816589,
"kl": 0.016876220703125,
"learning_rate": 7.586485893933972e-07,
"loss": -0.0079,
"reward": 0.6617710441350937,
"reward_std": 0.21483541280031204,
"rewards/exp_len_reward": 0.6617710441350937,
"step": 139
},
{
"completion_length": 2220.1385192871094,
"epoch": 0.4148148148148148,
"grad_norm": 0.18462009649943625,
"kl": 0.0157470703125,
"learning_rate": 7.545035572884928e-07,
"loss": 0.0096,
"reward": 0.48423583060503006,
"reward_std": 0.26414601504802704,
"rewards/exp_len_reward": 0.48423583060503006,
"step": 140
},
{
"completion_length": 2394.5001220703125,
"epoch": 0.4177777777777778,
"grad_norm": 0.17730147014895184,
"kl": 0.017822265625,
"learning_rate": 7.503365409431801e-07,
"loss": 0.0339,
"reward": 0.6387054920196533,
"reward_std": 0.17836992628872395,
"rewards/exp_len_reward": 0.6387054920196533,
"step": 141
},
{
"completion_length": 1699.62060546875,
"epoch": 0.42074074074074075,
"grad_norm": 0.27025204836335937,
"kl": 0.012969970703125,
"learning_rate": 7.46147988313917e-07,
"loss": 0.0775,
"reward": 0.6824662685394287,
"reward_std": 0.27711663395166397,
"rewards/exp_len_reward": 0.6824662685394287,
"step": 142
},
{
"completion_length": 2110.464385986328,
"epoch": 0.4237037037037037,
"grad_norm": 0.20686431019260695,
"kl": 0.0187225341796875,
"learning_rate": 7.419383496723229e-07,
"loss": 0.0448,
"reward": 0.6168643683195114,
"reward_std": 0.24179954081773758,
"rewards/exp_len_reward": 0.6168643683195114,
"step": 143
},
{
"completion_length": 2662.9599609375,
"epoch": 0.4266666666666667,
"grad_norm": 0.18577680951510714,
"kl": 0.021453857421875,
"learning_rate": 7.377080775567751e-07,
"loss": 0.0196,
"reward": 0.4251635745167732,
"reward_std": 0.24519287049770355,
"rewards/exp_len_reward": 0.4251635745167732,
"step": 144
},
{
"completion_length": 1741.3394165039062,
"epoch": 0.42962962962962964,
"grad_norm": 0.181100819553374,
"kl": 0.015777587890625,
"learning_rate": 7.334576267237599e-07,
"loss": 0.0253,
"reward": 0.653263047337532,
"reward_std": 0.21376236528158188,
"rewards/exp_len_reward": 0.653263047337532,
"step": 145
},
{
"completion_length": 2283.5670776367188,
"epoch": 0.4325925925925926,
"grad_norm": 0.20288310670933243,
"kl": 0.017974853515625,
"learning_rate": 7.291874540989869e-07,
"loss": 0.063,
"reward": 0.6847885400056839,
"reward_std": 0.22203149646520615,
"rewards/exp_len_reward": 0.6847885400056839,
"step": 146
},
{
"completion_length": 2459.8483276367188,
"epoch": 0.43555555555555553,
"grad_norm": 0.24617751418167974,
"kl": 0.023956298828125,
"learning_rate": 7.248980187282679e-07,
"loss": 0.0514,
"reward": 0.5492689982056618,
"reward_std": 0.29514792189002037,
"rewards/exp_len_reward": 0.5492689982056618,
"step": 147
},
{
"completion_length": 2116.6295776367188,
"epoch": 0.43851851851851853,
"grad_norm": 0.20910355061187821,
"kl": 0.018707275390625,
"learning_rate": 7.205897817281707e-07,
"loss": -0.0376,
"reward": 0.562318354845047,
"reward_std": 0.20038180239498615,
"rewards/exp_len_reward": 0.562318354845047,
"step": 148
},
{
"completion_length": 1637.2054138183594,
"epoch": 0.4414814814814815,
"grad_norm": 0.3052837937915986,
"kl": 0.0164947509765625,
"learning_rate": 7.162632062364482e-07,
"loss": 0.0718,
"reward": 0.6785788387060165,
"reward_std": 0.28277434036135674,
"rewards/exp_len_reward": 0.6785788387060165,
"step": 149
},
{
"completion_length": 2099.5491943359375,
"epoch": 0.4444444444444444,
"grad_norm": 0.21016234434727396,
"kl": 0.02056884765625,
"learning_rate": 7.119187573622503e-07,
"loss": 0.0004,
"reward": 0.5978891626000404,
"reward_std": 0.21479224599897861,
"rewards/exp_len_reward": 0.5978891626000404,
"step": 150
},
{
"completion_length": 1984.9420776367188,
"epoch": 0.4474074074074074,
"grad_norm": 0.17085682034823446,
"kl": 0.017852783203125,
"learning_rate": 7.075569021361258e-07,
"loss": 0.0227,
"reward": 0.5734822899103165,
"reward_std": 0.24261553958058357,
"rewards/exp_len_reward": 0.5734822899103165,
"step": 151
},
{
"completion_length": 2491.6206970214844,
"epoch": 0.45037037037037037,
"grad_norm": 0.20118499173178536,
"kl": 0.0238037109375,
"learning_rate": 7.031781094598147e-07,
"loss": 0.0491,
"reward": 0.4708263725042343,
"reward_std": 0.288173146545887,
"rewards/exp_len_reward": 0.4708263725042343,
"step": 152
},
{
"completion_length": 2146.6563415527344,
"epoch": 0.4533333333333333,
"grad_norm": 0.20235562160674747,
"kl": 0.0205841064453125,
"learning_rate": 6.987828500558422e-07,
"loss": 0.0515,
"reward": 0.615074560046196,
"reward_std": 0.2827143333852291,
"rewards/exp_len_reward": 0.615074560046196,
"step": 153
},
{
"completion_length": 1729.3304443359375,
"epoch": 0.4562962962962963,
"grad_norm": 0.19165529870144216,
"kl": 0.016082763671875,
"learning_rate": 6.943715964169153e-07,
"loss": -0.0319,
"reward": 0.59318608045578,
"reward_std": 0.21333957836031914,
"rewards/exp_len_reward": 0.59318608045578,
"step": 154
},
{
"completion_length": 2026.9911499023438,
"epoch": 0.45925925925925926,
"grad_norm": 0.26603840660660094,
"kl": 0.017608642578125,
"learning_rate": 6.899448227551302e-07,
"loss": 0.1068,
"reward": 0.6835269778966904,
"reward_std": 0.22044039890170097,
"rewards/exp_len_reward": 0.6835269778966904,
"step": 155
},
{
"completion_length": 2220.4911499023438,
"epoch": 0.4622222222222222,
"grad_norm": 0.18158002675146767,
"kl": 0.01971435546875,
"learning_rate": 6.85503004950993e-07,
"loss": 0.0614,
"reward": 0.5589229390025139,
"reward_std": 0.21225098706781864,
"rewards/exp_len_reward": 0.5589229390025139,
"step": 156
},
{
"completion_length": 1817.1831359863281,
"epoch": 0.4651851851851852,
"grad_norm": 0.24149007108576753,
"kl": 0.017913818359375,
"learning_rate": 6.810466205022635e-07,
"loss": 0.0515,
"reward": 0.539856381714344,
"reward_std": 0.26797987148165703,
"rewards/exp_len_reward": 0.539856381714344,
"step": 157
},
{
"completion_length": 2117.3929443359375,
"epoch": 0.46814814814814815,
"grad_norm": 0.25938118733261895,
"kl": 0.02740478515625,
"learning_rate": 6.765761484726232e-07,
"loss": 0.0564,
"reward": 0.5961438491940498,
"reward_std": 0.24437472596764565,
"rewards/exp_len_reward": 0.5961438491940498,
"step": 158
},
{
"completion_length": 2225.0179443359375,
"epoch": 0.4711111111111111,
"grad_norm": 0.1700079832657491,
"kl": 0.0214080810546875,
"learning_rate": 6.720920694401765e-07,
"loss": 0.0528,
"reward": 0.5992478281259537,
"reward_std": 0.28642022609710693,
"rewards/exp_len_reward": 0.5992478281259537,
"step": 159
},
{
"completion_length": 1754.8750915527344,
"epoch": 0.4740740740740741,
"grad_norm": 0.17436341962581006,
"kl": 0.020294189453125,
"learning_rate": 6.675948654457873e-07,
"loss": 0.0133,
"reward": 0.5765155255794525,
"reward_std": 0.18946415930986404,
"rewards/exp_len_reward": 0.5765155255794525,
"step": 160
},
{
"completion_length": 1651.9732971191406,
"epoch": 0.47703703703703704,
"grad_norm": 0.27021245157611135,
"kl": 0.01971435546875,
"learning_rate": 6.6308501994126e-07,
"loss": 0.0449,
"reward": 0.6996115148067474,
"reward_std": 0.19840912148356438,
"rewards/exp_len_reward": 0.6996115148067474,
"step": 161
},
{
"completion_length": 1784.4465026855469,
"epoch": 0.48,
"grad_norm": 0.18201737347411745,
"kl": 0.0213623046875,
"learning_rate": 6.585630177373679e-07,
"loss": 0.0101,
"reward": 0.6633335798978806,
"reward_std": 0.28136105462908745,
"rewards/exp_len_reward": 0.6633335798978806,
"step": 162
},
{
"completion_length": 2037.9107971191406,
"epoch": 0.482962962962963,
"grad_norm": 0.18837562477978834,
"kl": 0.026611328125,
"learning_rate": 6.540293449517364e-07,
"loss": -0.008,
"reward": 0.5584470629692078,
"reward_std": 0.2016864065080881,
"rewards/exp_len_reward": 0.5584470629692078,
"step": 163
},
{
"completion_length": 1733.7366943359375,
"epoch": 0.48592592592592593,
"grad_norm": 0.2840070123751475,
"kl": 0.0224151611328125,
"learning_rate": 6.494844889565838e-07,
"loss": 0.0569,
"reward": 0.6604474782943726,
"reward_std": 0.25152015686035156,
"rewards/exp_len_reward": 0.6604474782943726,
"step": 164
},
{
"completion_length": 2190.232208251953,
"epoch": 0.4888888888888889,
"grad_norm": 0.2130269522908582,
"kl": 0.03131103515625,
"learning_rate": 6.449289383263299e-07,
"loss": 0.0263,
"reward": 0.587119996547699,
"reward_std": 0.18609843030571938,
"rewards/exp_len_reward": 0.587119996547699,
"step": 165
},
{
"completion_length": 1967.9108276367188,
"epoch": 0.4918518518518519,
"grad_norm": 0.23106631062493835,
"kl": 0.026458740234375,
"learning_rate": 6.403631827850733e-07,
"loss": 0.0458,
"reward": 0.6920860558748245,
"reward_std": 0.2177874594926834,
"rewards/exp_len_reward": 0.6920860558748245,
"step": 166
},
{
"completion_length": 1848.1875915527344,
"epoch": 0.4948148148148148,
"grad_norm": 0.2557642214220052,
"kl": 0.025726318359375,
"learning_rate": 6.357877131539459e-07,
"loss": 0.0119,
"reward": 0.5580969974398613,
"reward_std": 0.2476295307278633,
"rewards/exp_len_reward": 0.5580969974398613,
"step": 167
},
{
"completion_length": 1784.1741333007812,
"epoch": 0.49777777777777776,
"grad_norm": 0.2020034406354174,
"kl": 0.02728271484375,
"learning_rate": 6.312030212983492e-07,
"loss": 0.009,
"reward": 0.6462460905313492,
"reward_std": 0.27480896189808846,
"rewards/exp_len_reward": 0.6462460905313492,
"step": 168
},
{
"completion_length": 1966.8393249511719,
"epoch": 0.5007407407407407,
"grad_norm": 0.20624757112600367,
"kl": 0.030059814453125,
"learning_rate": 6.266096000750794e-07,
"loss": 0.0173,
"reward": 0.6448132321238518,
"reward_std": 0.2088510636240244,
"rewards/exp_len_reward": 0.6448132321238518,
"step": 169
},
{
"completion_length": 1901.9420166015625,
"epoch": 0.5037037037037037,
"grad_norm": 0.2570931569292836,
"kl": 0.028076171875,
"learning_rate": 6.220079432793434e-07,
"loss": 0.0486,
"reward": 0.5949899107217789,
"reward_std": 0.22542590275406837,
"rewards/exp_len_reward": 0.5949899107217789,
"step": 170
},
{
"completion_length": 2003.0358276367188,
"epoch": 0.5066666666666667,
"grad_norm": 0.20834827415235962,
"kl": 0.035552978515625,
"learning_rate": 6.173985455916767e-07,
"loss": 0.0339,
"reward": 0.5371049828827381,
"reward_std": 0.2185331992805004,
"rewards/exp_len_reward": 0.5371049828827381,
"step": 171
},
{
"completion_length": 2454.7857971191406,
"epoch": 0.5096296296296297,
"grad_norm": 0.19172881294229105,
"kl": 0.042144775390625,
"learning_rate": 6.127819025247654e-07,
"loss": 0.0363,
"reward": 0.5926978290081024,
"reward_std": 0.2542005889117718,
"rewards/exp_len_reward": 0.5926978290081024,
"step": 172
},
{
"completion_length": 1779.415283203125,
"epoch": 0.5125925925925926,
"grad_norm": 0.2287442270331349,
"kl": 0.03466796875,
"learning_rate": 6.081585103701769e-07,
"loss": 0.0649,
"reward": 0.7005281001329422,
"reward_std": 0.20357034727931023,
"rewards/exp_len_reward": 0.7005281001329422,
"step": 173
},
{
"completion_length": 2120.3482666015625,
"epoch": 0.5155555555555555,
"grad_norm": 0.287867669455274,
"kl": 0.039642333984375,
"learning_rate": 6.0352886614501e-07,
"loss": 0.0649,
"reward": 0.6480746418237686,
"reward_std": 0.23573359474539757,
"rewards/exp_len_reward": 0.6480746418237686,
"step": 174
},
{
"completion_length": 1984.0626220703125,
"epoch": 0.5185185185185185,
"grad_norm": 0.22117625934907972,
"kl": 0.038360595703125,
"learning_rate": 5.988934675384635e-07,
"loss": 0.0294,
"reward": 0.6022319048643112,
"reward_std": 0.24885358661413193,
"rewards/exp_len_reward": 0.6022319048643112,
"step": 175
},
{
"completion_length": 1547.2277526855469,
"epoch": 0.5214814814814814,
"grad_norm": 0.23764266854092705,
"kl": 0.031890869140625,
"learning_rate": 5.942528128583356e-07,
"loss": -0.0127,
"reward": 0.581740252673626,
"reward_std": 0.2612038552761078,
"rewards/exp_len_reward": 0.581740252673626,
"step": 176
},
{
"completion_length": 1882.9866943359375,
"epoch": 0.5244444444444445,
"grad_norm": 0.257891426495327,
"kl": 0.036163330078125,
"learning_rate": 5.896074009774554e-07,
"loss": 0.08,
"reward": 0.6895754784345627,
"reward_std": 0.18230854347348213,
"rewards/exp_len_reward": 0.6895754784345627,
"step": 177
},
{
"completion_length": 1950.5625610351562,
"epoch": 0.5274074074074074,
"grad_norm": 0.23376992182966935,
"kl": 0.033843994140625,
"learning_rate": 5.849577312800529e-07,
"loss": 0.0192,
"reward": 0.6687831208109856,
"reward_std": 0.2672557160258293,
"rewards/exp_len_reward": 0.6687831208109856,
"step": 178
},
{
"completion_length": 1712.3616638183594,
"epoch": 0.5303703703703704,
"grad_norm": 0.22585330405547638,
"kl": 0.036285400390625,
"learning_rate": 5.803043036080764e-07,
"loss": 0.0197,
"reward": 0.6192082017660141,
"reward_std": 0.2819724902510643,
"rewards/exp_len_reward": 0.6192082017660141,
"step": 179
},
{
"completion_length": 1884.2367248535156,
"epoch": 0.5333333333333333,
"grad_norm": 0.41283804620716,
"kl": 0.044189453125,
"learning_rate": 5.756476182074582e-07,
"loss": 0.0724,
"reward": 0.6782168745994568,
"reward_std": 0.1702322345227003,
"rewards/exp_len_reward": 0.6782168745994568,
"step": 180
},
{
"completion_length": 2027.8750915527344,
"epoch": 0.5362962962962963,
"grad_norm": 0.2937087831499392,
"kl": 0.05908203125,
"learning_rate": 5.709881756743379e-07,
"loss": 0.0467,
"reward": 0.5191835761070251,
"reward_std": 0.19608749821782112,
"rewards/exp_len_reward": 0.5191835761070251,
"step": 181
},
{
"completion_length": 1563.6473541259766,
"epoch": 0.5392592592592592,
"grad_norm": 0.3970129789902169,
"kl": 0.04388427734375,
"learning_rate": 5.663264769012486e-07,
"loss": 0.0596,
"reward": 0.7023471593856812,
"reward_std": 0.20404362678527832,
"rewards/exp_len_reward": 0.7023471593856812,
"step": 182
},
{
"completion_length": 1748.5402526855469,
"epoch": 0.5422222222222223,
"grad_norm": 0.3017203217726933,
"kl": 0.05078125,
"learning_rate": 5.616630230232704e-07,
"loss": 0.0113,
"reward": 0.5868410617113113,
"reward_std": 0.2523919604718685,
"rewards/exp_len_reward": 0.5868410617113113,
"step": 183
},
{
"completion_length": 2259.8885192871094,
"epoch": 0.5451851851851852,
"grad_norm": 2.1483873089235024,
"kl": 0.09649658203125,
"learning_rate": 5.569983153641579e-07,
"loss": 0.0481,
"reward": 0.5765419751405716,
"reward_std": 0.23718996345996857,
"rewards/exp_len_reward": 0.5765419751405716,
"step": 184
},
{
"completion_length": 1957.0090026855469,
"epoch": 0.5481481481481482,
"grad_norm": 0.2960652403592979,
"kl": 0.0684814453125,
"learning_rate": 5.523328553824479e-07,
"loss": 0.0223,
"reward": 0.6493587493896484,
"reward_std": 0.19187942519783974,
"rewards/exp_len_reward": 0.6493587493896484,
"step": 185
},
{
"completion_length": 2201.696502685547,
"epoch": 0.5511111111111111,
"grad_norm": 0.3800732482018625,
"kl": 0.0880126953125,
"learning_rate": 5.476671446175522e-07,
"loss": 0.0025,
"reward": 0.6268903613090515,
"reward_std": 0.18401159532368183,
"rewards/exp_len_reward": 0.6268903613090515,
"step": 186
},
{
"completion_length": 1674.7188720703125,
"epoch": 0.554074074074074,
"grad_norm": 0.6923481954334814,
"kl": 0.056396484375,
"learning_rate": 5.43001684635842e-07,
"loss": 0.0794,
"reward": 0.7184347957372665,
"reward_std": 0.18241577968001366,
"rewards/exp_len_reward": 0.7184347957372665,
"step": 187
},
{
"completion_length": 1843.6741638183594,
"epoch": 0.557037037037037,
"grad_norm": 0.6224586398259572,
"kl": 0.07568359375,
"learning_rate": 5.383369769767296e-07,
"loss": 0.0439,
"reward": 0.6129633188247681,
"reward_std": 0.25642842054367065,
"rewards/exp_len_reward": 0.6129633188247681,
"step": 188
},
{
"completion_length": 2510.1161499023438,
"epoch": 0.56,
"grad_norm": 0.4420205774969637,
"kl": 0.1275634765625,
"learning_rate": 5.336735230987514e-07,
"loss": 0.0268,
"reward": 0.4878092482686043,
"reward_std": 0.20435325056314468,
"rewards/exp_len_reward": 0.4878092482686043,
"step": 189
},
{
"completion_length": 2056.4420471191406,
"epoch": 0.562962962962963,
"grad_norm": 0.6305416339191662,
"kl": 0.1219482421875,
"learning_rate": 5.290118243256622e-07,
"loss": -0.0364,
"reward": 0.4746796190738678,
"reward_std": 0.21284806914627552,
"rewards/exp_len_reward": 0.4746796190738678,
"step": 190
},
{
"completion_length": 2051.919708251953,
"epoch": 0.5659259259259259,
"grad_norm": 0.4598638399702875,
"kl": 0.130859375,
"learning_rate": 5.243523817925418e-07,
"loss": 0.0496,
"reward": 0.6630957126617432,
"reward_std": 0.17974085174500942,
"rewards/exp_len_reward": 0.6630957126617432,
"step": 191
},
{
"completion_length": 2432.370635986328,
"epoch": 0.5688888888888889,
"grad_norm": 1.3858309739065304,
"kl": 0.266845703125,
"learning_rate": 5.196956963919237e-07,
"loss": 0.1059,
"reward": 0.5846623033285141,
"reward_std": 0.2576068378984928,
"rewards/exp_len_reward": 0.5846623033285141,
"step": 192
},
{
"completion_length": 2348.6563415527344,
"epoch": 0.5718518518518518,
"grad_norm": 0.446976505403873,
"kl": 0.21044921875,
"learning_rate": 5.150422687199471e-07,
"loss": 0.0607,
"reward": 0.6032482236623764,
"reward_std": 0.25601741299033165,
"rewards/exp_len_reward": 0.6032482236623764,
"step": 193
},
{
"completion_length": 1901.40185546875,
"epoch": 0.5748148148148148,
"grad_norm": 0.5635960519086024,
"kl": 0.1746826171875,
"learning_rate": 5.103925990225448e-07,
"loss": 0.0442,
"reward": 0.5330104827880859,
"reward_std": 0.19548507407307625,
"rewards/exp_len_reward": 0.5330104827880859,
"step": 194
},
{
"completion_length": 1449.602767944336,
"epoch": 0.5777777777777777,
"grad_norm": 0.4543937743090975,
"kl": 0.107879638671875,
"learning_rate": 5.057471871416644e-07,
"loss": 0.0058,
"reward": 0.6681396141648293,
"reward_std": 0.23483269661664963,
"rewards/exp_len_reward": 0.6681396141648293,
"step": 195
},
{
"completion_length": 2431.8617248535156,
"epoch": 0.5807407407407408,
"grad_norm": 1.1028267878445535,
"kl": 0.212646484375,
"learning_rate": 5.011065324615364e-07,
"loss": -0.0174,
"reward": 0.571851409971714,
"reward_std": 0.26459021866321564,
"rewards/exp_len_reward": 0.571851409971714,
"step": 196
},
{
"completion_length": 1839.2723693847656,
"epoch": 0.5837037037037037,
"grad_norm": 0.5633254396853296,
"kl": 0.1365966796875,
"learning_rate": 4.964711338549901e-07,
"loss": 0.0059,
"reward": 0.6712978407740593,
"reward_std": 0.16716519370675087,
"rewards/exp_len_reward": 0.6712978407740593,
"step": 197
},
{
"completion_length": 2141.5313415527344,
"epoch": 0.5866666666666667,
"grad_norm": 0.43139745186321504,
"kl": 0.117919921875,
"learning_rate": 4.918414896298229e-07,
"loss": 0.0106,
"reward": 0.555988572537899,
"reward_std": 0.21953130513429642,
"rewards/exp_len_reward": 0.555988572537899,
"step": 198
},
{
"completion_length": 1696.3572387695312,
"epoch": 0.5896296296296296,
"grad_norm": 0.44555781304916003,
"kl": 0.08502197265625,
"learning_rate": 4.872180974752347e-07,
"loss": 0.011,
"reward": 0.6980317980051041,
"reward_std": 0.1718193106353283,
"rewards/exp_len_reward": 0.6980317980051041,
"step": 199
},
{
"completion_length": 1863.7188415527344,
"epoch": 0.5925925925925926,
"grad_norm": 0.7676065950646701,
"kl": 0.0809326171875,
"learning_rate": 4.826014544083234e-07,
"loss": 0.0557,
"reward": 0.6198651492595673,
"reward_std": 0.21527405828237534,
"rewards/exp_len_reward": 0.6198651492595673,
"step": 200
},
{
"completion_length": 1915.4956359863281,
"epoch": 0.5955555555555555,
"grad_norm": 0.8609900913318874,
"kl": 0.09375,
"learning_rate": 4.779920567206568e-07,
"loss": 0.0506,
"reward": 0.635828509926796,
"reward_std": 0.21409705840051174,
"rewards/exp_len_reward": 0.635828509926796,
"step": 201
},
{
"completion_length": 1836.6920776367188,
"epoch": 0.5985185185185186,
"grad_norm": 0.8749272432326957,
"kl": 0.1002197265625,
"learning_rate": 4.733903999249206e-07,
"loss": 0.0628,
"reward": 0.5262488052248955,
"reward_std": 0.21715521067380905,
"rewards/exp_len_reward": 0.5262488052248955,
"step": 202
},
{
"completion_length": 1819.7635192871094,
"epoch": 0.6014814814814815,
"grad_norm": 0.7926613477880593,
"kl": 0.131591796875,
"learning_rate": 4.687969787016507e-07,
"loss": 0.0605,
"reward": 0.5331440269947052,
"reward_std": 0.19602959603071213,
"rewards/exp_len_reward": 0.5331440269947052,
"step": 203
},
{
"completion_length": 2419.7991943359375,
"epoch": 0.6044444444444445,
"grad_norm": 0.9961905231609627,
"kl": 0.25341796875,
"learning_rate": 4.642122868460542e-07,
"loss": 0.0808,
"reward": 0.6300962418317795,
"reward_std": 0.24434982240200043,
"rewards/exp_len_reward": 0.6300962418317795,
"step": 204
},
{
"completion_length": 2225.384002685547,
"epoch": 0.6074074074074074,
"grad_norm": 0.8274200921128466,
"kl": 0.35498046875,
"learning_rate": 4.596368172149268e-07,
"loss": 0.0202,
"reward": 0.5711120814085007,
"reward_std": 0.2728967033326626,
"rewards/exp_len_reward": 0.5711120814085007,
"step": 205
},
{
"completion_length": 1870.5804748535156,
"epoch": 0.6103703703703703,
"grad_norm": 0.6794454475943669,
"kl": 0.325439453125,
"learning_rate": 4.550710616736702e-07,
"loss": 0.041,
"reward": 0.6079469621181488,
"reward_std": 0.26122210919857025,
"rewards/exp_len_reward": 0.6079469621181488,
"step": 206
},
{
"completion_length": 1911.4643859863281,
"epoch": 0.6133333333333333,
"grad_norm": 1.1683585185836836,
"kl": 0.41748046875,
"learning_rate": 4.505155110434162e-07,
"loss": 0.085,
"reward": 0.5756408721208572,
"reward_std": 0.2867981418967247,
"rewards/exp_len_reward": 0.5756408721208572,
"step": 207
},
{
"completion_length": 2093.513458251953,
"epoch": 0.6162962962962963,
"grad_norm": 0.6479840159540543,
"kl": 0.46435546875,
"learning_rate": 4.459706550482638e-07,
"loss": 0.1024,
"reward": 0.6004444509744644,
"reward_std": 0.26299645751714706,
"rewards/exp_len_reward": 0.6004444509744644,
"step": 208
},
{
"completion_length": 2323.15185546875,
"epoch": 0.6192592592592593,
"grad_norm": 1.3601936776142103,
"kl": 0.5498046875,
"learning_rate": 4.4143698226263207e-07,
"loss": 0.0791,
"reward": 0.5422740504145622,
"reward_std": 0.3434370458126068,
"rewards/exp_len_reward": 0.5422740504145622,
"step": 209
},
{
"completion_length": 1607.0223999023438,
"epoch": 0.6222222222222222,
"grad_norm": 1.293326584722951,
"kl": 0.20556640625,
"learning_rate": 4.3691498005874007e-07,
"loss": 0.0862,
"reward": 0.6710554957389832,
"reward_std": 0.21959074586629868,
"rewards/exp_len_reward": 0.6710554957389832,
"step": 210
},
{
"completion_length": 1978.8706665039062,
"epoch": 0.6251851851851852,
"grad_norm": 0.9582295626587259,
"kl": 0.349609375,
"learning_rate": 4.324051345542128e-07,
"loss": 0.0559,
"reward": 0.5671351253986359,
"reward_std": 0.24335385113954544,
"rewards/exp_len_reward": 0.5671351253986359,
"step": 211
},
{
"completion_length": 2163.9420166015625,
"epoch": 0.6281481481481481,
"grad_norm": 0.5722113261698506,
"kl": 0.343017578125,
"learning_rate": 4.2790793055982354e-07,
"loss": 0.0844,
"reward": 0.5790135860443115,
"reward_std": 0.21604818850755692,
"rewards/exp_len_reward": 0.5790135860443115,
"step": 212
},
{
"completion_length": 1913.8036499023438,
"epoch": 0.6311111111111111,
"grad_norm": 0.5324405845923617,
"kl": 0.32373046875,
"learning_rate": 4.234238515273768e-07,
"loss": 0.0253,
"reward": 0.648313857614994,
"reward_std": 0.17545541189610958,
"rewards/exp_len_reward": 0.648313857614994,
"step": 213
},
{
"completion_length": 2254.8616943359375,
"epoch": 0.6340740740740741,
"grad_norm": 1.4273403179290647,
"kl": 0.43359375,
"learning_rate": 4.189533794977367e-07,
"loss": 0.1264,
"reward": 0.5515103414654732,
"reward_std": 0.2388550043106079,
"rewards/exp_len_reward": 0.5515103414654732,
"step": 214
},
{
"completion_length": 2051.8617553710938,
"epoch": 0.6370370370370371,
"grad_norm": 1.1393120895451125,
"kl": 0.42724609375,
"learning_rate": 4.14496995049007e-07,
"loss": 0.0296,
"reward": 0.5932003408670425,
"reward_std": 0.15118649788200855,
"rewards/exp_len_reward": 0.5932003408670425,
"step": 215
},
{
"completion_length": 2098.0491943359375,
"epoch": 0.64,
"grad_norm": 1.6151914612075735,
"kl": 0.38916015625,
"learning_rate": 4.100551772448697e-07,
"loss": 0.0656,
"reward": 0.568665586411953,
"reward_std": 0.22934136912226677,
"rewards/exp_len_reward": 0.568665586411953,
"step": 216
},
{
"completion_length": 2233.6652221679688,
"epoch": 0.642962962962963,
"grad_norm": 1.394565883893917,
"kl": 0.591796875,
"learning_rate": 4.056284035830846e-07,
"loss": 0.0706,
"reward": 0.48754215240478516,
"reward_std": 0.2107255533337593,
"rewards/exp_len_reward": 0.48754215240478516,
"step": 217
},
{
"completion_length": 2153.3750915527344,
"epoch": 0.6459259259259259,
"grad_norm": 0.7394942057986161,
"kl": 0.56396484375,
"learning_rate": 4.012171499441578e-07,
"loss": 0.067,
"reward": 0.5630225837230682,
"reward_std": 0.2632727436721325,
"rewards/exp_len_reward": 0.5630225837230682,
"step": 218
},
{
"completion_length": 2178.5223693847656,
"epoch": 0.6488888888888888,
"grad_norm": 0.8833824117642055,
"kl": 0.521484375,
"learning_rate": 3.968218905401853e-07,
"loss": 0.0932,
"reward": 0.6287193298339844,
"reward_std": 0.1800774559378624,
"rewards/exp_len_reward": 0.6287193298339844,
"step": 219
},
{
"completion_length": 2182.4912109375,
"epoch": 0.6518518518518519,
"grad_norm": 1.3858833504387356,
"kl": 0.4931640625,
"learning_rate": 3.924430978638742e-07,
"loss": 0.0321,
"reward": 0.5301230028271675,
"reward_std": 0.20102717354893684,
"rewards/exp_len_reward": 0.5301230028271675,
"step": 220
},
{
"completion_length": 2076.0536193847656,
"epoch": 0.6548148148148148,
"grad_norm": 0.7482996746885904,
"kl": 0.423095703125,
"learning_rate": 3.8808124263774955e-07,
"loss": 0.0427,
"reward": 0.49979688227176666,
"reward_std": 0.29293133690953255,
"rewards/exp_len_reward": 0.49979688227176666,
"step": 221
},
{
"completion_length": 2390.9510192871094,
"epoch": 0.6577777777777778,
"grad_norm": 1.1580322595016712,
"kl": 0.43798828125,
"learning_rate": 3.8373679376355195e-07,
"loss": 0.0593,
"reward": 0.49847787618637085,
"reward_std": 0.21575787663459778,
"rewards/exp_len_reward": 0.49847787618637085,
"step": 222
},
{
"completion_length": 2259.2635192871094,
"epoch": 0.6607407407407407,
"grad_norm": 1.5796351783929345,
"kl": 0.482421875,
"learning_rate": 3.794102182718294e-07,
"loss": 0.1097,
"reward": 0.5217412784695625,
"reward_std": 0.26718301698565483,
"rewards/exp_len_reward": 0.5217412784695625,
"step": 223
},
{
"completion_length": 2008.919677734375,
"epoch": 0.6637037037037037,
"grad_norm": 1.528742381770616,
"kl": 0.42919921875,
"learning_rate": 3.751019812717322e-07,
"loss": 0.0972,
"reward": 0.5147194415330887,
"reward_std": 0.25649960711598396,
"rewards/exp_len_reward": 0.5147194415330887,
"step": 224
},
{
"completion_length": 1913.9598999023438,
"epoch": 0.6666666666666666,
"grad_norm": 1.0049224868904445,
"kl": 0.462890625,
"learning_rate": 3.708125459010134e-07,
"loss": 0.0227,
"reward": 0.5306781381368637,
"reward_std": 0.22224940732121468,
"rewards/exp_len_reward": 0.5306781381368637,
"step": 225
},
{
"completion_length": 2082.8170776367188,
"epoch": 0.6696296296296296,
"grad_norm": 1.184822942762592,
"kl": 0.59326171875,
"learning_rate": 3.6654237327624003e-07,
"loss": 0.0692,
"reward": 0.5259907096624374,
"reward_std": 0.1462160311639309,
"rewards/exp_len_reward": 0.5259907096624374,
"step": 226
},
{
"completion_length": 2022.6920776367188,
"epoch": 0.6725925925925926,
"grad_norm": 1.5849715003252363,
"kl": 0.4501953125,
"learning_rate": 3.622919224432248e-07,
"loss": 0.0682,
"reward": 0.563248299062252,
"reward_std": 0.19807949475944042,
"rewards/exp_len_reward": 0.563248299062252,
"step": 227
},
{
"completion_length": 1900.1384582519531,
"epoch": 0.6755555555555556,
"grad_norm": 1.1910228260375635,
"kl": 0.4029541015625,
"learning_rate": 3.580616503276772e-07,
"loss": 0.045,
"reward": 0.5678588896989822,
"reward_std": 0.24164490401744843,
"rewards/exp_len_reward": 0.5678588896989822,
"step": 228
},
{
"completion_length": 2144.4822998046875,
"epoch": 0.6785185185185185,
"grad_norm": 0.6989125543878815,
"kl": 0.500244140625,
"learning_rate": 3.5385201168608303e-07,
"loss": 0.0246,
"reward": 0.5909338667988777,
"reward_std": 0.19285215064883232,
"rewards/exp_len_reward": 0.5909338667988777,
"step": 229
},
{
"completion_length": 2379.0760192871094,
"epoch": 0.6814814814814815,
"grad_norm": 1.3916488255594746,
"kl": 0.61767578125,
"learning_rate": 3.4966345905681984e-07,
"loss": 0.0633,
"reward": 0.5990354269742966,
"reward_std": 0.23781514167785645,
"rewards/exp_len_reward": 0.5990354269742966,
"step": 230
},
{
"completion_length": 1947.0536804199219,
"epoch": 0.6844444444444444,
"grad_norm": 1.1973429701947933,
"kl": 0.43896484375,
"learning_rate": 3.4549644271150723e-07,
"loss": 0.0619,
"reward": 0.6357074603438377,
"reward_std": 0.186561593785882,
"rewards/exp_len_reward": 0.6357074603438377,
"step": 231
},
{
"completion_length": 1959.9375915527344,
"epoch": 0.6874074074074074,
"grad_norm": 1.0316979947324525,
"kl": 0.5341796875,
"learning_rate": 3.413514106066026e-07,
"loss": 0.055,
"reward": 0.593490794301033,
"reward_std": 0.22011198103427887,
"rewards/exp_len_reward": 0.593490794301033,
"step": 232
},
{
"completion_length": 2068.4956665039062,
"epoch": 0.6903703703703704,
"grad_norm": 1.0526407604676598,
"kl": 0.68310546875,
"learning_rate": 3.3722880833524704e-07,
"loss": 0.0815,
"reward": 0.5511805862188339,
"reward_std": 0.29247505962848663,
"rewards/exp_len_reward": 0.5511805862188339,
"step": 233
},
{
"completion_length": 1586.1429138183594,
"epoch": 0.6933333333333334,
"grad_norm": 9.824678025864428,
"kl": 0.67529296875,
"learning_rate": 3.3312907907936097e-07,
"loss": 0.0485,
"reward": 0.6389699578285217,
"reward_std": 0.1864270232617855,
"rewards/exp_len_reward": 0.6389699578285217,
"step": 234
},
{
"completion_length": 1837.5581359863281,
"epoch": 0.6962962962962963,
"grad_norm": 1.9026040821940675,
"kl": 0.7041015625,
"learning_rate": 3.2905266356200506e-07,
"loss": 0.1006,
"reward": 0.5509998500347137,
"reward_std": 0.2821981944143772,
"rewards/exp_len_reward": 0.5509998500347137,
"step": 235
},
{
"completion_length": 2152.6295471191406,
"epoch": 0.6992592592592592,
"grad_norm": 11.662060157681738,
"kl": 0.9638671875,
"learning_rate": 3.250000000000001e-07,
"loss": 0.0456,
"reward": 0.5501847416162491,
"reward_std": 0.1710715489462018,
"rewards/exp_len_reward": 0.5501847416162491,
"step": 236
},
{
"completion_length": 1924.6519165039062,
"epoch": 0.7022222222222222,
"grad_norm": 1.3789345707628888,
"kl": 0.728515625,
"learning_rate": 3.2097152405681904e-07,
"loss": 0.0905,
"reward": 0.5505119562149048,
"reward_std": 0.20416779816150665,
"rewards/exp_len_reward": 0.5505119562149048,
"step": 237
},
{
"completion_length": 2016.696533203125,
"epoch": 0.7051851851851851,
"grad_norm": 2.230914856374322,
"kl": 0.5777587890625,
"learning_rate": 3.1696766879575354e-07,
"loss": 0.0708,
"reward": 0.5033136606216431,
"reward_std": 0.21769768744707108,
"rewards/exp_len_reward": 0.5033136606216431,
"step": 238
},
{
"completion_length": 2177.9866638183594,
"epoch": 0.7081481481481482,
"grad_norm": 1.3770470235796741,
"kl": 0.62841796875,
"learning_rate": 3.1298886463335857e-07,
"loss": 0.0397,
"reward": 0.5401712283492088,
"reward_std": 0.16454584524035454,
"rewards/exp_len_reward": 0.5401712283492088,
"step": 239
},
{
"completion_length": 2085.6607971191406,
"epoch": 0.7111111111111111,
"grad_norm": 1.0539338304475274,
"kl": 0.30908203125,
"learning_rate": 3.090355392931827e-07,
"loss": 0.0564,
"reward": 0.517996683716774,
"reward_std": 0.2007257491350174,
"rewards/exp_len_reward": 0.517996683716774,
"step": 240
},
{
"completion_length": 1860.1116943359375,
"epoch": 0.7140740740740741,
"grad_norm": 3.5014067092047294,
"kl": 0.22900390625,
"learning_rate": 3.051081177597876e-07,
"loss": 0.0691,
"reward": 0.6099975854158401,
"reward_std": 0.2467595972120762,
"rewards/exp_len_reward": 0.6099975854158401,
"step": 241
},
{
"completion_length": 1915.2724304199219,
"epoch": 0.717037037037037,
"grad_norm": 1.3719140200453745,
"kl": 0.228271484375,
"learning_rate": 3.012070222330629e-07,
"loss": 0.0217,
"reward": 0.4658224508166313,
"reward_std": 0.2403927743434906,
"rewards/exp_len_reward": 0.4658224508166313,
"step": 242
},
{
"completion_length": 2076.2188415527344,
"epoch": 0.72,
"grad_norm": 4.382757308947479,
"kl": 0.2587890625,
"learning_rate": 2.97332672082837e-07,
"loss": 0.0487,
"reward": 0.5252245962619781,
"reward_std": 0.25723912566900253,
"rewards/exp_len_reward": 0.5252245962619781,
"step": 243
},
{
"completion_length": 2111.9375610351562,
"epoch": 0.7229629629629629,
"grad_norm": 1.3459524492369142,
"kl": 0.290771484375,
"learning_rate": 2.934854838037978e-07,
"loss": 0.0753,
"reward": 0.591300830245018,
"reward_std": 0.2962731011211872,
"rewards/exp_len_reward": 0.591300830245018,
"step": 244
},
{
"completion_length": 1875.7143859863281,
"epoch": 0.725925925925926,
"grad_norm": 1.0059471369170592,
"kl": 0.2734375,
"learning_rate": 2.8966587097071683e-07,
"loss": 0.0397,
"reward": 0.6712532192468643,
"reward_std": 0.1884814165532589,
"rewards/exp_len_reward": 0.6712532192468643,
"step": 245
},
{
"completion_length": 1451.8304443359375,
"epoch": 0.7288888888888889,
"grad_norm": 3.334470597565311,
"kl": 0.21923828125,
"learning_rate": 2.8587424419399055e-07,
"loss": 0.0743,
"reward": 0.7117358893156052,
"reward_std": 0.25990375503897667,
"rewards/exp_len_reward": 0.7117358893156052,
"step": 246
},
{
"completion_length": 1776.2679443359375,
"epoch": 0.7318518518518519,
"grad_norm": 1.3678103662021623,
"kl": 0.25830078125,
"learning_rate": 2.821110110755004e-07,
"loss": 0.0406,
"reward": 0.5916131287813187,
"reward_std": 0.21059276908636093,
"rewards/exp_len_reward": 0.5916131287813187,
"step": 247
},
{
"completion_length": 1554.9465026855469,
"epoch": 0.7348148148148148,
"grad_norm": 1.574805240114687,
"kl": 0.3394775390625,
"learning_rate": 2.783765761647934e-07,
"loss": 0.0373,
"reward": 0.6932453364133835,
"reward_std": 0.18446229957044125,
"rewards/exp_len_reward": 0.6932453364133835,
"step": 248
},
{
"completion_length": 1849.1875305175781,
"epoch": 0.7377777777777778,
"grad_norm": 2.96754901799222,
"kl": 0.6630859375,
"learning_rate": 2.746713409155951e-07,
"loss": 0.0009,
"reward": 0.5517724305391312,
"reward_std": 0.2094859890639782,
"rewards/exp_len_reward": 0.5517724305391312,
"step": 249
},
{
"completion_length": 1902.3438415527344,
"epoch": 0.7407407407407407,
"grad_norm": 2.620462966652135,
"kl": 0.81787109375,
"learning_rate": 2.709957036426512e-07,
"loss": 0.0541,
"reward": 0.6254279538989067,
"reward_std": 0.22460020706057549,
"rewards/exp_len_reward": 0.6254279538989067,
"step": 250
},
{
"completion_length": 2094.65185546875,
"epoch": 0.7437037037037038,
"grad_norm": 4.270339518374745,
"kl": 0.9814453125,
"learning_rate": 2.6735005947890986e-07,
"loss": 0.0523,
"reward": 0.5432867407798767,
"reward_std": 0.25716196186840534,
"rewards/exp_len_reward": 0.5432867407798767,
"step": 251
},
{
"completion_length": 1923.1384887695312,
"epoch": 0.7466666666666667,
"grad_norm": 4.4007623566003335,
"kl": 0.876953125,
"learning_rate": 2.6373480033304397e-07,
"loss": 0.0662,
"reward": 0.5263698920607567,
"reward_std": 0.23529189638793468,
"rewards/exp_len_reward": 0.5263698920607567,
"step": 252
},
{
"completion_length": 1774.7947082519531,
"epoch": 0.7496296296296296,
"grad_norm": 4.102095204442662,
"kl": 0.74267578125,
"learning_rate": 2.6015031484732103e-07,
"loss": 0.0294,
"reward": 0.5787611454725266,
"reward_std": 0.23660384491086006,
"rewards/exp_len_reward": 0.5787611454725266,
"step": 253
},
{
"completion_length": 2020.52685546875,
"epoch": 0.7525925925925926,
"grad_norm": 3.494895709470538,
"kl": 0.59033203125,
"learning_rate": 2.565969883558236e-07,
"loss": 0.1393,
"reward": 0.5668770894408226,
"reward_std": 0.2578126862645149,
"rewards/exp_len_reward": 0.5668770894408226,
"step": 254
},
{
"completion_length": 2249.1875915527344,
"epoch": 0.7555555555555555,
"grad_norm": 1.3887241916882165,
"kl": 0.81689453125,
"learning_rate": 2.5307520284302606e-07,
"loss": 0.0922,
"reward": 0.598124660551548,
"reward_std": 0.16249966993927956,
"rewards/exp_len_reward": 0.598124660551548,
"step": 255
},
{
"completion_length": 2033.2679443359375,
"epoch": 0.7585185185185185,
"grad_norm": 4.011350770756549,
"kl": 0.75,
"learning_rate": 2.495853369027309e-07,
"loss": 0.0559,
"reward": 0.5327246338129044,
"reward_std": 0.2595828250050545,
"rewards/exp_len_reward": 0.5327246338129044,
"step": 256
},
{
"completion_length": 2089.6384887695312,
"epoch": 0.7614814814814815,
"grad_norm": 2.4045500748983955,
"kl": 0.5302734375,
"learning_rate": 2.4612776569736984e-07,
"loss": 0.1014,
"reward": 0.6376392692327499,
"reward_std": 0.23245985060930252,
"rewards/exp_len_reward": 0.6376392692327499,
"step": 257
},
{
"completion_length": 1895.6607971191406,
"epoch": 0.7644444444444445,
"grad_norm": 1.4906449441401861,
"kl": 0.5322265625,
"learning_rate": 2.4270286091767335e-07,
"loss": 0.0467,
"reward": 0.46556220203638077,
"reward_std": 0.2202283851802349,
"rewards/exp_len_reward": 0.46556220203638077,
"step": 258
},
{
"completion_length": 1196.276840209961,
"epoch": 0.7674074074074074,
"grad_norm": 1.0592729473109168,
"kl": 0.14678955078125,
"learning_rate": 2.39310990742714e-07,
"loss": 0.0688,
"reward": 0.7091374546289444,
"reward_std": 0.20984918251633644,
"rewards/exp_len_reward": 0.7091374546289444,
"step": 259
},
{
"completion_length": 1845.0536193847656,
"epoch": 0.7703703703703704,
"grad_norm": 0.6134870917567257,
"kl": 0.40478515625,
"learning_rate": 2.3595251980032673e-07,
"loss": 0.0446,
"reward": 0.62698695063591,
"reward_std": 0.23867091536521912,
"rewards/exp_len_reward": 0.62698695063591,
"step": 260
},
{
"completion_length": 1902.5268859863281,
"epoch": 0.7733333333333333,
"grad_norm": 3.0873733945089636,
"kl": 0.41748046875,
"learning_rate": 2.3262780912791183e-07,
"loss": 0.0941,
"reward": 0.6978839188814163,
"reward_std": 0.16607779264450073,
"rewards/exp_len_reward": 0.6978839188814163,
"step": 261
},
{
"completion_length": 1507.7098693847656,
"epoch": 0.7762962962962963,
"grad_norm": 0.7472079939240217,
"kl": 0.2447509765625,
"learning_rate": 2.2933721613362188e-07,
"loss": 0.0344,
"reward": 0.7389920055866241,
"reward_std": 0.16782627813518047,
"rewards/exp_len_reward": 0.7389920055866241,
"step": 262
},
{
"completion_length": 1831.7724304199219,
"epoch": 0.7792592592592592,
"grad_norm": 2.303477094889597,
"kl": 0.4873046875,
"learning_rate": 2.2608109455794197e-07,
"loss": 0.0001,
"reward": 0.6041949242353439,
"reward_std": 0.18537207320332527,
"rewards/exp_len_reward": 0.6041949242353439,
"step": 263
},
{
"completion_length": 2072.5447387695312,
"epoch": 0.7822222222222223,
"grad_norm": 1.162365568500677,
"kl": 0.521484375,
"learning_rate": 2.2285979443566093e-07,
"loss": 0.0353,
"reward": 0.47308728843927383,
"reward_std": 0.2114715836942196,
"rewards/exp_len_reward": 0.47308728843927383,
"step": 264
},
{
"completion_length": 2082.0982666015625,
"epoch": 0.7851851851851852,
"grad_norm": 1.1540954056683599,
"kl": 0.4599609375,
"learning_rate": 2.196736620582429e-07,
"loss": 0.0681,
"reward": 0.6188310533761978,
"reward_std": 0.21996454149484634,
"rewards/exp_len_reward": 0.6188310533761978,
"step": 265
},
{
"completion_length": 1896.6295471191406,
"epoch": 0.7881481481481482,
"grad_norm": 1.5229341500025346,
"kl": 0.4105224609375,
"learning_rate": 2.1652303993660146e-07,
"loss": 0.0771,
"reward": 0.619941383600235,
"reward_std": 0.23672576248645782,
"rewards/exp_len_reward": 0.619941383600235,
"step": 266
},
{
"completion_length": 2306.241180419922,
"epoch": 0.7911111111111111,
"grad_norm": 1.5965567483211722,
"kl": 0.603515625,
"learning_rate": 2.1340826676427826e-07,
"loss": 0.0899,
"reward": 0.46971995383501053,
"reward_std": 0.25150875374674797,
"rewards/exp_len_reward": 0.46971995383501053,
"step": 267
},
{
"completion_length": 2109.759063720703,
"epoch": 0.794074074074074,
"grad_norm": 1.0057063473264853,
"kl": 0.5146484375,
"learning_rate": 2.103296773810344e-07,
"loss": 0.0736,
"reward": 0.646474152803421,
"reward_std": 0.1665012501180172,
"rewards/exp_len_reward": 0.646474152803421,
"step": 268
},
{
"completion_length": 1648.3706665039062,
"epoch": 0.797037037037037,
"grad_norm": 1.399927607428829,
"kl": 0.41455078125,
"learning_rate": 2.0728760273685435e-07,
"loss": 0.0669,
"reward": 0.6383180469274521,
"reward_std": 0.24311606958508492,
"rewards/exp_len_reward": 0.6383180469274521,
"step": 269
},
{
"completion_length": 1812.5000915527344,
"epoch": 0.8,
"grad_norm": 2.0638539548725543,
"kl": 0.58056640625,
"learning_rate": 2.0428236985636878e-07,
"loss": 0.0273,
"reward": 0.5707896202802658,
"reward_std": 0.14662024565041065,
"rewards/exp_len_reward": 0.5707896202802658,
"step": 270
},
{
"completion_length": 1855.99560546875,
"epoch": 0.802962962962963,
"grad_norm": 1.077724114851918,
"kl": 0.573486328125,
"learning_rate": 2.0131430180369957e-07,
"loss": 0.0689,
"reward": 0.6811731457710266,
"reward_std": 0.21652160212397575,
"rewards/exp_len_reward": 0.6811731457710266,
"step": 271
},
{
"completion_length": 1906.46435546875,
"epoch": 0.8059259259259259,
"grad_norm": 3.8395631016480345,
"kl": 0.7919921875,
"learning_rate": 1.9838371764772992e-07,
"loss": 0.0734,
"reward": 0.5762921273708344,
"reward_std": 0.21297482959926128,
"rewards/exp_len_reward": 0.5762921273708344,
"step": 272
},
{
"completion_length": 1850.71435546875,
"epoch": 0.8088888888888889,
"grad_norm": 2.5571773473006285,
"kl": 0.69580078125,
"learning_rate": 1.954909324278041e-07,
"loss": 0.0464,
"reward": 0.6064716130495071,
"reward_std": 0.19751618057489395,
"rewards/exp_len_reward": 0.6064716130495071,
"step": 273
},
{
"completion_length": 1855.6205749511719,
"epoch": 0.8118518518518518,
"grad_norm": 1.234393016428857,
"kl": 0.59619140625,
"learning_rate": 1.9263625711986092e-07,
"loss": 0.0481,
"reward": 0.6006206125020981,
"reward_std": 0.24168968573212624,
"rewards/exp_len_reward": 0.6006206125020981,
"step": 274
},
{
"completion_length": 1901.0803833007812,
"epoch": 0.8148148148148148,
"grad_norm": 1.638661716998031,
"kl": 0.62451171875,
"learning_rate": 1.8981999860300385e-07,
"loss": 0.0825,
"reward": 0.5825950875878334,
"reward_std": 0.20379779115319252,
"rewards/exp_len_reward": 0.5825950875878334,
"step": 275
},
{
"completion_length": 2026.5090637207031,
"epoch": 0.8177777777777778,
"grad_norm": 1.448189554080957,
"kl": 0.6962890625,
"learning_rate": 1.8704245962651026e-07,
"loss": 0.0915,
"reward": 0.5586806088685989,
"reward_std": 0.21963583678007126,
"rewards/exp_len_reward": 0.5586806088685989,
"step": 276
},
{
"completion_length": 1590.4777526855469,
"epoch": 0.8207407407407408,
"grad_norm": 0.8289937373611839,
"kl": 0.5087890625,
"learning_rate": 1.8430393877728745e-07,
"loss": 0.0715,
"reward": 0.6978113353252411,
"reward_std": 0.22020583972334862,
"rewards/exp_len_reward": 0.6978113353252411,
"step": 277
},
{
"completion_length": 2163.5938415527344,
"epoch": 0.8237037037037037,
"grad_norm": 1.0094687173541748,
"kl": 0.767578125,
"learning_rate": 1.8160473044777263e-07,
"loss": 0.1185,
"reward": 0.5574893727898598,
"reward_std": 0.25213854014873505,
"rewards/exp_len_reward": 0.5574893727898598,
"step": 278
},
{
"completion_length": 1669.3616638183594,
"epoch": 0.8266666666666667,
"grad_norm": 1.3169001272779028,
"kl": 0.4873046875,
"learning_rate": 1.789451248042867e-07,
"loss": 0.0525,
"reward": 0.6423463597893715,
"reward_std": 0.21679977793246508,
"rewards/exp_len_reward": 0.6423463597893715,
"step": 279
},
{
"completion_length": 2108.071502685547,
"epoch": 0.8296296296296296,
"grad_norm": 1.5089860120727316,
"kl": 0.72509765625,
"learning_rate": 1.763254077558411e-07,
"loss": 0.0624,
"reward": 0.6306832581758499,
"reward_std": 0.23130958899855614,
"rewards/exp_len_reward": 0.6306832581758499,
"step": 280
},
{
"completion_length": 1408.0223693847656,
"epoch": 0.8325925925925926,
"grad_norm": 0.626851236326197,
"kl": 0.3817138671875,
"learning_rate": 1.7374586092340194e-07,
"loss": 0.052,
"reward": 0.7289980947971344,
"reward_std": 0.1484288088977337,
"rewards/exp_len_reward": 0.7289980947971344,
"step": 281
},
{
"completion_length": 1850.3304443359375,
"epoch": 0.8355555555555556,
"grad_norm": 1.2880854913101176,
"kl": 0.61328125,
"learning_rate": 1.712067616096159e-07,
"loss": 0.0664,
"reward": 0.6098030656576157,
"reward_std": 0.27465640753507614,
"rewards/exp_len_reward": 0.6098030656576157,
"step": 282
},
{
"completion_length": 2180.772430419922,
"epoch": 0.8385185185185186,
"grad_norm": 1.3173048932404967,
"kl": 0.7880859375,
"learning_rate": 1.6870838276900018e-07,
"loss": 0.068,
"reward": 0.5274906530976295,
"reward_std": 0.17172403447329998,
"rewards/exp_len_reward": 0.5274906530976295,
"step": 283
},
{
"completion_length": 1963.9598693847656,
"epoch": 0.8414814814814815,
"grad_norm": 2.352592735238779,
"kl": 0.5947265625,
"learning_rate": 1.6625099297859945e-07,
"loss": 0.0435,
"reward": 0.5498563274741173,
"reward_std": 0.22014831006526947,
"rewards/exp_len_reward": 0.5498563274741173,
"step": 284
},
{
"completion_length": 1755.5179748535156,
"epoch": 0.8444444444444444,
"grad_norm": 0.7783642288316969,
"kl": 0.53125,
"learning_rate": 1.638348564091142e-07,
"loss": 0.0537,
"reward": 0.6466532945632935,
"reward_std": 0.21114975400269032,
"rewards/exp_len_reward": 0.6466532945632935,
"step": 285
},
{
"completion_length": 1900.9108276367188,
"epoch": 0.8474074074074074,
"grad_norm": 0.7325869249506614,
"kl": 0.56689453125,
"learning_rate": 1.6146023279650146e-07,
"loss": 0.0552,
"reward": 0.5337693318724632,
"reward_std": 0.2679591439664364,
"rewards/exp_len_reward": 0.5337693318724632,
"step": 286
},
{
"completion_length": 1815.821533203125,
"epoch": 0.8503703703703703,
"grad_norm": 2.5993122855161217,
"kl": 0.48291015625,
"learning_rate": 1.5912737741405364e-07,
"loss": 0.0855,
"reward": 0.6207796633243561,
"reward_std": 0.28707681968808174,
"rewards/exp_len_reward": 0.6207796633243561,
"step": 287
},
{
"completion_length": 2066.6116943359375,
"epoch": 0.8533333333333334,
"grad_norm": 1.6367894996791235,
"kl": 0.658203125,
"learning_rate": 1.5683654104495627e-07,
"loss": 0.0136,
"reward": 0.5434933006763458,
"reward_std": 0.20752229169011116,
"rewards/exp_len_reward": 0.5434933006763458,
"step": 288
},
{
"completion_length": 1812.1653137207031,
"epoch": 0.8562962962962963,
"grad_norm": 1.6394731751923866,
"kl": 0.48291015625,
"learning_rate": 1.5458796995532915e-07,
"loss": 0.0547,
"reward": 0.676224872469902,
"reward_std": 0.20963529124855995,
"rewards/exp_len_reward": 0.676224872469902,
"step": 289
},
{
"completion_length": 1776.732177734375,
"epoch": 0.8592592592592593,
"grad_norm": 1.1773115253318147,
"kl": 0.44091796875,
"learning_rate": 1.5238190586775145e-07,
"loss": 0.0524,
"reward": 0.5728821009397507,
"reward_std": 0.19821078144013882,
"rewards/exp_len_reward": 0.5728821009397507,
"step": 290
},
{
"completion_length": 1930.7991943359375,
"epoch": 0.8622222222222222,
"grad_norm": 1.217553347358791,
"kl": 0.544921875,
"learning_rate": 1.50218585935278e-07,
"loss": 0.0898,
"reward": 0.5744712874293327,
"reward_std": 0.2638898529112339,
"rewards/exp_len_reward": 0.5744712874293327,
"step": 291
},
{
"completion_length": 1568.3929443359375,
"epoch": 0.8651851851851852,
"grad_norm": 2.1855891688795217,
"kl": 0.3900146484375,
"learning_rate": 1.4809824271594384e-07,
"loss": 0.0756,
"reward": 0.6319922655820847,
"reward_std": 0.18087825924158096,
"rewards/exp_len_reward": 0.6319922655820847,
"step": 292
},
{
"completion_length": 1893.5537109375,
"epoch": 0.8681481481481481,
"grad_norm": 0.8745208848244106,
"kl": 0.60498046875,
"learning_rate": 1.4602110414776475e-07,
"loss": 0.0793,
"reward": 0.5885374248027802,
"reward_std": 0.2328047677874565,
"rewards/exp_len_reward": 0.5885374248027802,
"step": 293
},
{
"completion_length": 2152.7456665039062,
"epoch": 0.8711111111111111,
"grad_norm": 3.1078833340758987,
"kl": 0.7197265625,
"learning_rate": 1.4398739352423406e-07,
"loss": 0.0136,
"reward": 0.5296469628810883,
"reward_std": 0.1996788065880537,
"rewards/exp_len_reward": 0.5296469628810883,
"step": 294
},
{
"completion_length": 2409.5358276367188,
"epoch": 0.8740740740740741,
"grad_norm": 3.4314169496830558,
"kl": 0.951171875,
"learning_rate": 1.419973294703174e-07,
"loss": 0.0447,
"reward": 0.3807084336876869,
"reward_std": 0.22457972541451454,
"rewards/exp_len_reward": 0.3807084336876869,
"step": 295
},
{
"completion_length": 2120.7813720703125,
"epoch": 0.8770370370370371,
"grad_norm": 3.439087218546517,
"kl": 0.82958984375,
"learning_rate": 1.400511259189518e-07,
"loss": 0.026,
"reward": 0.5302798449993134,
"reward_std": 0.19236281886696815,
"rewards/exp_len_reward": 0.5302798449993134,
"step": 296
},
{
"completion_length": 1617.3125610351562,
"epoch": 0.88,
"grad_norm": 0.947761415688406,
"kl": 0.3895263671875,
"learning_rate": 1.3814899208804677e-07,
"loss": 0.0739,
"reward": 0.7116686105728149,
"reward_std": 0.17973697930574417,
"rewards/exp_len_reward": 0.7116686105728149,
"step": 297
},
{
"completion_length": 2093.3616943359375,
"epoch": 0.882962962962963,
"grad_norm": 2.663080594359249,
"kl": 0.7080078125,
"learning_rate": 1.3629113245799361e-07,
"loss": 0.0267,
"reward": 0.4907858446240425,
"reward_std": 0.18806980550289154,
"rewards/exp_len_reward": 0.4907858446240425,
"step": 298
},
{
"completion_length": 1681.8840026855469,
"epoch": 0.8859259259259259,
"grad_norm": 1.4501228440522578,
"kl": 0.40087890625,
"learning_rate": 1.3447774674968387e-07,
"loss": 0.0953,
"reward": 0.6455406174063683,
"reward_std": 0.21742986515164375,
"rewards/exp_len_reward": 0.6455406174063683,
"step": 299
},
{
"completion_length": 1697.2188110351562,
"epoch": 0.8888888888888888,
"grad_norm": 0.8931595775633989,
"kl": 0.4423828125,
"learning_rate": 1.3270902990303869e-07,
"loss": 0.0021,
"reward": 0.5877698212862015,
"reward_std": 0.2616008296608925,
"rewards/exp_len_reward": 0.5877698212862015,
"step": 300
},
{
"completion_length": 1755.1697387695312,
"epoch": 0.8918518518518519,
"grad_norm": 1.5838910503748744,
"kl": 0.42138671875,
"learning_rate": 1.3098517205605325e-07,
"loss": 0.0896,
"reward": 0.6868456155061722,
"reward_std": 0.18149937316775322,
"rewards/exp_len_reward": 0.6868456155061722,
"step": 301
},
{
"completion_length": 1789.8304595947266,
"epoch": 0.8948148148148148,
"grad_norm": 0.6954424759579384,
"kl": 0.455291748046875,
"learning_rate": 1.2930635852435634e-07,
"loss": 0.0637,
"reward": 0.6176896244287491,
"reward_std": 0.2546579912304878,
"rewards/exp_len_reward": 0.6176896244287491,
"step": 302
},
{
"completion_length": 2092.6742248535156,
"epoch": 0.8977777777777778,
"grad_norm": 1.055499700957037,
"kl": 0.59423828125,
"learning_rate": 1.276727697812894e-07,
"loss": 0.0759,
"reward": 0.5725482404232025,
"reward_std": 0.2711305655539036,
"rewards/exp_len_reward": 0.5725482404232025,
"step": 303
},
{
"completion_length": 2076.634033203125,
"epoch": 0.9007407407407407,
"grad_norm": 1.0741468220182044,
"kl": 0.5556640625,
"learning_rate": 1.2608458143850493e-07,
"loss": 0.0704,
"reward": 0.6020158976316452,
"reward_std": 0.2659350074827671,
"rewards/exp_len_reward": 0.6020158976316452,
"step": 304
},
{
"completion_length": 2139.4733276367188,
"epoch": 0.9037037037037037,
"grad_norm": 0.9471440274301567,
"kl": 0.590576171875,
"learning_rate": 1.2454196422708843e-07,
"loss": 0.0492,
"reward": 0.5845741108059883,
"reward_std": 0.21030431985855103,
"rewards/exp_len_reward": 0.5845741108059883,
"step": 305
},
{
"completion_length": 2160.6697692871094,
"epoch": 0.9066666666666666,
"grad_norm": 1.0546222999465211,
"kl": 0.73681640625,
"learning_rate": 1.2304508397920499e-07,
"loss": 0.01,
"reward": 0.5044809579849243,
"reward_std": 0.2162732593715191,
"rewards/exp_len_reward": 0.5044809579849243,
"step": 306
},
{
"completion_length": 1802.0626220703125,
"epoch": 0.9096296296296297,
"grad_norm": 0.7273536659582915,
"kl": 0.45458984375,
"learning_rate": 1.2159410161027153e-07,
"loss": 0.061,
"reward": 0.6756877303123474,
"reward_std": 0.17008201032876968,
"rewards/exp_len_reward": 0.6756877303123474,
"step": 307
},
{
"completion_length": 1900.1429138183594,
"epoch": 0.9125925925925926,
"grad_norm": 1.0377925081151909,
"kl": 0.4912109375,
"learning_rate": 1.2018917310165926e-07,
"loss": 0.0756,
"reward": 0.6221350133419037,
"reward_std": 0.21066963486373425,
"rewards/exp_len_reward": 0.6221350133419037,
"step": 308
},
{
"completion_length": 2008.040283203125,
"epoch": 0.9155555555555556,
"grad_norm": 1.1323707827713791,
"kl": 0.5888671875,
"learning_rate": 1.1883044948392453e-07,
"loss": 0.0239,
"reward": 0.6152837574481964,
"reward_std": 0.20816011540591717,
"rewards/exp_len_reward": 0.6152837574481964,
"step": 309
},
{
"completion_length": 1659.3572082519531,
"epoch": 0.9185185185185185,
"grad_norm": 1.0741422911073732,
"kl": 0.3739013671875,
"learning_rate": 1.1751807682057396e-07,
"loss": 0.0697,
"reward": 0.6434877663850784,
"reward_std": 0.22068125009536743,
"rewards/exp_len_reward": 0.6434877663850784,
"step": 310
},
{
"completion_length": 1905.5492248535156,
"epoch": 0.9214814814814815,
"grad_norm": 1.4331895848772223,
"kl": 0.52294921875,
"learning_rate": 1.1625219619236196e-07,
"loss": 0.0179,
"reward": 0.6263534277677536,
"reward_std": 0.17030689865350723,
"rewards/exp_len_reward": 0.6263534277677536,
"step": 311
},
{
"completion_length": 1953.1831665039062,
"epoch": 0.9244444444444444,
"grad_norm": 2.1662592035207973,
"kl": 0.55517578125,
"learning_rate": 1.1503294368212441e-07,
"loss": 0.0041,
"reward": 0.5517635121941566,
"reward_std": 0.16691016405820847,
"rewards/exp_len_reward": 0.5517635121941566,
"step": 312
},
{
"completion_length": 2065.2679443359375,
"epoch": 0.9274074074074075,
"grad_norm": 0.8474773056034575,
"kl": 0.51318359375,
"learning_rate": 1.1386045036015024e-07,
"loss": 0.0518,
"reward": 0.6386523991823196,
"reward_std": 0.28626545891165733,
"rewards/exp_len_reward": 0.6386523991823196,
"step": 313
},
{
"completion_length": 1785.6920471191406,
"epoch": 0.9303703703703704,
"grad_norm": 3.330669113813351,
"kl": 0.39599609375,
"learning_rate": 1.1273484227009072e-07,
"loss": 0.1397,
"reward": 0.6923246830701828,
"reward_std": 0.23065154626965523,
"rewards/exp_len_reward": 0.6923246830701828,
"step": 314
},
{
"completion_length": 2113.290283203125,
"epoch": 0.9333333333333333,
"grad_norm": 2.208786377683472,
"kl": 0.6357421875,
"learning_rate": 1.116562404154099e-07,
"loss": 0.0675,
"reward": 0.5248966738581657,
"reward_std": 0.20559153519570827,
"rewards/exp_len_reward": 0.5248966738581657,
"step": 315
},
{
"completion_length": 1636.8036193847656,
"epoch": 0.9362962962962963,
"grad_norm": 0.8582998929826559,
"kl": 0.359130859375,
"learning_rate": 1.1062476074637685e-07,
"loss": 0.0267,
"reward": 0.5902325585484505,
"reward_std": 0.26360809803009033,
"rewards/exp_len_reward": 0.5902325585484505,
"step": 316
},
{
"completion_length": 1962.6473999023438,
"epoch": 0.9392592592592592,
"grad_norm": 0.7797115036676918,
"kl": 0.483642578125,
"learning_rate": 1.0964051414760065e-07,
"loss": 0.0519,
"reward": 0.6097806543111801,
"reward_std": 0.19469193182885647,
"rewards/exp_len_reward": 0.6097806543111801,
"step": 317
},
{
"completion_length": 1495.165283203125,
"epoch": 0.9422222222222222,
"grad_norm": 1.724934156584613,
"kl": 0.27099609375,
"learning_rate": 1.087036064261106e-07,
"loss": 0.0479,
"reward": 0.7044764161109924,
"reward_std": 0.22360007464885712,
"rewards/exp_len_reward": 0.7044764161109924,
"step": 318
},
{
"completion_length": 1907.3750610351562,
"epoch": 0.9451851851851852,
"grad_norm": 0.9397015929298037,
"kl": 0.43603515625,
"learning_rate": 1.0781413829998135e-07,
"loss": 0.0703,
"reward": 0.6270845979452133,
"reward_std": 0.1956428363919258,
"rewards/exp_len_reward": 0.6270845979452133,
"step": 319
},
{
"completion_length": 2248.6384887695312,
"epoch": 0.9481481481481482,
"grad_norm": 2.3924203042429424,
"kl": 0.767578125,
"learning_rate": 1.0697220538750631e-07,
"loss": 0.1143,
"reward": 0.4779609218239784,
"reward_std": 0.2641923241317272,
"rewards/exp_len_reward": 0.4779609218239784,
"step": 320
},
{
"completion_length": 2031.3527526855469,
"epoch": 0.9511111111111111,
"grad_norm": 1.5717510752992125,
"kl": 0.57177734375,
"learning_rate": 1.0617789819691819e-07,
"loss": 0.0913,
"reward": 0.5689445361495018,
"reward_std": 0.27545909211039543,
"rewards/exp_len_reward": 0.5689445361495018,
"step": 321
},
{
"completion_length": 2053.80810546875,
"epoch": 0.9540740740740741,
"grad_norm": 2.3514432726094316,
"kl": 0.568359375,
"learning_rate": 1.054313021166595e-07,
"loss": 0.0835,
"reward": 0.6623236984014511,
"reward_std": 0.258603822439909,
"rewards/exp_len_reward": 0.6623236984014511,
"step": 322
},
{
"completion_length": 1866.009033203125,
"epoch": 0.957037037037037,
"grad_norm": 1.7616535804704176,
"kl": 0.535400390625,
"learning_rate": 1.0473249740620304e-07,
"loss": 0.0234,
"reward": 0.6161750108003616,
"reward_std": 0.2101491615176201,
"rewards/exp_len_reward": 0.6161750108003616,
"step": 323
},
{
"completion_length": 2126.464385986328,
"epoch": 0.96,
"grad_norm": 1.8176658288354475,
"kl": 0.7197265625,
"learning_rate": 1.0408155918742432e-07,
"loss": 0.1053,
"reward": 0.619974821805954,
"reward_std": 0.21160422265529633,
"rewards/exp_len_reward": 0.619974821805954,
"step": 324
},
{
"completion_length": 2201.9510192871094,
"epoch": 0.9629629629629629,
"grad_norm": 2.611363321049143,
"kl": 0.8994140625,
"learning_rate": 1.034785574365256e-07,
"loss": 0.0785,
"reward": 0.5216581001877785,
"reward_std": 0.22227726504206657,
"rewards/exp_len_reward": 0.5216581001877785,
"step": 325
},
{
"completion_length": 1939.4777526855469,
"epoch": 0.965925925925926,
"grad_norm": 1.9447889354625958,
"kl": 0.57666015625,
"learning_rate": 1.0292355697651348e-07,
"loss": 0.0518,
"reward": 0.5352144092321396,
"reward_std": 0.18627181835472584,
"rewards/exp_len_reward": 0.5352144092321396,
"step": 326
},
{
"completion_length": 1660.0223999023438,
"epoch": 0.9688888888888889,
"grad_norm": 1.3319739321021469,
"kl": 0.615234375,
"learning_rate": 1.0241661747023064e-07,
"loss": 0.0264,
"reward": 0.5945611968636513,
"reward_std": 0.19085084274411201,
"rewards/exp_len_reward": 0.5945611968636513,
"step": 327
},
{
"completion_length": 1870.8482666015625,
"epoch": 0.9718518518518519,
"grad_norm": 1.2683542624045563,
"kl": 0.6884765625,
"learning_rate": 1.0195779341394164e-07,
"loss": 0.0875,
"reward": 0.5801157727837563,
"reward_std": 0.2543545439839363,
"rewards/exp_len_reward": 0.5801157727837563,
"step": 328
},
{
"completion_length": 1804.2098693847656,
"epoch": 0.9748148148148148,
"grad_norm": 1.532251011882061,
"kl": 0.546142578125,
"learning_rate": 1.0154713413147486e-07,
"loss": 0.0935,
"reward": 0.5890957191586494,
"reward_std": 0.1959761083126068,
"rewards/exp_len_reward": 0.5890957191586494,
"step": 329
},
{
"completion_length": 2037.2098693847656,
"epoch": 0.9777777777777777,
"grad_norm": 1.51448980281598,
"kl": 0.75634765625,
"learning_rate": 1.0118468376892005e-07,
"loss": 0.116,
"reward": 0.53825593739748,
"reward_std": 0.2599205709993839,
"rewards/exp_len_reward": 0.53825593739748,
"step": 330
},
{
"completion_length": 1872.2322082519531,
"epoch": 0.9807407407407407,
"grad_norm": 0.8135192710770781,
"kl": 0.63916015625,
"learning_rate": 1.0087048128988256e-07,
"loss": 0.0755,
"reward": 0.5870219618082047,
"reward_std": 0.23178360238671303,
"rewards/exp_len_reward": 0.5870219618082047,
"step": 331
},
{
"completion_length": 1804.0000610351562,
"epoch": 0.9837037037037037,
"grad_norm": 2.521244978635537,
"kl": 0.60009765625,
"learning_rate": 1.0060456047129485e-07,
"loss": 0.0965,
"reward": 0.7236264944076538,
"reward_std": 0.2475818656384945,
"rewards/exp_len_reward": 0.7236264944076538,
"step": 332
},
{
"completion_length": 1929.8483276367188,
"epoch": 0.9866666666666667,
"grad_norm": 1.4125480684524767,
"kl": 0.4892578125,
"learning_rate": 1.0038694989978531e-07,
"loss": 0.0505,
"reward": 0.5696776583790779,
"reward_std": 0.24001475051045418,
"rewards/exp_len_reward": 0.5696776583790779,
"step": 333
},
{
"completion_length": 2289.2322692871094,
"epoch": 0.9896296296296296,
"grad_norm": 1.2411060851104214,
"kl": 1.0234375,
"learning_rate": 1.0021767296860537e-07,
"loss": 0.1065,
"reward": 0.5802329778671265,
"reward_std": 0.255879282951355,
"rewards/exp_len_reward": 0.5802329778671265,
"step": 334
},
{
"completion_length": 2055.4866943359375,
"epoch": 0.9925925925925926,
"grad_norm": 4.508535676079954,
"kl": 0.82861328125,
"learning_rate": 1.0009674787511447e-07,
"loss": -0.0012,
"reward": 0.5173570811748505,
"reward_std": 0.22090869024395943,
"rewards/exp_len_reward": 0.5173570811748505,
"step": 335
},
{
"completion_length": 1760.1474304199219,
"epoch": 0.9955555555555555,
"grad_norm": 2.1627630685797605,
"kl": 0.75048828125,
"learning_rate": 1.0002418761882409e-07,
"loss": 0.0715,
"reward": 0.5967651307582855,
"reward_std": 0.23602332174777985,
"rewards/exp_len_reward": 0.5967651307582855,
"step": 336
},
{
"completion_length": 1915.9599304199219,
"epoch": 0.9985185185185185,
"grad_norm": 0.9639784041944786,
"kl": 0.678466796875,
"learning_rate": 1e-07,
"loss": 0.0641,
"reward": 0.5330347046256065,
"reward_std": 0.26336976513266563,
"rewards/exp_len_reward": 0.5330347046256065,
"step": 337
},
{
"epoch": 0.9985185185185185,
"step": 337,
"total_flos": 0.0,
"train_loss": 0.04468778468586763,
"train_runtime": 66671.6072,
"train_samples_per_second": 0.162,
"train_steps_per_second": 0.005
}
],
"logging_steps": 1,
"max_steps": 337,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}