|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9985185185185185, |
|
"eval_steps": 500, |
|
"global_step": 337, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"completion_length": 1801.2277526855469, |
|
"epoch": 0.002962962962962963, |
|
"grad_norm": 0.17165035705260304, |
|
"kl": 0.0, |
|
"learning_rate": 2.941176470588235e-08, |
|
"loss": 0.0224, |
|
"reward": 0.5656742379069328, |
|
"reward_std": 0.2888262942433357, |
|
"rewards/exp_len_reward": 0.5656742379069328, |
|
"step": 1 |
|
}, |
|
{ |
|
"completion_length": 2185.982208251953, |
|
"epoch": 0.005925925925925926, |
|
"grad_norm": 0.15400600491691718, |
|
"kl": 0.0, |
|
"learning_rate": 5.88235294117647e-08, |
|
"loss": 0.0623, |
|
"reward": 0.3711318001151085, |
|
"reward_std": 0.24151454865932465, |
|
"rewards/exp_len_reward": 0.3711318001151085, |
|
"step": 2 |
|
}, |
|
{ |
|
"completion_length": 2091.6697387695312, |
|
"epoch": 0.008888888888888889, |
|
"grad_norm": 0.12985062585096363, |
|
"kl": 0.0001380443572998047, |
|
"learning_rate": 8.823529411764706e-08, |
|
"loss": -0.0138, |
|
"reward": 0.45638658851385117, |
|
"reward_std": 0.27105605974793434, |
|
"rewards/exp_len_reward": 0.45638658851385117, |
|
"step": 3 |
|
}, |
|
{ |
|
"completion_length": 2074.7500915527344, |
|
"epoch": 0.011851851851851851, |
|
"grad_norm": 0.15982273028009736, |
|
"kl": 0.0001361370086669922, |
|
"learning_rate": 1.176470588235294e-07, |
|
"loss": 0.0803, |
|
"reward": 0.5727517828345299, |
|
"reward_std": 0.2747129164636135, |
|
"rewards/exp_len_reward": 0.5727517828345299, |
|
"step": 4 |
|
}, |
|
{ |
|
"completion_length": 1972.2679138183594, |
|
"epoch": 0.014814814814814815, |
|
"grad_norm": 0.14384158083081833, |
|
"kl": 0.00012791156768798828, |
|
"learning_rate": 1.4705882352941175e-07, |
|
"loss": -0.0259, |
|
"reward": 0.49002690985798836, |
|
"reward_std": 0.19663079921156168, |
|
"rewards/exp_len_reward": 0.49002690985798836, |
|
"step": 5 |
|
}, |
|
{ |
|
"completion_length": 1804.8885192871094, |
|
"epoch": 0.017777777777777778, |
|
"grad_norm": 0.16133143911721617, |
|
"kl": 0.00011050701141357422, |
|
"learning_rate": 1.764705882352941e-07, |
|
"loss": 0.0338, |
|
"reward": 0.5603420734405518, |
|
"reward_std": 0.3139008916914463, |
|
"rewards/exp_len_reward": 0.5603420734405518, |
|
"step": 6 |
|
}, |
|
{ |
|
"completion_length": 1797.9956359863281, |
|
"epoch": 0.02074074074074074, |
|
"grad_norm": 0.1746647382731447, |
|
"kl": 0.00010704994201660156, |
|
"learning_rate": 2.0588235294117645e-07, |
|
"loss": 0.0104, |
|
"reward": 0.5684476867318153, |
|
"reward_std": 0.2409583255648613, |
|
"rewards/exp_len_reward": 0.5684476867318153, |
|
"step": 7 |
|
}, |
|
{ |
|
"completion_length": 2083.8929748535156, |
|
"epoch": 0.023703703703703703, |
|
"grad_norm": 0.1420875124823215, |
|
"kl": 0.00012505054473876953, |
|
"learning_rate": 2.352941176470588e-07, |
|
"loss": 0.0083, |
|
"reward": 0.4780779331922531, |
|
"reward_std": 0.27319788932800293, |
|
"rewards/exp_len_reward": 0.4780779331922531, |
|
"step": 8 |
|
}, |
|
{ |
|
"completion_length": 2139.6027221679688, |
|
"epoch": 0.02666666666666667, |
|
"grad_norm": 0.13758669498682333, |
|
"kl": 0.0001379251480102539, |
|
"learning_rate": 2.6470588235294114e-07, |
|
"loss": 0.037, |
|
"reward": 0.3995143547654152, |
|
"reward_std": 0.2271974515169859, |
|
"rewards/exp_len_reward": 0.3995143547654152, |
|
"step": 9 |
|
}, |
|
{ |
|
"completion_length": 1866.290283203125, |
|
"epoch": 0.02962962962962963, |
|
"grad_norm": 0.18570234734655267, |
|
"kl": 0.00011456012725830078, |
|
"learning_rate": 2.941176470588235e-07, |
|
"loss": 0.1181, |
|
"reward": 0.46772801876068115, |
|
"reward_std": 0.2898196689784527, |
|
"rewards/exp_len_reward": 0.46772801876068115, |
|
"step": 10 |
|
}, |
|
{ |
|
"completion_length": 2405.1697387695312, |
|
"epoch": 0.03259259259259259, |
|
"grad_norm": 0.13468269595632437, |
|
"kl": 0.0001614093780517578, |
|
"learning_rate": 3.2352941176470586e-07, |
|
"loss": -0.0412, |
|
"reward": 0.4437425658106804, |
|
"reward_std": 0.2463722713291645, |
|
"rewards/exp_len_reward": 0.4437425658106804, |
|
"step": 11 |
|
}, |
|
{ |
|
"completion_length": 1774.8750915527344, |
|
"epoch": 0.035555555555555556, |
|
"grad_norm": 0.1541607968056605, |
|
"kl": 0.00010025501251220703, |
|
"learning_rate": 3.529411764705882e-07, |
|
"loss": 0.0198, |
|
"reward": 0.4462169408798218, |
|
"reward_std": 0.25852715596556664, |
|
"rewards/exp_len_reward": 0.4462169408798218, |
|
"step": 12 |
|
}, |
|
{ |
|
"completion_length": 1651.9152374267578, |
|
"epoch": 0.03851851851851852, |
|
"grad_norm": 0.17738313728076366, |
|
"kl": 9.834766387939453e-05, |
|
"learning_rate": 3.8235294117647053e-07, |
|
"loss": 0.0837, |
|
"reward": 0.5214016065001488, |
|
"reward_std": 0.1839424017816782, |
|
"rewards/exp_len_reward": 0.5214016065001488, |
|
"step": 13 |
|
}, |
|
{ |
|
"completion_length": 1320.7232666015625, |
|
"epoch": 0.04148148148148148, |
|
"grad_norm": 0.16613281883713185, |
|
"kl": 9.1552734375e-05, |
|
"learning_rate": 4.117647058823529e-07, |
|
"loss": 0.0156, |
|
"reward": 0.5551810637116432, |
|
"reward_std": 0.28313471004366875, |
|
"rewards/exp_len_reward": 0.5551810637116432, |
|
"step": 14 |
|
}, |
|
{ |
|
"completion_length": 1841.2857971191406, |
|
"epoch": 0.044444444444444446, |
|
"grad_norm": 0.146202640780426, |
|
"kl": 0.00010335445404052734, |
|
"learning_rate": 4.4117647058823526e-07, |
|
"loss": 0.0368, |
|
"reward": 0.5522215813398361, |
|
"reward_std": 0.337805338203907, |
|
"rewards/exp_len_reward": 0.5522215813398361, |
|
"step": 15 |
|
}, |
|
{ |
|
"completion_length": 1566.0045166015625, |
|
"epoch": 0.047407407407407405, |
|
"grad_norm": 0.18526362140745664, |
|
"kl": 0.00010216236114501953, |
|
"learning_rate": 4.705882352941176e-07, |
|
"loss": 0.0547, |
|
"reward": 0.5769367516040802, |
|
"reward_std": 0.29369358718395233, |
|
"rewards/exp_len_reward": 0.5769367516040802, |
|
"step": 16 |
|
}, |
|
{ |
|
"completion_length": 1614.1385040283203, |
|
"epoch": 0.05037037037037037, |
|
"grad_norm": 0.1437351588405587, |
|
"kl": 0.00010508298873901367, |
|
"learning_rate": 5e-07, |
|
"loss": -0.02, |
|
"reward": 0.41402483731508255, |
|
"reward_std": 0.35327037796378136, |
|
"rewards/exp_len_reward": 0.41402483731508255, |
|
"step": 17 |
|
}, |
|
{ |
|
"completion_length": 1919.7411499023438, |
|
"epoch": 0.05333333333333334, |
|
"grad_norm": 0.16214726757721953, |
|
"kl": 0.0001310110092163086, |
|
"learning_rate": 5.294117647058823e-07, |
|
"loss": -0.0029, |
|
"reward": 0.5019121393561363, |
|
"reward_std": 0.3405100703239441, |
|
"rewards/exp_len_reward": 0.5019121393561363, |
|
"step": 18 |
|
}, |
|
{ |
|
"completion_length": 2193.102783203125, |
|
"epoch": 0.056296296296296296, |
|
"grad_norm": 0.17578873894389108, |
|
"kl": 0.0001233816146850586, |
|
"learning_rate": 5.588235294117647e-07, |
|
"loss": 0.0736, |
|
"reward": 0.5333621501922607, |
|
"reward_std": 0.332721009850502, |
|
"rewards/exp_len_reward": 0.5333621501922607, |
|
"step": 19 |
|
}, |
|
{ |
|
"completion_length": 1631.3750610351562, |
|
"epoch": 0.05925925925925926, |
|
"grad_norm": 0.16308413233344032, |
|
"kl": 8.952617645263672e-05, |
|
"learning_rate": 5.88235294117647e-07, |
|
"loss": 0.039, |
|
"reward": 0.5893106684088707, |
|
"reward_std": 0.20388014614582062, |
|
"rewards/exp_len_reward": 0.5893106684088707, |
|
"step": 20 |
|
}, |
|
{ |
|
"completion_length": 1881.4911499023438, |
|
"epoch": 0.06222222222222222, |
|
"grad_norm": 0.185982449259781, |
|
"kl": 0.00013494491577148438, |
|
"learning_rate": 6.176470588235294e-07, |
|
"loss": 0.032, |
|
"reward": 0.5047426372766495, |
|
"reward_std": 0.3071717321872711, |
|
"rewards/exp_len_reward": 0.5047426372766495, |
|
"step": 21 |
|
}, |
|
{ |
|
"completion_length": 2141.5001220703125, |
|
"epoch": 0.06518518518518518, |
|
"grad_norm": 0.12297826000738939, |
|
"kl": 0.00014662742614746094, |
|
"learning_rate": 6.470588235294117e-07, |
|
"loss": 0.01, |
|
"reward": 0.4561139643192291, |
|
"reward_std": 0.26503079757094383, |
|
"rewards/exp_len_reward": 0.4561139643192291, |
|
"step": 22 |
|
}, |
|
{ |
|
"completion_length": 1731.0938110351562, |
|
"epoch": 0.06814814814814815, |
|
"grad_norm": 0.16752422299971495, |
|
"kl": 0.00011110305786132812, |
|
"learning_rate": 6.764705882352941e-07, |
|
"loss": 0.0578, |
|
"reward": 0.5439532399177551, |
|
"reward_std": 0.2806715965270996, |
|
"rewards/exp_len_reward": 0.5439532399177551, |
|
"step": 23 |
|
}, |
|
{ |
|
"completion_length": 1966.1787109375, |
|
"epoch": 0.07111111111111111, |
|
"grad_norm": 0.21835387563377565, |
|
"kl": 0.00012505054473876953, |
|
"learning_rate": 7.058823529411765e-07, |
|
"loss": 0.0838, |
|
"reward": 0.5456492155790329, |
|
"reward_std": 0.21898872777819633, |
|
"rewards/exp_len_reward": 0.5456492155790329, |
|
"step": 24 |
|
}, |
|
{ |
|
"completion_length": 1735.8259582519531, |
|
"epoch": 0.07407407407407407, |
|
"grad_norm": 0.16254233382968727, |
|
"kl": 0.0001264810562133789, |
|
"learning_rate": 7.352941176470589e-07, |
|
"loss": -0.0254, |
|
"reward": 0.5086465999484062, |
|
"reward_std": 0.28585580736398697, |
|
"rewards/exp_len_reward": 0.5086465999484062, |
|
"step": 25 |
|
}, |
|
{ |
|
"completion_length": 1944.1384887695312, |
|
"epoch": 0.07703703703703704, |
|
"grad_norm": 0.1278452103013438, |
|
"kl": 0.00012123584747314453, |
|
"learning_rate": 7.647058823529411e-07, |
|
"loss": -0.0574, |
|
"reward": 0.521670825779438, |
|
"reward_std": 0.2907986231148243, |
|
"rewards/exp_len_reward": 0.521670825779438, |
|
"step": 26 |
|
}, |
|
{ |
|
"completion_length": 2062.15185546875, |
|
"epoch": 0.08, |
|
"grad_norm": 0.14646903593402805, |
|
"kl": 0.00015497207641601562, |
|
"learning_rate": 7.941176470588235e-07, |
|
"loss": 0.0199, |
|
"reward": 0.5175457671284676, |
|
"reward_std": 0.21266279742121696, |
|
"rewards/exp_len_reward": 0.5175457671284676, |
|
"step": 27 |
|
}, |
|
{ |
|
"completion_length": 1839.6786804199219, |
|
"epoch": 0.08296296296296296, |
|
"grad_norm": 0.14287471559793075, |
|
"kl": 0.00013077259063720703, |
|
"learning_rate": 8.235294117647058e-07, |
|
"loss": 0.0364, |
|
"reward": 0.40920490026474, |
|
"reward_std": 0.3160223700106144, |
|
"rewards/exp_len_reward": 0.40920490026474, |
|
"step": 28 |
|
}, |
|
{ |
|
"completion_length": 1802.4777526855469, |
|
"epoch": 0.08592592592592592, |
|
"grad_norm": 0.18816138571640492, |
|
"kl": 0.0001322031021118164, |
|
"learning_rate": 8.529411764705882e-07, |
|
"loss": 0.077, |
|
"reward": 0.5730812773108482, |
|
"reward_std": 0.25979165360331535, |
|
"rewards/exp_len_reward": 0.5730812773108482, |
|
"step": 29 |
|
}, |
|
{ |
|
"completion_length": 2252.700958251953, |
|
"epoch": 0.08888888888888889, |
|
"grad_norm": 0.1563414689939465, |
|
"kl": 0.00017523765563964844, |
|
"learning_rate": 8.823529411764705e-07, |
|
"loss": 0.0581, |
|
"reward": 0.4275534115731716, |
|
"reward_std": 0.3341764882206917, |
|
"rewards/exp_len_reward": 0.4275534115731716, |
|
"step": 30 |
|
}, |
|
{ |
|
"completion_length": 2195.9510192871094, |
|
"epoch": 0.09185185185185185, |
|
"grad_norm": 0.14534598607811303, |
|
"kl": 0.000186920166015625, |
|
"learning_rate": 9.117647058823529e-07, |
|
"loss": 0.0581, |
|
"reward": 0.369928453117609, |
|
"reward_std": 0.2257540374994278, |
|
"rewards/exp_len_reward": 0.369928453117609, |
|
"step": 31 |
|
}, |
|
{ |
|
"completion_length": 2284.1072387695312, |
|
"epoch": 0.09481481481481481, |
|
"grad_norm": 0.13504634261216167, |
|
"kl": 0.0002334117889404297, |
|
"learning_rate": 9.411764705882352e-07, |
|
"loss": 0.0254, |
|
"reward": 0.5308222621679306, |
|
"reward_std": 0.2156723290681839, |
|
"rewards/exp_len_reward": 0.5308222621679306, |
|
"step": 32 |
|
}, |
|
{ |
|
"completion_length": 1720.0759735107422, |
|
"epoch": 0.09777777777777778, |
|
"grad_norm": 0.2262086823960172, |
|
"kl": 0.00020706653594970703, |
|
"learning_rate": 9.705882352941176e-07, |
|
"loss": 0.1291, |
|
"reward": 0.6482533067464828, |
|
"reward_std": 0.23323534801602364, |
|
"rewards/exp_len_reward": 0.6482533067464828, |
|
"step": 33 |
|
}, |
|
{ |
|
"completion_length": 2521.1072387695312, |
|
"epoch": 0.10074074074074074, |
|
"grad_norm": 0.12083975758187475, |
|
"kl": 0.00023365020751953125, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0182, |
|
"reward": 0.37041275948286057, |
|
"reward_std": 0.27552830427885056, |
|
"rewards/exp_len_reward": 0.37041275948286057, |
|
"step": 34 |
|
}, |
|
{ |
|
"completion_length": 2528.1741943359375, |
|
"epoch": 0.1037037037037037, |
|
"grad_norm": 0.12437434921657496, |
|
"kl": 0.0003223419189453125, |
|
"learning_rate": 9.99975812381176e-07, |
|
"loss": 0.015, |
|
"reward": 0.331495076417923, |
|
"reward_std": 0.2619011141359806, |
|
"rewards/exp_len_reward": 0.331495076417923, |
|
"step": 35 |
|
}, |
|
{ |
|
"completion_length": 1851.3170776367188, |
|
"epoch": 0.10666666666666667, |
|
"grad_norm": 0.15925522443930307, |
|
"kl": 0.00037288665771484375, |
|
"learning_rate": 9.999032521248854e-07, |
|
"loss": -0.026, |
|
"reward": 0.4777600094676018, |
|
"reward_std": 0.31441882997751236, |
|
"rewards/exp_len_reward": 0.4777600094676018, |
|
"step": 36 |
|
}, |
|
{ |
|
"completion_length": 1999.2813110351562, |
|
"epoch": 0.10962962962962963, |
|
"grad_norm": 0.1217664172693966, |
|
"kl": 0.0003540515899658203, |
|
"learning_rate": 9.997823270313945e-07, |
|
"loss": -0.0106, |
|
"reward": 0.5654740855097771, |
|
"reward_std": 0.2559405229985714, |
|
"rewards/exp_len_reward": 0.5654740855097771, |
|
"step": 37 |
|
}, |
|
{ |
|
"completion_length": 1974.83935546875, |
|
"epoch": 0.11259259259259259, |
|
"grad_norm": 0.1707475640038693, |
|
"kl": 0.0003287792205810547, |
|
"learning_rate": 9.996130501002146e-07, |
|
"loss": 0.0946, |
|
"reward": 0.4990657716989517, |
|
"reward_std": 0.22302044555544853, |
|
"rewards/exp_len_reward": 0.4990657716989517, |
|
"step": 38 |
|
}, |
|
{ |
|
"completion_length": 2258.2500915527344, |
|
"epoch": 0.11555555555555555, |
|
"grad_norm": 0.13998814580057628, |
|
"kl": 0.00038623809814453125, |
|
"learning_rate": 9.99395439528705e-07, |
|
"loss": 0.0104, |
|
"reward": 0.4020570404827595, |
|
"reward_std": 0.35356171429157257, |
|
"rewards/exp_len_reward": 0.4020570404827595, |
|
"step": 39 |
|
}, |
|
{ |
|
"completion_length": 2405.2233276367188, |
|
"epoch": 0.11851851851851852, |
|
"grad_norm": 0.13044077504292861, |
|
"kl": 0.00035953521728515625, |
|
"learning_rate": 9.991295187101175e-07, |
|
"loss": 0.0227, |
|
"reward": 0.3418276160955429, |
|
"reward_std": 0.30284278094768524, |
|
"rewards/exp_len_reward": 0.3418276160955429, |
|
"step": 40 |
|
}, |
|
{ |
|
"completion_length": 2109.4688720703125, |
|
"epoch": 0.12148148148148148, |
|
"grad_norm": 0.1529163557190866, |
|
"kl": 0.0004558563232421875, |
|
"learning_rate": 9.988153162310798e-07, |
|
"loss": 0.0287, |
|
"reward": 0.3564532473683357, |
|
"reward_std": 0.2458956204354763, |
|
"rewards/exp_len_reward": 0.3564532473683357, |
|
"step": 41 |
|
}, |
|
{ |
|
"completion_length": 2029.4866638183594, |
|
"epoch": 0.12444444444444444, |
|
"grad_norm": 0.17325144566551426, |
|
"kl": 0.000431060791015625, |
|
"learning_rate": 9.98452865868525e-07, |
|
"loss": 0.0635, |
|
"reward": 0.5387638658285141, |
|
"reward_std": 0.20850694179534912, |
|
"rewards/exp_len_reward": 0.5387638658285141, |
|
"step": 42 |
|
}, |
|
{ |
|
"completion_length": 1609.05810546875, |
|
"epoch": 0.1274074074074074, |
|
"grad_norm": 0.18834562491158613, |
|
"kl": 0.0004696846008300781, |
|
"learning_rate": 9.980422065860585e-07, |
|
"loss": 0.0148, |
|
"reward": 0.5655806735157967, |
|
"reward_std": 0.25856464356184006, |
|
"rewards/exp_len_reward": 0.5655806735157967, |
|
"step": 43 |
|
}, |
|
{ |
|
"completion_length": 2234.6385192871094, |
|
"epoch": 0.13037037037037036, |
|
"grad_norm": 0.13146085912536554, |
|
"kl": 0.0006403923034667969, |
|
"learning_rate": 9.975833825297694e-07, |
|
"loss": -0.0197, |
|
"reward": 0.5414331331849098, |
|
"reward_std": 0.2674776539206505, |
|
"rewards/exp_len_reward": 0.5414331331849098, |
|
"step": 44 |
|
}, |
|
{ |
|
"completion_length": 2471.6028442382812, |
|
"epoch": 0.13333333333333333, |
|
"grad_norm": 0.12838849490986115, |
|
"kl": 0.0006742477416992188, |
|
"learning_rate": 9.970764430234865e-07, |
|
"loss": -0.0289, |
|
"reward": 0.4237719103693962, |
|
"reward_std": 0.25649312883615494, |
|
"rewards/exp_len_reward": 0.4237719103693962, |
|
"step": 45 |
|
}, |
|
{ |
|
"completion_length": 2365.5001220703125, |
|
"epoch": 0.1362962962962963, |
|
"grad_norm": 0.14914654023839766, |
|
"kl": 0.0007715225219726562, |
|
"learning_rate": 9.965214425634744e-07, |
|
"loss": 0.0748, |
|
"reward": 0.5114802047610283, |
|
"reward_std": 0.2153051160275936, |
|
"rewards/exp_len_reward": 0.5114802047610283, |
|
"step": 46 |
|
}, |
|
{ |
|
"completion_length": 1474.0982666015625, |
|
"epoch": 0.13925925925925925, |
|
"grad_norm": 0.17420480563840932, |
|
"kl": 0.0010280609130859375, |
|
"learning_rate": 9.959184408125757e-07, |
|
"loss": 0.0243, |
|
"reward": 0.5414484888315201, |
|
"reward_std": 0.23497811146080494, |
|
"rewards/exp_len_reward": 0.5414484888315201, |
|
"step": 47 |
|
}, |
|
{ |
|
"completion_length": 2123.9866943359375, |
|
"epoch": 0.14222222222222222, |
|
"grad_norm": 0.20318427732480254, |
|
"kl": 0.0011425018310546875, |
|
"learning_rate": 9.952675025937969e-07, |
|
"loss": 0.0641, |
|
"reward": 0.3801772743463516, |
|
"reward_std": 0.26251309737563133, |
|
"rewards/exp_len_reward": 0.3801772743463516, |
|
"step": 48 |
|
}, |
|
{ |
|
"completion_length": 2290.9911499023438, |
|
"epoch": 0.1451851851851852, |
|
"grad_norm": 0.16137418781063556, |
|
"kl": 0.0010118484497070312, |
|
"learning_rate": 9.945686978833404e-07, |
|
"loss": 0.0867, |
|
"reward": 0.5667200461030006, |
|
"reward_std": 0.2713882625102997, |
|
"rewards/exp_len_reward": 0.5667200461030006, |
|
"step": 49 |
|
}, |
|
{ |
|
"completion_length": 2380.5492248535156, |
|
"epoch": 0.14814814814814814, |
|
"grad_norm": 0.16965865430432733, |
|
"kl": 0.0011739730834960938, |
|
"learning_rate": 9.938221018030818e-07, |
|
"loss": 0.1294, |
|
"reward": 0.5124416798353195, |
|
"reward_std": 0.25596321001648903, |
|
"rewards/exp_len_reward": 0.5124416798353195, |
|
"step": 50 |
|
}, |
|
{ |
|
"completion_length": 2247.0670776367188, |
|
"epoch": 0.1511111111111111, |
|
"grad_norm": 0.12970462538444932, |
|
"kl": 0.001293182373046875, |
|
"learning_rate": 9.930277946124936e-07, |
|
"loss": 0.0198, |
|
"reward": 0.4916500821709633, |
|
"reward_std": 0.2014484405517578, |
|
"rewards/exp_len_reward": 0.4916500821709633, |
|
"step": 51 |
|
}, |
|
{ |
|
"completion_length": 2249.3125610351562, |
|
"epoch": 0.15407407407407409, |
|
"grad_norm": 0.1622954753109236, |
|
"kl": 0.001811981201171875, |
|
"learning_rate": 9.921858617000186e-07, |
|
"loss": 0.0112, |
|
"reward": 0.6037572771310806, |
|
"reward_std": 0.24519889429211617, |
|
"rewards/exp_len_reward": 0.6037572771310806, |
|
"step": 52 |
|
}, |
|
{ |
|
"completion_length": 2421.6787109375, |
|
"epoch": 0.15703703703703703, |
|
"grad_norm": 0.18286903039083063, |
|
"kl": 0.0019397735595703125, |
|
"learning_rate": 9.912963935738895e-07, |
|
"loss": 0.0844, |
|
"reward": 0.503461018204689, |
|
"reward_std": 0.26762050203979015, |
|
"rewards/exp_len_reward": 0.503461018204689, |
|
"step": 53 |
|
}, |
|
{ |
|
"completion_length": 2319.352813720703, |
|
"epoch": 0.16, |
|
"grad_norm": 0.15710291606441482, |
|
"kl": 0.001514434814453125, |
|
"learning_rate": 9.903594858523993e-07, |
|
"loss": 0.0332, |
|
"reward": 0.560577280819416, |
|
"reward_std": 0.25206807255744934, |
|
"rewards/exp_len_reward": 0.560577280819416, |
|
"step": 54 |
|
}, |
|
{ |
|
"completion_length": 2609.1741638183594, |
|
"epoch": 0.16296296296296298, |
|
"grad_norm": 0.13990344576481042, |
|
"kl": 0.001995086669921875, |
|
"learning_rate": 9.893752392536231e-07, |
|
"loss": 0.0342, |
|
"reward": 0.46819788962602615, |
|
"reward_std": 0.215255219489336, |
|
"rewards/exp_len_reward": 0.46819788962602615, |
|
"step": 55 |
|
}, |
|
{ |
|
"completion_length": 2860.785888671875, |
|
"epoch": 0.16592592592592592, |
|
"grad_norm": 0.12137029587563823, |
|
"kl": 0.00226593017578125, |
|
"learning_rate": 9.883437595845901e-07, |
|
"loss": 0.0001, |
|
"reward": 0.47793612629175186, |
|
"reward_std": 0.21438376046717167, |
|
"rewards/exp_len_reward": 0.47793612629175186, |
|
"step": 56 |
|
}, |
|
{ |
|
"completion_length": 2219.946533203125, |
|
"epoch": 0.1688888888888889, |
|
"grad_norm": 0.16329103001651094, |
|
"kl": 0.003246307373046875, |
|
"learning_rate": 9.872651577299092e-07, |
|
"loss": 0.0075, |
|
"reward": 0.5384046509861946, |
|
"reward_std": 0.22491934522986412, |
|
"rewards/exp_len_reward": 0.5384046509861946, |
|
"step": 57 |
|
}, |
|
{ |
|
"completion_length": 2603.6117553710938, |
|
"epoch": 0.17185185185185184, |
|
"grad_norm": 0.15535812470619684, |
|
"kl": 0.002307891845703125, |
|
"learning_rate": 9.861395496398497e-07, |
|
"loss": 0.0209, |
|
"reward": 0.44340164959430695, |
|
"reward_std": 0.2533542141318321, |
|
"rewards/exp_len_reward": 0.44340164959430695, |
|
"step": 58 |
|
}, |
|
{ |
|
"completion_length": 2278.138458251953, |
|
"epoch": 0.1748148148148148, |
|
"grad_norm": 0.19628078093756599, |
|
"kl": 0.0034637451171875, |
|
"learning_rate": 9.849670563178756e-07, |
|
"loss": 0.0342, |
|
"reward": 0.6375216767191887, |
|
"reward_std": 0.22566119581460953, |
|
"rewards/exp_len_reward": 0.6375216767191887, |
|
"step": 59 |
|
}, |
|
{ |
|
"completion_length": 2012.8616638183594, |
|
"epoch": 0.17777777777777778, |
|
"grad_norm": 0.1834506950176924, |
|
"kl": 0.00319671630859375, |
|
"learning_rate": 9.83747803807638e-07, |
|
"loss": -0.0158, |
|
"reward": 0.6068644598126411, |
|
"reward_std": 0.2145768441259861, |
|
"rewards/exp_len_reward": 0.6068644598126411, |
|
"step": 60 |
|
}, |
|
{ |
|
"completion_length": 2115.3438720703125, |
|
"epoch": 0.18074074074074073, |
|
"grad_norm": 0.15523600283402922, |
|
"kl": 0.003215789794921875, |
|
"learning_rate": 9.82481923179426e-07, |
|
"loss": 0.0184, |
|
"reward": 0.6350785046815872, |
|
"reward_std": 0.18117142282426357, |
|
"rewards/exp_len_reward": 0.6350785046815872, |
|
"step": 61 |
|
}, |
|
{ |
|
"completion_length": 2114.2411193847656, |
|
"epoch": 0.1837037037037037, |
|
"grad_norm": 0.20605772552254983, |
|
"kl": 0.003391265869140625, |
|
"learning_rate": 9.811695505160755e-07, |
|
"loss": 0.1202, |
|
"reward": 0.4864572286605835, |
|
"reward_std": 0.28768010064959526, |
|
"rewards/exp_len_reward": 0.4864572286605835, |
|
"step": 62 |
|
}, |
|
{ |
|
"completion_length": 2333.384063720703, |
|
"epoch": 0.18666666666666668, |
|
"grad_norm": 0.17526874157597688, |
|
"kl": 0.00322723388671875, |
|
"learning_rate": 9.79810826898341e-07, |
|
"loss": 0.0574, |
|
"reward": 0.5434901565313339, |
|
"reward_std": 0.2581823952496052, |
|
"rewards/exp_len_reward": 0.5434901565313339, |
|
"step": 63 |
|
}, |
|
{ |
|
"completion_length": 2263.5625610351562, |
|
"epoch": 0.18962962962962962, |
|
"grad_norm": 0.1678305474956245, |
|
"kl": 0.0043487548828125, |
|
"learning_rate": 9.784058983897284e-07, |
|
"loss": -0.0221, |
|
"reward": 0.5998831987380981, |
|
"reward_std": 0.20506521314382553, |
|
"rewards/exp_len_reward": 0.5998831987380981, |
|
"step": 64 |
|
}, |
|
{ |
|
"completion_length": 2293.4688415527344, |
|
"epoch": 0.1925925925925926, |
|
"grad_norm": 0.12491619040889941, |
|
"kl": 0.003841400146484375, |
|
"learning_rate": 9.769549160207952e-07, |
|
"loss": -0.0029, |
|
"reward": 0.4781326428055763, |
|
"reward_std": 0.24647967144846916, |
|
"rewards/exp_len_reward": 0.4781326428055763, |
|
"step": 65 |
|
}, |
|
{ |
|
"completion_length": 2674.1876220703125, |
|
"epoch": 0.19555555555555557, |
|
"grad_norm": 0.17123713322573883, |
|
"kl": 0.003971099853515625, |
|
"learning_rate": 9.754580357729116e-07, |
|
"loss": 0.0074, |
|
"reward": 0.6343429163098335, |
|
"reward_std": 0.17385547421872616, |
|
"rewards/exp_len_reward": 0.6343429163098335, |
|
"step": 66 |
|
}, |
|
{ |
|
"completion_length": 2128.9553833007812, |
|
"epoch": 0.1985185185185185, |
|
"grad_norm": 0.19102462314359206, |
|
"kl": 0.00537109375, |
|
"learning_rate": 9.739154185614949e-07, |
|
"loss": 0.0521, |
|
"reward": 0.5997566878795624, |
|
"reward_std": 0.25122974812984467, |
|
"rewards/exp_len_reward": 0.5997566878795624, |
|
"step": 67 |
|
}, |
|
{ |
|
"completion_length": 2614.040283203125, |
|
"epoch": 0.20148148148148148, |
|
"grad_norm": 0.13959648308338587, |
|
"kl": 0.005168914794921875, |
|
"learning_rate": 9.723272302187106e-07, |
|
"loss": -0.0082, |
|
"reward": 0.5444743484258652, |
|
"reward_std": 0.22692475281655788, |
|
"rewards/exp_len_reward": 0.5444743484258652, |
|
"step": 68 |
|
}, |
|
{ |
|
"completion_length": 2289.901885986328, |
|
"epoch": 0.20444444444444446, |
|
"grad_norm": 0.19300943423240471, |
|
"kl": 0.004718780517578125, |
|
"learning_rate": 9.706936414756435e-07, |
|
"loss": 0.0497, |
|
"reward": 0.5558685436844826, |
|
"reward_std": 0.2169661819934845, |
|
"rewards/exp_len_reward": 0.5558685436844826, |
|
"step": 69 |
|
}, |
|
{ |
|
"completion_length": 2974.8751220703125, |
|
"epoch": 0.2074074074074074, |
|
"grad_norm": 0.121339310942439, |
|
"kl": 0.004192352294921875, |
|
"learning_rate": 9.69014827943947e-07, |
|
"loss": -0.0285, |
|
"reward": 0.40666233375668526, |
|
"reward_std": 0.2304704710841179, |
|
"rewards/exp_len_reward": 0.40666233375668526, |
|
"step": 70 |
|
}, |
|
{ |
|
"completion_length": 2327.964385986328, |
|
"epoch": 0.21037037037037037, |
|
"grad_norm": 0.14030638320428154, |
|
"kl": 0.0042724609375, |
|
"learning_rate": 9.672909700969612e-07, |
|
"loss": 0.0189, |
|
"reward": 0.5954511985182762, |
|
"reward_std": 0.1756008304655552, |
|
"rewards/exp_len_reward": 0.5954511985182762, |
|
"step": 71 |
|
}, |
|
{ |
|
"completion_length": 2386.6920776367188, |
|
"epoch": 0.21333333333333335, |
|
"grad_norm": 0.15142935641482305, |
|
"kl": 0.0072174072265625, |
|
"learning_rate": 9.65522253250316e-07, |
|
"loss": 0.0134, |
|
"reward": 0.5968082100152969, |
|
"reward_std": 0.24449098855257034, |
|
"rewards/exp_len_reward": 0.5968082100152969, |
|
"step": 72 |
|
}, |
|
{ |
|
"completion_length": 2349.0001220703125, |
|
"epoch": 0.2162962962962963, |
|
"grad_norm": 0.19733194845778884, |
|
"kl": 0.0043792724609375, |
|
"learning_rate": 9.637088675420063e-07, |
|
"loss": 0.0693, |
|
"reward": 0.6912636756896973, |
|
"reward_std": 0.23656904697418213, |
|
"rewards/exp_len_reward": 0.6912636756896973, |
|
"step": 73 |
|
}, |
|
{ |
|
"completion_length": 2502.040283203125, |
|
"epoch": 0.21925925925925926, |
|
"grad_norm": 0.17306578944129042, |
|
"kl": 0.00576019287109375, |
|
"learning_rate": 9.618510079119533e-07, |
|
"loss": 0.0302, |
|
"reward": 0.5406814813613892, |
|
"reward_std": 0.22251397371292114, |
|
"rewards/exp_len_reward": 0.5406814813613892, |
|
"step": 74 |
|
}, |
|
{ |
|
"completion_length": 2595.4509887695312, |
|
"epoch": 0.2222222222222222, |
|
"grad_norm": 0.15937561200793735, |
|
"kl": 0.005767822265625, |
|
"learning_rate": 9.59948874081048e-07, |
|
"loss": -0.0046, |
|
"reward": 0.4833944961428642, |
|
"reward_std": 0.2554270029067993, |
|
"rewards/exp_len_reward": 0.4833944961428642, |
|
"step": 75 |
|
}, |
|
{ |
|
"completion_length": 2212.65185546875, |
|
"epoch": 0.22518518518518518, |
|
"grad_norm": 0.16902423297675423, |
|
"kl": 0.00504302978515625, |
|
"learning_rate": 9.580026705296824e-07, |
|
"loss": 0.0447, |
|
"reward": 0.6849584132432938, |
|
"reward_std": 0.19782762601971626, |
|
"rewards/exp_len_reward": 0.6849584132432938, |
|
"step": 76 |
|
}, |
|
{ |
|
"completion_length": 2508.058074951172, |
|
"epoch": 0.22814814814814816, |
|
"grad_norm": 0.14168432606139286, |
|
"kl": 0.00524139404296875, |
|
"learning_rate": 9.56012606475766e-07, |
|
"loss": -0.02, |
|
"reward": 0.5538096725940704, |
|
"reward_std": 0.1790675725787878, |
|
"rewards/exp_len_reward": 0.5538096725940704, |
|
"step": 77 |
|
}, |
|
{ |
|
"completion_length": 2099.8438720703125, |
|
"epoch": 0.2311111111111111, |
|
"grad_norm": 0.16648937358394447, |
|
"kl": 0.0051116943359375, |
|
"learning_rate": 9.539788958522353e-07, |
|
"loss": 0.0118, |
|
"reward": 0.5786676853895187, |
|
"reward_std": 0.20153935626149178, |
|
"rewards/exp_len_reward": 0.5786676853895187, |
|
"step": 78 |
|
}, |
|
{ |
|
"completion_length": 2088.1697387695312, |
|
"epoch": 0.23407407407407407, |
|
"grad_norm": 0.18793602052290423, |
|
"kl": 0.0059051513671875, |
|
"learning_rate": 9.519017572840562e-07, |
|
"loss": 0.0275, |
|
"reward": 0.6274498999118805, |
|
"reward_std": 0.27273761481046677, |
|
"rewards/exp_len_reward": 0.6274498999118805, |
|
"step": 79 |
|
}, |
|
{ |
|
"completion_length": 2347.9019165039062, |
|
"epoch": 0.23703703703703705, |
|
"grad_norm": 0.1591086590370902, |
|
"kl": 0.005828857421875, |
|
"learning_rate": 9.49781414064722e-07, |
|
"loss": 0.0009, |
|
"reward": 0.5423298478126526, |
|
"reward_std": 0.27614232525229454, |
|
"rewards/exp_len_reward": 0.5423298478126526, |
|
"step": 80 |
|
}, |
|
{ |
|
"completion_length": 2335.3259887695312, |
|
"epoch": 0.24, |
|
"grad_norm": 0.1492302716898933, |
|
"kl": 0.00699615478515625, |
|
"learning_rate": 9.476180941322485e-07, |
|
"loss": -0.0002, |
|
"reward": 0.48675865679979324, |
|
"reward_std": 0.22459113597869873, |
|
"rewards/exp_len_reward": 0.48675865679979324, |
|
"step": 81 |
|
}, |
|
{ |
|
"completion_length": 2585.65185546875, |
|
"epoch": 0.24296296296296296, |
|
"grad_norm": 0.17333826527197865, |
|
"kl": 0.0056304931640625, |
|
"learning_rate": 9.454120300446708e-07, |
|
"loss": 0.0085, |
|
"reward": 0.46361441165208817, |
|
"reward_std": 0.2632403336465359, |
|
"rewards/exp_len_reward": 0.46361441165208817, |
|
"step": 82 |
|
}, |
|
{ |
|
"completion_length": 2431.7322998046875, |
|
"epoch": 0.24592592592592594, |
|
"grad_norm": 0.1602558093279175, |
|
"kl": 0.006927490234375, |
|
"learning_rate": 9.431634589550437e-07, |
|
"loss": -0.0156, |
|
"reward": 0.6010517254471779, |
|
"reward_std": 0.1429296052083373, |
|
"rewards/exp_len_reward": 0.6010517254471779, |
|
"step": 83 |
|
}, |
|
{ |
|
"completion_length": 2337.3037109375, |
|
"epoch": 0.24888888888888888, |
|
"grad_norm": 0.23867947641732404, |
|
"kl": 0.00533294677734375, |
|
"learning_rate": 9.408726225859463e-07, |
|
"loss": 0.0472, |
|
"reward": 0.5745537057518959, |
|
"reward_std": 0.19299127161502838, |
|
"rewards/exp_len_reward": 0.5745537057518959, |
|
"step": 84 |
|
}, |
|
{ |
|
"completion_length": 1958.0089721679688, |
|
"epoch": 0.2518518518518518, |
|
"grad_norm": 0.18352640456742336, |
|
"kl": 0.00751495361328125, |
|
"learning_rate": 9.385397672034984e-07, |
|
"loss": 0.0373, |
|
"reward": 0.5863458216190338, |
|
"reward_std": 0.18559462763369083, |
|
"rewards/exp_len_reward": 0.5863458216190338, |
|
"step": 85 |
|
}, |
|
{ |
|
"completion_length": 2164.3348693847656, |
|
"epoch": 0.2548148148148148, |
|
"grad_norm": 0.1583111043850771, |
|
"kl": 0.0073394775390625, |
|
"learning_rate": 9.361651435908859e-07, |
|
"loss": 0.0039, |
|
"reward": 0.6185845136642456, |
|
"reward_std": 0.23790935426950455, |
|
"rewards/exp_len_reward": 0.6185845136642456, |
|
"step": 86 |
|
}, |
|
{ |
|
"completion_length": 1974.8393859863281, |
|
"epoch": 0.2577777777777778, |
|
"grad_norm": 0.24011600157172625, |
|
"kl": 0.0063018798828125, |
|
"learning_rate": 9.337490070214005e-07, |
|
"loss": 0.0782, |
|
"reward": 0.6514532268047333, |
|
"reward_std": 0.21611288189888, |
|
"rewards/exp_len_reward": 0.6514532268047333, |
|
"step": 87 |
|
}, |
|
{ |
|
"completion_length": 2044.7188415527344, |
|
"epoch": 0.2607407407407407, |
|
"grad_norm": 0.16828102239112172, |
|
"kl": 0.0050811767578125, |
|
"learning_rate": 9.312916172309998e-07, |
|
"loss": 0.0209, |
|
"reward": 0.6684047281742096, |
|
"reward_std": 0.26073355227708817, |
|
"rewards/exp_len_reward": 0.6684047281742096, |
|
"step": 88 |
|
}, |
|
{ |
|
"completion_length": 2310.620635986328, |
|
"epoch": 0.2637037037037037, |
|
"grad_norm": 0.14135906575873353, |
|
"kl": 0.0082855224609375, |
|
"learning_rate": 9.287932383903842e-07, |
|
"loss": -0.0474, |
|
"reward": 0.580662876367569, |
|
"reward_std": 0.16773580014705658, |
|
"rewards/exp_len_reward": 0.580662876367569, |
|
"step": 89 |
|
}, |
|
{ |
|
"completion_length": 2240.1117553710938, |
|
"epoch": 0.26666666666666666, |
|
"grad_norm": 0.1981410777840034, |
|
"kl": 0.00653839111328125, |
|
"learning_rate": 9.262541390765981e-07, |
|
"loss": 0.0485, |
|
"reward": 0.5460301488637924, |
|
"reward_std": 0.20029782131314278, |
|
"rewards/exp_len_reward": 0.5460301488637924, |
|
"step": 90 |
|
}, |
|
{ |
|
"completion_length": 2113.919708251953, |
|
"epoch": 0.2696296296296296, |
|
"grad_norm": 0.17969252200032312, |
|
"kl": 0.00701141357421875, |
|
"learning_rate": 9.236745922441589e-07, |
|
"loss": 0.0784, |
|
"reward": 0.6915992498397827, |
|
"reward_std": 0.19186005368828773, |
|
"rewards/exp_len_reward": 0.6915992498397827, |
|
"step": 91 |
|
}, |
|
{ |
|
"completion_length": 2258.0224609375, |
|
"epoch": 0.2725925925925926, |
|
"grad_norm": 0.19839390162400178, |
|
"kl": 0.00594329833984375, |
|
"learning_rate": 9.210548751957133e-07, |
|
"loss": 0.0849, |
|
"reward": 0.6840342581272125, |
|
"reward_std": 0.24512023478746414, |
|
"rewards/exp_len_reward": 0.6840342581272125, |
|
"step": 92 |
|
}, |
|
{ |
|
"completion_length": 2275.4107971191406, |
|
"epoch": 0.27555555555555555, |
|
"grad_norm": 0.25858993059609997, |
|
"kl": 0.0076751708984375, |
|
"learning_rate": 9.183952695522273e-07, |
|
"loss": 0.0955, |
|
"reward": 0.685549259185791, |
|
"reward_std": 0.23211624845862389, |
|
"rewards/exp_len_reward": 0.685549259185791, |
|
"step": 93 |
|
}, |
|
{ |
|
"completion_length": 2360.0581665039062, |
|
"epoch": 0.2785185185185185, |
|
"grad_norm": 0.19051489587675371, |
|
"kl": 0.00785064697265625, |
|
"learning_rate": 9.156960612227125e-07, |
|
"loss": 0.0796, |
|
"reward": 0.5155021622776985, |
|
"reward_std": 0.2668054960668087, |
|
"rewards/exp_len_reward": 0.5155021622776985, |
|
"step": 94 |
|
}, |
|
{ |
|
"completion_length": 2579.8260192871094, |
|
"epoch": 0.2814814814814815, |
|
"grad_norm": 0.13837089717937884, |
|
"kl": 0.00725555419921875, |
|
"learning_rate": 9.129575403734897e-07, |
|
"loss": 0.0142, |
|
"reward": 0.5060503482818604, |
|
"reward_std": 0.23962176218628883, |
|
"rewards/exp_len_reward": 0.5060503482818604, |
|
"step": 95 |
|
}, |
|
{ |
|
"completion_length": 2322.634002685547, |
|
"epoch": 0.28444444444444444, |
|
"grad_norm": 0.19969897102985795, |
|
"kl": 0.00792694091796875, |
|
"learning_rate": 9.101800013969962e-07, |
|
"loss": 0.0381, |
|
"reward": 0.5826811380684376, |
|
"reward_std": 0.21555687859654427, |
|
"rewards/exp_len_reward": 0.5826811380684376, |
|
"step": 96 |
|
}, |
|
{ |
|
"completion_length": 2316.5670776367188, |
|
"epoch": 0.2874074074074074, |
|
"grad_norm": 0.16955049457906315, |
|
"kl": 0.00732421875, |
|
"learning_rate": 9.07363742880139e-07, |
|
"loss": -0.0193, |
|
"reward": 0.6406743228435516, |
|
"reward_std": 0.2392715960741043, |
|
"rewards/exp_len_reward": 0.6406743228435516, |
|
"step": 97 |
|
}, |
|
{ |
|
"completion_length": 2166.6920471191406, |
|
"epoch": 0.2903703703703704, |
|
"grad_norm": 0.16741896253103236, |
|
"kl": 0.0088043212890625, |
|
"learning_rate": 9.045090675721959e-07, |
|
"loss": 0.0393, |
|
"reward": 0.6196507066488266, |
|
"reward_std": 0.2079339139163494, |
|
"rewards/exp_len_reward": 0.6196507066488266, |
|
"step": 98 |
|
}, |
|
{ |
|
"completion_length": 2266.2010192871094, |
|
"epoch": 0.29333333333333333, |
|
"grad_norm": 0.19630807827817112, |
|
"kl": 0.008056640625, |
|
"learning_rate": 9.016162823522701e-07, |
|
"loss": -0.0373, |
|
"reward": 0.6069852858781815, |
|
"reward_std": 0.2058863341808319, |
|
"rewards/exp_len_reward": 0.6069852858781815, |
|
"step": 99 |
|
}, |
|
{ |
|
"completion_length": 2042.7992248535156, |
|
"epoch": 0.2962962962962963, |
|
"grad_norm": 0.22170963608651797, |
|
"kl": 0.0109100341796875, |
|
"learning_rate": 8.986856981963004e-07, |
|
"loss": 0.0303, |
|
"reward": 0.6604138612747192, |
|
"reward_std": 0.20395291596651077, |
|
"rewards/exp_len_reward": 0.6604138612747192, |
|
"step": 100 |
|
}, |
|
{ |
|
"completion_length": 2354.2366943359375, |
|
"epoch": 0.2992592592592593, |
|
"grad_norm": 0.17072493336044253, |
|
"kl": 0.0085601806640625, |
|
"learning_rate": 8.957176301436312e-07, |
|
"loss": 0.0235, |
|
"reward": 0.5688716098666191, |
|
"reward_std": 0.21174464374780655, |
|
"rewards/exp_len_reward": 0.5688716098666191, |
|
"step": 101 |
|
}, |
|
{ |
|
"completion_length": 2221.0625915527344, |
|
"epoch": 0.3022222222222222, |
|
"grad_norm": 0.20959585853001003, |
|
"kl": 0.00896453857421875, |
|
"learning_rate": 8.927123972631457e-07, |
|
"loss": 0.0744, |
|
"reward": 0.6062222719192505, |
|
"reward_std": 0.22734928503632545, |
|
"rewards/exp_len_reward": 0.6062222719192505, |
|
"step": 102 |
|
}, |
|
{ |
|
"completion_length": 2169.2947692871094, |
|
"epoch": 0.30518518518518517, |
|
"grad_norm": 0.16321546987755797, |
|
"kl": 0.00870513916015625, |
|
"learning_rate": 8.896703226189656e-07, |
|
"loss": 0.0455, |
|
"reward": 0.6819661110639572, |
|
"reward_std": 0.19446462020277977, |
|
"rewards/exp_len_reward": 0.6819661110639572, |
|
"step": 103 |
|
}, |
|
{ |
|
"completion_length": 2295.540252685547, |
|
"epoch": 0.30814814814814817, |
|
"grad_norm": 0.15015615986614694, |
|
"kl": 0.00914764404296875, |
|
"learning_rate": 8.865917332357217e-07, |
|
"loss": 0.0034, |
|
"reward": 0.5188455395400524, |
|
"reward_std": 0.19476320780813694, |
|
"rewards/exp_len_reward": 0.5188455395400524, |
|
"step": 104 |
|
}, |
|
{ |
|
"completion_length": 2407.134033203125, |
|
"epoch": 0.3111111111111111, |
|
"grad_norm": 0.16080385779787676, |
|
"kl": 0.008636474609375, |
|
"learning_rate": 8.834769600633986e-07, |
|
"loss": 0.0888, |
|
"reward": 0.5705942884087563, |
|
"reward_std": 0.2538597658276558, |
|
"rewards/exp_len_reward": 0.5705942884087563, |
|
"step": 105 |
|
}, |
|
{ |
|
"completion_length": 1935.7322387695312, |
|
"epoch": 0.31407407407407406, |
|
"grad_norm": 0.17954312631984032, |
|
"kl": 0.00833892822265625, |
|
"learning_rate": 8.803263379417572e-07, |
|
"loss": -0.0044, |
|
"reward": 0.5462605357170105, |
|
"reward_std": 0.2077418938279152, |
|
"rewards/exp_len_reward": 0.5462605357170105, |
|
"step": 106 |
|
}, |
|
{ |
|
"completion_length": 2878.9197998046875, |
|
"epoch": 0.31703703703703706, |
|
"grad_norm": 0.1997205984738251, |
|
"kl": 0.0114898681640625, |
|
"learning_rate": 8.771402055643391e-07, |
|
"loss": 0.0063, |
|
"reward": 0.5251818224787712, |
|
"reward_std": 0.18359562009572983, |
|
"rewards/exp_len_reward": 0.5251818224787712, |
|
"step": 107 |
|
}, |
|
{ |
|
"completion_length": 2381.1295776367188, |
|
"epoch": 0.32, |
|
"grad_norm": 0.15668192052340332, |
|
"kl": 0.00910186767578125, |
|
"learning_rate": 8.73918905442058e-07, |
|
"loss": -0.0058, |
|
"reward": 0.6536043733358383, |
|
"reward_std": 0.1777793299406767, |
|
"rewards/exp_len_reward": 0.6536043733358383, |
|
"step": 108 |
|
}, |
|
{ |
|
"completion_length": 2140.8616943359375, |
|
"epoch": 0.32296296296296295, |
|
"grad_norm": 0.1908188055608542, |
|
"kl": 0.0096588134765625, |
|
"learning_rate": 8.706627838663782e-07, |
|
"loss": -0.0087, |
|
"reward": 0.5826982110738754, |
|
"reward_std": 0.23715216293931007, |
|
"rewards/exp_len_reward": 0.5826982110738754, |
|
"step": 109 |
|
}, |
|
{ |
|
"completion_length": 2211.8750610351562, |
|
"epoch": 0.32592592592592595, |
|
"grad_norm": 0.17454162730160738, |
|
"kl": 0.0082244873046875, |
|
"learning_rate": 8.673721908720884e-07, |
|
"loss": 0.0936, |
|
"reward": 0.6324276328086853, |
|
"reward_std": 0.19550849869847298, |
|
"rewards/exp_len_reward": 0.6324276328086853, |
|
"step": 110 |
|
}, |
|
{ |
|
"completion_length": 1898.7500915527344, |
|
"epoch": 0.3288888888888889, |
|
"grad_norm": 0.19019621715427518, |
|
"kl": 0.00946807861328125, |
|
"learning_rate": 8.640474801996732e-07, |
|
"loss": 0.0601, |
|
"reward": 0.706642210483551, |
|
"reward_std": 0.13929060846567154, |
|
"rewards/exp_len_reward": 0.706642210483551, |
|
"step": 111 |
|
}, |
|
{ |
|
"completion_length": 2158.0179443359375, |
|
"epoch": 0.33185185185185184, |
|
"grad_norm": 0.1803886062628602, |
|
"kl": 0.008941650390625, |
|
"learning_rate": 8.606890092572861e-07, |
|
"loss": 0.0214, |
|
"reward": 0.5730146244168282, |
|
"reward_std": 0.28394924849271774, |
|
"rewards/exp_len_reward": 0.5730146244168282, |
|
"step": 112 |
|
}, |
|
{ |
|
"completion_length": 1975.2322082519531, |
|
"epoch": 0.3348148148148148, |
|
"grad_norm": 0.26887475456345783, |
|
"kl": 0.012176513671875, |
|
"learning_rate": 8.572971390823266e-07, |
|
"loss": 0.09, |
|
"reward": 0.6306671500205994, |
|
"reward_std": 0.236881572753191, |
|
"rewards/exp_len_reward": 0.6306671500205994, |
|
"step": 113 |
|
}, |
|
{ |
|
"completion_length": 2393.74560546875, |
|
"epoch": 0.3377777777777778, |
|
"grad_norm": 0.1680385692386688, |
|
"kl": 0.00844573974609375, |
|
"learning_rate": 8.538722343026302e-07, |
|
"loss": 0.0391, |
|
"reward": 0.403280146420002, |
|
"reward_std": 0.22043467685580254, |
|
"rewards/exp_len_reward": 0.403280146420002, |
|
"step": 114 |
|
}, |
|
{ |
|
"completion_length": 1869.9822540283203, |
|
"epoch": 0.34074074074074073, |
|
"grad_norm": 0.2100589530247618, |
|
"kl": 0.00864410400390625, |
|
"learning_rate": 8.50414663097269e-07, |
|
"loss": 0.0856, |
|
"reward": 0.7126432359218597, |
|
"reward_std": 0.2112839464098215, |
|
"rewards/exp_len_reward": 0.7126432359218597, |
|
"step": 115 |
|
}, |
|
{ |
|
"completion_length": 2621.9688720703125, |
|
"epoch": 0.3437037037037037, |
|
"grad_norm": 0.1833213887219074, |
|
"kl": 0.0106201171875, |
|
"learning_rate": 8.46924797156974e-07, |
|
"loss": 0.0355, |
|
"reward": 0.4800976812839508, |
|
"reward_std": 0.2597590982913971, |
|
"rewards/exp_len_reward": 0.4800976812839508, |
|
"step": 116 |
|
}, |
|
{ |
|
"completion_length": 2430.9598999023438, |
|
"epoch": 0.3466666666666667, |
|
"grad_norm": 0.1777526950884752, |
|
"kl": 0.0114288330078125, |
|
"learning_rate": 8.434030116441765e-07, |
|
"loss": -0.0294, |
|
"reward": 0.45774422585964203, |
|
"reward_std": 0.14171775989234447, |
|
"rewards/exp_len_reward": 0.45774422585964203, |
|
"step": 117 |
|
}, |
|
{ |
|
"completion_length": 2442.321533203125, |
|
"epoch": 0.3496296296296296, |
|
"grad_norm": 0.1793512093791192, |
|
"kl": 0.0110931396484375, |
|
"learning_rate": 8.39849685152679e-07, |
|
"loss": 0.0494, |
|
"reward": 0.5156404674053192, |
|
"reward_std": 0.23575026541948318, |
|
"rewards/exp_len_reward": 0.5156404674053192, |
|
"step": 118 |
|
}, |
|
{ |
|
"completion_length": 1924.6116638183594, |
|
"epoch": 0.35259259259259257, |
|
"grad_norm": 0.28968627912789613, |
|
"kl": 0.007965087890625, |
|
"learning_rate": 8.36265199666956e-07, |
|
"loss": 0.1686, |
|
"reward": 0.7478295713663101, |
|
"reward_std": 0.1679641492664814, |
|
"rewards/exp_len_reward": 0.7478295713663101, |
|
"step": 119 |
|
}, |
|
{ |
|
"completion_length": 1653.5670471191406, |
|
"epoch": 0.35555555555555557, |
|
"grad_norm": 0.21781918537576359, |
|
"kl": 0.00855255126953125, |
|
"learning_rate": 8.326499405210902e-07, |
|
"loss": 0.0401, |
|
"reward": 0.6878243908286095, |
|
"reward_std": 0.19577785581350327, |
|
"rewards/exp_len_reward": 0.6878243908286095, |
|
"step": 120 |
|
}, |
|
{ |
|
"completion_length": 2593.0269165039062, |
|
"epoch": 0.3585185185185185, |
|
"grad_norm": 0.15518499757489337, |
|
"kl": 0.0120086669921875, |
|
"learning_rate": 8.290042963573488e-07, |
|
"loss": 0.011, |
|
"reward": 0.5845073834061623, |
|
"reward_std": 0.23032733984291553, |
|
"rewards/exp_len_reward": 0.5845073834061623, |
|
"step": 121 |
|
}, |
|
{ |
|
"completion_length": 1934.8036499023438, |
|
"epoch": 0.36148148148148146, |
|
"grad_norm": 0.3199511661475365, |
|
"kl": 0.009521484375, |
|
"learning_rate": 8.25328659084405e-07, |
|
"loss": 0.1144, |
|
"reward": 0.6791598200798035, |
|
"reward_std": 0.17451436072587967, |
|
"rewards/exp_len_reward": 0.6791598200798035, |
|
"step": 122 |
|
}, |
|
{ |
|
"completion_length": 2042.8750915527344, |
|
"epoch": 0.36444444444444446, |
|
"grad_norm": 0.1947938475896873, |
|
"kl": 0.0099334716796875, |
|
"learning_rate": 8.216234238352065e-07, |
|
"loss": 0.0709, |
|
"reward": 0.7000842541456223, |
|
"reward_std": 0.25582029670476913, |
|
"rewards/exp_len_reward": 0.7000842541456223, |
|
"step": 123 |
|
}, |
|
{ |
|
"completion_length": 1863.2098999023438, |
|
"epoch": 0.3674074074074074, |
|
"grad_norm": 0.2066827286740214, |
|
"kl": 0.0110931396484375, |
|
"learning_rate": 8.178889889244996e-07, |
|
"loss": 0.0224, |
|
"reward": 0.5978061109781265, |
|
"reward_std": 0.17980634421110153, |
|
"rewards/exp_len_reward": 0.5978061109781265, |
|
"step": 124 |
|
}, |
|
{ |
|
"completion_length": 1728.6876220703125, |
|
"epoch": 0.37037037037037035, |
|
"grad_norm": 0.1729347976132675, |
|
"kl": 0.0093841552734375, |
|
"learning_rate": 8.141257558060092e-07, |
|
"loss": 0.0214, |
|
"reward": 0.7174255400896072, |
|
"reward_std": 0.1986971478909254, |
|
"rewards/exp_len_reward": 0.7174255400896072, |
|
"step": 125 |
|
}, |
|
{ |
|
"completion_length": 2421.3482666015625, |
|
"epoch": 0.37333333333333335, |
|
"grad_norm": 0.1407316534343894, |
|
"kl": 0.0113677978515625, |
|
"learning_rate": 8.103341290292833e-07, |
|
"loss": 0.0084, |
|
"reward": 0.5087610557675362, |
|
"reward_std": 0.2041846662759781, |
|
"rewards/exp_len_reward": 0.5087610557675362, |
|
"step": 126 |
|
}, |
|
{ |
|
"completion_length": 1954.4152526855469, |
|
"epoch": 0.3762962962962963, |
|
"grad_norm": 0.19211843182621693, |
|
"kl": 0.0105438232421875, |
|
"learning_rate": 8.065145161962021e-07, |
|
"loss": 0.0738, |
|
"reward": 0.6312553137540817, |
|
"reward_std": 0.14810450747609138, |
|
"rewards/exp_len_reward": 0.6312553137540817, |
|
"step": 127 |
|
}, |
|
{ |
|
"completion_length": 2153.8795776367188, |
|
"epoch": 0.37925925925925924, |
|
"grad_norm": 0.170330613639256, |
|
"kl": 0.0113677978515625, |
|
"learning_rate": 8.02667327917163e-07, |
|
"loss": 0.0318, |
|
"reward": 0.6820006817579269, |
|
"reward_std": 0.19498692452907562, |
|
"rewards/exp_len_reward": 0.6820006817579269, |
|
"step": 128 |
|
}, |
|
{ |
|
"completion_length": 2104.5179138183594, |
|
"epoch": 0.38222222222222224, |
|
"grad_norm": 0.2463458601299465, |
|
"kl": 0.0134429931640625, |
|
"learning_rate": 7.987929777669372e-07, |
|
"loss": 0.0701, |
|
"reward": 0.6237293034791946, |
|
"reward_std": 0.22337394580245018, |
|
"rewards/exp_len_reward": 0.6237293034791946, |
|
"step": 129 |
|
}, |
|
{ |
|
"completion_length": 2244.7634887695312, |
|
"epoch": 0.3851851851851852, |
|
"grad_norm": 0.21232997470712528, |
|
"kl": 0.0127105712890625, |
|
"learning_rate": 7.948918822402123e-07, |
|
"loss": -0.01, |
|
"reward": 0.5622997805476189, |
|
"reward_std": 0.21293797343969345, |
|
"rewards/exp_len_reward": 0.5622997805476189, |
|
"step": 130 |
|
}, |
|
{ |
|
"completion_length": 2267.8349609375, |
|
"epoch": 0.38814814814814813, |
|
"grad_norm": 0.18491480276734742, |
|
"kl": 0.014862060546875, |
|
"learning_rate": 7.909644607068174e-07, |
|
"loss": -0.0161, |
|
"reward": 0.5492302775382996, |
|
"reward_std": 0.19176549836993217, |
|
"rewards/exp_len_reward": 0.5492302775382996, |
|
"step": 131 |
|
}, |
|
{ |
|
"completion_length": 2307.584930419922, |
|
"epoch": 0.39111111111111113, |
|
"grad_norm": 0.18853458559326913, |
|
"kl": 0.0153656005859375, |
|
"learning_rate": 7.870111353666414e-07, |
|
"loss": 0.0551, |
|
"reward": 0.5810948982834816, |
|
"reward_std": 0.21469852700829506, |
|
"rewards/exp_len_reward": 0.5810948982834816, |
|
"step": 132 |
|
}, |
|
{ |
|
"completion_length": 2127.2501525878906, |
|
"epoch": 0.3940740740740741, |
|
"grad_norm": 0.2079768750933827, |
|
"kl": 0.0130462646484375, |
|
"learning_rate": 7.830323312042464e-07, |
|
"loss": 0.0753, |
|
"reward": 0.5947326272726059, |
|
"reward_std": 0.2402301263064146, |
|
"rewards/exp_len_reward": 0.5947326272726059, |
|
"step": 133 |
|
}, |
|
{ |
|
"completion_length": 2028.3483276367188, |
|
"epoch": 0.397037037037037, |
|
"grad_norm": 0.21498398205311295, |
|
"kl": 0.01544189453125, |
|
"learning_rate": 7.790284759431809e-07, |
|
"loss": 0.0471, |
|
"reward": 0.6623428612947464, |
|
"reward_std": 0.16073044575750828, |
|
"rewards/exp_len_reward": 0.6623428612947464, |
|
"step": 134 |
|
}, |
|
{ |
|
"completion_length": 2291.884002685547, |
|
"epoch": 0.4, |
|
"grad_norm": 0.1715810834745006, |
|
"kl": 0.01556396484375, |
|
"learning_rate": 7.75e-07, |
|
"loss": 0.0255, |
|
"reward": 0.5680599883198738, |
|
"reward_std": 0.2503676153719425, |
|
"rewards/exp_len_reward": 0.5680599883198738, |
|
"step": 135 |
|
}, |
|
{ |
|
"completion_length": 2042.2545776367188, |
|
"epoch": 0.40296296296296297, |
|
"grad_norm": 0.2520458310084413, |
|
"kl": 0.012939453125, |
|
"learning_rate": 7.709473364379949e-07, |
|
"loss": 0.0913, |
|
"reward": 0.5720359832048416, |
|
"reward_std": 0.2734212428331375, |
|
"rewards/exp_len_reward": 0.5720359832048416, |
|
"step": 136 |
|
}, |
|
{ |
|
"completion_length": 1867.6385192871094, |
|
"epoch": 0.4059259259259259, |
|
"grad_norm": 0.1612993062652069, |
|
"kl": 0.0109100341796875, |
|
"learning_rate": 7.668709209206391e-07, |
|
"loss": 0.0005, |
|
"reward": 0.6729157119989395, |
|
"reward_std": 0.24489626288414001, |
|
"rewards/exp_len_reward": 0.6729157119989395, |
|
"step": 137 |
|
}, |
|
{ |
|
"completion_length": 2194.866180419922, |
|
"epoch": 0.4088888888888889, |
|
"grad_norm": 0.1838415012898094, |
|
"kl": 0.01513671875, |
|
"learning_rate": 7.627711916647531e-07, |
|
"loss": 0.0393, |
|
"reward": 0.616047129034996, |
|
"reward_std": 0.241153996437788, |
|
"rewards/exp_len_reward": 0.616047129034996, |
|
"step": 138 |
|
}, |
|
{ |
|
"completion_length": 2489.272430419922, |
|
"epoch": 0.41185185185185186, |
|
"grad_norm": 0.15970472042816589, |
|
"kl": 0.016876220703125, |
|
"learning_rate": 7.586485893933972e-07, |
|
"loss": -0.0079, |
|
"reward": 0.6617710441350937, |
|
"reward_std": 0.21483541280031204, |
|
"rewards/exp_len_reward": 0.6617710441350937, |
|
"step": 139 |
|
}, |
|
{ |
|
"completion_length": 2220.1385192871094, |
|
"epoch": 0.4148148148148148, |
|
"grad_norm": 0.18462009649943625, |
|
"kl": 0.0157470703125, |
|
"learning_rate": 7.545035572884928e-07, |
|
"loss": 0.0096, |
|
"reward": 0.48423583060503006, |
|
"reward_std": 0.26414601504802704, |
|
"rewards/exp_len_reward": 0.48423583060503006, |
|
"step": 140 |
|
}, |
|
{ |
|
"completion_length": 2394.5001220703125, |
|
"epoch": 0.4177777777777778, |
|
"grad_norm": 0.17730147014895184, |
|
"kl": 0.017822265625, |
|
"learning_rate": 7.503365409431801e-07, |
|
"loss": 0.0339, |
|
"reward": 0.6387054920196533, |
|
"reward_std": 0.17836992628872395, |
|
"rewards/exp_len_reward": 0.6387054920196533, |
|
"step": 141 |
|
}, |
|
{ |
|
"completion_length": 1699.62060546875, |
|
"epoch": 0.42074074074074075, |
|
"grad_norm": 0.27025204836335937, |
|
"kl": 0.012969970703125, |
|
"learning_rate": 7.46147988313917e-07, |
|
"loss": 0.0775, |
|
"reward": 0.6824662685394287, |
|
"reward_std": 0.27711663395166397, |
|
"rewards/exp_len_reward": 0.6824662685394287, |
|
"step": 142 |
|
}, |
|
{ |
|
"completion_length": 2110.464385986328, |
|
"epoch": 0.4237037037037037, |
|
"grad_norm": 0.20686431019260695, |
|
"kl": 0.0187225341796875, |
|
"learning_rate": 7.419383496723229e-07, |
|
"loss": 0.0448, |
|
"reward": 0.6168643683195114, |
|
"reward_std": 0.24179954081773758, |
|
"rewards/exp_len_reward": 0.6168643683195114, |
|
"step": 143 |
|
}, |
|
{ |
|
"completion_length": 2662.9599609375, |
|
"epoch": 0.4266666666666667, |
|
"grad_norm": 0.18577680951510714, |
|
"kl": 0.021453857421875, |
|
"learning_rate": 7.377080775567751e-07, |
|
"loss": 0.0196, |
|
"reward": 0.4251635745167732, |
|
"reward_std": 0.24519287049770355, |
|
"rewards/exp_len_reward": 0.4251635745167732, |
|
"step": 144 |
|
}, |
|
{ |
|
"completion_length": 1741.3394165039062, |
|
"epoch": 0.42962962962962964, |
|
"grad_norm": 0.181100819553374, |
|
"kl": 0.015777587890625, |
|
"learning_rate": 7.334576267237599e-07, |
|
"loss": 0.0253, |
|
"reward": 0.653263047337532, |
|
"reward_std": 0.21376236528158188, |
|
"rewards/exp_len_reward": 0.653263047337532, |
|
"step": 145 |
|
}, |
|
{ |
|
"completion_length": 2283.5670776367188, |
|
"epoch": 0.4325925925925926, |
|
"grad_norm": 0.20288310670933243, |
|
"kl": 0.017974853515625, |
|
"learning_rate": 7.291874540989869e-07, |
|
"loss": 0.063, |
|
"reward": 0.6847885400056839, |
|
"reward_std": 0.22203149646520615, |
|
"rewards/exp_len_reward": 0.6847885400056839, |
|
"step": 146 |
|
}, |
|
{ |
|
"completion_length": 2459.8483276367188, |
|
"epoch": 0.43555555555555553, |
|
"grad_norm": 0.24617751418167974, |
|
"kl": 0.023956298828125, |
|
"learning_rate": 7.248980187282679e-07, |
|
"loss": 0.0514, |
|
"reward": 0.5492689982056618, |
|
"reward_std": 0.29514792189002037, |
|
"rewards/exp_len_reward": 0.5492689982056618, |
|
"step": 147 |
|
}, |
|
{ |
|
"completion_length": 2116.6295776367188, |
|
"epoch": 0.43851851851851853, |
|
"grad_norm": 0.20910355061187821, |
|
"kl": 0.018707275390625, |
|
"learning_rate": 7.205897817281707e-07, |
|
"loss": -0.0376, |
|
"reward": 0.562318354845047, |
|
"reward_std": 0.20038180239498615, |
|
"rewards/exp_len_reward": 0.562318354845047, |
|
"step": 148 |
|
}, |
|
{ |
|
"completion_length": 1637.2054138183594, |
|
"epoch": 0.4414814814814815, |
|
"grad_norm": 0.3052837937915986, |
|
"kl": 0.0164947509765625, |
|
"learning_rate": 7.162632062364482e-07, |
|
"loss": 0.0718, |
|
"reward": 0.6785788387060165, |
|
"reward_std": 0.28277434036135674, |
|
"rewards/exp_len_reward": 0.6785788387060165, |
|
"step": 149 |
|
}, |
|
{ |
|
"completion_length": 2099.5491943359375, |
|
"epoch": 0.4444444444444444, |
|
"grad_norm": 0.21016234434727396, |
|
"kl": 0.02056884765625, |
|
"learning_rate": 7.119187573622503e-07, |
|
"loss": 0.0004, |
|
"reward": 0.5978891626000404, |
|
"reward_std": 0.21479224599897861, |
|
"rewards/exp_len_reward": 0.5978891626000404, |
|
"step": 150 |
|
}, |
|
{ |
|
"completion_length": 1984.9420776367188, |
|
"epoch": 0.4474074074074074, |
|
"grad_norm": 0.17085682034823446, |
|
"kl": 0.017852783203125, |
|
"learning_rate": 7.075569021361258e-07, |
|
"loss": 0.0227, |
|
"reward": 0.5734822899103165, |
|
"reward_std": 0.24261553958058357, |
|
"rewards/exp_len_reward": 0.5734822899103165, |
|
"step": 151 |
|
}, |
|
{ |
|
"completion_length": 2491.6206970214844, |
|
"epoch": 0.45037037037037037, |
|
"grad_norm": 0.20118499173178536, |
|
"kl": 0.0238037109375, |
|
"learning_rate": 7.031781094598147e-07, |
|
"loss": 0.0491, |
|
"reward": 0.4708263725042343, |
|
"reward_std": 0.288173146545887, |
|
"rewards/exp_len_reward": 0.4708263725042343, |
|
"step": 152 |
|
}, |
|
{ |
|
"completion_length": 2146.6563415527344, |
|
"epoch": 0.4533333333333333, |
|
"grad_norm": 0.20235562160674747, |
|
"kl": 0.0205841064453125, |
|
"learning_rate": 6.987828500558422e-07, |
|
"loss": 0.0515, |
|
"reward": 0.615074560046196, |
|
"reward_std": 0.2827143333852291, |
|
"rewards/exp_len_reward": 0.615074560046196, |
|
"step": 153 |
|
}, |
|
{ |
|
"completion_length": 1729.3304443359375, |
|
"epoch": 0.4562962962962963, |
|
"grad_norm": 0.19165529870144216, |
|
"kl": 0.016082763671875, |
|
"learning_rate": 6.943715964169153e-07, |
|
"loss": -0.0319, |
|
"reward": 0.59318608045578, |
|
"reward_std": 0.21333957836031914, |
|
"rewards/exp_len_reward": 0.59318608045578, |
|
"step": 154 |
|
}, |
|
{ |
|
"completion_length": 2026.9911499023438, |
|
"epoch": 0.45925925925925926, |
|
"grad_norm": 0.26603840660660094, |
|
"kl": 0.017608642578125, |
|
"learning_rate": 6.899448227551302e-07, |
|
"loss": 0.1068, |
|
"reward": 0.6835269778966904, |
|
"reward_std": 0.22044039890170097, |
|
"rewards/exp_len_reward": 0.6835269778966904, |
|
"step": 155 |
|
}, |
|
{ |
|
"completion_length": 2220.4911499023438, |
|
"epoch": 0.4622222222222222, |
|
"grad_norm": 0.18158002675146767, |
|
"kl": 0.01971435546875, |
|
"learning_rate": 6.85503004950993e-07, |
|
"loss": 0.0614, |
|
"reward": 0.5589229390025139, |
|
"reward_std": 0.21225098706781864, |
|
"rewards/exp_len_reward": 0.5589229390025139, |
|
"step": 156 |
|
}, |
|
{ |
|
"completion_length": 1817.1831359863281, |
|
"epoch": 0.4651851851851852, |
|
"grad_norm": 0.24149007108576753, |
|
"kl": 0.017913818359375, |
|
"learning_rate": 6.810466205022635e-07, |
|
"loss": 0.0515, |
|
"reward": 0.539856381714344, |
|
"reward_std": 0.26797987148165703, |
|
"rewards/exp_len_reward": 0.539856381714344, |
|
"step": 157 |
|
}, |
|
{ |
|
"completion_length": 2117.3929443359375, |
|
"epoch": 0.46814814814814815, |
|
"grad_norm": 0.25938118733261895, |
|
"kl": 0.02740478515625, |
|
"learning_rate": 6.765761484726232e-07, |
|
"loss": 0.0564, |
|
"reward": 0.5961438491940498, |
|
"reward_std": 0.24437472596764565, |
|
"rewards/exp_len_reward": 0.5961438491940498, |
|
"step": 158 |
|
}, |
|
{ |
|
"completion_length": 2225.0179443359375, |
|
"epoch": 0.4711111111111111, |
|
"grad_norm": 0.1700079832657491, |
|
"kl": 0.0214080810546875, |
|
"learning_rate": 6.720920694401765e-07, |
|
"loss": 0.0528, |
|
"reward": 0.5992478281259537, |
|
"reward_std": 0.28642022609710693, |
|
"rewards/exp_len_reward": 0.5992478281259537, |
|
"step": 159 |
|
}, |
|
{ |
|
"completion_length": 1754.8750915527344, |
|
"epoch": 0.4740740740740741, |
|
"grad_norm": 0.17436341962581006, |
|
"kl": 0.020294189453125, |
|
"learning_rate": 6.675948654457873e-07, |
|
"loss": 0.0133, |
|
"reward": 0.5765155255794525, |
|
"reward_std": 0.18946415930986404, |
|
"rewards/exp_len_reward": 0.5765155255794525, |
|
"step": 160 |
|
}, |
|
{ |
|
"completion_length": 1651.9732971191406, |
|
"epoch": 0.47703703703703704, |
|
"grad_norm": 0.27021245157611135, |
|
"kl": 0.01971435546875, |
|
"learning_rate": 6.6308501994126e-07, |
|
"loss": 0.0449, |
|
"reward": 0.6996115148067474, |
|
"reward_std": 0.19840912148356438, |
|
"rewards/exp_len_reward": 0.6996115148067474, |
|
"step": 161 |
|
}, |
|
{ |
|
"completion_length": 1784.4465026855469, |
|
"epoch": 0.48, |
|
"grad_norm": 0.18201737347411745, |
|
"kl": 0.0213623046875, |
|
"learning_rate": 6.585630177373679e-07, |
|
"loss": 0.0101, |
|
"reward": 0.6633335798978806, |
|
"reward_std": 0.28136105462908745, |
|
"rewards/exp_len_reward": 0.6633335798978806, |
|
"step": 162 |
|
}, |
|
{ |
|
"completion_length": 2037.9107971191406, |
|
"epoch": 0.482962962962963, |
|
"grad_norm": 0.18837562477978834, |
|
"kl": 0.026611328125, |
|
"learning_rate": 6.540293449517364e-07, |
|
"loss": -0.008, |
|
"reward": 0.5584470629692078, |
|
"reward_std": 0.2016864065080881, |
|
"rewards/exp_len_reward": 0.5584470629692078, |
|
"step": 163 |
|
}, |
|
{ |
|
"completion_length": 1733.7366943359375, |
|
"epoch": 0.48592592592592593, |
|
"grad_norm": 0.2840070123751475, |
|
"kl": 0.0224151611328125, |
|
"learning_rate": 6.494844889565838e-07, |
|
"loss": 0.0569, |
|
"reward": 0.6604474782943726, |
|
"reward_std": 0.25152015686035156, |
|
"rewards/exp_len_reward": 0.6604474782943726, |
|
"step": 164 |
|
}, |
|
{ |
|
"completion_length": 2190.232208251953, |
|
"epoch": 0.4888888888888889, |
|
"grad_norm": 0.2130269522908582, |
|
"kl": 0.03131103515625, |
|
"learning_rate": 6.449289383263299e-07, |
|
"loss": 0.0263, |
|
"reward": 0.587119996547699, |
|
"reward_std": 0.18609843030571938, |
|
"rewards/exp_len_reward": 0.587119996547699, |
|
"step": 165 |
|
}, |
|
{ |
|
"completion_length": 1967.9108276367188, |
|
"epoch": 0.4918518518518519, |
|
"grad_norm": 0.23106631062493835, |
|
"kl": 0.026458740234375, |
|
"learning_rate": 6.403631827850733e-07, |
|
"loss": 0.0458, |
|
"reward": 0.6920860558748245, |
|
"reward_std": 0.2177874594926834, |
|
"rewards/exp_len_reward": 0.6920860558748245, |
|
"step": 166 |
|
}, |
|
{ |
|
"completion_length": 1848.1875915527344, |
|
"epoch": 0.4948148148148148, |
|
"grad_norm": 0.2557642214220052, |
|
"kl": 0.025726318359375, |
|
"learning_rate": 6.357877131539459e-07, |
|
"loss": 0.0119, |
|
"reward": 0.5580969974398613, |
|
"reward_std": 0.2476295307278633, |
|
"rewards/exp_len_reward": 0.5580969974398613, |
|
"step": 167 |
|
}, |
|
{ |
|
"completion_length": 1784.1741333007812, |
|
"epoch": 0.49777777777777776, |
|
"grad_norm": 0.2020034406354174, |
|
"kl": 0.02728271484375, |
|
"learning_rate": 6.312030212983492e-07, |
|
"loss": 0.009, |
|
"reward": 0.6462460905313492, |
|
"reward_std": 0.27480896189808846, |
|
"rewards/exp_len_reward": 0.6462460905313492, |
|
"step": 168 |
|
}, |
|
{ |
|
"completion_length": 1966.8393249511719, |
|
"epoch": 0.5007407407407407, |
|
"grad_norm": 0.20624757112600367, |
|
"kl": 0.030059814453125, |
|
"learning_rate": 6.266096000750794e-07, |
|
"loss": 0.0173, |
|
"reward": 0.6448132321238518, |
|
"reward_std": 0.2088510636240244, |
|
"rewards/exp_len_reward": 0.6448132321238518, |
|
"step": 169 |
|
}, |
|
{ |
|
"completion_length": 1901.9420166015625, |
|
"epoch": 0.5037037037037037, |
|
"grad_norm": 0.2570931569292836, |
|
"kl": 0.028076171875, |
|
"learning_rate": 6.220079432793434e-07, |
|
"loss": 0.0486, |
|
"reward": 0.5949899107217789, |
|
"reward_std": 0.22542590275406837, |
|
"rewards/exp_len_reward": 0.5949899107217789, |
|
"step": 170 |
|
}, |
|
{ |
|
"completion_length": 2003.0358276367188, |
|
"epoch": 0.5066666666666667, |
|
"grad_norm": 0.20834827415235962, |
|
"kl": 0.035552978515625, |
|
"learning_rate": 6.173985455916767e-07, |
|
"loss": 0.0339, |
|
"reward": 0.5371049828827381, |
|
"reward_std": 0.2185331992805004, |
|
"rewards/exp_len_reward": 0.5371049828827381, |
|
"step": 171 |
|
}, |
|
{ |
|
"completion_length": 2454.7857971191406, |
|
"epoch": 0.5096296296296297, |
|
"grad_norm": 0.19172881294229105, |
|
"kl": 0.042144775390625, |
|
"learning_rate": 6.127819025247654e-07, |
|
"loss": 0.0363, |
|
"reward": 0.5926978290081024, |
|
"reward_std": 0.2542005889117718, |
|
"rewards/exp_len_reward": 0.5926978290081024, |
|
"step": 172 |
|
}, |
|
{ |
|
"completion_length": 1779.415283203125, |
|
"epoch": 0.5125925925925926, |
|
"grad_norm": 0.2287442270331349, |
|
"kl": 0.03466796875, |
|
"learning_rate": 6.081585103701769e-07, |
|
"loss": 0.0649, |
|
"reward": 0.7005281001329422, |
|
"reward_std": 0.20357034727931023, |
|
"rewards/exp_len_reward": 0.7005281001329422, |
|
"step": 173 |
|
}, |
|
{ |
|
"completion_length": 2120.3482666015625, |
|
"epoch": 0.5155555555555555, |
|
"grad_norm": 0.287867669455274, |
|
"kl": 0.039642333984375, |
|
"learning_rate": 6.0352886614501e-07, |
|
"loss": 0.0649, |
|
"reward": 0.6480746418237686, |
|
"reward_std": 0.23573359474539757, |
|
"rewards/exp_len_reward": 0.6480746418237686, |
|
"step": 174 |
|
}, |
|
{ |
|
"completion_length": 1984.0626220703125, |
|
"epoch": 0.5185185185185185, |
|
"grad_norm": 0.22117625934907972, |
|
"kl": 0.038360595703125, |
|
"learning_rate": 5.988934675384635e-07, |
|
"loss": 0.0294, |
|
"reward": 0.6022319048643112, |
|
"reward_std": 0.24885358661413193, |
|
"rewards/exp_len_reward": 0.6022319048643112, |
|
"step": 175 |
|
}, |
|
{ |
|
"completion_length": 1547.2277526855469, |
|
"epoch": 0.5214814814814814, |
|
"grad_norm": 0.23764266854092705, |
|
"kl": 0.031890869140625, |
|
"learning_rate": 5.942528128583356e-07, |
|
"loss": -0.0127, |
|
"reward": 0.581740252673626, |
|
"reward_std": 0.2612038552761078, |
|
"rewards/exp_len_reward": 0.581740252673626, |
|
"step": 176 |
|
}, |
|
{ |
|
"completion_length": 1882.9866943359375, |
|
"epoch": 0.5244444444444445, |
|
"grad_norm": 0.257891426495327, |
|
"kl": 0.036163330078125, |
|
"learning_rate": 5.896074009774554e-07, |
|
"loss": 0.08, |
|
"reward": 0.6895754784345627, |
|
"reward_std": 0.18230854347348213, |
|
"rewards/exp_len_reward": 0.6895754784345627, |
|
"step": 177 |
|
}, |
|
{ |
|
"completion_length": 1950.5625610351562, |
|
"epoch": 0.5274074074074074, |
|
"grad_norm": 0.23376992182966935, |
|
"kl": 0.033843994140625, |
|
"learning_rate": 5.849577312800529e-07, |
|
"loss": 0.0192, |
|
"reward": 0.6687831208109856, |
|
"reward_std": 0.2672557160258293, |
|
"rewards/exp_len_reward": 0.6687831208109856, |
|
"step": 178 |
|
}, |
|
{ |
|
"completion_length": 1712.3616638183594, |
|
"epoch": 0.5303703703703704, |
|
"grad_norm": 0.22585330405547638, |
|
"kl": 0.036285400390625, |
|
"learning_rate": 5.803043036080764e-07, |
|
"loss": 0.0197, |
|
"reward": 0.6192082017660141, |
|
"reward_std": 0.2819724902510643, |
|
"rewards/exp_len_reward": 0.6192082017660141, |
|
"step": 179 |
|
}, |
|
{ |
|
"completion_length": 1884.2367248535156, |
|
"epoch": 0.5333333333333333, |
|
"grad_norm": 0.41283804620716, |
|
"kl": 0.044189453125, |
|
"learning_rate": 5.756476182074582e-07, |
|
"loss": 0.0724, |
|
"reward": 0.6782168745994568, |
|
"reward_std": 0.1702322345227003, |
|
"rewards/exp_len_reward": 0.6782168745994568, |
|
"step": 180 |
|
}, |
|
{ |
|
"completion_length": 2027.8750915527344, |
|
"epoch": 0.5362962962962963, |
|
"grad_norm": 0.2937087831499392, |
|
"kl": 0.05908203125, |
|
"learning_rate": 5.709881756743379e-07, |
|
"loss": 0.0467, |
|
"reward": 0.5191835761070251, |
|
"reward_std": 0.19608749821782112, |
|
"rewards/exp_len_reward": 0.5191835761070251, |
|
"step": 181 |
|
}, |
|
{ |
|
"completion_length": 1563.6473541259766, |
|
"epoch": 0.5392592592592592, |
|
"grad_norm": 0.3970129789902169, |
|
"kl": 0.04388427734375, |
|
"learning_rate": 5.663264769012486e-07, |
|
"loss": 0.0596, |
|
"reward": 0.7023471593856812, |
|
"reward_std": 0.20404362678527832, |
|
"rewards/exp_len_reward": 0.7023471593856812, |
|
"step": 182 |
|
}, |
|
{ |
|
"completion_length": 1748.5402526855469, |
|
"epoch": 0.5422222222222223, |
|
"grad_norm": 0.3017203217726933, |
|
"kl": 0.05078125, |
|
"learning_rate": 5.616630230232704e-07, |
|
"loss": 0.0113, |
|
"reward": 0.5868410617113113, |
|
"reward_std": 0.2523919604718685, |
|
"rewards/exp_len_reward": 0.5868410617113113, |
|
"step": 183 |
|
}, |
|
{ |
|
"completion_length": 2259.8885192871094, |
|
"epoch": 0.5451851851851852, |
|
"grad_norm": 2.1483873089235024, |
|
"kl": 0.09649658203125, |
|
"learning_rate": 5.569983153641579e-07, |
|
"loss": 0.0481, |
|
"reward": 0.5765419751405716, |
|
"reward_std": 0.23718996345996857, |
|
"rewards/exp_len_reward": 0.5765419751405716, |
|
"step": 184 |
|
}, |
|
{ |
|
"completion_length": 1957.0090026855469, |
|
"epoch": 0.5481481481481482, |
|
"grad_norm": 0.2960652403592979, |
|
"kl": 0.0684814453125, |
|
"learning_rate": 5.523328553824479e-07, |
|
"loss": 0.0223, |
|
"reward": 0.6493587493896484, |
|
"reward_std": 0.19187942519783974, |
|
"rewards/exp_len_reward": 0.6493587493896484, |
|
"step": 185 |
|
}, |
|
{ |
|
"completion_length": 2201.696502685547, |
|
"epoch": 0.5511111111111111, |
|
"grad_norm": 0.3800732482018625, |
|
"kl": 0.0880126953125, |
|
"learning_rate": 5.476671446175522e-07, |
|
"loss": 0.0025, |
|
"reward": 0.6268903613090515, |
|
"reward_std": 0.18401159532368183, |
|
"rewards/exp_len_reward": 0.6268903613090515, |
|
"step": 186 |
|
}, |
|
{ |
|
"completion_length": 1674.7188720703125, |
|
"epoch": 0.554074074074074, |
|
"grad_norm": 0.6923481954334814, |
|
"kl": 0.056396484375, |
|
"learning_rate": 5.43001684635842e-07, |
|
"loss": 0.0794, |
|
"reward": 0.7184347957372665, |
|
"reward_std": 0.18241577968001366, |
|
"rewards/exp_len_reward": 0.7184347957372665, |
|
"step": 187 |
|
}, |
|
{ |
|
"completion_length": 1843.6741638183594, |
|
"epoch": 0.557037037037037, |
|
"grad_norm": 0.6224586398259572, |
|
"kl": 0.07568359375, |
|
"learning_rate": 5.383369769767296e-07, |
|
"loss": 0.0439, |
|
"reward": 0.6129633188247681, |
|
"reward_std": 0.25642842054367065, |
|
"rewards/exp_len_reward": 0.6129633188247681, |
|
"step": 188 |
|
}, |
|
{ |
|
"completion_length": 2510.1161499023438, |
|
"epoch": 0.56, |
|
"grad_norm": 0.4420205774969637, |
|
"kl": 0.1275634765625, |
|
"learning_rate": 5.336735230987514e-07, |
|
"loss": 0.0268, |
|
"reward": 0.4878092482686043, |
|
"reward_std": 0.20435325056314468, |
|
"rewards/exp_len_reward": 0.4878092482686043, |
|
"step": 189 |
|
}, |
|
{ |
|
"completion_length": 2056.4420471191406, |
|
"epoch": 0.562962962962963, |
|
"grad_norm": 0.6305416339191662, |
|
"kl": 0.1219482421875, |
|
"learning_rate": 5.290118243256622e-07, |
|
"loss": -0.0364, |
|
"reward": 0.4746796190738678, |
|
"reward_std": 0.21284806914627552, |
|
"rewards/exp_len_reward": 0.4746796190738678, |
|
"step": 190 |
|
}, |
|
{ |
|
"completion_length": 2051.919708251953, |
|
"epoch": 0.5659259259259259, |
|
"grad_norm": 0.4598638399702875, |
|
"kl": 0.130859375, |
|
"learning_rate": 5.243523817925418e-07, |
|
"loss": 0.0496, |
|
"reward": 0.6630957126617432, |
|
"reward_std": 0.17974085174500942, |
|
"rewards/exp_len_reward": 0.6630957126617432, |
|
"step": 191 |
|
}, |
|
{ |
|
"completion_length": 2432.370635986328, |
|
"epoch": 0.5688888888888889, |
|
"grad_norm": 1.3858309739065304, |
|
"kl": 0.266845703125, |
|
"learning_rate": 5.196956963919237e-07, |
|
"loss": 0.1059, |
|
"reward": 0.5846623033285141, |
|
"reward_std": 0.2576068378984928, |
|
"rewards/exp_len_reward": 0.5846623033285141, |
|
"step": 192 |
|
}, |
|
{ |
|
"completion_length": 2348.6563415527344, |
|
"epoch": 0.5718518518518518, |
|
"grad_norm": 0.446976505403873, |
|
"kl": 0.21044921875, |
|
"learning_rate": 5.150422687199471e-07, |
|
"loss": 0.0607, |
|
"reward": 0.6032482236623764, |
|
"reward_std": 0.25601741299033165, |
|
"rewards/exp_len_reward": 0.6032482236623764, |
|
"step": 193 |
|
}, |
|
{ |
|
"completion_length": 1901.40185546875, |
|
"epoch": 0.5748148148148148, |
|
"grad_norm": 0.5635960519086024, |
|
"kl": 0.1746826171875, |
|
"learning_rate": 5.103925990225448e-07, |
|
"loss": 0.0442, |
|
"reward": 0.5330104827880859, |
|
"reward_std": 0.19548507407307625, |
|
"rewards/exp_len_reward": 0.5330104827880859, |
|
"step": 194 |
|
}, |
|
{ |
|
"completion_length": 1449.602767944336, |
|
"epoch": 0.5777777777777777, |
|
"grad_norm": 0.4543937743090975, |
|
"kl": 0.107879638671875, |
|
"learning_rate": 5.057471871416644e-07, |
|
"loss": 0.0058, |
|
"reward": 0.6681396141648293, |
|
"reward_std": 0.23483269661664963, |
|
"rewards/exp_len_reward": 0.6681396141648293, |
|
"step": 195 |
|
}, |
|
{ |
|
"completion_length": 2431.8617248535156, |
|
"epoch": 0.5807407407407408, |
|
"grad_norm": 1.1028267878445535, |
|
"kl": 0.212646484375, |
|
"learning_rate": 5.011065324615364e-07, |
|
"loss": -0.0174, |
|
"reward": 0.571851409971714, |
|
"reward_std": 0.26459021866321564, |
|
"rewards/exp_len_reward": 0.571851409971714, |
|
"step": 196 |
|
}, |
|
{ |
|
"completion_length": 1839.2723693847656, |
|
"epoch": 0.5837037037037037, |
|
"grad_norm": 0.5633254396853296, |
|
"kl": 0.1365966796875, |
|
"learning_rate": 4.964711338549901e-07, |
|
"loss": 0.0059, |
|
"reward": 0.6712978407740593, |
|
"reward_std": 0.16716519370675087, |
|
"rewards/exp_len_reward": 0.6712978407740593, |
|
"step": 197 |
|
}, |
|
{ |
|
"completion_length": 2141.5313415527344, |
|
"epoch": 0.5866666666666667, |
|
"grad_norm": 0.43139745186321504, |
|
"kl": 0.117919921875, |
|
"learning_rate": 4.918414896298229e-07, |
|
"loss": 0.0106, |
|
"reward": 0.555988572537899, |
|
"reward_std": 0.21953130513429642, |
|
"rewards/exp_len_reward": 0.555988572537899, |
|
"step": 198 |
|
}, |
|
{ |
|
"completion_length": 1696.3572387695312, |
|
"epoch": 0.5896296296296296, |
|
"grad_norm": 0.44555781304916003, |
|
"kl": 0.08502197265625, |
|
"learning_rate": 4.872180974752347e-07, |
|
"loss": 0.011, |
|
"reward": 0.6980317980051041, |
|
"reward_std": 0.1718193106353283, |
|
"rewards/exp_len_reward": 0.6980317980051041, |
|
"step": 199 |
|
}, |
|
{ |
|
"completion_length": 1863.7188415527344, |
|
"epoch": 0.5925925925925926, |
|
"grad_norm": 0.7676065950646701, |
|
"kl": 0.0809326171875, |
|
"learning_rate": 4.826014544083234e-07, |
|
"loss": 0.0557, |
|
"reward": 0.6198651492595673, |
|
"reward_std": 0.21527405828237534, |
|
"rewards/exp_len_reward": 0.6198651492595673, |
|
"step": 200 |
|
}, |
|
{ |
|
"completion_length": 1915.4956359863281, |
|
"epoch": 0.5955555555555555, |
|
"grad_norm": 0.8609900913318874, |
|
"kl": 0.09375, |
|
"learning_rate": 4.779920567206568e-07, |
|
"loss": 0.0506, |
|
"reward": 0.635828509926796, |
|
"reward_std": 0.21409705840051174, |
|
"rewards/exp_len_reward": 0.635828509926796, |
|
"step": 201 |
|
}, |
|
{ |
|
"completion_length": 1836.6920776367188, |
|
"epoch": 0.5985185185185186, |
|
"grad_norm": 0.8749272432326957, |
|
"kl": 0.1002197265625, |
|
"learning_rate": 4.733903999249206e-07, |
|
"loss": 0.0628, |
|
"reward": 0.5262488052248955, |
|
"reward_std": 0.21715521067380905, |
|
"rewards/exp_len_reward": 0.5262488052248955, |
|
"step": 202 |
|
}, |
|
{ |
|
"completion_length": 1819.7635192871094, |
|
"epoch": 0.6014814814814815, |
|
"grad_norm": 0.7926613477880593, |
|
"kl": 0.131591796875, |
|
"learning_rate": 4.687969787016507e-07, |
|
"loss": 0.0605, |
|
"reward": 0.5331440269947052, |
|
"reward_std": 0.19602959603071213, |
|
"rewards/exp_len_reward": 0.5331440269947052, |
|
"step": 203 |
|
}, |
|
{ |
|
"completion_length": 2419.7991943359375, |
|
"epoch": 0.6044444444444445, |
|
"grad_norm": 0.9961905231609627, |
|
"kl": 0.25341796875, |
|
"learning_rate": 4.642122868460542e-07, |
|
"loss": 0.0808, |
|
"reward": 0.6300962418317795, |
|
"reward_std": 0.24434982240200043, |
|
"rewards/exp_len_reward": 0.6300962418317795, |
|
"step": 204 |
|
}, |
|
{ |
|
"completion_length": 2225.384002685547, |
|
"epoch": 0.6074074074074074, |
|
"grad_norm": 0.8274200921128466, |
|
"kl": 0.35498046875, |
|
"learning_rate": 4.596368172149268e-07, |
|
"loss": 0.0202, |
|
"reward": 0.5711120814085007, |
|
"reward_std": 0.2728967033326626, |
|
"rewards/exp_len_reward": 0.5711120814085007, |
|
"step": 205 |
|
}, |
|
{ |
|
"completion_length": 1870.5804748535156, |
|
"epoch": 0.6103703703703703, |
|
"grad_norm": 0.6794454475943669, |
|
"kl": 0.325439453125, |
|
"learning_rate": 4.550710616736702e-07, |
|
"loss": 0.041, |
|
"reward": 0.6079469621181488, |
|
"reward_std": 0.26122210919857025, |
|
"rewards/exp_len_reward": 0.6079469621181488, |
|
"step": 206 |
|
}, |
|
{ |
|
"completion_length": 1911.4643859863281, |
|
"epoch": 0.6133333333333333, |
|
"grad_norm": 1.1683585185836836, |
|
"kl": 0.41748046875, |
|
"learning_rate": 4.505155110434162e-07, |
|
"loss": 0.085, |
|
"reward": 0.5756408721208572, |
|
"reward_std": 0.2867981418967247, |
|
"rewards/exp_len_reward": 0.5756408721208572, |
|
"step": 207 |
|
}, |
|
{ |
|
"completion_length": 2093.513458251953, |
|
"epoch": 0.6162962962962963, |
|
"grad_norm": 0.6479840159540543, |
|
"kl": 0.46435546875, |
|
"learning_rate": 4.459706550482638e-07, |
|
"loss": 0.1024, |
|
"reward": 0.6004444509744644, |
|
"reward_std": 0.26299645751714706, |
|
"rewards/exp_len_reward": 0.6004444509744644, |
|
"step": 208 |
|
}, |
|
{ |
|
"completion_length": 2323.15185546875, |
|
"epoch": 0.6192592592592593, |
|
"grad_norm": 1.3601936776142103, |
|
"kl": 0.5498046875, |
|
"learning_rate": 4.4143698226263207e-07, |
|
"loss": 0.0791, |
|
"reward": 0.5422740504145622, |
|
"reward_std": 0.3434370458126068, |
|
"rewards/exp_len_reward": 0.5422740504145622, |
|
"step": 209 |
|
}, |
|
{ |
|
"completion_length": 1607.0223999023438, |
|
"epoch": 0.6222222222222222, |
|
"grad_norm": 1.293326584722951, |
|
"kl": 0.20556640625, |
|
"learning_rate": 4.3691498005874007e-07, |
|
"loss": 0.0862, |
|
"reward": 0.6710554957389832, |
|
"reward_std": 0.21959074586629868, |
|
"rewards/exp_len_reward": 0.6710554957389832, |
|
"step": 210 |
|
}, |
|
{ |
|
"completion_length": 1978.8706665039062, |
|
"epoch": 0.6251851851851852, |
|
"grad_norm": 0.9582295626587259, |
|
"kl": 0.349609375, |
|
"learning_rate": 4.324051345542128e-07, |
|
"loss": 0.0559, |
|
"reward": 0.5671351253986359, |
|
"reward_std": 0.24335385113954544, |
|
"rewards/exp_len_reward": 0.5671351253986359, |
|
"step": 211 |
|
}, |
|
{ |
|
"completion_length": 2163.9420166015625, |
|
"epoch": 0.6281481481481481, |
|
"grad_norm": 0.5722113261698506, |
|
"kl": 0.343017578125, |
|
"learning_rate": 4.2790793055982354e-07, |
|
"loss": 0.0844, |
|
"reward": 0.5790135860443115, |
|
"reward_std": 0.21604818850755692, |
|
"rewards/exp_len_reward": 0.5790135860443115, |
|
"step": 212 |
|
}, |
|
{ |
|
"completion_length": 1913.8036499023438, |
|
"epoch": 0.6311111111111111, |
|
"grad_norm": 0.5324405845923617, |
|
"kl": 0.32373046875, |
|
"learning_rate": 4.234238515273768e-07, |
|
"loss": 0.0253, |
|
"reward": 0.648313857614994, |
|
"reward_std": 0.17545541189610958, |
|
"rewards/exp_len_reward": 0.648313857614994, |
|
"step": 213 |
|
}, |
|
{ |
|
"completion_length": 2254.8616943359375, |
|
"epoch": 0.6340740740740741, |
|
"grad_norm": 1.4273403179290647, |
|
"kl": 0.43359375, |
|
"learning_rate": 4.189533794977367e-07, |
|
"loss": 0.1264, |
|
"reward": 0.5515103414654732, |
|
"reward_std": 0.2388550043106079, |
|
"rewards/exp_len_reward": 0.5515103414654732, |
|
"step": 214 |
|
}, |
|
{ |
|
"completion_length": 2051.8617553710938, |
|
"epoch": 0.6370370370370371, |
|
"grad_norm": 1.1393120895451125, |
|
"kl": 0.42724609375, |
|
"learning_rate": 4.14496995049007e-07, |
|
"loss": 0.0296, |
|
"reward": 0.5932003408670425, |
|
"reward_std": 0.15118649788200855, |
|
"rewards/exp_len_reward": 0.5932003408670425, |
|
"step": 215 |
|
}, |
|
{ |
|
"completion_length": 2098.0491943359375, |
|
"epoch": 0.64, |
|
"grad_norm": 1.6151914612075735, |
|
"kl": 0.38916015625, |
|
"learning_rate": 4.100551772448697e-07, |
|
"loss": 0.0656, |
|
"reward": 0.568665586411953, |
|
"reward_std": 0.22934136912226677, |
|
"rewards/exp_len_reward": 0.568665586411953, |
|
"step": 216 |
|
}, |
|
{ |
|
"completion_length": 2233.6652221679688, |
|
"epoch": 0.642962962962963, |
|
"grad_norm": 1.394565883893917, |
|
"kl": 0.591796875, |
|
"learning_rate": 4.056284035830846e-07, |
|
"loss": 0.0706, |
|
"reward": 0.48754215240478516, |
|
"reward_std": 0.2107255533337593, |
|
"rewards/exp_len_reward": 0.48754215240478516, |
|
"step": 217 |
|
}, |
|
{ |
|
"completion_length": 2153.3750915527344, |
|
"epoch": 0.6459259259259259, |
|
"grad_norm": 0.7394942057986161, |
|
"kl": 0.56396484375, |
|
"learning_rate": 4.012171499441578e-07, |
|
"loss": 0.067, |
|
"reward": 0.5630225837230682, |
|
"reward_std": 0.2632727436721325, |
|
"rewards/exp_len_reward": 0.5630225837230682, |
|
"step": 218 |
|
}, |
|
{ |
|
"completion_length": 2178.5223693847656, |
|
"epoch": 0.6488888888888888, |
|
"grad_norm": 0.8833824117642055, |
|
"kl": 0.521484375, |
|
"learning_rate": 3.968218905401853e-07, |
|
"loss": 0.0932, |
|
"reward": 0.6287193298339844, |
|
"reward_std": 0.1800774559378624, |
|
"rewards/exp_len_reward": 0.6287193298339844, |
|
"step": 219 |
|
}, |
|
{ |
|
"completion_length": 2182.4912109375, |
|
"epoch": 0.6518518518518519, |
|
"grad_norm": 1.3858833504387356, |
|
"kl": 0.4931640625, |
|
"learning_rate": 3.924430978638742e-07, |
|
"loss": 0.0321, |
|
"reward": 0.5301230028271675, |
|
"reward_std": 0.20102717354893684, |
|
"rewards/exp_len_reward": 0.5301230028271675, |
|
"step": 220 |
|
}, |
|
{ |
|
"completion_length": 2076.0536193847656, |
|
"epoch": 0.6548148148148148, |
|
"grad_norm": 0.7482996746885904, |
|
"kl": 0.423095703125, |
|
"learning_rate": 3.8808124263774955e-07, |
|
"loss": 0.0427, |
|
"reward": 0.49979688227176666, |
|
"reward_std": 0.29293133690953255, |
|
"rewards/exp_len_reward": 0.49979688227176666, |
|
"step": 221 |
|
}, |
|
{ |
|
"completion_length": 2390.9510192871094, |
|
"epoch": 0.6577777777777778, |
|
"grad_norm": 1.1580322595016712, |
|
"kl": 0.43798828125, |
|
"learning_rate": 3.8373679376355195e-07, |
|
"loss": 0.0593, |
|
"reward": 0.49847787618637085, |
|
"reward_std": 0.21575787663459778, |
|
"rewards/exp_len_reward": 0.49847787618637085, |
|
"step": 222 |
|
}, |
|
{ |
|
"completion_length": 2259.2635192871094, |
|
"epoch": 0.6607407407407407, |
|
"grad_norm": 1.5796351783929345, |
|
"kl": 0.482421875, |
|
"learning_rate": 3.794102182718294e-07, |
|
"loss": 0.1097, |
|
"reward": 0.5217412784695625, |
|
"reward_std": 0.26718301698565483, |
|
"rewards/exp_len_reward": 0.5217412784695625, |
|
"step": 223 |
|
}, |
|
{ |
|
"completion_length": 2008.919677734375, |
|
"epoch": 0.6637037037037037, |
|
"grad_norm": 1.528742381770616, |
|
"kl": 0.42919921875, |
|
"learning_rate": 3.751019812717322e-07, |
|
"loss": 0.0972, |
|
"reward": 0.5147194415330887, |
|
"reward_std": 0.25649960711598396, |
|
"rewards/exp_len_reward": 0.5147194415330887, |
|
"step": 224 |
|
}, |
|
{ |
|
"completion_length": 1913.9598999023438, |
|
"epoch": 0.6666666666666666, |
|
"grad_norm": 1.0049224868904445, |
|
"kl": 0.462890625, |
|
"learning_rate": 3.708125459010134e-07, |
|
"loss": 0.0227, |
|
"reward": 0.5306781381368637, |
|
"reward_std": 0.22224940732121468, |
|
"rewards/exp_len_reward": 0.5306781381368637, |
|
"step": 225 |
|
}, |
|
{ |
|
"completion_length": 2082.8170776367188, |
|
"epoch": 0.6696296296296296, |
|
"grad_norm": 1.184822942762592, |
|
"kl": 0.59326171875, |
|
"learning_rate": 3.6654237327624003e-07, |
|
"loss": 0.0692, |
|
"reward": 0.5259907096624374, |
|
"reward_std": 0.1462160311639309, |
|
"rewards/exp_len_reward": 0.5259907096624374, |
|
"step": 226 |
|
}, |
|
{ |
|
"completion_length": 2022.6920776367188, |
|
"epoch": 0.6725925925925926, |
|
"grad_norm": 1.5849715003252363, |
|
"kl": 0.4501953125, |
|
"learning_rate": 3.622919224432248e-07, |
|
"loss": 0.0682, |
|
"reward": 0.563248299062252, |
|
"reward_std": 0.19807949475944042, |
|
"rewards/exp_len_reward": 0.563248299062252, |
|
"step": 227 |
|
}, |
|
{ |
|
"completion_length": 1900.1384582519531, |
|
"epoch": 0.6755555555555556, |
|
"grad_norm": 1.1910228260375635, |
|
"kl": 0.4029541015625, |
|
"learning_rate": 3.580616503276772e-07, |
|
"loss": 0.045, |
|
"reward": 0.5678588896989822, |
|
"reward_std": 0.24164490401744843, |
|
"rewards/exp_len_reward": 0.5678588896989822, |
|
"step": 228 |
|
}, |
|
{ |
|
"completion_length": 2144.4822998046875, |
|
"epoch": 0.6785185185185185, |
|
"grad_norm": 0.6989125543878815, |
|
"kl": 0.500244140625, |
|
"learning_rate": 3.5385201168608303e-07, |
|
"loss": 0.0246, |
|
"reward": 0.5909338667988777, |
|
"reward_std": 0.19285215064883232, |
|
"rewards/exp_len_reward": 0.5909338667988777, |
|
"step": 229 |
|
}, |
|
{ |
|
"completion_length": 2379.0760192871094, |
|
"epoch": 0.6814814814814815, |
|
"grad_norm": 1.3916488255594746, |
|
"kl": 0.61767578125, |
|
"learning_rate": 3.4966345905681984e-07, |
|
"loss": 0.0633, |
|
"reward": 0.5990354269742966, |
|
"reward_std": 0.23781514167785645, |
|
"rewards/exp_len_reward": 0.5990354269742966, |
|
"step": 230 |
|
}, |
|
{ |
|
"completion_length": 1947.0536804199219, |
|
"epoch": 0.6844444444444444, |
|
"grad_norm": 1.1973429701947933, |
|
"kl": 0.43896484375, |
|
"learning_rate": 3.4549644271150723e-07, |
|
"loss": 0.0619, |
|
"reward": 0.6357074603438377, |
|
"reward_std": 0.186561593785882, |
|
"rewards/exp_len_reward": 0.6357074603438377, |
|
"step": 231 |
|
}, |
|
{ |
|
"completion_length": 1959.9375915527344, |
|
"epoch": 0.6874074074074074, |
|
"grad_norm": 1.0316979947324525, |
|
"kl": 0.5341796875, |
|
"learning_rate": 3.413514106066026e-07, |
|
"loss": 0.055, |
|
"reward": 0.593490794301033, |
|
"reward_std": 0.22011198103427887, |
|
"rewards/exp_len_reward": 0.593490794301033, |
|
"step": 232 |
|
}, |
|
{ |
|
"completion_length": 2068.4956665039062, |
|
"epoch": 0.6903703703703704, |
|
"grad_norm": 1.0526407604676598, |
|
"kl": 0.68310546875, |
|
"learning_rate": 3.3722880833524704e-07, |
|
"loss": 0.0815, |
|
"reward": 0.5511805862188339, |
|
"reward_std": 0.29247505962848663, |
|
"rewards/exp_len_reward": 0.5511805862188339, |
|
"step": 233 |
|
}, |
|
{ |
|
"completion_length": 1586.1429138183594, |
|
"epoch": 0.6933333333333334, |
|
"grad_norm": 9.824678025864428, |
|
"kl": 0.67529296875, |
|
"learning_rate": 3.3312907907936097e-07, |
|
"loss": 0.0485, |
|
"reward": 0.6389699578285217, |
|
"reward_std": 0.1864270232617855, |
|
"rewards/exp_len_reward": 0.6389699578285217, |
|
"step": 234 |
|
}, |
|
{ |
|
"completion_length": 1837.5581359863281, |
|
"epoch": 0.6962962962962963, |
|
"grad_norm": 1.9026040821940675, |
|
"kl": 0.7041015625, |
|
"learning_rate": 3.2905266356200506e-07, |
|
"loss": 0.1006, |
|
"reward": 0.5509998500347137, |
|
"reward_std": 0.2821981944143772, |
|
"rewards/exp_len_reward": 0.5509998500347137, |
|
"step": 235 |
|
}, |
|
{ |
|
"completion_length": 2152.6295471191406, |
|
"epoch": 0.6992592592592592, |
|
"grad_norm": 11.662060157681738, |
|
"kl": 0.9638671875, |
|
"learning_rate": 3.250000000000001e-07, |
|
"loss": 0.0456, |
|
"reward": 0.5501847416162491, |
|
"reward_std": 0.1710715489462018, |
|
"rewards/exp_len_reward": 0.5501847416162491, |
|
"step": 236 |
|
}, |
|
{ |
|
"completion_length": 1924.6519165039062, |
|
"epoch": 0.7022222222222222, |
|
"grad_norm": 1.3789345707628888, |
|
"kl": 0.728515625, |
|
"learning_rate": 3.2097152405681904e-07, |
|
"loss": 0.0905, |
|
"reward": 0.5505119562149048, |
|
"reward_std": 0.20416779816150665, |
|
"rewards/exp_len_reward": 0.5505119562149048, |
|
"step": 237 |
|
}, |
|
{ |
|
"completion_length": 2016.696533203125, |
|
"epoch": 0.7051851851851851, |
|
"grad_norm": 2.230914856374322, |
|
"kl": 0.5777587890625, |
|
"learning_rate": 3.1696766879575354e-07, |
|
"loss": 0.0708, |
|
"reward": 0.5033136606216431, |
|
"reward_std": 0.21769768744707108, |
|
"rewards/exp_len_reward": 0.5033136606216431, |
|
"step": 238 |
|
}, |
|
{ |
|
"completion_length": 2177.9866638183594, |
|
"epoch": 0.7081481481481482, |
|
"grad_norm": 1.3770470235796741, |
|
"kl": 0.62841796875, |
|
"learning_rate": 3.1298886463335857e-07, |
|
"loss": 0.0397, |
|
"reward": 0.5401712283492088, |
|
"reward_std": 0.16454584524035454, |
|
"rewards/exp_len_reward": 0.5401712283492088, |
|
"step": 239 |
|
}, |
|
{ |
|
"completion_length": 2085.6607971191406, |
|
"epoch": 0.7111111111111111, |
|
"grad_norm": 1.0539338304475274, |
|
"kl": 0.30908203125, |
|
"learning_rate": 3.090355392931827e-07, |
|
"loss": 0.0564, |
|
"reward": 0.517996683716774, |
|
"reward_std": 0.2007257491350174, |
|
"rewards/exp_len_reward": 0.517996683716774, |
|
"step": 240 |
|
}, |
|
{ |
|
"completion_length": 1860.1116943359375, |
|
"epoch": 0.7140740740740741, |
|
"grad_norm": 3.5014067092047294, |
|
"kl": 0.22900390625, |
|
"learning_rate": 3.051081177597876e-07, |
|
"loss": 0.0691, |
|
"reward": 0.6099975854158401, |
|
"reward_std": 0.2467595972120762, |
|
"rewards/exp_len_reward": 0.6099975854158401, |
|
"step": 241 |
|
}, |
|
{ |
|
"completion_length": 1915.2724304199219, |
|
"epoch": 0.717037037037037, |
|
"grad_norm": 1.3719140200453745, |
|
"kl": 0.228271484375, |
|
"learning_rate": 3.012070222330629e-07, |
|
"loss": 0.0217, |
|
"reward": 0.4658224508166313, |
|
"reward_std": 0.2403927743434906, |
|
"rewards/exp_len_reward": 0.4658224508166313, |
|
"step": 242 |
|
}, |
|
{ |
|
"completion_length": 2076.2188415527344, |
|
"epoch": 0.72, |
|
"grad_norm": 4.382757308947479, |
|
"kl": 0.2587890625, |
|
"learning_rate": 2.97332672082837e-07, |
|
"loss": 0.0487, |
|
"reward": 0.5252245962619781, |
|
"reward_std": 0.25723912566900253, |
|
"rewards/exp_len_reward": 0.5252245962619781, |
|
"step": 243 |
|
}, |
|
{ |
|
"completion_length": 2111.9375610351562, |
|
"epoch": 0.7229629629629629, |
|
"grad_norm": 1.3459524492369142, |
|
"kl": 0.290771484375, |
|
"learning_rate": 2.934854838037978e-07, |
|
"loss": 0.0753, |
|
"reward": 0.591300830245018, |
|
"reward_std": 0.2962731011211872, |
|
"rewards/exp_len_reward": 0.591300830245018, |
|
"step": 244 |
|
}, |
|
{ |
|
"completion_length": 1875.7143859863281, |
|
"epoch": 0.725925925925926, |
|
"grad_norm": 1.0059471369170592, |
|
"kl": 0.2734375, |
|
"learning_rate": 2.8966587097071683e-07, |
|
"loss": 0.0397, |
|
"reward": 0.6712532192468643, |
|
"reward_std": 0.1884814165532589, |
|
"rewards/exp_len_reward": 0.6712532192468643, |
|
"step": 245 |
|
}, |
|
{ |
|
"completion_length": 1451.8304443359375, |
|
"epoch": 0.7288888888888889, |
|
"grad_norm": 3.334470597565311, |
|
"kl": 0.21923828125, |
|
"learning_rate": 2.8587424419399055e-07, |
|
"loss": 0.0743, |
|
"reward": 0.7117358893156052, |
|
"reward_std": 0.25990375503897667, |
|
"rewards/exp_len_reward": 0.7117358893156052, |
|
"step": 246 |
|
}, |
|
{ |
|
"completion_length": 1776.2679443359375, |
|
"epoch": 0.7318518518518519, |
|
"grad_norm": 1.3678103662021623, |
|
"kl": 0.25830078125, |
|
"learning_rate": 2.821110110755004e-07, |
|
"loss": 0.0406, |
|
"reward": 0.5916131287813187, |
|
"reward_std": 0.21059276908636093, |
|
"rewards/exp_len_reward": 0.5916131287813187, |
|
"step": 247 |
|
}, |
|
{ |
|
"completion_length": 1554.9465026855469, |
|
"epoch": 0.7348148148148148, |
|
"grad_norm": 1.574805240114687, |
|
"kl": 0.3394775390625, |
|
"learning_rate": 2.783765761647934e-07, |
|
"loss": 0.0373, |
|
"reward": 0.6932453364133835, |
|
"reward_std": 0.18446229957044125, |
|
"rewards/exp_len_reward": 0.6932453364133835, |
|
"step": 248 |
|
}, |
|
{ |
|
"completion_length": 1849.1875305175781, |
|
"epoch": 0.7377777777777778, |
|
"grad_norm": 2.96754901799222, |
|
"kl": 0.6630859375, |
|
"learning_rate": 2.746713409155951e-07, |
|
"loss": 0.0009, |
|
"reward": 0.5517724305391312, |
|
"reward_std": 0.2094859890639782, |
|
"rewards/exp_len_reward": 0.5517724305391312, |
|
"step": 249 |
|
}, |
|
{ |
|
"completion_length": 1902.3438415527344, |
|
"epoch": 0.7407407407407407, |
|
"grad_norm": 2.620462966652135, |
|
"kl": 0.81787109375, |
|
"learning_rate": 2.709957036426512e-07, |
|
"loss": 0.0541, |
|
"reward": 0.6254279538989067, |
|
"reward_std": 0.22460020706057549, |
|
"rewards/exp_len_reward": 0.6254279538989067, |
|
"step": 250 |
|
}, |
|
{ |
|
"completion_length": 2094.65185546875, |
|
"epoch": 0.7437037037037038, |
|
"grad_norm": 4.270339518374745, |
|
"kl": 0.9814453125, |
|
"learning_rate": 2.6735005947890986e-07, |
|
"loss": 0.0523, |
|
"reward": 0.5432867407798767, |
|
"reward_std": 0.25716196186840534, |
|
"rewards/exp_len_reward": 0.5432867407798767, |
|
"step": 251 |
|
}, |
|
{ |
|
"completion_length": 1923.1384887695312, |
|
"epoch": 0.7466666666666667, |
|
"grad_norm": 4.4007623566003335, |
|
"kl": 0.876953125, |
|
"learning_rate": 2.6373480033304397e-07, |
|
"loss": 0.0662, |
|
"reward": 0.5263698920607567, |
|
"reward_std": 0.23529189638793468, |
|
"rewards/exp_len_reward": 0.5263698920607567, |
|
"step": 252 |
|
}, |
|
{ |
|
"completion_length": 1774.7947082519531, |
|
"epoch": 0.7496296296296296, |
|
"grad_norm": 4.102095204442662, |
|
"kl": 0.74267578125, |
|
"learning_rate": 2.6015031484732103e-07, |
|
"loss": 0.0294, |
|
"reward": 0.5787611454725266, |
|
"reward_std": 0.23660384491086006, |
|
"rewards/exp_len_reward": 0.5787611454725266, |
|
"step": 253 |
|
}, |
|
{ |
|
"completion_length": 2020.52685546875, |
|
"epoch": 0.7525925925925926, |
|
"grad_norm": 3.494895709470538, |
|
"kl": 0.59033203125, |
|
"learning_rate": 2.565969883558236e-07, |
|
"loss": 0.1393, |
|
"reward": 0.5668770894408226, |
|
"reward_std": 0.2578126862645149, |
|
"rewards/exp_len_reward": 0.5668770894408226, |
|
"step": 254 |
|
}, |
|
{ |
|
"completion_length": 2249.1875915527344, |
|
"epoch": 0.7555555555555555, |
|
"grad_norm": 1.3887241916882165, |
|
"kl": 0.81689453125, |
|
"learning_rate": 2.5307520284302606e-07, |
|
"loss": 0.0922, |
|
"reward": 0.598124660551548, |
|
"reward_std": 0.16249966993927956, |
|
"rewards/exp_len_reward": 0.598124660551548, |
|
"step": 255 |
|
}, |
|
{ |
|
"completion_length": 2033.2679443359375, |
|
"epoch": 0.7585185185185185, |
|
"grad_norm": 4.011350770756549, |
|
"kl": 0.75, |
|
"learning_rate": 2.495853369027309e-07, |
|
"loss": 0.0559, |
|
"reward": 0.5327246338129044, |
|
"reward_std": 0.2595828250050545, |
|
"rewards/exp_len_reward": 0.5327246338129044, |
|
"step": 256 |
|
}, |
|
{ |
|
"completion_length": 2089.6384887695312, |
|
"epoch": 0.7614814814814815, |
|
"grad_norm": 2.4045500748983955, |
|
"kl": 0.5302734375, |
|
"learning_rate": 2.4612776569736984e-07, |
|
"loss": 0.1014, |
|
"reward": 0.6376392692327499, |
|
"reward_std": 0.23245985060930252, |
|
"rewards/exp_len_reward": 0.6376392692327499, |
|
"step": 257 |
|
}, |
|
{ |
|
"completion_length": 1895.6607971191406, |
|
"epoch": 0.7644444444444445, |
|
"grad_norm": 1.4906449441401861, |
|
"kl": 0.5322265625, |
|
"learning_rate": 2.4270286091767335e-07, |
|
"loss": 0.0467, |
|
"reward": 0.46556220203638077, |
|
"reward_std": 0.2202283851802349, |
|
"rewards/exp_len_reward": 0.46556220203638077, |
|
"step": 258 |
|
}, |
|
{ |
|
"completion_length": 1196.276840209961, |
|
"epoch": 0.7674074074074074, |
|
"grad_norm": 1.0592729473109168, |
|
"kl": 0.14678955078125, |
|
"learning_rate": 2.39310990742714e-07, |
|
"loss": 0.0688, |
|
"reward": 0.7091374546289444, |
|
"reward_std": 0.20984918251633644, |
|
"rewards/exp_len_reward": 0.7091374546289444, |
|
"step": 259 |
|
}, |
|
{ |
|
"completion_length": 1845.0536193847656, |
|
"epoch": 0.7703703703703704, |
|
"grad_norm": 0.6134870917567257, |
|
"kl": 0.40478515625, |
|
"learning_rate": 2.3595251980032673e-07, |
|
"loss": 0.0446, |
|
"reward": 0.62698695063591, |
|
"reward_std": 0.23867091536521912, |
|
"rewards/exp_len_reward": 0.62698695063591, |
|
"step": 260 |
|
}, |
|
{ |
|
"completion_length": 1902.5268859863281, |
|
"epoch": 0.7733333333333333, |
|
"grad_norm": 3.0873733945089636, |
|
"kl": 0.41748046875, |
|
"learning_rate": 2.3262780912791183e-07, |
|
"loss": 0.0941, |
|
"reward": 0.6978839188814163, |
|
"reward_std": 0.16607779264450073, |
|
"rewards/exp_len_reward": 0.6978839188814163, |
|
"step": 261 |
|
}, |
|
{ |
|
"completion_length": 1507.7098693847656, |
|
"epoch": 0.7762962962962963, |
|
"grad_norm": 0.7472079939240217, |
|
"kl": 0.2447509765625, |
|
"learning_rate": 2.2933721613362188e-07, |
|
"loss": 0.0344, |
|
"reward": 0.7389920055866241, |
|
"reward_std": 0.16782627813518047, |
|
"rewards/exp_len_reward": 0.7389920055866241, |
|
"step": 262 |
|
}, |
|
{ |
|
"completion_length": 1831.7724304199219, |
|
"epoch": 0.7792592592592592, |
|
"grad_norm": 2.303477094889597, |
|
"kl": 0.4873046875, |
|
"learning_rate": 2.2608109455794197e-07, |
|
"loss": 0.0001, |
|
"reward": 0.6041949242353439, |
|
"reward_std": 0.18537207320332527, |
|
"rewards/exp_len_reward": 0.6041949242353439, |
|
"step": 263 |
|
}, |
|
{ |
|
"completion_length": 2072.5447387695312, |
|
"epoch": 0.7822222222222223, |
|
"grad_norm": 1.162365568500677, |
|
"kl": 0.521484375, |
|
"learning_rate": 2.2285979443566093e-07, |
|
"loss": 0.0353, |
|
"reward": 0.47308728843927383, |
|
"reward_std": 0.2114715836942196, |
|
"rewards/exp_len_reward": 0.47308728843927383, |
|
"step": 264 |
|
}, |
|
{ |
|
"completion_length": 2082.0982666015625, |
|
"epoch": 0.7851851851851852, |
|
"grad_norm": 1.1540954056683599, |
|
"kl": 0.4599609375, |
|
"learning_rate": 2.196736620582429e-07, |
|
"loss": 0.0681, |
|
"reward": 0.6188310533761978, |
|
"reward_std": 0.21996454149484634, |
|
"rewards/exp_len_reward": 0.6188310533761978, |
|
"step": 265 |
|
}, |
|
{ |
|
"completion_length": 1896.6295471191406, |
|
"epoch": 0.7881481481481482, |
|
"grad_norm": 1.5229341500025346, |
|
"kl": 0.4105224609375, |
|
"learning_rate": 2.1652303993660146e-07, |
|
"loss": 0.0771, |
|
"reward": 0.619941383600235, |
|
"reward_std": 0.23672576248645782, |
|
"rewards/exp_len_reward": 0.619941383600235, |
|
"step": 266 |
|
}, |
|
{ |
|
"completion_length": 2306.241180419922, |
|
"epoch": 0.7911111111111111, |
|
"grad_norm": 1.5965567483211722, |
|
"kl": 0.603515625, |
|
"learning_rate": 2.1340826676427826e-07, |
|
"loss": 0.0899, |
|
"reward": 0.46971995383501053, |
|
"reward_std": 0.25150875374674797, |
|
"rewards/exp_len_reward": 0.46971995383501053, |
|
"step": 267 |
|
}, |
|
{ |
|
"completion_length": 2109.759063720703, |
|
"epoch": 0.794074074074074, |
|
"grad_norm": 1.0057063473264853, |
|
"kl": 0.5146484375, |
|
"learning_rate": 2.103296773810344e-07, |
|
"loss": 0.0736, |
|
"reward": 0.646474152803421, |
|
"reward_std": 0.1665012501180172, |
|
"rewards/exp_len_reward": 0.646474152803421, |
|
"step": 268 |
|
}, |
|
{ |
|
"completion_length": 1648.3706665039062, |
|
"epoch": 0.797037037037037, |
|
"grad_norm": 1.399927607428829, |
|
"kl": 0.41455078125, |
|
"learning_rate": 2.0728760273685435e-07, |
|
"loss": 0.0669, |
|
"reward": 0.6383180469274521, |
|
"reward_std": 0.24311606958508492, |
|
"rewards/exp_len_reward": 0.6383180469274521, |
|
"step": 269 |
|
}, |
|
{ |
|
"completion_length": 1812.5000915527344, |
|
"epoch": 0.8, |
|
"grad_norm": 2.0638539548725543, |
|
"kl": 0.58056640625, |
|
"learning_rate": 2.0428236985636878e-07, |
|
"loss": 0.0273, |
|
"reward": 0.5707896202802658, |
|
"reward_std": 0.14662024565041065, |
|
"rewards/exp_len_reward": 0.5707896202802658, |
|
"step": 270 |
|
}, |
|
{ |
|
"completion_length": 1855.99560546875, |
|
"epoch": 0.802962962962963, |
|
"grad_norm": 1.077724114851918, |
|
"kl": 0.573486328125, |
|
"learning_rate": 2.0131430180369957e-07, |
|
"loss": 0.0689, |
|
"reward": 0.6811731457710266, |
|
"reward_std": 0.21652160212397575, |
|
"rewards/exp_len_reward": 0.6811731457710266, |
|
"step": 271 |
|
}, |
|
{ |
|
"completion_length": 1906.46435546875, |
|
"epoch": 0.8059259259259259, |
|
"grad_norm": 3.8395631016480345, |
|
"kl": 0.7919921875, |
|
"learning_rate": 1.9838371764772992e-07, |
|
"loss": 0.0734, |
|
"reward": 0.5762921273708344, |
|
"reward_std": 0.21297482959926128, |
|
"rewards/exp_len_reward": 0.5762921273708344, |
|
"step": 272 |
|
}, |
|
{ |
|
"completion_length": 1850.71435546875, |
|
"epoch": 0.8088888888888889, |
|
"grad_norm": 2.5571773473006285, |
|
"kl": 0.69580078125, |
|
"learning_rate": 1.954909324278041e-07, |
|
"loss": 0.0464, |
|
"reward": 0.6064716130495071, |
|
"reward_std": 0.19751618057489395, |
|
"rewards/exp_len_reward": 0.6064716130495071, |
|
"step": 273 |
|
}, |
|
{ |
|
"completion_length": 1855.6205749511719, |
|
"epoch": 0.8118518518518518, |
|
"grad_norm": 1.234393016428857, |
|
"kl": 0.59619140625, |
|
"learning_rate": 1.9263625711986092e-07, |
|
"loss": 0.0481, |
|
"reward": 0.6006206125020981, |
|
"reward_std": 0.24168968573212624, |
|
"rewards/exp_len_reward": 0.6006206125020981, |
|
"step": 274 |
|
}, |
|
{ |
|
"completion_length": 1901.0803833007812, |
|
"epoch": 0.8148148148148148, |
|
"grad_norm": 1.638661716998031, |
|
"kl": 0.62451171875, |
|
"learning_rate": 1.8981999860300385e-07, |
|
"loss": 0.0825, |
|
"reward": 0.5825950875878334, |
|
"reward_std": 0.20379779115319252, |
|
"rewards/exp_len_reward": 0.5825950875878334, |
|
"step": 275 |
|
}, |
|
{ |
|
"completion_length": 2026.5090637207031, |
|
"epoch": 0.8177777777777778, |
|
"grad_norm": 1.448189554080957, |
|
"kl": 0.6962890625, |
|
"learning_rate": 1.8704245962651026e-07, |
|
"loss": 0.0915, |
|
"reward": 0.5586806088685989, |
|
"reward_std": 0.21963583678007126, |
|
"rewards/exp_len_reward": 0.5586806088685989, |
|
"step": 276 |
|
}, |
|
{ |
|
"completion_length": 1590.4777526855469, |
|
"epoch": 0.8207407407407408, |
|
"grad_norm": 0.8289937373611839, |
|
"kl": 0.5087890625, |
|
"learning_rate": 1.8430393877728745e-07, |
|
"loss": 0.0715, |
|
"reward": 0.6978113353252411, |
|
"reward_std": 0.22020583972334862, |
|
"rewards/exp_len_reward": 0.6978113353252411, |
|
"step": 277 |
|
}, |
|
{ |
|
"completion_length": 2163.5938415527344, |
|
"epoch": 0.8237037037037037, |
|
"grad_norm": 1.0094687173541748, |
|
"kl": 0.767578125, |
|
"learning_rate": 1.8160473044777263e-07, |
|
"loss": 0.1185, |
|
"reward": 0.5574893727898598, |
|
"reward_std": 0.25213854014873505, |
|
"rewards/exp_len_reward": 0.5574893727898598, |
|
"step": 278 |
|
}, |
|
{ |
|
"completion_length": 1669.3616638183594, |
|
"epoch": 0.8266666666666667, |
|
"grad_norm": 1.3169001272779028, |
|
"kl": 0.4873046875, |
|
"learning_rate": 1.789451248042867e-07, |
|
"loss": 0.0525, |
|
"reward": 0.6423463597893715, |
|
"reward_std": 0.21679977793246508, |
|
"rewards/exp_len_reward": 0.6423463597893715, |
|
"step": 279 |
|
}, |
|
{ |
|
"completion_length": 2108.071502685547, |
|
"epoch": 0.8296296296296296, |
|
"grad_norm": 1.5089860120727316, |
|
"kl": 0.72509765625, |
|
"learning_rate": 1.763254077558411e-07, |
|
"loss": 0.0624, |
|
"reward": 0.6306832581758499, |
|
"reward_std": 0.23130958899855614, |
|
"rewards/exp_len_reward": 0.6306832581758499, |
|
"step": 280 |
|
}, |
|
{ |
|
"completion_length": 1408.0223693847656, |
|
"epoch": 0.8325925925925926, |
|
"grad_norm": 0.626851236326197, |
|
"kl": 0.3817138671875, |
|
"learning_rate": 1.7374586092340194e-07, |
|
"loss": 0.052, |
|
"reward": 0.7289980947971344, |
|
"reward_std": 0.1484288088977337, |
|
"rewards/exp_len_reward": 0.7289980947971344, |
|
"step": 281 |
|
}, |
|
{ |
|
"completion_length": 1850.3304443359375, |
|
"epoch": 0.8355555555555556, |
|
"grad_norm": 1.2880854913101176, |
|
"kl": 0.61328125, |
|
"learning_rate": 1.712067616096159e-07, |
|
"loss": 0.0664, |
|
"reward": 0.6098030656576157, |
|
"reward_std": 0.27465640753507614, |
|
"rewards/exp_len_reward": 0.6098030656576157, |
|
"step": 282 |
|
}, |
|
{ |
|
"completion_length": 2180.772430419922, |
|
"epoch": 0.8385185185185186, |
|
"grad_norm": 1.3173048932404967, |
|
"kl": 0.7880859375, |
|
"learning_rate": 1.6870838276900018e-07, |
|
"loss": 0.068, |
|
"reward": 0.5274906530976295, |
|
"reward_std": 0.17172403447329998, |
|
"rewards/exp_len_reward": 0.5274906530976295, |
|
"step": 283 |
|
}, |
|
{ |
|
"completion_length": 1963.9598693847656, |
|
"epoch": 0.8414814814814815, |
|
"grad_norm": 2.352592735238779, |
|
"kl": 0.5947265625, |
|
"learning_rate": 1.6625099297859945e-07, |
|
"loss": 0.0435, |
|
"reward": 0.5498563274741173, |
|
"reward_std": 0.22014831006526947, |
|
"rewards/exp_len_reward": 0.5498563274741173, |
|
"step": 284 |
|
}, |
|
{ |
|
"completion_length": 1755.5179748535156, |
|
"epoch": 0.8444444444444444, |
|
"grad_norm": 0.7783642288316969, |
|
"kl": 0.53125, |
|
"learning_rate": 1.638348564091142e-07, |
|
"loss": 0.0537, |
|
"reward": 0.6466532945632935, |
|
"reward_std": 0.21114975400269032, |
|
"rewards/exp_len_reward": 0.6466532945632935, |
|
"step": 285 |
|
}, |
|
{ |
|
"completion_length": 1900.9108276367188, |
|
"epoch": 0.8474074074074074, |
|
"grad_norm": 0.7325869249506614, |
|
"kl": 0.56689453125, |
|
"learning_rate": 1.6146023279650146e-07, |
|
"loss": 0.0552, |
|
"reward": 0.5337693318724632, |
|
"reward_std": 0.2679591439664364, |
|
"rewards/exp_len_reward": 0.5337693318724632, |
|
"step": 286 |
|
}, |
|
{ |
|
"completion_length": 1815.821533203125, |
|
"epoch": 0.8503703703703703, |
|
"grad_norm": 2.5993122855161217, |
|
"kl": 0.48291015625, |
|
"learning_rate": 1.5912737741405364e-07, |
|
"loss": 0.0855, |
|
"reward": 0.6207796633243561, |
|
"reward_std": 0.28707681968808174, |
|
"rewards/exp_len_reward": 0.6207796633243561, |
|
"step": 287 |
|
}, |
|
{ |
|
"completion_length": 2066.6116943359375, |
|
"epoch": 0.8533333333333334, |
|
"grad_norm": 1.6367894996791235, |
|
"kl": 0.658203125, |
|
"learning_rate": 1.5683654104495627e-07, |
|
"loss": 0.0136, |
|
"reward": 0.5434933006763458, |
|
"reward_std": 0.20752229169011116, |
|
"rewards/exp_len_reward": 0.5434933006763458, |
|
"step": 288 |
|
}, |
|
{ |
|
"completion_length": 1812.1653137207031, |
|
"epoch": 0.8562962962962963, |
|
"grad_norm": 1.6394731751923866, |
|
"kl": 0.48291015625, |
|
"learning_rate": 1.5458796995532915e-07, |
|
"loss": 0.0547, |
|
"reward": 0.676224872469902, |
|
"reward_std": 0.20963529124855995, |
|
"rewards/exp_len_reward": 0.676224872469902, |
|
"step": 289 |
|
}, |
|
{ |
|
"completion_length": 1776.732177734375, |
|
"epoch": 0.8592592592592593, |
|
"grad_norm": 1.1773115253318147, |
|
"kl": 0.44091796875, |
|
"learning_rate": 1.5238190586775145e-07, |
|
"loss": 0.0524, |
|
"reward": 0.5728821009397507, |
|
"reward_std": 0.19821078144013882, |
|
"rewards/exp_len_reward": 0.5728821009397507, |
|
"step": 290 |
|
}, |
|
{ |
|
"completion_length": 1930.7991943359375, |
|
"epoch": 0.8622222222222222, |
|
"grad_norm": 1.217553347358791, |
|
"kl": 0.544921875, |
|
"learning_rate": 1.50218585935278e-07, |
|
"loss": 0.0898, |
|
"reward": 0.5744712874293327, |
|
"reward_std": 0.2638898529112339, |
|
"rewards/exp_len_reward": 0.5744712874293327, |
|
"step": 291 |
|
}, |
|
{ |
|
"completion_length": 1568.3929443359375, |
|
"epoch": 0.8651851851851852, |
|
"grad_norm": 2.1855891688795217, |
|
"kl": 0.3900146484375, |
|
"learning_rate": 1.4809824271594384e-07, |
|
"loss": 0.0756, |
|
"reward": 0.6319922655820847, |
|
"reward_std": 0.18087825924158096, |
|
"rewards/exp_len_reward": 0.6319922655820847, |
|
"step": 292 |
|
}, |
|
{ |
|
"completion_length": 1893.5537109375, |
|
"epoch": 0.8681481481481481, |
|
"grad_norm": 0.8745208848244106, |
|
"kl": 0.60498046875, |
|
"learning_rate": 1.4602110414776475e-07, |
|
"loss": 0.0793, |
|
"reward": 0.5885374248027802, |
|
"reward_std": 0.2328047677874565, |
|
"rewards/exp_len_reward": 0.5885374248027802, |
|
"step": 293 |
|
}, |
|
{ |
|
"completion_length": 2152.7456665039062, |
|
"epoch": 0.8711111111111111, |
|
"grad_norm": 3.1078833340758987, |
|
"kl": 0.7197265625, |
|
"learning_rate": 1.4398739352423406e-07, |
|
"loss": 0.0136, |
|
"reward": 0.5296469628810883, |
|
"reward_std": 0.1996788065880537, |
|
"rewards/exp_len_reward": 0.5296469628810883, |
|
"step": 294 |
|
}, |
|
{ |
|
"completion_length": 2409.5358276367188, |
|
"epoch": 0.8740740740740741, |
|
"grad_norm": 3.4314169496830558, |
|
"kl": 0.951171875, |
|
"learning_rate": 1.419973294703174e-07, |
|
"loss": 0.0447, |
|
"reward": 0.3807084336876869, |
|
"reward_std": 0.22457972541451454, |
|
"rewards/exp_len_reward": 0.3807084336876869, |
|
"step": 295 |
|
}, |
|
{ |
|
"completion_length": 2120.7813720703125, |
|
"epoch": 0.8770370370370371, |
|
"grad_norm": 3.439087218546517, |
|
"kl": 0.82958984375, |
|
"learning_rate": 1.400511259189518e-07, |
|
"loss": 0.026, |
|
"reward": 0.5302798449993134, |
|
"reward_std": 0.19236281886696815, |
|
"rewards/exp_len_reward": 0.5302798449993134, |
|
"step": 296 |
|
}, |
|
{ |
|
"completion_length": 1617.3125610351562, |
|
"epoch": 0.88, |
|
"grad_norm": 0.947761415688406, |
|
"kl": 0.3895263671875, |
|
"learning_rate": 1.3814899208804677e-07, |
|
"loss": 0.0739, |
|
"reward": 0.7116686105728149, |
|
"reward_std": 0.17973697930574417, |
|
"rewards/exp_len_reward": 0.7116686105728149, |
|
"step": 297 |
|
}, |
|
{ |
|
"completion_length": 2093.3616943359375, |
|
"epoch": 0.882962962962963, |
|
"grad_norm": 2.663080594359249, |
|
"kl": 0.7080078125, |
|
"learning_rate": 1.3629113245799361e-07, |
|
"loss": 0.0267, |
|
"reward": 0.4907858446240425, |
|
"reward_std": 0.18806980550289154, |
|
"rewards/exp_len_reward": 0.4907858446240425, |
|
"step": 298 |
|
}, |
|
{ |
|
"completion_length": 1681.8840026855469, |
|
"epoch": 0.8859259259259259, |
|
"grad_norm": 1.4501228440522578, |
|
"kl": 0.40087890625, |
|
"learning_rate": 1.3447774674968387e-07, |
|
"loss": 0.0953, |
|
"reward": 0.6455406174063683, |
|
"reward_std": 0.21742986515164375, |
|
"rewards/exp_len_reward": 0.6455406174063683, |
|
"step": 299 |
|
}, |
|
{ |
|
"completion_length": 1697.2188110351562, |
|
"epoch": 0.8888888888888888, |
|
"grad_norm": 0.8931595775633989, |
|
"kl": 0.4423828125, |
|
"learning_rate": 1.3270902990303869e-07, |
|
"loss": 0.0021, |
|
"reward": 0.5877698212862015, |
|
"reward_std": 0.2616008296608925, |
|
"rewards/exp_len_reward": 0.5877698212862015, |
|
"step": 300 |
|
}, |
|
{ |
|
"completion_length": 1755.1697387695312, |
|
"epoch": 0.8918518518518519, |
|
"grad_norm": 1.5838910503748744, |
|
"kl": 0.42138671875, |
|
"learning_rate": 1.3098517205605325e-07, |
|
"loss": 0.0896, |
|
"reward": 0.6868456155061722, |
|
"reward_std": 0.18149937316775322, |
|
"rewards/exp_len_reward": 0.6868456155061722, |
|
"step": 301 |
|
}, |
|
{ |
|
"completion_length": 1789.8304595947266, |
|
"epoch": 0.8948148148148148, |
|
"grad_norm": 0.6954424759579384, |
|
"kl": 0.455291748046875, |
|
"learning_rate": 1.2930635852435634e-07, |
|
"loss": 0.0637, |
|
"reward": 0.6176896244287491, |
|
"reward_std": 0.2546579912304878, |
|
"rewards/exp_len_reward": 0.6176896244287491, |
|
"step": 302 |
|
}, |
|
{ |
|
"completion_length": 2092.6742248535156, |
|
"epoch": 0.8977777777777778, |
|
"grad_norm": 1.055499700957037, |
|
"kl": 0.59423828125, |
|
"learning_rate": 1.276727697812894e-07, |
|
"loss": 0.0759, |
|
"reward": 0.5725482404232025, |
|
"reward_std": 0.2711305655539036, |
|
"rewards/exp_len_reward": 0.5725482404232025, |
|
"step": 303 |
|
}, |
|
{ |
|
"completion_length": 2076.634033203125, |
|
"epoch": 0.9007407407407407, |
|
"grad_norm": 1.0741468220182044, |
|
"kl": 0.5556640625, |
|
"learning_rate": 1.2608458143850493e-07, |
|
"loss": 0.0704, |
|
"reward": 0.6020158976316452, |
|
"reward_std": 0.2659350074827671, |
|
"rewards/exp_len_reward": 0.6020158976316452, |
|
"step": 304 |
|
}, |
|
{ |
|
"completion_length": 2139.4733276367188, |
|
"epoch": 0.9037037037037037, |
|
"grad_norm": 0.9471440274301567, |
|
"kl": 0.590576171875, |
|
"learning_rate": 1.2454196422708843e-07, |
|
"loss": 0.0492, |
|
"reward": 0.5845741108059883, |
|
"reward_std": 0.21030431985855103, |
|
"rewards/exp_len_reward": 0.5845741108059883, |
|
"step": 305 |
|
}, |
|
{ |
|
"completion_length": 2160.6697692871094, |
|
"epoch": 0.9066666666666666, |
|
"grad_norm": 1.0546222999465211, |
|
"kl": 0.73681640625, |
|
"learning_rate": 1.2304508397920499e-07, |
|
"loss": 0.01, |
|
"reward": 0.5044809579849243, |
|
"reward_std": 0.2162732593715191, |
|
"rewards/exp_len_reward": 0.5044809579849243, |
|
"step": 306 |
|
}, |
|
{ |
|
"completion_length": 1802.0626220703125, |
|
"epoch": 0.9096296296296297, |
|
"grad_norm": 0.7273536659582915, |
|
"kl": 0.45458984375, |
|
"learning_rate": 1.2159410161027153e-07, |
|
"loss": 0.061, |
|
"reward": 0.6756877303123474, |
|
"reward_std": 0.17008201032876968, |
|
"rewards/exp_len_reward": 0.6756877303123474, |
|
"step": 307 |
|
}, |
|
{ |
|
"completion_length": 1900.1429138183594, |
|
"epoch": 0.9125925925925926, |
|
"grad_norm": 1.0377925081151909, |
|
"kl": 0.4912109375, |
|
"learning_rate": 1.2018917310165926e-07, |
|
"loss": 0.0756, |
|
"reward": 0.6221350133419037, |
|
"reward_std": 0.21066963486373425, |
|
"rewards/exp_len_reward": 0.6221350133419037, |
|
"step": 308 |
|
}, |
|
{ |
|
"completion_length": 2008.040283203125, |
|
"epoch": 0.9155555555555556, |
|
"grad_norm": 1.1323707827713791, |
|
"kl": 0.5888671875, |
|
"learning_rate": 1.1883044948392453e-07, |
|
"loss": 0.0239, |
|
"reward": 0.6152837574481964, |
|
"reward_std": 0.20816011540591717, |
|
"rewards/exp_len_reward": 0.6152837574481964, |
|
"step": 309 |
|
}, |
|
{ |
|
"completion_length": 1659.3572082519531, |
|
"epoch": 0.9185185185185185, |
|
"grad_norm": 1.0741422911073732, |
|
"kl": 0.3739013671875, |
|
"learning_rate": 1.1751807682057396e-07, |
|
"loss": 0.0697, |
|
"reward": 0.6434877663850784, |
|
"reward_std": 0.22068125009536743, |
|
"rewards/exp_len_reward": 0.6434877663850784, |
|
"step": 310 |
|
}, |
|
{ |
|
"completion_length": 1905.5492248535156, |
|
"epoch": 0.9214814814814815, |
|
"grad_norm": 1.4331895848772223, |
|
"kl": 0.52294921875, |
|
"learning_rate": 1.1625219619236196e-07, |
|
"loss": 0.0179, |
|
"reward": 0.6263534277677536, |
|
"reward_std": 0.17030689865350723, |
|
"rewards/exp_len_reward": 0.6263534277677536, |
|
"step": 311 |
|
}, |
|
{ |
|
"completion_length": 1953.1831665039062, |
|
"epoch": 0.9244444444444444, |
|
"grad_norm": 2.1662592035207973, |
|
"kl": 0.55517578125, |
|
"learning_rate": 1.1503294368212441e-07, |
|
"loss": 0.0041, |
|
"reward": 0.5517635121941566, |
|
"reward_std": 0.16691016405820847, |
|
"rewards/exp_len_reward": 0.5517635121941566, |
|
"step": 312 |
|
}, |
|
{ |
|
"completion_length": 2065.2679443359375, |
|
"epoch": 0.9274074074074075, |
|
"grad_norm": 0.8474773056034575, |
|
"kl": 0.51318359375, |
|
"learning_rate": 1.1386045036015024e-07, |
|
"loss": 0.0518, |
|
"reward": 0.6386523991823196, |
|
"reward_std": 0.28626545891165733, |
|
"rewards/exp_len_reward": 0.6386523991823196, |
|
"step": 313 |
|
}, |
|
{ |
|
"completion_length": 1785.6920471191406, |
|
"epoch": 0.9303703703703704, |
|
"grad_norm": 3.330669113813351, |
|
"kl": 0.39599609375, |
|
"learning_rate": 1.1273484227009072e-07, |
|
"loss": 0.1397, |
|
"reward": 0.6923246830701828, |
|
"reward_std": 0.23065154626965523, |
|
"rewards/exp_len_reward": 0.6923246830701828, |
|
"step": 314 |
|
}, |
|
{ |
|
"completion_length": 2113.290283203125, |
|
"epoch": 0.9333333333333333, |
|
"grad_norm": 2.208786377683472, |
|
"kl": 0.6357421875, |
|
"learning_rate": 1.116562404154099e-07, |
|
"loss": 0.0675, |
|
"reward": 0.5248966738581657, |
|
"reward_std": 0.20559153519570827, |
|
"rewards/exp_len_reward": 0.5248966738581657, |
|
"step": 315 |
|
}, |
|
{ |
|
"completion_length": 1636.8036193847656, |
|
"epoch": 0.9362962962962963, |
|
"grad_norm": 0.8582998929826559, |
|
"kl": 0.359130859375, |
|
"learning_rate": 1.1062476074637685e-07, |
|
"loss": 0.0267, |
|
"reward": 0.5902325585484505, |
|
"reward_std": 0.26360809803009033, |
|
"rewards/exp_len_reward": 0.5902325585484505, |
|
"step": 316 |
|
}, |
|
{ |
|
"completion_length": 1962.6473999023438, |
|
"epoch": 0.9392592592592592, |
|
"grad_norm": 0.7797115036676918, |
|
"kl": 0.483642578125, |
|
"learning_rate": 1.0964051414760065e-07, |
|
"loss": 0.0519, |
|
"reward": 0.6097806543111801, |
|
"reward_std": 0.19469193182885647, |
|
"rewards/exp_len_reward": 0.6097806543111801, |
|
"step": 317 |
|
}, |
|
{ |
|
"completion_length": 1495.165283203125, |
|
"epoch": 0.9422222222222222, |
|
"grad_norm": 1.724934156584613, |
|
"kl": 0.27099609375, |
|
"learning_rate": 1.087036064261106e-07, |
|
"loss": 0.0479, |
|
"reward": 0.7044764161109924, |
|
"reward_std": 0.22360007464885712, |
|
"rewards/exp_len_reward": 0.7044764161109924, |
|
"step": 318 |
|
}, |
|
{ |
|
"completion_length": 1907.3750610351562, |
|
"epoch": 0.9451851851851852, |
|
"grad_norm": 0.9397015929298037, |
|
"kl": 0.43603515625, |
|
"learning_rate": 1.0781413829998135e-07, |
|
"loss": 0.0703, |
|
"reward": 0.6270845979452133, |
|
"reward_std": 0.1956428363919258, |
|
"rewards/exp_len_reward": 0.6270845979452133, |
|
"step": 319 |
|
}, |
|
{ |
|
"completion_length": 2248.6384887695312, |
|
"epoch": 0.9481481481481482, |
|
"grad_norm": 2.3924203042429424, |
|
"kl": 0.767578125, |
|
"learning_rate": 1.0697220538750631e-07, |
|
"loss": 0.1143, |
|
"reward": 0.4779609218239784, |
|
"reward_std": 0.2641923241317272, |
|
"rewards/exp_len_reward": 0.4779609218239784, |
|
"step": 320 |
|
}, |
|
{ |
|
"completion_length": 2031.3527526855469, |
|
"epoch": 0.9511111111111111, |
|
"grad_norm": 1.5717510752992125, |
|
"kl": 0.57177734375, |
|
"learning_rate": 1.0617789819691819e-07, |
|
"loss": 0.0913, |
|
"reward": 0.5689445361495018, |
|
"reward_std": 0.27545909211039543, |
|
"rewards/exp_len_reward": 0.5689445361495018, |
|
"step": 321 |
|
}, |
|
{ |
|
"completion_length": 2053.80810546875, |
|
"epoch": 0.9540740740740741, |
|
"grad_norm": 2.3514432726094316, |
|
"kl": 0.568359375, |
|
"learning_rate": 1.054313021166595e-07, |
|
"loss": 0.0835, |
|
"reward": 0.6623236984014511, |
|
"reward_std": 0.258603822439909, |
|
"rewards/exp_len_reward": 0.6623236984014511, |
|
"step": 322 |
|
}, |
|
{ |
|
"completion_length": 1866.009033203125, |
|
"epoch": 0.957037037037037, |
|
"grad_norm": 1.7616535804704176, |
|
"kl": 0.535400390625, |
|
"learning_rate": 1.0473249740620304e-07, |
|
"loss": 0.0234, |
|
"reward": 0.6161750108003616, |
|
"reward_std": 0.2101491615176201, |
|
"rewards/exp_len_reward": 0.6161750108003616, |
|
"step": 323 |
|
}, |
|
{ |
|
"completion_length": 2126.464385986328, |
|
"epoch": 0.96, |
|
"grad_norm": 1.8176658288354475, |
|
"kl": 0.7197265625, |
|
"learning_rate": 1.0408155918742432e-07, |
|
"loss": 0.1053, |
|
"reward": 0.619974821805954, |
|
"reward_std": 0.21160422265529633, |
|
"rewards/exp_len_reward": 0.619974821805954, |
|
"step": 324 |
|
}, |
|
{ |
|
"completion_length": 2201.9510192871094, |
|
"epoch": 0.9629629629629629, |
|
"grad_norm": 2.611363321049143, |
|
"kl": 0.8994140625, |
|
"learning_rate": 1.034785574365256e-07, |
|
"loss": 0.0785, |
|
"reward": 0.5216581001877785, |
|
"reward_std": 0.22227726504206657, |
|
"rewards/exp_len_reward": 0.5216581001877785, |
|
"step": 325 |
|
}, |
|
{ |
|
"completion_length": 1939.4777526855469, |
|
"epoch": 0.965925925925926, |
|
"grad_norm": 1.9447889354625958, |
|
"kl": 0.57666015625, |
|
"learning_rate": 1.0292355697651348e-07, |
|
"loss": 0.0518, |
|
"reward": 0.5352144092321396, |
|
"reward_std": 0.18627181835472584, |
|
"rewards/exp_len_reward": 0.5352144092321396, |
|
"step": 326 |
|
}, |
|
{ |
|
"completion_length": 1660.0223999023438, |
|
"epoch": 0.9688888888888889, |
|
"grad_norm": 1.3319739321021469, |
|
"kl": 0.615234375, |
|
"learning_rate": 1.0241661747023064e-07, |
|
"loss": 0.0264, |
|
"reward": 0.5945611968636513, |
|
"reward_std": 0.19085084274411201, |
|
"rewards/exp_len_reward": 0.5945611968636513, |
|
"step": 327 |
|
}, |
|
{ |
|
"completion_length": 1870.8482666015625, |
|
"epoch": 0.9718518518518519, |
|
"grad_norm": 1.2683542624045563, |
|
"kl": 0.6884765625, |
|
"learning_rate": 1.0195779341394164e-07, |
|
"loss": 0.0875, |
|
"reward": 0.5801157727837563, |
|
"reward_std": 0.2543545439839363, |
|
"rewards/exp_len_reward": 0.5801157727837563, |
|
"step": 328 |
|
}, |
|
{ |
|
"completion_length": 1804.2098693847656, |
|
"epoch": 0.9748148148148148, |
|
"grad_norm": 1.532251011882061, |
|
"kl": 0.546142578125, |
|
"learning_rate": 1.0154713413147486e-07, |
|
"loss": 0.0935, |
|
"reward": 0.5890957191586494, |
|
"reward_std": 0.1959761083126068, |
|
"rewards/exp_len_reward": 0.5890957191586494, |
|
"step": 329 |
|
}, |
|
{ |
|
"completion_length": 2037.2098693847656, |
|
"epoch": 0.9777777777777777, |
|
"grad_norm": 1.51448980281598, |
|
"kl": 0.75634765625, |
|
"learning_rate": 1.0118468376892005e-07, |
|
"loss": 0.116, |
|
"reward": 0.53825593739748, |
|
"reward_std": 0.2599205709993839, |
|
"rewards/exp_len_reward": 0.53825593739748, |
|
"step": 330 |
|
}, |
|
{ |
|
"completion_length": 1872.2322082519531, |
|
"epoch": 0.9807407407407407, |
|
"grad_norm": 0.8135192710770781, |
|
"kl": 0.63916015625, |
|
"learning_rate": 1.0087048128988256e-07, |
|
"loss": 0.0755, |
|
"reward": 0.5870219618082047, |
|
"reward_std": 0.23178360238671303, |
|
"rewards/exp_len_reward": 0.5870219618082047, |
|
"step": 331 |
|
}, |
|
{ |
|
"completion_length": 1804.0000610351562, |
|
"epoch": 0.9837037037037037, |
|
"grad_norm": 2.521244978635537, |
|
"kl": 0.60009765625, |
|
"learning_rate": 1.0060456047129485e-07, |
|
"loss": 0.0965, |
|
"reward": 0.7236264944076538, |
|
"reward_std": 0.2475818656384945, |
|
"rewards/exp_len_reward": 0.7236264944076538, |
|
"step": 332 |
|
}, |
|
{ |
|
"completion_length": 1929.8483276367188, |
|
"epoch": 0.9866666666666667, |
|
"grad_norm": 1.4125480684524767, |
|
"kl": 0.4892578125, |
|
"learning_rate": 1.0038694989978531e-07, |
|
"loss": 0.0505, |
|
"reward": 0.5696776583790779, |
|
"reward_std": 0.24001475051045418, |
|
"rewards/exp_len_reward": 0.5696776583790779, |
|
"step": 333 |
|
}, |
|
{ |
|
"completion_length": 2289.2322692871094, |
|
"epoch": 0.9896296296296296, |
|
"grad_norm": 1.2411060851104214, |
|
"kl": 1.0234375, |
|
"learning_rate": 1.0021767296860537e-07, |
|
"loss": 0.1065, |
|
"reward": 0.5802329778671265, |
|
"reward_std": 0.255879282951355, |
|
"rewards/exp_len_reward": 0.5802329778671265, |
|
"step": 334 |
|
}, |
|
{ |
|
"completion_length": 2055.4866943359375, |
|
"epoch": 0.9925925925925926, |
|
"grad_norm": 4.508535676079954, |
|
"kl": 0.82861328125, |
|
"learning_rate": 1.0009674787511447e-07, |
|
"loss": -0.0012, |
|
"reward": 0.5173570811748505, |
|
"reward_std": 0.22090869024395943, |
|
"rewards/exp_len_reward": 0.5173570811748505, |
|
"step": 335 |
|
}, |
|
{ |
|
"completion_length": 1760.1474304199219, |
|
"epoch": 0.9955555555555555, |
|
"grad_norm": 2.1627630685797605, |
|
"kl": 0.75048828125, |
|
"learning_rate": 1.0002418761882409e-07, |
|
"loss": 0.0715, |
|
"reward": 0.5967651307582855, |
|
"reward_std": 0.23602332174777985, |
|
"rewards/exp_len_reward": 0.5967651307582855, |
|
"step": 336 |
|
}, |
|
{ |
|
"completion_length": 1915.9599304199219, |
|
"epoch": 0.9985185185185185, |
|
"grad_norm": 0.9639784041944786, |
|
"kl": 0.678466796875, |
|
"learning_rate": 1e-07, |
|
"loss": 0.0641, |
|
"reward": 0.5330347046256065, |
|
"reward_std": 0.26336976513266563, |
|
"rewards/exp_len_reward": 0.5330347046256065, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.9985185185185185, |
|
"step": 337, |
|
"total_flos": 0.0, |
|
"train_loss": 0.04468778468586763, |
|
"train_runtime": 66671.6072, |
|
"train_samples_per_second": 0.162, |
|
"train_steps_per_second": 0.005 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 337, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|