|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 7.992, |
|
"eval_steps": 100, |
|
"global_step": 496, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 575.0117378234863, |
|
"epoch": 0.08, |
|
"grad_norm": 0.5159733891487122, |
|
"kl": 0.0005777359008789062, |
|
"learning_rate": 2e-07, |
|
"loss": 0.0211, |
|
"reward": 0.41477865651249884, |
|
"reward_std": 0.3093508366495371, |
|
"rewards/accuracy_reward": 0.08098958514165133, |
|
"rewards/format_reward": 0.007812500209547579, |
|
"rewards/tag_count_reward": 0.32597657293081284, |
|
"step": 5 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 569.7036651611328, |
|
"epoch": 0.16, |
|
"grad_norm": 0.5224132537841797, |
|
"kl": 0.001135540008544922, |
|
"learning_rate": 4e-07, |
|
"loss": 0.0197, |
|
"reward": 0.41028647050261496, |
|
"reward_std": 0.30269680023193357, |
|
"rewards/accuracy_reward": 0.07421875218860804, |
|
"rewards/format_reward": 0.005729166814126075, |
|
"rewards/tag_count_reward": 0.3303385525941849, |
|
"step": 10 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 586.758869934082, |
|
"epoch": 0.24, |
|
"grad_norm": 0.3704665005207062, |
|
"kl": 0.0023041725158691405, |
|
"learning_rate": 6e-07, |
|
"loss": 0.0267, |
|
"reward": 0.473372408002615, |
|
"reward_std": 0.3123403754085302, |
|
"rewards/accuracy_reward": 0.08750000239815563, |
|
"rewards/format_reward": 0.009635416907258332, |
|
"rewards/tag_count_reward": 0.3762369878590107, |
|
"step": 15 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 589.0830902099609, |
|
"epoch": 0.32, |
|
"grad_norm": 1.0813833475112915, |
|
"kl": 0.0357421875, |
|
"learning_rate": 8e-07, |
|
"loss": 0.032, |
|
"reward": 0.6087890841066838, |
|
"reward_std": 0.32254689410328863, |
|
"rewards/accuracy_reward": 0.07447916907258331, |
|
"rewards/format_reward": 0.025781250488944352, |
|
"rewards/tag_count_reward": 0.5085286632180214, |
|
"step": 20 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 598.2661651611328, |
|
"epoch": 0.4, |
|
"grad_norm": 0.3315693140029907, |
|
"kl": 0.0186279296875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0404, |
|
"reward": 0.6679036632180214, |
|
"reward_std": 0.3646054796874523, |
|
"rewards/accuracy_reward": 0.08750000272411854, |
|
"rewards/format_reward": 0.044791668001562354, |
|
"rewards/tag_count_reward": 0.5356119990348815, |
|
"step": 25 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 603.8648612976074, |
|
"epoch": 0.48, |
|
"grad_norm": 0.34996700286865234, |
|
"kl": 0.0280853271484375, |
|
"learning_rate": 1.2e-06, |
|
"loss": 0.0484, |
|
"reward": 0.7606770992279053, |
|
"reward_std": 0.42633651196956635, |
|
"rewards/accuracy_reward": 0.0877604190260172, |
|
"rewards/format_reward": 0.08750000251457095, |
|
"rewards/tag_count_reward": 0.5854166865348815, |
|
"step": 30 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 598.5612190246582, |
|
"epoch": 0.56, |
|
"grad_norm": 0.3991927206516266, |
|
"kl": 0.0320770263671875, |
|
"learning_rate": 1.4e-06, |
|
"loss": 0.0474, |
|
"reward": 0.8298828288912773, |
|
"reward_std": 0.46221103593707086, |
|
"rewards/accuracy_reward": 0.07890625211875886, |
|
"rewards/format_reward": 0.12395833758637309, |
|
"rewards/tag_count_reward": 0.6270182520151139, |
|
"step": 35 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 599.6370010375977, |
|
"epoch": 0.64, |
|
"grad_norm": 0.32159847021102905, |
|
"kl": 0.0319732666015625, |
|
"learning_rate": 1.6e-06, |
|
"loss": 0.057, |
|
"reward": 1.0265625357627868, |
|
"reward_std": 0.5343712478876114, |
|
"rewards/accuracy_reward": 0.06770833521150052, |
|
"rewards/format_reward": 0.25104167461395266, |
|
"rewards/tag_count_reward": 0.7078125223517417, |
|
"step": 40 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 579.9875183105469, |
|
"epoch": 0.72, |
|
"grad_norm": 0.3186890482902527, |
|
"kl": 0.0441070556640625, |
|
"learning_rate": 1.8e-06, |
|
"loss": 0.0737, |
|
"reward": 1.2731771171092987, |
|
"reward_std": 0.5938673868775368, |
|
"rewards/accuracy_reward": 0.058072918513789776, |
|
"rewards/format_reward": 0.446614596247673, |
|
"rewards/tag_count_reward": 0.7684896036982536, |
|
"step": 45 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 568.0612159729004, |
|
"epoch": 0.8, |
|
"grad_norm": 0.28663673996925354, |
|
"kl": 0.0451324462890625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0684, |
|
"reward": 1.470442745089531, |
|
"reward_std": 0.5922676548361778, |
|
"rewards/accuracy_reward": 0.06432291842065752, |
|
"rewards/format_reward": 0.5750000163912773, |
|
"rewards/tag_count_reward": 0.8311198085546494, |
|
"step": 50 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 539.609130859375, |
|
"epoch": 0.88, |
|
"grad_norm": 0.2857872247695923, |
|
"kl": 0.0481170654296875, |
|
"learning_rate": 1.9993798522846506e-06, |
|
"loss": 0.0487, |
|
"reward": 1.6721354514360427, |
|
"reward_std": 0.5311627298593521, |
|
"rewards/accuracy_reward": 0.06093750160653144, |
|
"rewards/format_reward": 0.7151041895151138, |
|
"rewards/tag_count_reward": 0.8960937678813934, |
|
"step": 55 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 541.1231971740723, |
|
"epoch": 0.96, |
|
"grad_norm": 0.273100882768631, |
|
"kl": 0.05677490234375, |
|
"learning_rate": 1.9975201783049802e-06, |
|
"loss": 0.0228, |
|
"reward": 1.768489620089531, |
|
"reward_std": 0.4786494873464108, |
|
"rewards/accuracy_reward": 0.052343751583248374, |
|
"rewards/format_reward": 0.7924479350447655, |
|
"rewards/tag_count_reward": 0.9236979350447655, |
|
"step": 60 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 534.5526145935058, |
|
"epoch": 1.048, |
|
"grad_norm": 0.23924335837364197, |
|
"kl": 0.053448486328125, |
|
"learning_rate": 1.994423284606128e-06, |
|
"loss": 0.0239, |
|
"reward": 1.8424479603767394, |
|
"reward_std": 0.4263153623789549, |
|
"rewards/accuracy_reward": 0.06171875204890966, |
|
"rewards/format_reward": 0.8372396007180214, |
|
"rewards/tag_count_reward": 0.9434895992279053, |
|
"step": 65 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 553.1002761840821, |
|
"epoch": 1.1280000000000001, |
|
"grad_norm": 0.24568906426429749, |
|
"kl": 0.0635986328125, |
|
"learning_rate": 1.990093012251199e-06, |
|
"loss": 0.0147, |
|
"reward": 1.8621745228767395, |
|
"reward_std": 0.37093897461891173, |
|
"rewards/accuracy_reward": 0.0471354179084301, |
|
"rewards/format_reward": 0.8682291880249977, |
|
"rewards/tag_count_reward": 0.9468099132180214, |
|
"step": 70 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 502.05652542114257, |
|
"epoch": 1.208, |
|
"grad_norm": 0.2926892638206482, |
|
"kl": 0.06356201171875, |
|
"learning_rate": 1.9845347320572078e-06, |
|
"loss": 0.0112, |
|
"reward": 1.8821615010499955, |
|
"reward_std": 0.3585973784327507, |
|
"rewards/accuracy_reward": 0.05546875158324838, |
|
"rewards/format_reward": 0.873697929084301, |
|
"rewards/tag_count_reward": 0.9529948100447655, |
|
"step": 75 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 495.64324111938475, |
|
"epoch": 1.288, |
|
"grad_norm": 0.24777106940746307, |
|
"kl": 0.0506591796875, |
|
"learning_rate": 1.977755337933682e-06, |
|
"loss": 0.0009, |
|
"reward": 1.8709635853767395, |
|
"reward_std": 0.3816621009260416, |
|
"rewards/accuracy_reward": 0.05234375139698386, |
|
"rewards/format_reward": 0.8700521066784859, |
|
"rewards/tag_count_reward": 0.9485677301883697, |
|
"step": 80 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 485.4924621582031, |
|
"epoch": 1.3679999999999999, |
|
"grad_norm": 0.24417904019355774, |
|
"kl": 0.057867431640625, |
|
"learning_rate": 1.9697632383321754e-06, |
|
"loss": 0.0134, |
|
"reward": 1.9161458700895309, |
|
"reward_std": 0.35734121650457384, |
|
"rewards/accuracy_reward": 0.06484375179279596, |
|
"rewards/format_reward": 0.8960937723517418, |
|
"rewards/tag_count_reward": 0.9552083522081375, |
|
"step": 85 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 500.1916786193848, |
|
"epoch": 1.448, |
|
"grad_norm": 0.23515592515468597, |
|
"kl": 0.0543365478515625, |
|
"learning_rate": 1.960568345817306e-06, |
|
"loss": 0.0188, |
|
"reward": 1.9377604484558106, |
|
"reward_std": 0.33337037824094296, |
|
"rewards/accuracy_reward": 0.07031250232830644, |
|
"rewards/format_reward": 0.9088541835546493, |
|
"rewards/tag_count_reward": 0.9585937723517418, |
|
"step": 90 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 495.4093902587891, |
|
"epoch": 1.528, |
|
"grad_norm": 0.26944634318351746, |
|
"kl": 0.048583984375, |
|
"learning_rate": 1.9501820647722456e-06, |
|
"loss": 0.0118, |
|
"reward": 1.9946615099906921, |
|
"reward_std": 0.29993436820805075, |
|
"rewards/accuracy_reward": 0.09088541890960186, |
|
"rewards/format_reward": 0.9322916865348816, |
|
"rewards/tag_count_reward": 0.9714843943715096, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 1.608, |
|
"grad_norm": 0.2572639584541321, |
|
"learning_rate": 1.938617277253916e-06, |
|
"loss": 0.0144, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.608, |
|
"eval_clip_ratio": 0.0, |
|
"eval_completion_length": 484.87662212665265, |
|
"eval_kl": 0.057382436899038464, |
|
"eval_loss": 0.015068341977894306, |
|
"eval_reward": 1.952323785194984, |
|
"eval_reward_std": 0.38110188108224136, |
|
"eval_rewards/accuracy_reward": 0.10897436236532834, |
|
"eval_rewards/format_reward": 0.89262822499642, |
|
"eval_rewards/tag_count_reward": 0.9507211676010718, |
|
"eval_runtime": 151.6564, |
|
"eval_samples_per_second": 0.653, |
|
"eval_steps_per_second": 0.02, |
|
"step": 100 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 506.50118865966795, |
|
"epoch": 1.688, |
|
"grad_norm": 0.2550075650215149, |
|
"kl": 0.05778656005859375, |
|
"learning_rate": 1.925888327015434e-06, |
|
"loss": 0.0138, |
|
"reward": 2.005696675181389, |
|
"reward_std": 0.35690731797367337, |
|
"rewards/accuracy_reward": 0.1282552123069763, |
|
"rewards/format_reward": 0.9125000208616256, |
|
"rewards/tag_count_reward": 0.9649414233863354, |
|
"step": 105 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 532.497412109375, |
|
"epoch": 1.768, |
|
"grad_norm": 0.22832101583480835, |
|
"kl": 0.106121826171875, |
|
"learning_rate": 1.9120110017156167e-06, |
|
"loss": 0.0192, |
|
"reward": 1.9944662123918533, |
|
"reward_std": 0.3744897425174713, |
|
"rewards/accuracy_reward": 0.13489583702757954, |
|
"rewards/format_reward": 0.9020833536982537, |
|
"rewards/tag_count_reward": 0.9574869960546494, |
|
"step": 110 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 505.0661560058594, |
|
"epoch": 1.8479999999999999, |
|
"grad_norm": 0.23704102635383606, |
|
"kl": 0.055731201171875, |
|
"learning_rate": 1.8970025133376251e-06, |
|
"loss": 0.0128, |
|
"reward": 2.009244841337204, |
|
"reward_std": 0.33148185834288596, |
|
"rewards/accuracy_reward": 0.12421875419095159, |
|
"rewards/format_reward": 0.9169271022081376, |
|
"rewards/tag_count_reward": 0.968098983168602, |
|
"step": 115 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 519.771109008789, |
|
"epoch": 1.928, |
|
"grad_norm": 0.2956863045692444, |
|
"kl": 0.060504150390625, |
|
"learning_rate": 1.8808814768410154e-06, |
|
"loss": 0.0325, |
|
"reward": 2.0304688274860383, |
|
"reward_std": 0.3420341990888119, |
|
"rewards/accuracy_reward": 0.13880208730697632, |
|
"rewards/format_reward": 0.9231771007180214, |
|
"rewards/tag_count_reward": 0.9684896007180214, |
|
"step": 120 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 512.1099105834961, |
|
"epoch": 2.016, |
|
"grad_norm": 0.22449135780334473, |
|
"kl": 0.06649169921875, |
|
"learning_rate": 1.8636678870736926e-06, |
|
"loss": 0.0198, |
|
"reward": 2.039778712391853, |
|
"reward_std": 0.3516721487045288, |
|
"rewards/accuracy_reward": 0.15156250335276128, |
|
"rewards/format_reward": 0.9226562693715096, |
|
"rewards/tag_count_reward": 0.9655599161982537, |
|
"step": 125 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 530.5299659729004, |
|
"epoch": 2.096, |
|
"grad_norm": 0.22710081934928894, |
|
"kl": 0.0750244140625, |
|
"learning_rate": 1.8453830939723912e-06, |
|
"loss": 0.0315, |
|
"reward": 2.042122468352318, |
|
"reward_std": 0.37393513284623625, |
|
"rewards/accuracy_reward": 0.16276042107492686, |
|
"rewards/format_reward": 0.9140625193715095, |
|
"rewards/tag_count_reward": 0.9652994975447655, |
|
"step": 130 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 549.8502754211426, |
|
"epoch": 2.176, |
|
"grad_norm": 0.28876593708992004, |
|
"kl": 0.09521484375, |
|
"learning_rate": 1.8260497760824456e-06, |
|
"loss": 0.0474, |
|
"reward": 2.0036458909511565, |
|
"reward_std": 0.3934386149048805, |
|
"rewards/accuracy_reward": 0.15026042023673652, |
|
"rewards/format_reward": 0.8989583522081375, |
|
"rewards/tag_count_reward": 0.9544271066784858, |
|
"step": 135 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 612.0536613464355, |
|
"epoch": 2.2560000000000002, |
|
"grad_norm": 0.8413172960281372, |
|
"kl": 0.14200439453125, |
|
"learning_rate": 1.8056919124296956e-06, |
|
"loss": 0.1363, |
|
"reward": 1.8713542193174362, |
|
"reward_std": 0.536376877874136, |
|
"rewards/accuracy_reward": 0.15989583814516664, |
|
"rewards/format_reward": 0.7997395992279053, |
|
"rewards/tag_count_reward": 0.911718773841858, |
|
"step": 140 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1079.364094543457, |
|
"epoch": 2.336, |
|
"grad_norm": 0.6317133903503418, |
|
"kl": 0.334130859375, |
|
"learning_rate": 1.784334752779408e-06, |
|
"loss": 0.28, |
|
"reward": 1.164257851243019, |
|
"reward_std": 0.6237203784286975, |
|
"rewards/accuracy_reward": 0.08359375211875886, |
|
"rewards/format_reward": 0.42317709475755694, |
|
"rewards/tag_count_reward": 0.6574869945645332, |
|
"step": 145 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1432.9927520751953, |
|
"epoch": 2.416, |
|
"grad_norm": 2.980980634689331, |
|
"kl": 0.5193359375, |
|
"learning_rate": 1.7620047863191097e-06, |
|
"loss": 0.3054, |
|
"reward": 0.794856795668602, |
|
"reward_std": 0.5762915194034577, |
|
"rewards/accuracy_reward": 0.042187501187436284, |
|
"rewards/format_reward": 0.2578125067055225, |
|
"rewards/tag_count_reward": 0.49485678449273107, |
|
"step": 150 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 579.490380859375, |
|
"epoch": 2.496, |
|
"grad_norm": 1.2213436365127563, |
|
"kl": 0.2888671875, |
|
"learning_rate": 1.7387297088041692e-06, |
|
"loss": 0.1924, |
|
"reward": 1.7270182639360427, |
|
"reward_std": 0.5595091491937637, |
|
"rewards/accuracy_reward": 0.1138020868645981, |
|
"rewards/format_reward": 0.7473958514630794, |
|
"rewards/tag_count_reward": 0.8658203348517418, |
|
"step": 155 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 401.75886688232424, |
|
"epoch": 2.576, |
|
"grad_norm": 0.8756619095802307, |
|
"kl": 0.21771240234375, |
|
"learning_rate": 1.7145383882068778e-06, |
|
"loss": 0.02, |
|
"reward": 1.8417318046092988, |
|
"reward_std": 0.4633196383714676, |
|
"rewards/accuracy_reward": 0.08697916942182929, |
|
"rewards/format_reward": 0.8268229380249977, |
|
"rewards/tag_count_reward": 0.9279297098517418, |
|
"step": 160 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 424.29662628173827, |
|
"epoch": 2.656, |
|
"grad_norm": 0.4502270221710205, |
|
"kl": 0.13651123046875, |
|
"learning_rate": 1.6894608289116341e-06, |
|
"loss": 0.0061, |
|
"reward": 1.8492838889360428, |
|
"reward_std": 0.44692824259400366, |
|
"rewards/accuracy_reward": 0.08515625311993062, |
|
"rewards/format_reward": 0.8325521051883698, |
|
"rewards/tag_count_reward": 0.9315755411982536, |
|
"step": 165 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 438.4726676940918, |
|
"epoch": 2.7359999999999998, |
|
"grad_norm": 0.37484192848205566, |
|
"kl": 0.14183349609375, |
|
"learning_rate": 1.663528134500646e-06, |
|
"loss": 0.0114, |
|
"reward": 1.8793620347976685, |
|
"reward_std": 0.43491987958550454, |
|
"rewards/accuracy_reward": 0.08671875244472176, |
|
"rewards/format_reward": 0.8562500238418579, |
|
"rewards/tag_count_reward": 0.9363932520151138, |
|
"step": 170 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 443.8067802429199, |
|
"epoch": 2.816, |
|
"grad_norm": 2.34854793548584, |
|
"kl": 0.1514892578125, |
|
"learning_rate": 1.6367724691762965e-06, |
|
"loss": 0.0216, |
|
"reward": 1.8856771260499954, |
|
"reward_std": 0.44159807488322256, |
|
"rewards/accuracy_reward": 0.09062500288709999, |
|
"rewards/format_reward": 0.8583333492279053, |
|
"rewards/tag_count_reward": 0.9367187693715096, |
|
"step": 175 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 428.94193878173826, |
|
"epoch": 2.896, |
|
"grad_norm": 0.9070159792900085, |
|
"kl": 0.166015625, |
|
"learning_rate": 1.6092270178680326e-06, |
|
"loss": 0.0118, |
|
"reward": 1.823046910762787, |
|
"reward_std": 0.49542137011885645, |
|
"rewards/accuracy_reward": 0.08932291937526315, |
|
"rewards/format_reward": 0.8166666850447655, |
|
"rewards/tag_count_reward": 0.9170573070645333, |
|
"step": 180 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 422.6744903564453, |
|
"epoch": 2.976, |
|
"grad_norm": 0.5818607211112976, |
|
"kl": 0.22315673828125, |
|
"learning_rate": 1.5809259450732493e-06, |
|
"loss": 0.0246, |
|
"reward": 1.8421875447034837, |
|
"reward_std": 0.48024289682507515, |
|
"rewards/accuracy_reward": 0.08046875253785402, |
|
"rewards/format_reward": 0.8382812723517418, |
|
"rewards/tag_count_reward": 0.9234375193715095, |
|
"step": 185 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 408.40235137939453, |
|
"epoch": 3.064, |
|
"grad_norm": 1.1170427799224854, |
|
"kl": 0.3676025390625, |
|
"learning_rate": 1.5519043524832167e-06, |
|
"loss": 0.0297, |
|
"reward": 1.8282552510499954, |
|
"reward_std": 0.49146031588315964, |
|
"rewards/accuracy_reward": 0.0791666692122817, |
|
"rewards/format_reward": 0.8320312708616256, |
|
"rewards/tag_count_reward": 0.9170573130249977, |
|
"step": 190 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 394.2843864440918, |
|
"epoch": 3.144, |
|
"grad_norm": 2.2475008964538574, |
|
"kl": 0.3608642578125, |
|
"learning_rate": 1.522198235446617e-06, |
|
"loss": 0.0145, |
|
"reward": 1.911328172683716, |
|
"reward_std": 0.43749314919114113, |
|
"rewards/accuracy_reward": 0.09192708560731262, |
|
"rewards/format_reward": 0.8776041835546493, |
|
"rewards/tag_count_reward": 0.9417968943715096, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 3.224, |
|
"grad_norm": 0.7201040387153625, |
|
"learning_rate": 1.4918444383246736e-06, |
|
"loss": 0.0284, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 3.224, |
|
"eval_clip_ratio": 0.0, |
|
"eval_completion_length": 421.5977736253005, |
|
"eval_kl": 0.96875, |
|
"eval_loss": 0.050301969051361084, |
|
"eval_reward": 1.9030449115313017, |
|
"eval_reward_std": 0.45874117200191206, |
|
"eval_rewards/accuracy_reward": 0.11217949052269642, |
|
"eval_rewards/format_reward": 0.8589743742576013, |
|
"eval_rewards/tag_count_reward": 0.9318910470375648, |
|
"eval_runtime": 143.8236, |
|
"eval_samples_per_second": 0.688, |
|
"eval_steps_per_second": 0.021, |
|
"step": 200 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 436.63086853027346, |
|
"epoch": 3.304, |
|
"grad_norm": 3.313720941543579, |
|
"kl": 0.51187744140625, |
|
"learning_rate": 1.4608806087932616e-06, |
|
"loss": 0.0457, |
|
"reward": 1.9524414613842964, |
|
"reward_std": 0.4343319511041045, |
|
"rewards/accuracy_reward": 0.13098958672489971, |
|
"rewards/format_reward": 0.8803385585546494, |
|
"rewards/tag_count_reward": 0.9411133043467999, |
|
"step": 205 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 436.59454345703125, |
|
"epoch": 3.384, |
|
"grad_norm": 3.354438066482544, |
|
"kl": 0.853369140625, |
|
"learning_rate": 1.4293451511486655e-06, |
|
"loss": 0.0496, |
|
"reward": 1.9438151478767396, |
|
"reward_std": 0.456201434135437, |
|
"rewards/accuracy_reward": 0.13255208693444728, |
|
"rewards/format_reward": 0.8734375208616256, |
|
"rewards/tag_count_reward": 0.9378255411982537, |
|
"step": 210 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 438.16745910644534, |
|
"epoch": 3.464, |
|
"grad_norm": 1.7152291536331177, |
|
"kl": 0.615625, |
|
"learning_rate": 1.3972771786749071e-06, |
|
"loss": 0.0382, |
|
"reward": 1.9890625536441804, |
|
"reward_std": 0.39320042356848717, |
|
"rewards/accuracy_reward": 0.13333333656191826, |
|
"rewards/format_reward": 0.9054687723517418, |
|
"rewards/tag_count_reward": 0.9502604335546494, |
|
"step": 215 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 438.5679801940918, |
|
"epoch": 3.544, |
|
"grad_norm": 3.926846504211426, |
|
"kl": 1.026171875, |
|
"learning_rate": 1.3647164651317176e-06, |
|
"loss": 0.0547, |
|
"reward": 1.9758463978767395, |
|
"reward_std": 0.4062122445553541, |
|
"rewards/accuracy_reward": 0.12291666923556477, |
|
"rewards/format_reward": 0.9041666865348816, |
|
"rewards/tag_count_reward": 0.9487630411982536, |
|
"step": 220 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 444.0872512817383, |
|
"epoch": 3.624, |
|
"grad_norm": 3.1936042308807373, |
|
"kl": 0.9034423828125, |
|
"learning_rate": 1.3317033954233242e-06, |
|
"loss": 0.048, |
|
"reward": 1.9549479603767395, |
|
"reward_std": 0.38959153592586515, |
|
"rewards/accuracy_reward": 0.1049479195382446, |
|
"rewards/format_reward": 0.9028646066784859, |
|
"rewards/tag_count_reward": 0.9471354380249977, |
|
"step": 225 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 443.2856895446777, |
|
"epoch": 3.7039999999999997, |
|
"grad_norm": 3.45194149017334, |
|
"kl": 3.086962890625, |
|
"learning_rate": 1.2982789155092406e-06, |
|
"loss": 0.1277, |
|
"reward": 1.8746094167232514, |
|
"reward_std": 0.5123490341007709, |
|
"rewards/accuracy_reward": 0.10755208698101341, |
|
"rewards/format_reward": 0.8559895992279053, |
|
"rewards/tag_count_reward": 0.9110677286982536, |
|
"step": 230 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 459.6276145935059, |
|
"epoch": 3.784, |
|
"grad_norm": 2.9876277446746826, |
|
"kl": 1.5169921875, |
|
"learning_rate": 1.264484481619177e-06, |
|
"loss": 0.0562, |
|
"reward": 1.8753906697034837, |
|
"reward_std": 0.47588179595768454, |
|
"rewards/accuracy_reward": 0.08463541921228171, |
|
"rewards/format_reward": 0.8697916865348816, |
|
"rewards/tag_count_reward": 0.9209635615348816, |
|
"step": 235 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 453.0531379699707, |
|
"epoch": 3.864, |
|
"grad_norm": 2.683588981628418, |
|
"kl": 1.91220703125, |
|
"learning_rate": 1.2303620088350698e-06, |
|
"loss": 0.0736, |
|
"reward": 1.8893880665302276, |
|
"reward_std": 0.4542847402393818, |
|
"rewards/accuracy_reward": 0.09505208649206906, |
|
"rewards/format_reward": 0.8731771022081375, |
|
"rewards/tag_count_reward": 0.9211588725447655, |
|
"step": 240 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 476.33881454467775, |
|
"epoch": 3.944, |
|
"grad_norm": 1.4992015361785889, |
|
"kl": 1.704296875, |
|
"learning_rate": 1.1959538191039984e-06, |
|
"loss": 0.0533, |
|
"reward": 1.9165365010499955, |
|
"reward_std": 0.44639485478401186, |
|
"rewards/accuracy_reward": 0.10234375309664756, |
|
"rewards/format_reward": 0.88619794100523, |
|
"rewards/tag_count_reward": 0.9279948130249978, |
|
"step": 245 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 459.0494895935059, |
|
"epoch": 4.032, |
|
"grad_norm": 2.175201416015625, |
|
"kl": 1.41162109375, |
|
"learning_rate": 1.161302588746464e-06, |
|
"loss": 0.0501, |
|
"reward": 1.9514974415302277, |
|
"reward_std": 0.38656867146492, |
|
"rewards/accuracy_reward": 0.09531250256113707, |
|
"rewards/format_reward": 0.9125000149011612, |
|
"rewards/tag_count_reward": 0.9436849191784858, |
|
"step": 250 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 456.7244903564453, |
|
"epoch": 4.112, |
|
"grad_norm": 2.2169039249420166, |
|
"kl": 1.56513671875, |
|
"learning_rate": 1.1264512955251477e-06, |
|
"loss": 0.0673, |
|
"reward": 1.9486979573965073, |
|
"reward_std": 0.3977412488311529, |
|
"rewards/accuracy_reward": 0.0968750029336661, |
|
"rewards/format_reward": 0.9091146051883697, |
|
"rewards/tag_count_reward": 0.9427083522081375, |
|
"step": 255 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 438.6416763305664, |
|
"epoch": 4.192, |
|
"grad_norm": 1.7040432691574097, |
|
"kl": 1.164892578125, |
|
"learning_rate": 1.0914431653397856e-06, |
|
"loss": 0.0609, |
|
"reward": 1.9953776597976685, |
|
"reward_std": 0.3568487804383039, |
|
"rewards/accuracy_reward": 0.10859375339932739, |
|
"rewards/format_reward": 0.93151044100523, |
|
"rewards/tag_count_reward": 0.9552734643220901, |
|
"step": 260 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 421.86694107055666, |
|
"epoch": 4.272, |
|
"grad_norm": 1.1802079677581787, |
|
"kl": 1.281884765625, |
|
"learning_rate": 1.0563216186142838e-06, |
|
"loss": 0.0606, |
|
"reward": 1.9884766191244125, |
|
"reward_std": 0.38088383749127386, |
|
"rewards/accuracy_reward": 0.11223958663176745, |
|
"rewards/format_reward": 0.9242187678813935, |
|
"rewards/tag_count_reward": 0.9520182505249977, |
|
"step": 265 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 450.229695892334, |
|
"epoch": 4.352, |
|
"grad_norm": 0.8549647927284241, |
|
"kl": 1.15458984375, |
|
"learning_rate": 1.0211302164425654e-06, |
|
"loss": 0.0333, |
|
"reward": 2.0100261092185976, |
|
"reward_std": 0.39018577747046945, |
|
"rewards/accuracy_reward": 0.13671875335276126, |
|
"rewards/format_reward": 0.9205729365348816, |
|
"rewards/tag_count_reward": 0.952734398841858, |
|
"step": 270 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 473.3450653076172, |
|
"epoch": 4.432, |
|
"grad_norm": 1.360452651977539, |
|
"kl": 0.996142578125, |
|
"learning_rate": 9.859126065599434e-07, |
|
"loss": 0.0378, |
|
"reward": 1.9733073562383652, |
|
"reward_std": 0.361860204860568, |
|
"rewards/accuracy_reward": 0.10338541967794299, |
|
"rewards/format_reward": 0.9190104350447654, |
|
"rewards/tag_count_reward": 0.9509114801883698, |
|
"step": 275 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 456.2471473693848, |
|
"epoch": 4.5120000000000005, |
|
"grad_norm": 1.2597142457962036, |
|
"kl": 1.180615234375, |
|
"learning_rate": 9.507124692070355e-07, |
|
"loss": 0.0429, |
|
"reward": 2.0083985060453413, |
|
"reward_std": 0.4000336788594723, |
|
"rewards/accuracy_reward": 0.13541667046956718, |
|
"rewards/format_reward": 0.9218750208616256, |
|
"rewards/tag_count_reward": 0.9511067897081376, |
|
"step": 280 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 465.9554801940918, |
|
"epoch": 4.592, |
|
"grad_norm": 1.3279640674591064, |
|
"kl": 0.97479248046875, |
|
"learning_rate": 9.15573462953361e-07, |
|
"loss": 0.0412, |
|
"reward": 2.039453184604645, |
|
"reward_std": 0.3459701970219612, |
|
"rewards/accuracy_reward": 0.13671875298023223, |
|
"rewards/format_reward": 0.9406250178813934, |
|
"rewards/tag_count_reward": 0.9621093913912773, |
|
"step": 285 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 487.58568954467773, |
|
"epoch": 4.672, |
|
"grad_norm": 1.1848392486572266, |
|
"kl": 0.62801513671875, |
|
"learning_rate": 8.805391705478147e-07, |
|
"loss": 0.0341, |
|
"reward": 2.0368490278720857, |
|
"reward_std": 0.34789668396115303, |
|
"rewards/accuracy_reward": 0.14088542181998492, |
|
"rewards/format_reward": 0.93697919100523, |
|
"rewards/tag_count_reward": 0.958984388411045, |
|
"step": 290 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 462.78100204467773, |
|
"epoch": 4.752, |
|
"grad_norm": 1.9356491565704346, |
|
"kl": 0.9264404296875, |
|
"learning_rate": 8.456530448631855e-07, |
|
"loss": 0.0383, |
|
"reward": 1.9699219435453414, |
|
"reward_std": 0.4028118785470724, |
|
"rewards/accuracy_reward": 0.12838542023673655, |
|
"rewards/format_reward": 0.9046875193715096, |
|
"rewards/tag_count_reward": 0.9368489772081375, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 4.832, |
|
"grad_norm": 1.8601847887039185, |
|
"learning_rate": 8.109583550017549e-07, |
|
"loss": 0.0298, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 4.832, |
|
"eval_clip_ratio": 0.0, |
|
"eval_completion_length": 448.06251408503607, |
|
"eval_kl": 0.5677208533653846, |
|
"eval_loss": 0.0052544595673680305, |
|
"eval_reward": 2.0404648138926578, |
|
"eval_reward_std": 0.3293027098362262, |
|
"eval_rewards/accuracy_reward": 0.1250000040118511, |
|
"eval_rewards/format_reward": 0.9503205281037551, |
|
"eval_rewards/tag_count_reward": 0.9651442399391761, |
|
"eval_runtime": 140.3516, |
|
"eval_samples_per_second": 0.705, |
|
"eval_steps_per_second": 0.021, |
|
"step": 300 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 473.3700660705566, |
|
"epoch": 4.912, |
|
"grad_norm": 1.6324626207351685, |
|
"kl": 0.8969482421875, |
|
"learning_rate": 7.764981326288272e-07, |
|
"loss": 0.0462, |
|
"reward": 2.001009176671505, |
|
"reward_std": 0.38462002836167813, |
|
"rewards/accuracy_reward": 0.1316406281432137, |
|
"rewards/format_reward": 0.9210937708616257, |
|
"rewards/tag_count_reward": 0.9482747592031956, |
|
"step": 305 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 476.5562614440918, |
|
"epoch": 4.992, |
|
"grad_norm": 1.3346195220947266, |
|
"kl": 0.7723388671875, |
|
"learning_rate": 7.423151186007526e-07, |
|
"loss": 0.0428, |
|
"reward": 2.023112043738365, |
|
"reward_std": 0.3479595385491848, |
|
"rewards/accuracy_reward": 0.12942708758637309, |
|
"rewards/format_reward": 0.9343750178813934, |
|
"rewards/tag_count_reward": 0.9593099161982537, |
|
"step": 310 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 474.70105209350584, |
|
"epoch": 5.08, |
|
"grad_norm": 3.4773247241973877, |
|
"kl": 1.13505859375, |
|
"learning_rate": 7.084517099536377e-07, |
|
"loss": 0.0536, |
|
"reward": 2.0288412243127825, |
|
"reward_std": 0.37681432962417605, |
|
"rewards/accuracy_reward": 0.1414062537252903, |
|
"rewards/format_reward": 0.9312500193715095, |
|
"rewards/tag_count_reward": 0.9561849161982536, |
|
"step": 315 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 449.5380317687988, |
|
"epoch": 5.16, |
|
"grad_norm": 3.157280683517456, |
|
"kl": 0.80751953125, |
|
"learning_rate": 6.749499073184957e-07, |
|
"loss": 0.0222, |
|
"reward": 2.015625074505806, |
|
"reward_std": 0.36971147619187833, |
|
"rewards/accuracy_reward": 0.13515625351574273, |
|
"rewards/format_reward": 0.9281250193715096, |
|
"rewards/tag_count_reward": 0.9523437649011612, |
|
"step": 320 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 453.49766845703124, |
|
"epoch": 5.24, |
|
"grad_norm": 2.6990413665771484, |
|
"kl": 1.14775390625, |
|
"learning_rate": 6.418512628280544e-07, |
|
"loss": 0.0459, |
|
"reward": 2.000455787777901, |
|
"reward_std": 0.3810987573117018, |
|
"rewards/accuracy_reward": 0.1320312530733645, |
|
"rewards/format_reward": 0.91979169100523, |
|
"rewards/tag_count_reward": 0.9486328348517418, |
|
"step": 325 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 438.2487075805664, |
|
"epoch": 5.32, |
|
"grad_norm": 14.02304744720459, |
|
"kl": 1.029150390625, |
|
"learning_rate": 6.091968285798378e-07, |
|
"loss": 0.0235, |
|
"reward": 1.9692057698965073, |
|
"reward_std": 0.44279313534498216, |
|
"rewards/accuracy_reward": 0.13697916995733977, |
|
"rewards/format_reward": 0.9005208522081375, |
|
"rewards/tag_count_reward": 0.93170575350523, |
|
"step": 330 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 442.1963684082031, |
|
"epoch": 5.4, |
|
"grad_norm": 2.4285733699798584, |
|
"kl": 0.85400390625, |
|
"learning_rate": 5.770271057194369e-07, |
|
"loss": 0.0206, |
|
"reward": 1.9694662004709245, |
|
"reward_std": 0.4486567251384258, |
|
"rewards/accuracy_reward": 0.14244791967794299, |
|
"rewards/format_reward": 0.8971354350447655, |
|
"rewards/tag_count_reward": 0.9298828288912773, |
|
"step": 335 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 434.9536582946777, |
|
"epoch": 5.48, |
|
"grad_norm": 1.456663966178894, |
|
"kl": 1.02734375, |
|
"learning_rate": 5.453819942071211e-07, |
|
"loss": 0.0171, |
|
"reward": 1.986132875084877, |
|
"reward_std": 0.45432515181601046, |
|
"rewards/accuracy_reward": 0.15182292135432363, |
|
"rewards/format_reward": 0.9010416820645333, |
|
"rewards/tag_count_reward": 0.9332682505249977, |
|
"step": 340 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 440.12631301879884, |
|
"epoch": 5.5600000000000005, |
|
"grad_norm": 6.417184352874756, |
|
"kl": 1.1162109375, |
|
"learning_rate": 5.143007433301034e-07, |
|
"loss": 0.0256, |
|
"reward": 1.994401106238365, |
|
"reward_std": 0.42431050203740595, |
|
"rewards/accuracy_reward": 0.1507812541909516, |
|
"rewards/format_reward": 0.9062500223517418, |
|
"rewards/tag_count_reward": 0.9373698100447655, |
|
"step": 345 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 435.890633392334, |
|
"epoch": 5.64, |
|
"grad_norm": 1.4459128379821777, |
|
"kl": 1.022119140625, |
|
"learning_rate": 4.838219030218274e-07, |
|
"loss": 0.0165, |
|
"reward": 2.000455787777901, |
|
"reward_std": 0.441185887157917, |
|
"rewards/accuracy_reward": 0.15000000391155482, |
|
"rewards/format_reward": 0.9106771036982536, |
|
"rewards/tag_count_reward": 0.9397786661982537, |
|
"step": 350 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 454.38621215820314, |
|
"epoch": 5.72, |
|
"grad_norm": 3.449855327606201, |
|
"kl": 2.3652099609375, |
|
"learning_rate": 4.5398327604866047e-07, |
|
"loss": 0.1717, |
|
"reward": 2.000716209411621, |
|
"reward_std": 0.4119129028171301, |
|
"rewards/accuracy_reward": 0.13723958674818276, |
|
"rewards/format_reward": 0.9182291850447655, |
|
"rewards/tag_count_reward": 0.9452474132180214, |
|
"step": 355 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 465.8750114440918, |
|
"epoch": 5.8, |
|
"grad_norm": 2.101736545562744, |
|
"kl": 1.1593994140625, |
|
"learning_rate": 4.2482187112329515e-07, |
|
"loss": 0.0262, |
|
"reward": 2.005924531817436, |
|
"reward_std": 0.40926145724952223, |
|
"rewards/accuracy_reward": 0.14505208674818276, |
|
"rewards/format_reward": 0.9166666850447655, |
|
"rewards/tag_count_reward": 0.9442057460546494, |
|
"step": 360 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 431.95730514526366, |
|
"epoch": 5.88, |
|
"grad_norm": 1.7132809162139893, |
|
"kl": 0.95352783203125, |
|
"learning_rate": 3.963738570030134e-07, |
|
"loss": 0.0317, |
|
"reward": 2.0475261181592943, |
|
"reward_std": 0.38608854487538335, |
|
"rewards/accuracy_reward": 0.1666666718199849, |
|
"rewards/format_reward": 0.9283854395151139, |
|
"rewards/tag_count_reward": 0.9524739846587181, |
|
"step": 365 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 446.39740829467775, |
|
"epoch": 5.96, |
|
"grad_norm": 1.1850680112838745, |
|
"kl": 0.68291015625, |
|
"learning_rate": 3.686745176297411e-07, |
|
"loss": 0.0102, |
|
"reward": 2.046158942580223, |
|
"reward_std": 0.3435225561261177, |
|
"rewards/accuracy_reward": 0.14583333767950535, |
|
"rewards/format_reward": 0.9406250163912773, |
|
"rewards/tag_count_reward": 0.9597005382180214, |
|
"step": 370 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 435.18698806762694, |
|
"epoch": 6.048, |
|
"grad_norm": 1.8223094940185547, |
|
"kl": 0.9990966796875, |
|
"learning_rate": 3.4175820836753646e-07, |
|
"loss": 0.0236, |
|
"reward": 2.0619141340255736, |
|
"reward_std": 0.34180368296802044, |
|
"rewards/accuracy_reward": 0.1572916701436043, |
|
"rewards/format_reward": 0.9429687693715095, |
|
"rewards/tag_count_reward": 0.9616536617279052, |
|
"step": 375 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 453.92188491821287, |
|
"epoch": 6.128, |
|
"grad_norm": 1.4606250524520874, |
|
"kl": 1.0483154296875, |
|
"learning_rate": 3.156583133917884e-07, |
|
"loss": 0.0246, |
|
"reward": 2.0753255993127824, |
|
"reward_std": 0.3737676966935396, |
|
"rewards/accuracy_reward": 0.1804687550291419, |
|
"rewards/format_reward": 0.9361979395151139, |
|
"rewards/tag_count_reward": 0.9586588710546493, |
|
"step": 380 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 443.9789176940918, |
|
"epoch": 6.208, |
|
"grad_norm": 1.4942998886108398, |
|
"kl": 0.8615966796875, |
|
"learning_rate": 2.904072042829775e-07, |
|
"loss": 0.0161, |
|
"reward": 2.043815162777901, |
|
"reward_std": 0.3786072336137295, |
|
"rewards/accuracy_reward": 0.15390625409781933, |
|
"rewards/format_reward": 0.9343750208616257, |
|
"rewards/tag_count_reward": 0.9555338725447655, |
|
"step": 385 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 448.456778717041, |
|
"epoch": 6.288, |
|
"grad_norm": 3.1579020023345947, |
|
"kl": 0.98740234375, |
|
"learning_rate": 2.660361998763508e-07, |
|
"loss": 0.0237, |
|
"reward": 2.04114590883255, |
|
"reward_std": 0.3731150545179844, |
|
"rewards/accuracy_reward": 0.15442708749324083, |
|
"rewards/format_reward": 0.9322916850447655, |
|
"rewards/tag_count_reward": 0.9544271036982537, |
|
"step": 390 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 449.2880340576172, |
|
"epoch": 6.368, |
|
"grad_norm": 1.9755905866622925, |
|
"kl": 0.92861328125, |
|
"learning_rate": 2.425755274173159e-07, |
|
"loss": 0.0252, |
|
"reward": 2.0747396677732466, |
|
"reward_std": 0.39273047521710397, |
|
"rewards/accuracy_reward": 0.18593750540167092, |
|
"rewards/format_reward": 0.9341146007180214, |
|
"rewards/tag_count_reward": 0.9546875223517418, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 6.448, |
|
"grad_norm": 1.7979745864868164, |
|
"learning_rate": 2.2005428507072465e-07, |
|
"loss": 0.0208, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 6.448, |
|
"eval_clip_ratio": 0.0, |
|
"eval_completion_length": 427.849369929387, |
|
"eval_kl": 0.7789963942307693, |
|
"eval_loss": 0.026709111407399178, |
|
"eval_reward": 2.0749199298711924, |
|
"eval_reward_std": 0.36127966069258177, |
|
"eval_rewards/accuracy_reward": 0.17147436250860876, |
|
"eval_rewards/format_reward": 0.9407051434883704, |
|
"eval_rewards/tag_count_reward": 0.9627403983703027, |
|
"eval_runtime": 137.2381, |
|
"eval_samples_per_second": 0.721, |
|
"eval_steps_per_second": 0.022, |
|
"step": 400 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 450.2595161437988, |
|
"epoch": 6.5280000000000005, |
|
"grad_norm": 3.3182427883148193, |
|
"kl": 0.9768798828125, |
|
"learning_rate": 1.985004058305535e-07, |
|
"loss": 0.0313, |
|
"reward": 2.052799554169178, |
|
"reward_std": 0.38064534645527603, |
|
"rewards/accuracy_reward": 0.16210937928408384, |
|
"rewards/format_reward": 0.9345052309334279, |
|
"rewards/tag_count_reward": 0.9561849169433116, |
|
"step": 405 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 448.1330825805664, |
|
"epoch": 6.608, |
|
"grad_norm": 3.9416732788085938, |
|
"kl": 1.0939453125, |
|
"learning_rate": 1.7794062287473733e-07, |
|
"loss": 0.0289, |
|
"reward": 2.0518229812383653, |
|
"reward_std": 0.35645177885890006, |
|
"rewards/accuracy_reward": 0.15468750335276127, |
|
"rewards/format_reward": 0.9377604350447655, |
|
"rewards/tag_count_reward": 0.9593750149011612, |
|
"step": 410 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 436.55521926879885, |
|
"epoch": 6.688, |
|
"grad_norm": 2.4022459983825684, |
|
"kl": 1.0167724609375, |
|
"learning_rate": 1.5840043640813272e-07, |
|
"loss": 0.0213, |
|
"reward": 2.047461000084877, |
|
"reward_std": 0.3820289634168148, |
|
"rewards/accuracy_reward": 0.16432292014360428, |
|
"rewards/format_reward": 0.9296875223517418, |
|
"rewards/tag_count_reward": 0.9534505411982537, |
|
"step": 415 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 449.48621063232423, |
|
"epoch": 6.768, |
|
"grad_norm": 2.1802432537078857, |
|
"kl": 0.945361328125, |
|
"learning_rate": 1.3990408203472937e-07, |
|
"loss": 0.0146, |
|
"reward": 2.051237052679062, |
|
"reward_std": 0.3935232628136873, |
|
"rewards/accuracy_reward": 0.16536458812188357, |
|
"rewards/format_reward": 0.932031263411045, |
|
"rewards/tag_count_reward": 0.9538411647081375, |
|
"step": 420 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 448.03881378173827, |
|
"epoch": 6.848, |
|
"grad_norm": 1.2726682424545288, |
|
"kl": 1.0002685546875, |
|
"learning_rate": 1.2247450069834076e-07, |
|
"loss": 0.0156, |
|
"reward": 2.064778706431389, |
|
"reward_std": 0.3714164044708014, |
|
"rewards/accuracy_reward": 0.16979167144745588, |
|
"rewards/format_reward": 0.9377604365348816, |
|
"rewards/tag_count_reward": 0.9572265848517418, |
|
"step": 425 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 447.07500915527345, |
|
"epoch": 6.928, |
|
"grad_norm": 2.3944952487945557, |
|
"kl": 0.728662109375, |
|
"learning_rate": 1.0613331022905758e-07, |
|
"loss": 0.0221, |
|
"reward": 2.0807943403720857, |
|
"reward_std": 0.35667028427124026, |
|
"rewards/accuracy_reward": 0.17161458870396018, |
|
"rewards/format_reward": 0.9460937738418579, |
|
"rewards/tag_count_reward": 0.9630859613418579, |
|
"step": 430 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1584.4791870117188, |
|
"epoch": 7.016, |
|
"grad_norm": 3.8696494102478027, |
|
"kl": 0.512451171875, |
|
"learning_rate": 9.090077853075118e-08, |
|
"loss": 0.3338, |
|
"reward": 0.6214192882180214, |
|
"reward_std": 0.8211026787757874, |
|
"rewards/accuracy_reward": 0.06380208488553762, |
|
"rewards/format_reward": 0.25130208767950535, |
|
"rewards/tag_count_reward": 0.3063151128590107, |
|
"step": 435 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1574.7263488769531, |
|
"epoch": 7.096, |
|
"grad_norm": 5.592410087585449, |
|
"kl": 0.470166015625, |
|
"learning_rate": 7.679579844288509e-08, |
|
"loss": 0.3414, |
|
"reward": 0.6222656443715096, |
|
"reward_std": 0.8386320501565934, |
|
"rewards/accuracy_reward": 0.06223958514165133, |
|
"rewards/format_reward": 0.2518229246139526, |
|
"rewards/tag_count_reward": 0.3082031346857548, |
|
"step": 440 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1548.7713928222656, |
|
"epoch": 7.176, |
|
"grad_norm": 6.609188079833984, |
|
"kl": 0.45009765625, |
|
"learning_rate": 6.383586430781196e-08, |
|
"loss": 0.3274, |
|
"reward": 0.6048828318715096, |
|
"reward_std": 0.8250703603029251, |
|
"rewards/accuracy_reward": 0.06588541865348815, |
|
"rewards/format_reward": 0.23958334065973758, |
|
"rewards/tag_count_reward": 0.2994140714406967, |
|
"step": 445 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1514.1794677734374, |
|
"epoch": 7.256, |
|
"grad_norm": 9.236469268798828, |
|
"kl": 0.4421875, |
|
"learning_rate": 5.203705027262184e-08, |
|
"loss": 0.3091, |
|
"reward": 0.6100911624729634, |
|
"reward_std": 0.7998277023434639, |
|
"rewards/accuracy_reward": 0.06171875186264515, |
|
"rewards/format_reward": 0.24322917498648167, |
|
"rewards/tag_count_reward": 0.3051432378590107, |
|
"step": 450 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1422.5672241210937, |
|
"epoch": 7.336, |
|
"grad_norm": 14.974526405334473, |
|
"kl": 0.447216796875, |
|
"learning_rate": 4.141399035245052e-08, |
|
"loss": 0.332, |
|
"reward": 0.6535156451165676, |
|
"reward_std": 0.8434477031230927, |
|
"rewards/accuracy_reward": 0.06458333530463278, |
|
"rewards/format_reward": 0.2614583432674408, |
|
"rewards/tag_count_reward": 0.3274739678949118, |
|
"step": 455 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1339.1979614257812, |
|
"epoch": 7.416, |
|
"grad_norm": 19.16950035095215, |
|
"kl": 0.4712890625, |
|
"learning_rate": 3.1979860279976564e-08, |
|
"loss": 0.3387, |
|
"reward": 0.710091170668602, |
|
"reward_std": 0.8780084311962127, |
|
"rewards/accuracy_reward": 0.07421875204890967, |
|
"rewards/format_reward": 0.28307292610406876, |
|
"rewards/tag_count_reward": 0.3527994900941849, |
|
"step": 460 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1225.218276977539, |
|
"epoch": 7.496, |
|
"grad_norm": 22.52800750732422, |
|
"kl": 0.523876953125, |
|
"learning_rate": 2.374636116362172e-08, |
|
"loss": 0.3504, |
|
"reward": 0.7427083536982536, |
|
"reward_std": 0.8980668410658836, |
|
"rewards/accuracy_reward": 0.07317708597984166, |
|
"rewards/format_reward": 0.30130209159106014, |
|
"rewards/tag_count_reward": 0.36822917237877845, |
|
"step": 465 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1114.6648742675782, |
|
"epoch": 7.576, |
|
"grad_norm": 25.05874252319336, |
|
"kl": 0.5748046875, |
|
"learning_rate": 1.6723704974718757e-08, |
|
"loss": 0.3561, |
|
"reward": 0.7968099161982536, |
|
"reward_std": 0.9075167685747146, |
|
"rewards/accuracy_reward": 0.08072916930541396, |
|
"rewards/format_reward": 0.3229166749864817, |
|
"rewards/tag_count_reward": 0.3931640740483999, |
|
"step": 470 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1007.0466384887695, |
|
"epoch": 7.656, |
|
"grad_norm": 24.392803192138672, |
|
"kl": 0.617919921875, |
|
"learning_rate": 1.0920601881650005e-08, |
|
"loss": 0.3497, |
|
"reward": 0.8455729380249977, |
|
"reward_std": 0.9318716049194335, |
|
"rewards/accuracy_reward": 0.08203125311993062, |
|
"rewards/format_reward": 0.3481770932674408, |
|
"rewards/tag_count_reward": 0.41536459475755694, |
|
"step": 475 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 946.3583541870117, |
|
"epoch": 7.736, |
|
"grad_norm": 22.616287231445312, |
|
"kl": 0.68662109375, |
|
"learning_rate": 6.344249446665673e-09, |
|
"loss": 0.3505, |
|
"reward": 0.8694010645151138, |
|
"reward_std": 0.9415230333805085, |
|
"rewards/accuracy_reward": 0.0838541688863188, |
|
"rewards/format_reward": 0.3583333432674408, |
|
"rewards/tag_count_reward": 0.42721355259418486, |
|
"step": 480 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 865.5632995605469, |
|
"epoch": 7.816, |
|
"grad_norm": 24.148319244384766, |
|
"kl": 0.79765625, |
|
"learning_rate": 3.0003236987802272e-09, |
|
"loss": 0.3502, |
|
"reward": 0.9250651299953461, |
|
"reward_std": 0.9858234718441963, |
|
"rewards/accuracy_reward": 0.09817708579357713, |
|
"rewards/format_reward": 0.3794270932674408, |
|
"rewards/tag_count_reward": 0.4474609471857548, |
|
"step": 485 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 807.2963745117188, |
|
"epoch": 7.896, |
|
"grad_norm": 19.522138595581055, |
|
"kl": 0.9572265625, |
|
"learning_rate": 8.929720938193331e-10, |
|
"loss": 0.3453, |
|
"reward": 0.9965495049953461, |
|
"reward_std": 0.9948423892259598, |
|
"rewards/accuracy_reward": 0.10338541963137686, |
|
"rewards/format_reward": 0.41250001043081286, |
|
"rewards/tag_count_reward": 0.4806640759110451, |
|
"step": 490 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 747.9367416381835, |
|
"epoch": 7.976, |
|
"grad_norm": 17.482969284057617, |
|
"kl": 1.2998046875, |
|
"learning_rate": 2.4808370349460595e-11, |
|
"loss": 0.3647, |
|
"reward": 1.0485026329755782, |
|
"reward_std": 1.0057428479194641, |
|
"rewards/accuracy_reward": 0.10468750277068466, |
|
"rewards/format_reward": 0.43854167610406875, |
|
"rewards/tag_count_reward": 0.5052734553813935, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 7.992, |
|
"step": 496, |
|
"total_flos": 0.0, |
|
"train_loss": 0.0, |
|
"train_runtime": 1.7643, |
|
"train_samples_per_second": 27205.631, |
|
"train_steps_per_second": 210.844 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 372, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 6, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 12, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|