|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.08, |
|
"eval_steps": 500, |
|
"global_step": 450, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"completion_length": 492.29167556762695, |
|
"epoch": 0.00017777777777777779, |
|
"grad_norm": 0.08746972070242835, |
|
"kl": 0.0, |
|
"learning_rate": 7.142857142857142e-08, |
|
"loss": 0.0, |
|
"reward": 0.02083333395421505, |
|
"reward_std": 0.05103103443980217, |
|
"rewards/equation_reward_func": 0.02083333395421505, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 1 |
|
}, |
|
{ |
|
"completion_length": 488.62500762939453, |
|
"epoch": 0.00035555555555555557, |
|
"grad_norm": 0.18281958866590367, |
|
"kl": 0.0, |
|
"learning_rate": 1.4285714285714285e-07, |
|
"loss": 0.0, |
|
"reward": 0.06250000186264515, |
|
"reward_std": 0.1530931070446968, |
|
"rewards/equation_reward_func": 0.06250000186264515, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 2 |
|
}, |
|
{ |
|
"completion_length": 481.8958396911621, |
|
"epoch": 0.0005333333333333334, |
|
"grad_norm": 0.18273061607712965, |
|
"kl": 0.00029015541076660156, |
|
"learning_rate": 2.1428571428571426e-07, |
|
"loss": 0.0, |
|
"reward": 0.0833333358168602, |
|
"reward_std": 0.20412413775920868, |
|
"rewards/equation_reward_func": 0.06250000186264515, |
|
"rewards/format_reward_func": 0.02083333395421505, |
|
"step": 3 |
|
}, |
|
{ |
|
"completion_length": 477.2708396911621, |
|
"epoch": 0.0007111111111111111, |
|
"grad_norm": 0.1403227179482258, |
|
"kl": 0.00024211406707763672, |
|
"learning_rate": 2.857142857142857e-07, |
|
"loss": 0.0, |
|
"reward": 0.0416666679084301, |
|
"reward_std": 0.10206207260489464, |
|
"rewards/equation_reward_func": 0.0416666679084301, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 4 |
|
}, |
|
{ |
|
"completion_length": 485.31250381469727, |
|
"epoch": 0.0008888888888888889, |
|
"grad_norm": 0.13329145692597189, |
|
"kl": 0.0002715587615966797, |
|
"learning_rate": 3.5714285714285716e-07, |
|
"loss": 0.0, |
|
"reward": 0.0833333358168602, |
|
"reward_std": 0.16661180555820465, |
|
"rewards/equation_reward_func": 0.06250000186264515, |
|
"rewards/format_reward_func": 0.02083333395421505, |
|
"step": 5 |
|
}, |
|
{ |
|
"completion_length": 485.3958396911621, |
|
"epoch": 0.0010666666666666667, |
|
"grad_norm": 0.1366397496686443, |
|
"kl": 0.0002875328063964844, |
|
"learning_rate": 4.285714285714285e-07, |
|
"loss": 0.0, |
|
"reward": 0.0416666679084301, |
|
"reward_std": 0.10206207260489464, |
|
"rewards/equation_reward_func": 0.0416666679084301, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 6 |
|
}, |
|
{ |
|
"completion_length": 489.83333587646484, |
|
"epoch": 0.0012444444444444445, |
|
"grad_norm": 0.0015282362483640434, |
|
"kl": 0.0002818107604980469, |
|
"learning_rate": 5e-07, |
|
"loss": 0.0, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/equation_reward_func": 0.0, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 7 |
|
}, |
|
{ |
|
"completion_length": 478.1041717529297, |
|
"epoch": 0.0014222222222222223, |
|
"grad_norm": 0.5082362557559295, |
|
"kl": 0.0039789676666259766, |
|
"learning_rate": 5.714285714285714e-07, |
|
"loss": 0.0002, |
|
"reward": 0.02083333395421505, |
|
"reward_std": 0.05103103816509247, |
|
"rewards/equation_reward_func": 0.02083333395421505, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 8 |
|
}, |
|
{ |
|
"completion_length": 498.25000381469727, |
|
"epoch": 0.0016, |
|
"grad_norm": 0.1284152446682543, |
|
"kl": 0.00029969215393066406, |
|
"learning_rate": 6.428571428571429e-07, |
|
"loss": 0.0, |
|
"reward": 0.0416666679084301, |
|
"reward_std": 0.10206207260489464, |
|
"rewards/equation_reward_func": 0.02083333395421505, |
|
"rewards/format_reward_func": 0.02083333395421505, |
|
"step": 9 |
|
}, |
|
{ |
|
"completion_length": 493.75000762939453, |
|
"epoch": 0.0017777777777777779, |
|
"grad_norm": 0.003296653133727949, |
|
"kl": 0.0004258155822753906, |
|
"learning_rate": 7.142857142857143e-07, |
|
"loss": 0.0, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/equation_reward_func": 0.0, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 10 |
|
}, |
|
{ |
|
"completion_length": 479.81250381469727, |
|
"epoch": 0.0019555555555555554, |
|
"grad_norm": 0.15068988372298686, |
|
"kl": 0.0005173683166503906, |
|
"learning_rate": 7.857142857142856e-07, |
|
"loss": 0.0, |
|
"reward": 0.06250000186264515, |
|
"reward_std": 0.11558076366782188, |
|
"rewards/equation_reward_func": 0.06250000186264515, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 11 |
|
}, |
|
{ |
|
"completion_length": 472.79167556762695, |
|
"epoch": 0.0021333333333333334, |
|
"grad_norm": 0.004443001493268414, |
|
"kl": 0.0008454322814941406, |
|
"learning_rate": 8.57142857142857e-07, |
|
"loss": 0.0, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/equation_reward_func": 0.0, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 12 |
|
}, |
|
{ |
|
"completion_length": 477.2708435058594, |
|
"epoch": 0.002311111111111111, |
|
"grad_norm": 0.17120574292118212, |
|
"kl": 0.0010666847229003906, |
|
"learning_rate": 9.285714285714285e-07, |
|
"loss": 0.0, |
|
"reward": 0.0416666679084301, |
|
"reward_std": 0.10206206887960434, |
|
"rewards/equation_reward_func": 0.0416666679084301, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 13 |
|
}, |
|
{ |
|
"completion_length": 487.87500381469727, |
|
"epoch": 0.002488888888888889, |
|
"grad_norm": 0.08858314482153737, |
|
"kl": 0.0020389556884765625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0001, |
|
"reward": 0.02083333395421505, |
|
"reward_std": 0.05103103443980217, |
|
"rewards/equation_reward_func": 0.02083333395421505, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 14 |
|
}, |
|
{ |
|
"completion_length": 463.54167556762695, |
|
"epoch": 0.0026666666666666666, |
|
"grad_norm": 0.21660240108333578, |
|
"kl": 0.0032138824462890625, |
|
"learning_rate": 9.999870202927739e-07, |
|
"loss": 0.0001, |
|
"reward": 0.0833333358168602, |
|
"reward_std": 0.16661180183291435, |
|
"rewards/equation_reward_func": 0.0833333358168602, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 15 |
|
}, |
|
{ |
|
"completion_length": 442.29167556762695, |
|
"epoch": 0.0028444444444444446, |
|
"grad_norm": 0.23716444884739032, |
|
"kl": 0.00571441650390625, |
|
"learning_rate": 9.999480818449865e-07, |
|
"loss": 0.0002, |
|
"reward": 0.1250000037252903, |
|
"reward_std": 0.2686738707125187, |
|
"rewards/equation_reward_func": 0.10416666977107525, |
|
"rewards/format_reward_func": 0.02083333395421505, |
|
"step": 16 |
|
}, |
|
{ |
|
"completion_length": 471.8958435058594, |
|
"epoch": 0.003022222222222222, |
|
"grad_norm": 0.17669793982814788, |
|
"kl": 0.00925445556640625, |
|
"learning_rate": 9.998831866782768e-07, |
|
"loss": 0.0004, |
|
"reward": 0.06250000186264515, |
|
"reward_std": 0.11558076366782188, |
|
"rewards/equation_reward_func": 0.0416666679084301, |
|
"rewards/format_reward_func": 0.02083333395421505, |
|
"step": 17 |
|
}, |
|
{ |
|
"completion_length": 480.1041793823242, |
|
"epoch": 0.0032, |
|
"grad_norm": 0.14804922191678244, |
|
"kl": 0.01300811767578125, |
|
"learning_rate": 9.997923381619255e-07, |
|
"loss": 0.0005, |
|
"reward": 0.06250000186264515, |
|
"reward_std": 0.1530931033194065, |
|
"rewards/equation_reward_func": 0.0416666679084301, |
|
"rewards/format_reward_func": 0.02083333395421505, |
|
"step": 18 |
|
}, |
|
{ |
|
"completion_length": 450.3125114440918, |
|
"epoch": 0.0033777777777777777, |
|
"grad_norm": 0.20021449335204797, |
|
"kl": 0.0184478759765625, |
|
"learning_rate": 9.996755410126814e-07, |
|
"loss": 0.0007, |
|
"reward": 0.0833333358168602, |
|
"reward_std": 0.16661180183291435, |
|
"rewards/equation_reward_func": 0.0833333358168602, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 19 |
|
}, |
|
{ |
|
"completion_length": 444.7708435058594, |
|
"epoch": 0.0035555555555555557, |
|
"grad_norm": 0.1748648048131685, |
|
"kl": 0.0309295654296875, |
|
"learning_rate": 9.995328012945157e-07, |
|
"loss": 0.0012, |
|
"reward": 0.08333333395421505, |
|
"reward_std": 0.15561354532837868, |
|
"rewards/equation_reward_func": 0.06250000186264515, |
|
"rewards/format_reward_func": 0.02083333395421505, |
|
"step": 20 |
|
}, |
|
{ |
|
"completion_length": 421.8541793823242, |
|
"epoch": 0.0037333333333333333, |
|
"grad_norm": 0.5042804386943773, |
|
"kl": 0.04931640625, |
|
"learning_rate": 9.993641264183072e-07, |
|
"loss": 0.002, |
|
"reward": 0.18750000558793545, |
|
"reward_std": 0.3842546343803406, |
|
"rewards/equation_reward_func": 0.1250000037252903, |
|
"rewards/format_reward_func": 0.06250000186264515, |
|
"step": 21 |
|
}, |
|
{ |
|
"completion_length": 395.2291793823242, |
|
"epoch": 0.003911111111111111, |
|
"grad_norm": 0.22336033886380158, |
|
"kl": 0.08929443359375, |
|
"learning_rate": 9.991695251414583e-07, |
|
"loss": 0.0036, |
|
"reward": 0.06250000186264515, |
|
"reward_std": 0.1530931070446968, |
|
"rewards/equation_reward_func": 0.0416666679084301, |
|
"rewards/format_reward_func": 0.02083333395421505, |
|
"step": 22 |
|
}, |
|
{ |
|
"completion_length": 396.81250762939453, |
|
"epoch": 0.004088888888888889, |
|
"grad_norm": 0.2743029521713877, |
|
"kl": 0.12481689453125, |
|
"learning_rate": 9.989490075674389e-07, |
|
"loss": 0.005, |
|
"reward": 0.0833333358168602, |
|
"reward_std": 0.16661179810762405, |
|
"rewards/equation_reward_func": 0.06250000186264515, |
|
"rewards/format_reward_func": 0.02083333395421505, |
|
"step": 23 |
|
}, |
|
{ |
|
"completion_length": 386.7291793823242, |
|
"epoch": 0.004266666666666667, |
|
"grad_norm": 0.5231775627781411, |
|
"kl": 0.15594482421875, |
|
"learning_rate": 9.987025851452636e-07, |
|
"loss": 0.0062, |
|
"reward": 0.1666666716337204, |
|
"reward_std": 0.3707359507679939, |
|
"rewards/equation_reward_func": 0.0833333358168602, |
|
"rewards/format_reward_func": 0.0833333358168602, |
|
"step": 24 |
|
}, |
|
{ |
|
"completion_length": 389.8541793823242, |
|
"epoch": 0.0044444444444444444, |
|
"grad_norm": 0.685343943475841, |
|
"kl": 0.3055419921875, |
|
"learning_rate": 9.984302706688961e-07, |
|
"loss": 0.0122, |
|
"reward": 0.20833333767950535, |
|
"reward_std": 0.42428741604089737, |
|
"rewards/equation_reward_func": 0.1250000037252903, |
|
"rewards/format_reward_func": 0.0833333358168602, |
|
"step": 25 |
|
}, |
|
{ |
|
"completion_length": 427.4166717529297, |
|
"epoch": 0.004622222222222222, |
|
"grad_norm": 0.40943932820956164, |
|
"kl": 0.3759765625, |
|
"learning_rate": 9.981320782765846e-07, |
|
"loss": 0.0151, |
|
"reward": 0.20833333767950535, |
|
"reward_std": 0.35973768681287766, |
|
"rewards/equation_reward_func": 0.12500000186264515, |
|
"rewards/format_reward_func": 0.0833333358168602, |
|
"step": 26 |
|
}, |
|
{ |
|
"completion_length": 360.0208435058594, |
|
"epoch": 0.0048, |
|
"grad_norm": 4.089935587345267, |
|
"kl": 0.62060546875, |
|
"learning_rate": 9.978080234501292e-07, |
|
"loss": 0.0248, |
|
"reward": 0.31250000931322575, |
|
"reward_std": 0.4177170805633068, |
|
"rewards/equation_reward_func": 0.18750000186264515, |
|
"rewards/format_reward_func": 0.1250000037252903, |
|
"step": 27 |
|
}, |
|
{ |
|
"completion_length": 446.8125114440918, |
|
"epoch": 0.004977777777777778, |
|
"grad_norm": 0.39386940795175784, |
|
"kl": 0.4873046875, |
|
"learning_rate": 9.974581230140768e-07, |
|
"loss": 0.0195, |
|
"reward": 0.06250000186264515, |
|
"reward_std": 0.1530931144952774, |
|
"rewards/equation_reward_func": 0.02083333395421505, |
|
"rewards/format_reward_func": 0.0416666679084301, |
|
"step": 28 |
|
}, |
|
{ |
|
"completion_length": 358.2916774749756, |
|
"epoch": 0.005155555555555556, |
|
"grad_norm": 0.47853926879988334, |
|
"kl": 0.381591796875, |
|
"learning_rate": 9.970823951348486e-07, |
|
"loss": 0.0153, |
|
"reward": 0.43750001676380634, |
|
"reward_std": 0.5070193596184254, |
|
"rewards/equation_reward_func": 0.18750000558793545, |
|
"rewards/format_reward_func": 0.25000000186264515, |
|
"step": 29 |
|
}, |
|
{ |
|
"completion_length": 392.0625190734863, |
|
"epoch": 0.005333333333333333, |
|
"grad_norm": 0.9169421130733736, |
|
"kl": 0.2674560546875, |
|
"learning_rate": 9.966808593197956e-07, |
|
"loss": 0.0107, |
|
"reward": 0.29166667349636555, |
|
"reward_std": 0.45132481306791306, |
|
"rewards/equation_reward_func": 0.14583333767950535, |
|
"rewards/format_reward_func": 0.14583333767950535, |
|
"step": 30 |
|
}, |
|
{ |
|
"completion_length": 402.68750762939453, |
|
"epoch": 0.005511111111111111, |
|
"grad_norm": 0.48152738156086744, |
|
"kl": 0.390869140625, |
|
"learning_rate": 9.962535364161878e-07, |
|
"loss": 0.0157, |
|
"reward": 0.2500000074505806, |
|
"reward_std": 0.3977733328938484, |
|
"rewards/equation_reward_func": 0.0833333358168602, |
|
"rewards/format_reward_func": 0.16666666977107525, |
|
"step": 31 |
|
}, |
|
{ |
|
"completion_length": 397.1875114440918, |
|
"epoch": 0.005688888888888889, |
|
"grad_norm": 0.5358456569457446, |
|
"kl": 0.632568359375, |
|
"learning_rate": 9.958004486101293e-07, |
|
"loss": 0.0253, |
|
"reward": 0.14583333767950535, |
|
"reward_std": 0.3572172485291958, |
|
"rewards/equation_reward_func": 0.10416666977107525, |
|
"rewards/format_reward_func": 0.0416666679084301, |
|
"step": 32 |
|
}, |
|
{ |
|
"completion_length": 395.5000114440918, |
|
"epoch": 0.005866666666666667, |
|
"grad_norm": 0.45393406597096747, |
|
"kl": 0.548828125, |
|
"learning_rate": 9.953216194254085e-07, |
|
"loss": 0.0219, |
|
"reward": 0.3125000074505806, |
|
"reward_std": 0.4986758381128311, |
|
"rewards/equation_reward_func": 0.1875000037252903, |
|
"rewards/format_reward_func": 0.1250000037252903, |
|
"step": 33 |
|
}, |
|
{ |
|
"completion_length": 418.12500762939453, |
|
"epoch": 0.006044444444444444, |
|
"grad_norm": 1.721174464990145, |
|
"kl": 0.89892578125, |
|
"learning_rate": 9.948170737222762e-07, |
|
"loss": 0.0359, |
|
"reward": 0.2083333358168602, |
|
"reward_std": 0.37717197462916374, |
|
"rewards/equation_reward_func": 0.06250000186264515, |
|
"rewards/format_reward_func": 0.1458333358168602, |
|
"step": 34 |
|
}, |
|
{ |
|
"completion_length": 404.9791793823242, |
|
"epoch": 0.006222222222222222, |
|
"grad_norm": 0.42311176292986397, |
|
"kl": 0.53369140625, |
|
"learning_rate": 9.94286837696154e-07, |
|
"loss": 0.0213, |
|
"reward": 0.31250000931322575, |
|
"reward_std": 0.45383426919579506, |
|
"rewards/equation_reward_func": 0.1458333358168602, |
|
"rewards/format_reward_func": 0.16666666977107525, |
|
"step": 35 |
|
}, |
|
{ |
|
"completion_length": 424.4791793823242, |
|
"epoch": 0.0064, |
|
"grad_norm": 0.7438227989937438, |
|
"kl": 0.407958984375, |
|
"learning_rate": 9.937309388762758e-07, |
|
"loss": 0.0163, |
|
"reward": 0.1250000037252903, |
|
"reward_std": 0.306186206638813, |
|
"rewards/equation_reward_func": 0.0416666679084301, |
|
"rewards/format_reward_func": 0.0833333358168602, |
|
"step": 36 |
|
}, |
|
{ |
|
"completion_length": 432.0833435058594, |
|
"epoch": 0.006577777777777778, |
|
"grad_norm": 0.5301151073899182, |
|
"kl": 0.273681640625, |
|
"learning_rate": 9.931494061242571e-07, |
|
"loss": 0.0109, |
|
"reward": 0.33333333767950535, |
|
"reward_std": 0.5959400944411755, |
|
"rewards/equation_reward_func": 0.1666666716337204, |
|
"rewards/format_reward_func": 0.1666666716337204, |
|
"step": 37 |
|
}, |
|
{ |
|
"completion_length": 389.27084732055664, |
|
"epoch": 0.0067555555555555554, |
|
"grad_norm": 0.6632742047974075, |
|
"kl": 0.218505859375, |
|
"learning_rate": 9.925422696325974e-07, |
|
"loss": 0.0087, |
|
"reward": 0.43750000931322575, |
|
"reward_std": 0.6827219277620316, |
|
"rewards/equation_reward_func": 0.20833333767950535, |
|
"rewards/format_reward_func": 0.2291666716337204, |
|
"step": 38 |
|
}, |
|
{ |
|
"completion_length": 353.2708435058594, |
|
"epoch": 0.006933333333333333, |
|
"grad_norm": 0.391058687294098, |
|
"kl": 0.21881103515625, |
|
"learning_rate": 9.919095609231123e-07, |
|
"loss": 0.0087, |
|
"reward": 0.5000000167638063, |
|
"reward_std": 0.5392209477722645, |
|
"rewards/equation_reward_func": 0.3125000074505806, |
|
"rewards/format_reward_func": 0.18750000558793545, |
|
"step": 39 |
|
}, |
|
{ |
|
"completion_length": 407.3333435058594, |
|
"epoch": 0.0071111111111111115, |
|
"grad_norm": 0.469200729103397, |
|
"kl": 0.32666015625, |
|
"learning_rate": 9.912513128452973e-07, |
|
"loss": 0.0131, |
|
"reward": 0.2708333358168602, |
|
"reward_std": 0.48175449296832085, |
|
"rewards/equation_reward_func": 0.1250000037252903, |
|
"rewards/format_reward_func": 0.1458333358168602, |
|
"step": 40 |
|
}, |
|
{ |
|
"completion_length": 398.5208435058594, |
|
"epoch": 0.007288888888888889, |
|
"grad_norm": 0.4795098047053158, |
|
"kl": 0.3126220703125, |
|
"learning_rate": 9.905675595746213e-07, |
|
"loss": 0.0125, |
|
"reward": 0.479166679084301, |
|
"reward_std": 0.6324757561087608, |
|
"rewards/equation_reward_func": 0.20833333767950535, |
|
"rewards/format_reward_func": 0.27083333767950535, |
|
"step": 41 |
|
}, |
|
{ |
|
"completion_length": 424.1458396911621, |
|
"epoch": 0.007466666666666667, |
|
"grad_norm": 0.517106705455673, |
|
"kl": 0.434326171875, |
|
"learning_rate": 9.898583366107536e-07, |
|
"loss": 0.0174, |
|
"reward": 0.2708333395421505, |
|
"reward_std": 0.3604965806007385, |
|
"rewards/equation_reward_func": 0.0833333358168602, |
|
"rewards/format_reward_func": 0.18750000186264515, |
|
"step": 42 |
|
}, |
|
{ |
|
"completion_length": 312.1875114440918, |
|
"epoch": 0.007644444444444444, |
|
"grad_norm": 1.5051043389807441, |
|
"kl": 0.644775390625, |
|
"learning_rate": 9.8912368077572e-07, |
|
"loss": 0.0258, |
|
"reward": 0.6041666846722364, |
|
"reward_std": 0.7486668117344379, |
|
"rewards/equation_reward_func": 0.2083333358168602, |
|
"rewards/format_reward_func": 0.3958333395421505, |
|
"step": 43 |
|
}, |
|
{ |
|
"completion_length": 392.2291793823242, |
|
"epoch": 0.007822222222222222, |
|
"grad_norm": 0.6184585307648679, |
|
"kl": 0.533447265625, |
|
"learning_rate": 9.88363630211991e-07, |
|
"loss": 0.0214, |
|
"reward": 0.33333334513008595, |
|
"reward_std": 0.5071536600589752, |
|
"rewards/equation_reward_func": 0.12500000186264515, |
|
"rewards/format_reward_func": 0.2083333395421505, |
|
"step": 44 |
|
}, |
|
{ |
|
"completion_length": 403.43750953674316, |
|
"epoch": 0.008, |
|
"grad_norm": 0.4758095462502418, |
|
"kl": 0.531005859375, |
|
"learning_rate": 9.875782243805017e-07, |
|
"loss": 0.0213, |
|
"reward": 0.2708333395421505, |
|
"reward_std": 0.6148928552865982, |
|
"rewards/equation_reward_func": 0.10416666977107525, |
|
"rewards/format_reward_func": 0.1666666716337204, |
|
"step": 45 |
|
}, |
|
{ |
|
"completion_length": 338.5625057220459, |
|
"epoch": 0.008177777777777779, |
|
"grad_norm": 0.5117314282157389, |
|
"kl": 0.47265625, |
|
"learning_rate": 9.867675040586033e-07, |
|
"loss": 0.0189, |
|
"reward": 0.45833334513008595, |
|
"reward_std": 0.6152770519256592, |
|
"rewards/equation_reward_func": 0.25000000186264515, |
|
"rewards/format_reward_func": 0.20833333767950535, |
|
"step": 46 |
|
}, |
|
{ |
|
"completion_length": 361.04167556762695, |
|
"epoch": 0.008355555555555555, |
|
"grad_norm": 0.5540118176815925, |
|
"kl": 0.56494140625, |
|
"learning_rate": 9.859315113379452e-07, |
|
"loss": 0.0226, |
|
"reward": 0.2291666716337204, |
|
"reward_std": 0.4107687212526798, |
|
"rewards/equation_reward_func": 0.12500000186264515, |
|
"rewards/format_reward_func": 0.10416666977107525, |
|
"step": 47 |
|
}, |
|
{ |
|
"completion_length": 376.60417556762695, |
|
"epoch": 0.008533333333333334, |
|
"grad_norm": 0.6882325279267957, |
|
"kl": 0.453369140625, |
|
"learning_rate": 9.850702896222908e-07, |
|
"loss": 0.0181, |
|
"reward": 0.37500001303851604, |
|
"reward_std": 0.5071536600589752, |
|
"rewards/equation_reward_func": 0.2083333358168602, |
|
"rewards/format_reward_func": 0.1666666716337204, |
|
"step": 48 |
|
}, |
|
{ |
|
"completion_length": 371.43750762939453, |
|
"epoch": 0.00871111111111111, |
|
"grad_norm": 0.6655719658883651, |
|
"kl": 0.404541015625, |
|
"learning_rate": 9.841838836252625e-07, |
|
"loss": 0.0162, |
|
"reward": 0.2916666753590107, |
|
"reward_std": 0.49615539610385895, |
|
"rewards/equation_reward_func": 0.10416666977107525, |
|
"rewards/format_reward_func": 0.1875000037252903, |
|
"step": 49 |
|
}, |
|
{ |
|
"completion_length": 301.39584159851074, |
|
"epoch": 0.008888888888888889, |
|
"grad_norm": 0.4870272919286295, |
|
"kl": 0.3218994140625, |
|
"learning_rate": 9.83272339368022e-07, |
|
"loss": 0.0129, |
|
"reward": 0.41666667349636555, |
|
"reward_std": 0.6257677860558033, |
|
"rewards/equation_reward_func": 0.1875000037252903, |
|
"rewards/format_reward_func": 0.2291666716337204, |
|
"step": 50 |
|
}, |
|
{ |
|
"completion_length": 398.3333435058594, |
|
"epoch": 0.009066666666666667, |
|
"grad_norm": 0.28160060568929596, |
|
"kl": 0.2677001953125, |
|
"learning_rate": 9.823357041768796e-07, |
|
"loss": 0.0107, |
|
"reward": 0.16666666977107525, |
|
"reward_std": 0.32222534343600273, |
|
"rewards/equation_reward_func": 0.06250000186264515, |
|
"rewards/format_reward_func": 0.10416666977107525, |
|
"step": 51 |
|
}, |
|
{ |
|
"completion_length": 366.0208396911621, |
|
"epoch": 0.009244444444444444, |
|
"grad_norm": 0.44161935224618987, |
|
"kl": 0.281494140625, |
|
"learning_rate": 9.813740266808373e-07, |
|
"loss": 0.0112, |
|
"reward": 0.27083333767950535, |
|
"reward_std": 0.455240398645401, |
|
"rewards/equation_reward_func": 0.1250000037252903, |
|
"rewards/format_reward_func": 0.14583333767950535, |
|
"step": 52 |
|
}, |
|
{ |
|
"completion_length": 289.2083396911621, |
|
"epoch": 0.009422222222222222, |
|
"grad_norm": 0.4634094627007982, |
|
"kl": 0.295166015625, |
|
"learning_rate": 9.803873568090647e-07, |
|
"loss": 0.0118, |
|
"reward": 0.5625000037252903, |
|
"reward_std": 0.6678489372134209, |
|
"rewards/equation_reward_func": 0.20833333767950535, |
|
"rewards/format_reward_func": 0.35416667349636555, |
|
"step": 53 |
|
}, |
|
{ |
|
"completion_length": 247.25000762939453, |
|
"epoch": 0.0096, |
|
"grad_norm": 0.5186115988333515, |
|
"kl": 0.3134765625, |
|
"learning_rate": 9.793757457883061e-07, |
|
"loss": 0.0125, |
|
"reward": 0.5208333525806665, |
|
"reward_std": 0.6603549271821976, |
|
"rewards/equation_reward_func": 0.25000000558793545, |
|
"rewards/format_reward_func": 0.27083333767950535, |
|
"step": 54 |
|
}, |
|
{ |
|
"completion_length": 315.91667556762695, |
|
"epoch": 0.009777777777777778, |
|
"grad_norm": 0.5492950399694214, |
|
"kl": 0.3515625, |
|
"learning_rate": 9.783392461402207e-07, |
|
"loss": 0.0141, |
|
"reward": 0.39583334513008595, |
|
"reward_std": 0.6474834568798542, |
|
"rewards/equation_reward_func": 0.22916667349636555, |
|
"rewards/format_reward_func": 0.1666666716337204, |
|
"step": 55 |
|
}, |
|
{ |
|
"completion_length": 309.16666984558105, |
|
"epoch": 0.009955555555555556, |
|
"grad_norm": 0.5069091296779727, |
|
"kl": 0.338623046875, |
|
"learning_rate": 9.772779116786567e-07, |
|
"loss": 0.0136, |
|
"reward": 0.5833333525806665, |
|
"reward_std": 0.6927030570805073, |
|
"rewards/equation_reward_func": 0.2500000037252903, |
|
"rewards/format_reward_func": 0.33333333767950535, |
|
"step": 56 |
|
}, |
|
{ |
|
"completion_length": 271.520845413208, |
|
"epoch": 0.010133333333333333, |
|
"grad_norm": 0.4630715136373293, |
|
"kl": 0.3236083984375, |
|
"learning_rate": 9.761917975068563e-07, |
|
"loss": 0.013, |
|
"reward": 0.6250000074505806, |
|
"reward_std": 0.661014586687088, |
|
"rewards/equation_reward_func": 0.25000000558793545, |
|
"rewards/format_reward_func": 0.3750000074505806, |
|
"step": 57 |
|
}, |
|
{ |
|
"completion_length": 296.2916717529297, |
|
"epoch": 0.010311111111111111, |
|
"grad_norm": 0.652647892352053, |
|
"kl": 0.392822265625, |
|
"learning_rate": 9.750809600145952e-07, |
|
"loss": 0.0157, |
|
"reward": 0.5208333525806665, |
|
"reward_std": 0.6347126960754395, |
|
"rewards/equation_reward_func": 0.12500000186264515, |
|
"rewards/format_reward_func": 0.39583334140479565, |
|
"step": 58 |
|
}, |
|
{ |
|
"completion_length": 282.5000047683716, |
|
"epoch": 0.01048888888888889, |
|
"grad_norm": 0.5320134109583597, |
|
"kl": 0.41748046875, |
|
"learning_rate": 9.739454568752555e-07, |
|
"loss": 0.0167, |
|
"reward": 0.645833358168602, |
|
"reward_std": 0.7514187395572662, |
|
"rewards/equation_reward_func": 0.2083333358168602, |
|
"rewards/format_reward_func": 0.4375000111758709, |
|
"step": 59 |
|
}, |
|
{ |
|
"completion_length": 317.3541793823242, |
|
"epoch": 0.010666666666666666, |
|
"grad_norm": 0.7774866396355492, |
|
"kl": 0.62548828125, |
|
"learning_rate": 9.7278534704283e-07, |
|
"loss": 0.025, |
|
"reward": 0.5625000111758709, |
|
"reward_std": 0.6851340346038342, |
|
"rewards/equation_reward_func": 0.1875000037252903, |
|
"rewards/format_reward_func": 0.37500000558793545, |
|
"step": 60 |
|
}, |
|
{ |
|
"completion_length": 325.020845413208, |
|
"epoch": 0.010844444444444445, |
|
"grad_norm": 1.0948810326989156, |
|
"kl": 0.64453125, |
|
"learning_rate": 9.716006907488628e-07, |
|
"loss": 0.0258, |
|
"reward": 0.5208333488553762, |
|
"reward_std": 0.605813056230545, |
|
"rewards/equation_reward_func": 0.06250000186264515, |
|
"rewards/format_reward_func": 0.4583333432674408, |
|
"step": 61 |
|
}, |
|
{ |
|
"completion_length": 244.68750762939453, |
|
"epoch": 0.011022222222222221, |
|
"grad_norm": 1.563284310620131, |
|
"kl": 0.613525390625, |
|
"learning_rate": 9.703915494993213e-07, |
|
"loss": 0.0245, |
|
"reward": 0.7916666846722364, |
|
"reward_std": 0.7340253219008446, |
|
"rewards/equation_reward_func": 0.22916666977107525, |
|
"rewards/format_reward_func": 0.5625000167638063, |
|
"step": 62 |
|
}, |
|
{ |
|
"completion_length": 282.50000762939453, |
|
"epoch": 0.0112, |
|
"grad_norm": 2.1064327877521154, |
|
"kl": 1.42578125, |
|
"learning_rate": 9.691579860714032e-07, |
|
"loss": 0.057, |
|
"reward": 0.5625000149011612, |
|
"reward_std": 0.6274054050445557, |
|
"rewards/equation_reward_func": 0.06250000186264515, |
|
"rewards/format_reward_func": 0.500000013038516, |
|
"step": 63 |
|
}, |
|
{ |
|
"completion_length": 194.25000667572021, |
|
"epoch": 0.011377777777777778, |
|
"grad_norm": 1.7833461932829624, |
|
"kl": 1.71484375, |
|
"learning_rate": 9.67900064510277e-07, |
|
"loss": 0.0686, |
|
"reward": 0.8125000260770321, |
|
"reward_std": 0.6131136827170849, |
|
"rewards/equation_reward_func": 0.16666666977107525, |
|
"rewards/format_reward_func": 0.6458333544433117, |
|
"step": 64 |
|
}, |
|
{ |
|
"completion_length": 187.25000286102295, |
|
"epoch": 0.011555555555555555, |
|
"grad_norm": 3.6199284336418573, |
|
"kl": 1.58935546875, |
|
"learning_rate": 9.666178501257572e-07, |
|
"loss": 0.0635, |
|
"reward": 0.7708333618938923, |
|
"reward_std": 0.48367293551564217, |
|
"rewards/equation_reward_func": 0.10416666977107525, |
|
"rewards/format_reward_func": 0.6666666828095913, |
|
"step": 65 |
|
}, |
|
{ |
|
"completion_length": 186.5625057220459, |
|
"epoch": 0.011733333333333333, |
|
"grad_norm": 6.170181718041983, |
|
"kl": 2.625, |
|
"learning_rate": 9.653114094889126e-07, |
|
"loss": 0.1052, |
|
"reward": 0.7916667051613331, |
|
"reward_std": 0.6234788559377193, |
|
"rewards/equation_reward_func": 0.14583333767950535, |
|
"rewards/format_reward_func": 0.6458333544433117, |
|
"step": 66 |
|
}, |
|
{ |
|
"completion_length": 172.43750381469727, |
|
"epoch": 0.011911111111111112, |
|
"grad_norm": 3.3588942097695202, |
|
"kl": 1.7451171875, |
|
"learning_rate": 9.639808104286116e-07, |
|
"loss": 0.0698, |
|
"reward": 0.8541667014360428, |
|
"reward_std": 0.753696121275425, |
|
"rewards/equation_reward_func": 0.20833333767950535, |
|
"rewards/format_reward_func": 0.6458333507180214, |
|
"step": 67 |
|
}, |
|
{ |
|
"completion_length": 167.35416984558105, |
|
"epoch": 0.012088888888888889, |
|
"grad_norm": 1.2101146404816208, |
|
"kl": 1.1162109375, |
|
"learning_rate": 9.626261220279987e-07, |
|
"loss": 0.0447, |
|
"reward": 0.9166667051613331, |
|
"reward_std": 0.6435392610728741, |
|
"rewards/equation_reward_func": 0.1875000037252903, |
|
"rewards/format_reward_func": 0.7291666977107525, |
|
"step": 68 |
|
}, |
|
{ |
|
"completion_length": 153.66667079925537, |
|
"epoch": 0.012266666666666667, |
|
"grad_norm": 1.0564151985609145, |
|
"kl": 0.7333984375, |
|
"learning_rate": 9.612474146209095e-07, |
|
"loss": 0.0294, |
|
"reward": 1.0416666865348816, |
|
"reward_std": 0.4375460147857666, |
|
"rewards/equation_reward_func": 0.1875000037252903, |
|
"rewards/format_reward_func": 0.8541666865348816, |
|
"step": 69 |
|
}, |
|
{ |
|
"completion_length": 156.87500762939453, |
|
"epoch": 0.012444444444444444, |
|
"grad_norm": 1.2014127678752624, |
|
"kl": 1.32470703125, |
|
"learning_rate": 9.598447597882179e-07, |
|
"loss": 0.053, |
|
"reward": 0.6875000223517418, |
|
"reward_std": 0.6247506737709045, |
|
"rewards/equation_reward_func": 0.0833333358168602, |
|
"rewards/format_reward_func": 0.604166679084301, |
|
"step": 70 |
|
}, |
|
{ |
|
"completion_length": 124.20833778381348, |
|
"epoch": 0.012622222222222222, |
|
"grad_norm": 1.3939165689430868, |
|
"kl": 1.7919921875, |
|
"learning_rate": 9.584182303541204e-07, |
|
"loss": 0.0716, |
|
"reward": 0.7916667014360428, |
|
"reward_std": 0.5915197134017944, |
|
"rewards/equation_reward_func": 0.0833333358168602, |
|
"rewards/format_reward_func": 0.708333358168602, |
|
"step": 71 |
|
}, |
|
{ |
|
"completion_length": 120.33333683013916, |
|
"epoch": 0.0128, |
|
"grad_norm": 4.070697361675846, |
|
"kl": 2.76123046875, |
|
"learning_rate": 9.56967900382354e-07, |
|
"loss": 0.1107, |
|
"reward": 0.9166666939854622, |
|
"reward_std": 0.681879960000515, |
|
"rewards/equation_reward_func": 0.2083333395421505, |
|
"rewards/format_reward_func": 0.7083333507180214, |
|
"step": 72 |
|
}, |
|
{ |
|
"completion_length": 112.27083587646484, |
|
"epoch": 0.012977777777777777, |
|
"grad_norm": 5.057509148199061, |
|
"kl": 3.8662109375, |
|
"learning_rate": 9.55493845172353e-07, |
|
"loss": 0.1549, |
|
"reward": 0.8125000223517418, |
|
"reward_std": 0.6485148780047894, |
|
"rewards/equation_reward_func": 0.14583333767950535, |
|
"rewards/format_reward_func": 0.6666666828095913, |
|
"step": 73 |
|
}, |
|
{ |
|
"completion_length": 100.75000381469727, |
|
"epoch": 0.013155555555555556, |
|
"grad_norm": 1.6660490540082615, |
|
"kl": 1.98828125, |
|
"learning_rate": 9.539961412553374e-07, |
|
"loss": 0.0795, |
|
"reward": 0.8750000149011612, |
|
"reward_std": 0.5296371467411518, |
|
"rewards/equation_reward_func": 0.2083333358168602, |
|
"rewards/format_reward_func": 0.6666666865348816, |
|
"step": 74 |
|
}, |
|
{ |
|
"completion_length": 113.87500190734863, |
|
"epoch": 0.013333333333333334, |
|
"grad_norm": 1.6938950842175522, |
|
"kl": 1.33642578125, |
|
"learning_rate": 9.524748663903406e-07, |
|
"loss": 0.0535, |
|
"reward": 0.8125000223517418, |
|
"reward_std": 0.5695351995527744, |
|
"rewards/equation_reward_func": 0.1250000037252903, |
|
"rewards/format_reward_func": 0.6875000223517418, |
|
"step": 75 |
|
}, |
|
{ |
|
"completion_length": 97.41666889190674, |
|
"epoch": 0.013511111111111111, |
|
"grad_norm": 1.8987706737359447, |
|
"kl": 1.123291015625, |
|
"learning_rate": 9.509300995601719e-07, |
|
"loss": 0.045, |
|
"reward": 0.7916666902601719, |
|
"reward_std": 0.533297847956419, |
|
"rewards/equation_reward_func": 0.1041666679084301, |
|
"rewards/format_reward_func": 0.6875000186264515, |
|
"step": 76 |
|
}, |
|
{ |
|
"completion_length": 98.72916984558105, |
|
"epoch": 0.01368888888888889, |
|
"grad_norm": 3.104011972298094, |
|
"kl": 2.25341796875, |
|
"learning_rate": 9.493619209673163e-07, |
|
"loss": 0.0902, |
|
"reward": 0.8541666939854622, |
|
"reward_std": 0.7043437324464321, |
|
"rewards/equation_reward_func": 0.18750000558793545, |
|
"rewards/format_reward_func": 0.666666679084301, |
|
"step": 77 |
|
}, |
|
{ |
|
"completion_length": 89.43750286102295, |
|
"epoch": 0.013866666666666666, |
|
"grad_norm": 1.9133019088326877, |
|
"kl": 1.931640625, |
|
"learning_rate": 9.477704120297696e-07, |
|
"loss": 0.0773, |
|
"reward": 0.7708333618938923, |
|
"reward_std": 0.6641731485724449, |
|
"rewards/equation_reward_func": 0.1875000037252903, |
|
"rewards/format_reward_func": 0.5833333544433117, |
|
"step": 78 |
|
}, |
|
{ |
|
"completion_length": 93.45833587646484, |
|
"epoch": 0.014044444444444444, |
|
"grad_norm": 1.5024014724527306, |
|
"kl": 1.484130859375, |
|
"learning_rate": 9.461556553768123e-07, |
|
"loss": 0.0593, |
|
"reward": 0.8125000149011612, |
|
"reward_std": 0.6717319972813129, |
|
"rewards/equation_reward_func": 0.18750000558793545, |
|
"rewards/format_reward_func": 0.6250000223517418, |
|
"step": 79 |
|
}, |
|
{ |
|
"completion_length": 97.62500286102295, |
|
"epoch": 0.014222222222222223, |
|
"grad_norm": 1.3010218844311383, |
|
"kl": 2.3037109375, |
|
"learning_rate": 9.445177348447186e-07, |
|
"loss": 0.0922, |
|
"reward": 0.7291666865348816, |
|
"reward_std": 0.6450728215277195, |
|
"rewards/equation_reward_func": 0.10416666977107525, |
|
"rewards/format_reward_func": 0.6250000149011612, |
|
"step": 80 |
|
}, |
|
{ |
|
"completion_length": 93.02083587646484, |
|
"epoch": 0.0144, |
|
"grad_norm": 1.4449550603366121, |
|
"kl": 1.0791015625, |
|
"learning_rate": 9.428567354724045e-07, |
|
"loss": 0.0432, |
|
"reward": 0.8125000298023224, |
|
"reward_std": 0.5830309242010117, |
|
"rewards/equation_reward_func": 0.1250000037252903, |
|
"rewards/format_reward_func": 0.6875000186264515, |
|
"step": 81 |
|
}, |
|
{ |
|
"completion_length": 86.27083587646484, |
|
"epoch": 0.014577777777777778, |
|
"grad_norm": 1.6082644108612607, |
|
"kl": 2.0986328125, |
|
"learning_rate": 9.41172743497012e-07, |
|
"loss": 0.0841, |
|
"reward": 0.7916666828095913, |
|
"reward_std": 0.6250231899321079, |
|
"rewards/equation_reward_func": 0.2083333395421505, |
|
"rewards/format_reward_func": 0.5833333488553762, |
|
"step": 82 |
|
}, |
|
{ |
|
"completion_length": 81.50000238418579, |
|
"epoch": 0.014755555555555555, |
|
"grad_norm": 3.1983476547534795, |
|
"kl": 3.3251953125, |
|
"learning_rate": 9.394658463494327e-07, |
|
"loss": 0.1331, |
|
"reward": 0.8750000298023224, |
|
"reward_std": 0.49993259087204933, |
|
"rewards/equation_reward_func": 0.12500000186264515, |
|
"rewards/format_reward_func": 0.7500000149011612, |
|
"step": 83 |
|
}, |
|
{ |
|
"completion_length": 80.02083587646484, |
|
"epoch": 0.014933333333333333, |
|
"grad_norm": 5.509760052892182, |
|
"kl": 4.7392578125, |
|
"learning_rate": 9.377361326497673e-07, |
|
"loss": 0.1899, |
|
"reward": 0.7708333525806665, |
|
"reward_std": 0.6075604781508446, |
|
"rewards/equation_reward_func": 0.1666666679084301, |
|
"rewards/format_reward_func": 0.6041666809469461, |
|
"step": 84 |
|
}, |
|
{ |
|
"completion_length": 82.25000238418579, |
|
"epoch": 0.015111111111111112, |
|
"grad_norm": 3.5108499823149404, |
|
"kl": 3.107421875, |
|
"learning_rate": 9.359836922027254e-07, |
|
"loss": 0.1244, |
|
"reward": 0.7291666828095913, |
|
"reward_std": 0.5941584445536137, |
|
"rewards/equation_reward_func": 0.0833333358168602, |
|
"rewards/format_reward_func": 0.6458333544433117, |
|
"step": 85 |
|
}, |
|
{ |
|
"completion_length": 82.41666984558105, |
|
"epoch": 0.015288888888888888, |
|
"grad_norm": 2.028459417088561, |
|
"kl": 1.29931640625, |
|
"learning_rate": 9.342086159929629e-07, |
|
"loss": 0.052, |
|
"reward": 0.9166666977107525, |
|
"reward_std": 0.6568441018462181, |
|
"rewards/equation_reward_func": 0.2291666679084301, |
|
"rewards/format_reward_func": 0.6875000186264515, |
|
"step": 86 |
|
}, |
|
{ |
|
"completion_length": 76.08333683013916, |
|
"epoch": 0.015466666666666667, |
|
"grad_norm": 1.3916338154872232, |
|
"kl": 0.656005859375, |
|
"learning_rate": 9.324109961803577e-07, |
|
"loss": 0.0262, |
|
"reward": 0.9166666865348816, |
|
"reward_std": 0.46873048692941666, |
|
"rewards/equation_reward_func": 0.1666666716337204, |
|
"rewards/format_reward_func": 0.7500000149011612, |
|
"step": 87 |
|
}, |
|
{ |
|
"completion_length": 83.31250381469727, |
|
"epoch": 0.015644444444444443, |
|
"grad_norm": 1.5055362976480897, |
|
"kl": 0.97509765625, |
|
"learning_rate": 9.305909260952254e-07, |
|
"loss": 0.039, |
|
"reward": 0.8541666865348816, |
|
"reward_std": 0.5454338155686855, |
|
"rewards/equation_reward_func": 0.10416666977107525, |
|
"rewards/format_reward_func": 0.7500000223517418, |
|
"step": 88 |
|
}, |
|
{ |
|
"completion_length": 83.79166984558105, |
|
"epoch": 0.015822222222222224, |
|
"grad_norm": 1.1633318801455976, |
|
"kl": 1.281494140625, |
|
"learning_rate": 9.287485002334732e-07, |
|
"loss": 0.0512, |
|
"reward": 0.8750000223517418, |
|
"reward_std": 0.4783512242138386, |
|
"rewards/equation_reward_func": 0.1458333358168602, |
|
"rewards/format_reward_func": 0.7291666939854622, |
|
"step": 89 |
|
}, |
|
{ |
|
"completion_length": 78.64583587646484, |
|
"epoch": 0.016, |
|
"grad_norm": 2.9144797063041006, |
|
"kl": 2.5732421875, |
|
"learning_rate": 9.268838142516943e-07, |
|
"loss": 0.1028, |
|
"reward": 0.7916666865348816, |
|
"reward_std": 0.652060579508543, |
|
"rewards/equation_reward_func": 0.14583333767950535, |
|
"rewards/format_reward_func": 0.6458333469927311, |
|
"step": 90 |
|
}, |
|
{ |
|
"completion_length": 81.66666984558105, |
|
"epoch": 0.016177777777777777, |
|
"grad_norm": 4.751929306174548, |
|
"kl": 4.068359375, |
|
"learning_rate": 9.249969649622012e-07, |
|
"loss": 0.1627, |
|
"reward": 0.6250000149011612, |
|
"reward_std": 0.6327161639928818, |
|
"rewards/equation_reward_func": 0.0833333358168602, |
|
"rewards/format_reward_func": 0.5416666716337204, |
|
"step": 91 |
|
}, |
|
{ |
|
"completion_length": 82.79166889190674, |
|
"epoch": 0.016355555555555557, |
|
"grad_norm": 2.236969609049743, |
|
"kl": 2.87109375, |
|
"learning_rate": 9.23088050327999e-07, |
|
"loss": 0.1146, |
|
"reward": 0.8125000223517418, |
|
"reward_std": 0.6459339037537575, |
|
"rewards/equation_reward_func": 0.1250000037252903, |
|
"rewards/format_reward_func": 0.6875000223517418, |
|
"step": 92 |
|
}, |
|
{ |
|
"completion_length": 82.45833587646484, |
|
"epoch": 0.016533333333333334, |
|
"grad_norm": 2.000499326379198, |
|
"kl": 1.4482421875, |
|
"learning_rate": 9.211571694577004e-07, |
|
"loss": 0.0579, |
|
"reward": 0.8958333507180214, |
|
"reward_std": 0.6133542768657207, |
|
"rewards/equation_reward_func": 0.18750000558793545, |
|
"rewards/format_reward_func": 0.7083333544433117, |
|
"step": 93 |
|
}, |
|
{ |
|
"completion_length": 84.58333587646484, |
|
"epoch": 0.01671111111111111, |
|
"grad_norm": 1.2548765801344943, |
|
"kl": 0.78173828125, |
|
"learning_rate": 9.192044226003788e-07, |
|
"loss": 0.0313, |
|
"reward": 0.8750000298023224, |
|
"reward_std": 0.5373301059007645, |
|
"rewards/equation_reward_func": 0.10416666977107525, |
|
"rewards/format_reward_func": 0.770833358168602, |
|
"step": 94 |
|
}, |
|
{ |
|
"completion_length": 81.95833587646484, |
|
"epoch": 0.016888888888888887, |
|
"grad_norm": 1.5059931362189294, |
|
"kl": 1.111083984375, |
|
"learning_rate": 9.172299111403641e-07, |
|
"loss": 0.0444, |
|
"reward": 1.0000000298023224, |
|
"reward_std": 0.4846614636480808, |
|
"rewards/equation_reward_func": 0.1458333358168602, |
|
"rewards/format_reward_func": 0.8541666865348816, |
|
"step": 95 |
|
}, |
|
{ |
|
"completion_length": 84.04166889190674, |
|
"epoch": 0.017066666666666667, |
|
"grad_norm": 1.3359826696515187, |
|
"kl": 1.60986328125, |
|
"learning_rate": 9.15233737591979e-07, |
|
"loss": 0.0643, |
|
"reward": 0.9166667014360428, |
|
"reward_std": 0.5707925632596016, |
|
"rewards/equation_reward_func": 0.14583333767950535, |
|
"rewards/format_reward_func": 0.7708333507180214, |
|
"step": 96 |
|
}, |
|
{ |
|
"completion_length": 86.62500286102295, |
|
"epoch": 0.017244444444444444, |
|
"grad_norm": 1.8097820481931228, |
|
"kl": 1.9326171875, |
|
"learning_rate": 9.132160055942164e-07, |
|
"loss": 0.0773, |
|
"reward": 0.9166666902601719, |
|
"reward_std": 0.4999736212193966, |
|
"rewards/equation_reward_func": 0.1458333358168602, |
|
"rewards/format_reward_func": 0.7708333488553762, |
|
"step": 97 |
|
}, |
|
{ |
|
"completion_length": 77.75000190734863, |
|
"epoch": 0.01742222222222222, |
|
"grad_norm": 1.6992570227449872, |
|
"kl": 1.365478515625, |
|
"learning_rate": 9.111768199053586e-07, |
|
"loss": 0.0546, |
|
"reward": 0.9791666939854622, |
|
"reward_std": 0.5367976725101471, |
|
"rewards/equation_reward_func": 0.14583333767950535, |
|
"rewards/format_reward_func": 0.8333333507180214, |
|
"step": 98 |
|
}, |
|
{ |
|
"completion_length": 70.70833492279053, |
|
"epoch": 0.0176, |
|
"grad_norm": 12.904518500107274, |
|
"kl": 1.700439453125, |
|
"learning_rate": 9.091162863975388e-07, |
|
"loss": 0.0681, |
|
"reward": 1.0833333879709244, |
|
"reward_std": 0.5429981462657452, |
|
"rewards/equation_reward_func": 0.2500000074505806, |
|
"rewards/format_reward_func": 0.8333333507180214, |
|
"step": 99 |
|
}, |
|
{ |
|
"completion_length": 74.75000333786011, |
|
"epoch": 0.017777777777777778, |
|
"grad_norm": 2.68173021529566, |
|
"kl": 1.787109375, |
|
"learning_rate": 9.070345120512435e-07, |
|
"loss": 0.0715, |
|
"reward": 0.9583333656191826, |
|
"reward_std": 0.4984116442501545, |
|
"rewards/equation_reward_func": 0.14583333767950535, |
|
"rewards/format_reward_func": 0.8125000223517418, |
|
"step": 100 |
|
}, |
|
{ |
|
"completion_length": 70.83333492279053, |
|
"epoch": 0.017955555555555554, |
|
"grad_norm": 1.405751831531719, |
|
"kl": 1.2646484375, |
|
"learning_rate": 9.049316049497587e-07, |
|
"loss": 0.0506, |
|
"reward": 0.8750000223517418, |
|
"reward_std": 0.39079636335372925, |
|
"rewards/equation_reward_func": 0.0416666679084301, |
|
"rewards/format_reward_func": 0.833333358168602, |
|
"step": 101 |
|
}, |
|
{ |
|
"completion_length": 68.66666889190674, |
|
"epoch": 0.018133333333333335, |
|
"grad_norm": 1.4351105118065817, |
|
"kl": 1.413818359375, |
|
"learning_rate": 9.028076742735582e-07, |
|
"loss": 0.0566, |
|
"reward": 0.8958333730697632, |
|
"reward_std": 0.5925082266330719, |
|
"rewards/equation_reward_func": 0.1250000037252903, |
|
"rewards/format_reward_func": 0.770833358168602, |
|
"step": 102 |
|
}, |
|
{ |
|
"completion_length": 66.25000286102295, |
|
"epoch": 0.01831111111111111, |
|
"grad_norm": 2.635755872511972, |
|
"kl": 1.2666015625, |
|
"learning_rate": 9.006628302946357e-07, |
|
"loss": 0.0507, |
|
"reward": 0.8958333656191826, |
|
"reward_std": 0.5383186265826225, |
|
"rewards/equation_reward_func": 0.10416666977107525, |
|
"rewards/format_reward_func": 0.7916666939854622, |
|
"step": 103 |
|
}, |
|
{ |
|
"completion_length": 66.04166889190674, |
|
"epoch": 0.018488888888888888, |
|
"grad_norm": 1.2582201461515905, |
|
"kl": 0.912353515625, |
|
"learning_rate": 8.984971843707787e-07, |
|
"loss": 0.0365, |
|
"reward": 0.916666679084301, |
|
"reward_std": 0.3247256837785244, |
|
"rewards/equation_reward_func": 0.0416666679084301, |
|
"rewards/format_reward_func": 0.8750000223517418, |
|
"step": 104 |
|
}, |
|
{ |
|
"completion_length": 65.89583539962769, |
|
"epoch": 0.018666666666666668, |
|
"grad_norm": 1.970155942749344, |
|
"kl": 0.91162109375, |
|
"learning_rate": 8.963108489397875e-07, |
|
"loss": 0.0364, |
|
"reward": 0.9166666865348816, |
|
"reward_std": 0.33336182311177254, |
|
"rewards/equation_reward_func": 0.06250000186264515, |
|
"rewards/format_reward_func": 0.854166679084301, |
|
"step": 105 |
|
}, |
|
{ |
|
"completion_length": 64.52083396911621, |
|
"epoch": 0.018844444444444445, |
|
"grad_norm": 1.3485959618130419, |
|
"kl": 0.93115234375, |
|
"learning_rate": 8.94103937513637e-07, |
|
"loss": 0.0372, |
|
"reward": 0.9375000298023224, |
|
"reward_std": 0.2982207238674164, |
|
"rewards/equation_reward_func": 0.0416666679084301, |
|
"rewards/format_reward_func": 0.8958333507180214, |
|
"step": 106 |
|
}, |
|
{ |
|
"completion_length": 63.06250190734863, |
|
"epoch": 0.01902222222222222, |
|
"grad_norm": 2.0728028572795543, |
|
"kl": 1.34375, |
|
"learning_rate": 8.918765646725843e-07, |
|
"loss": 0.0538, |
|
"reward": 0.937500037252903, |
|
"reward_std": 0.31970490887761116, |
|
"rewards/equation_reward_func": 0.0416666679084301, |
|
"rewards/format_reward_func": 0.8958333507180214, |
|
"step": 107 |
|
}, |
|
{ |
|
"completion_length": 61.979167461395264, |
|
"epoch": 0.0192, |
|
"grad_norm": 2.225767300026043, |
|
"kl": 2.87646484375, |
|
"learning_rate": 8.896288460592185e-07, |
|
"loss": 0.1152, |
|
"reward": 0.8750000223517418, |
|
"reward_std": 0.2861081585288048, |
|
"rewards/equation_reward_func": 0.02083333395421505, |
|
"rewards/format_reward_func": 0.854166679084301, |
|
"step": 108 |
|
}, |
|
{ |
|
"completion_length": 57.812501430511475, |
|
"epoch": 0.01937777777777778, |
|
"grad_norm": 1.3463463862492873, |
|
"kl": 1.247802734375, |
|
"learning_rate": 8.873608983724579e-07, |
|
"loss": 0.05, |
|
"reward": 1.0208333656191826, |
|
"reward_std": 0.4833719953894615, |
|
"rewards/equation_reward_func": 0.14583333767950535, |
|
"rewards/format_reward_func": 0.8750000149011612, |
|
"step": 109 |
|
}, |
|
{ |
|
"completion_length": 58.250001430511475, |
|
"epoch": 0.019555555555555555, |
|
"grad_norm": 1.666008392324381, |
|
"kl": 1.627685546875, |
|
"learning_rate": 8.850728393614901e-07, |
|
"loss": 0.0651, |
|
"reward": 0.9375000149011612, |
|
"reward_std": 0.4140563830733299, |
|
"rewards/equation_reward_func": 0.10416666977107525, |
|
"rewards/format_reward_func": 0.8333333507180214, |
|
"step": 110 |
|
}, |
|
{ |
|
"completion_length": 56.85416841506958, |
|
"epoch": 0.019733333333333332, |
|
"grad_norm": 0.9130518937737289, |
|
"kl": 0.876220703125, |
|
"learning_rate": 8.8276478781966e-07, |
|
"loss": 0.035, |
|
"reward": 1.0208333805203438, |
|
"reward_std": 0.4418273940682411, |
|
"rewards/equation_reward_func": 0.1250000037252903, |
|
"rewards/format_reward_func": 0.8958333507180214, |
|
"step": 111 |
|
}, |
|
{ |
|
"completion_length": 53.229167461395264, |
|
"epoch": 0.019911111111111112, |
|
"grad_norm": 2.1598126065522223, |
|
"kl": 0.886474609375, |
|
"learning_rate": 8.804368635783002e-07, |
|
"loss": 0.0355, |
|
"reward": 0.9583333730697632, |
|
"reward_std": 0.38524314761161804, |
|
"rewards/equation_reward_func": 0.06250000186264515, |
|
"rewards/format_reward_func": 0.895833358168602, |
|
"step": 112 |
|
}, |
|
{ |
|
"completion_length": 54.791667461395264, |
|
"epoch": 0.02008888888888889, |
|
"grad_norm": 1.4035187794909976, |
|
"kl": 1.21337890625, |
|
"learning_rate": 8.780891875005114e-07, |
|
"loss": 0.0487, |
|
"reward": 0.9375000298023224, |
|
"reward_std": 0.28219257295131683, |
|
"rewards/equation_reward_func": 0.0416666679084301, |
|
"rewards/format_reward_func": 0.8958333432674408, |
|
"step": 113 |
|
}, |
|
{ |
|
"completion_length": 49.97916793823242, |
|
"epoch": 0.020266666666666665, |
|
"grad_norm": 1.8833740064941022, |
|
"kl": 1.38671875, |
|
"learning_rate": 8.75721881474886e-07, |
|
"loss": 0.0555, |
|
"reward": 1.0416667014360428, |
|
"reward_std": 0.38524315133690834, |
|
"rewards/equation_reward_func": 0.12500000186264515, |
|
"rewards/format_reward_func": 0.9166666865348816, |
|
"step": 114 |
|
}, |
|
{ |
|
"completion_length": 48.83333444595337, |
|
"epoch": 0.020444444444444446, |
|
"grad_norm": 1.3339255087510238, |
|
"kl": 0.90771484375, |
|
"learning_rate": 8.733350684091805e-07, |
|
"loss": 0.0363, |
|
"reward": 0.8333333544433117, |
|
"reward_std": 0.4297148324549198, |
|
"rewards/equation_reward_func": 0.06250000186264515, |
|
"rewards/format_reward_func": 0.7708333544433117, |
|
"step": 115 |
|
}, |
|
{ |
|
"completion_length": 50.875000953674316, |
|
"epoch": 0.020622222222222222, |
|
"grad_norm": 1.1195993054785263, |
|
"kl": 0.71923828125, |
|
"learning_rate": 8.709288722239342e-07, |
|
"loss": 0.0288, |
|
"reward": 1.0625000447034836, |
|
"reward_std": 0.4778187908232212, |
|
"rewards/equation_reward_func": 0.14583333767950535, |
|
"rewards/format_reward_func": 0.9166666865348816, |
|
"step": 116 |
|
}, |
|
{ |
|
"completion_length": 53.10416793823242, |
|
"epoch": 0.0208, |
|
"grad_norm": 0.8633281856230262, |
|
"kl": 1.0654296875, |
|
"learning_rate": 8.685034178460353e-07, |
|
"loss": 0.0427, |
|
"reward": 1.0000000596046448, |
|
"reward_std": 0.4513138346374035, |
|
"rewards/equation_reward_func": 0.10416666977107525, |
|
"rewards/format_reward_func": 0.895833358168602, |
|
"step": 117 |
|
}, |
|
{ |
|
"completion_length": 49.85416841506958, |
|
"epoch": 0.02097777777777778, |
|
"grad_norm": 0.6227616661901212, |
|
"kl": 0.8115234375, |
|
"learning_rate": 8.660588312022343e-07, |
|
"loss": 0.0324, |
|
"reward": 1.0625000298023224, |
|
"reward_std": 0.3397653251886368, |
|
"rewards/equation_reward_func": 0.1250000037252903, |
|
"rewards/format_reward_func": 0.9375000074505806, |
|
"step": 118 |
|
}, |
|
{ |
|
"completion_length": 55.437500953674316, |
|
"epoch": 0.021155555555555556, |
|
"grad_norm": 0.9190492054206527, |
|
"kl": 0.992919921875, |
|
"learning_rate": 8.635952392126071e-07, |
|
"loss": 0.0397, |
|
"reward": 0.9583333432674408, |
|
"reward_std": 0.24161884933710098, |
|
"rewards/equation_reward_func": 0.06250000186264515, |
|
"rewards/format_reward_func": 0.8958333432674408, |
|
"step": 119 |
|
}, |
|
{ |
|
"completion_length": 53.72916793823242, |
|
"epoch": 0.021333333333333333, |
|
"grad_norm": 0.8309087697870374, |
|
"kl": 0.88134765625, |
|
"learning_rate": 8.611127697839647e-07, |
|
"loss": 0.0352, |
|
"reward": 1.1041667014360428, |
|
"reward_std": 0.36124950274825096, |
|
"rewards/equation_reward_func": 0.14583333767950535, |
|
"rewards/format_reward_func": 0.9583333432674408, |
|
"step": 120 |
|
}, |
|
{ |
|
"completion_length": 48.89583492279053, |
|
"epoch": 0.021511111111111113, |
|
"grad_norm": 1.145524145717797, |
|
"kl": 1.109375, |
|
"learning_rate": 8.586115518032126e-07, |
|
"loss": 0.0444, |
|
"reward": 1.0000000298023224, |
|
"reward_std": 0.3747681975364685, |
|
"rewards/equation_reward_func": 0.10416666977107525, |
|
"rewards/format_reward_func": 0.8958333432674408, |
|
"step": 121 |
|
}, |
|
{ |
|
"completion_length": 53.83333444595337, |
|
"epoch": 0.02168888888888889, |
|
"grad_norm": 1.291809265288758, |
|
"kl": 1.53271484375, |
|
"learning_rate": 8.560917151306592e-07, |
|
"loss": 0.0613, |
|
"reward": 1.125000037252903, |
|
"reward_std": 0.4393179304897785, |
|
"rewards/equation_reward_func": 0.18750000558793545, |
|
"rewards/format_reward_func": 0.9375000074505806, |
|
"step": 122 |
|
}, |
|
{ |
|
"completion_length": 56.89583492279053, |
|
"epoch": 0.021866666666666666, |
|
"grad_norm": 1.7427992065174527, |
|
"kl": 1.25, |
|
"learning_rate": 8.535533905932737e-07, |
|
"loss": 0.05, |
|
"reward": 1.1041666865348816, |
|
"reward_std": 0.4188222736120224, |
|
"rewards/equation_reward_func": 0.1875000037252903, |
|
"rewards/format_reward_func": 0.9166666865348816, |
|
"step": 123 |
|
}, |
|
{ |
|
"completion_length": 57.666667461395264, |
|
"epoch": 0.022044444444444443, |
|
"grad_norm": 1.0883926763944543, |
|
"kl": 0.804931640625, |
|
"learning_rate": 8.509967099778933e-07, |
|
"loss": 0.0322, |
|
"reward": 1.1250000596046448, |
|
"reward_std": 0.4283087030053139, |
|
"rewards/equation_reward_func": 0.1666666716337204, |
|
"rewards/format_reward_func": 0.9583333432674408, |
|
"step": 124 |
|
}, |
|
{ |
|
"completion_length": 54.97916793823242, |
|
"epoch": 0.022222222222222223, |
|
"grad_norm": 1.3364813622555163, |
|
"kl": 1.42626953125, |
|
"learning_rate": 8.484218060243815e-07, |
|
"loss": 0.057, |
|
"reward": 0.937500037252903, |
|
"reward_std": 0.3987618461251259, |
|
"rewards/equation_reward_func": 0.06250000186264515, |
|
"rewards/format_reward_func": 0.8750000223517418, |
|
"step": 125 |
|
}, |
|
{ |
|
"completion_length": 59.14583492279053, |
|
"epoch": 0.0224, |
|
"grad_norm": 1.025442216438611, |
|
"kl": 1.20263671875, |
|
"learning_rate": 8.458288124187358e-07, |
|
"loss": 0.0482, |
|
"reward": 1.0208333730697632, |
|
"reward_std": 0.4563346207141876, |
|
"rewards/equation_reward_func": 0.1250000037252903, |
|
"rewards/format_reward_func": 0.895833358168602, |
|
"step": 126 |
|
}, |
|
{ |
|
"completion_length": 50.00000190734863, |
|
"epoch": 0.022577777777777776, |
|
"grad_norm": 4.84373478962823, |
|
"kl": 3.6640625, |
|
"learning_rate": 8.432178637861483e-07, |
|
"loss": 0.1463, |
|
"reward": 0.8333333507180214, |
|
"reward_std": 0.4152076132595539, |
|
"rewards/equation_reward_func": 0.06250000186264515, |
|
"rewards/format_reward_func": 0.7708333432674408, |
|
"step": 127 |
|
}, |
|
{ |
|
"completion_length": 59.47916841506958, |
|
"epoch": 0.022755555555555557, |
|
"grad_norm": 2.7058062346789944, |
|
"kl": 1.1728515625, |
|
"learning_rate": 8.405890956840135e-07, |
|
"loss": 0.0469, |
|
"reward": 1.1458333507180214, |
|
"reward_std": 0.39611808210611343, |
|
"rewards/equation_reward_func": 0.22916666977107525, |
|
"rewards/format_reward_func": 0.916666679084301, |
|
"step": 128 |
|
}, |
|
{ |
|
"completion_length": 61.91666841506958, |
|
"epoch": 0.022933333333333333, |
|
"grad_norm": 2.4631575245463204, |
|
"kl": 2.07666015625, |
|
"learning_rate": 8.379426445948932e-07, |
|
"loss": 0.0831, |
|
"reward": 1.145833358168602, |
|
"reward_std": 0.5193633921444416, |
|
"rewards/equation_reward_func": 0.2291666716337204, |
|
"rewards/format_reward_func": 0.9166666865348816, |
|
"step": 129 |
|
}, |
|
{ |
|
"completion_length": 66.70833587646484, |
|
"epoch": 0.02311111111111111, |
|
"grad_norm": 1.230784921117314, |
|
"kl": 1.10009765625, |
|
"learning_rate": 8.352786479194287e-07, |
|
"loss": 0.044, |
|
"reward": 1.062500037252903, |
|
"reward_std": 0.32525811344385147, |
|
"rewards/equation_reward_func": 0.1250000037252903, |
|
"rewards/format_reward_func": 0.9375000074505806, |
|
"step": 130 |
|
}, |
|
{ |
|
"completion_length": 64.60416889190674, |
|
"epoch": 0.02328888888888889, |
|
"grad_norm": 1.6700411140187803, |
|
"kl": 1.74267578125, |
|
"learning_rate": 8.325972439692074e-07, |
|
"loss": 0.0696, |
|
"reward": 1.0833333656191826, |
|
"reward_std": 0.5055268332362175, |
|
"rewards/equation_reward_func": 0.20833333767950535, |
|
"rewards/format_reward_func": 0.8750000149011612, |
|
"step": 131 |
|
}, |
|
{ |
|
"completion_length": 66.66666889190674, |
|
"epoch": 0.023466666666666667, |
|
"grad_norm": 1.3772388429149782, |
|
"kl": 0.913818359375, |
|
"learning_rate": 8.298985719595823e-07, |
|
"loss": 0.0366, |
|
"reward": 1.0416667014360428, |
|
"reward_std": 0.3589930906891823, |
|
"rewards/equation_reward_func": 0.1250000037252903, |
|
"rewards/format_reward_func": 0.916666679084301, |
|
"step": 132 |
|
}, |
|
{ |
|
"completion_length": 69.39583539962769, |
|
"epoch": 0.023644444444444444, |
|
"grad_norm": 3.244989830519802, |
|
"kl": 1.36328125, |
|
"learning_rate": 8.271827720024438e-07, |
|
"loss": 0.0545, |
|
"reward": 1.2083333730697632, |
|
"reward_std": 0.4152076169848442, |
|
"rewards/equation_reward_func": 0.25000000558793545, |
|
"rewards/format_reward_func": 0.9583333432674408, |
|
"step": 133 |
|
}, |
|
{ |
|
"completion_length": 72.12500286102295, |
|
"epoch": 0.023822222222222224, |
|
"grad_norm": 2.0362764803071363, |
|
"kl": 1.7900390625, |
|
"learning_rate": 8.244499850989451e-07, |
|
"loss": 0.0715, |
|
"reward": 1.0000000521540642, |
|
"reward_std": 0.5644823275506496, |
|
"rewards/equation_reward_func": 0.1666666716337204, |
|
"rewards/format_reward_func": 0.833333358168602, |
|
"step": 134 |
|
}, |
|
{ |
|
"completion_length": 77.43750381469727, |
|
"epoch": 0.024, |
|
"grad_norm": 0.9861591115726814, |
|
"kl": 0.634521484375, |
|
"learning_rate": 8.21700353132182e-07, |
|
"loss": 0.0254, |
|
"reward": 1.0625000298023224, |
|
"reward_std": 0.29669978097081184, |
|
"rewards/equation_reward_func": 0.10416666977107525, |
|
"rewards/format_reward_func": 0.9583333432674408, |
|
"step": 135 |
|
}, |
|
{ |
|
"completion_length": 74.06250095367432, |
|
"epoch": 0.024177777777777777, |
|
"grad_norm": 0.9770205438349707, |
|
"kl": 0.664306640625, |
|
"learning_rate": 8.189340188598262e-07, |
|
"loss": 0.0266, |
|
"reward": 0.979166679084301, |
|
"reward_std": 0.2591874338686466, |
|
"rewards/equation_reward_func": 0.06250000186264515, |
|
"rewards/format_reward_func": 0.916666679084301, |
|
"step": 136 |
|
}, |
|
{ |
|
"completion_length": 79.12500238418579, |
|
"epoch": 0.024355555555555554, |
|
"grad_norm": 2.110834692267425, |
|
"kl": 1.36279296875, |
|
"learning_rate": 8.161511259067132e-07, |
|
"loss": 0.0545, |
|
"reward": 0.8750000298023224, |
|
"reward_std": 0.3977733254432678, |
|
"rewards/equation_reward_func": 0.0416666679084301, |
|
"rewards/format_reward_func": 0.833333358168602, |
|
"step": 137 |
|
}, |
|
{ |
|
"completion_length": 75.68750286102295, |
|
"epoch": 0.024533333333333334, |
|
"grad_norm": 1.8894230791150959, |
|
"kl": 1.076416015625, |
|
"learning_rate": 8.133518187573862e-07, |
|
"loss": 0.0431, |
|
"reward": 1.0625000298023224, |
|
"reward_std": 0.40168892964720726, |
|
"rewards/equation_reward_func": 0.1458333358168602, |
|
"rewards/format_reward_func": 0.9166666865348816, |
|
"step": 138 |
|
}, |
|
{ |
|
"completion_length": 77.06250095367432, |
|
"epoch": 0.02471111111111111, |
|
"grad_norm": 2.8491636564476788, |
|
"kl": 1.8115234375, |
|
"learning_rate": 8.105362427485942e-07, |
|
"loss": 0.0725, |
|
"reward": 0.9791666939854622, |
|
"reward_std": 0.5409447588026524, |
|
"rewards/equation_reward_func": 0.14583333767950535, |
|
"rewards/format_reward_func": 0.8333333656191826, |
|
"step": 139 |
|
}, |
|
{ |
|
"completion_length": 80.18750190734863, |
|
"epoch": 0.024888888888888887, |
|
"grad_norm": 1.0644881637020627, |
|
"kl": 1.40087890625, |
|
"learning_rate": 8.077045440617464e-07, |
|
"loss": 0.0561, |
|
"reward": 0.9583333656191826, |
|
"reward_std": 0.268673874437809, |
|
"rewards/equation_reward_func": 0.0416666679084301, |
|
"rewards/format_reward_func": 0.916666679084301, |
|
"step": 140 |
|
}, |
|
{ |
|
"completion_length": 80.12500190734863, |
|
"epoch": 0.025066666666666668, |
|
"grad_norm": 2.41329367830394, |
|
"kl": 2.546630859375, |
|
"learning_rate": 8.048568697153222e-07, |
|
"loss": 0.1021, |
|
"reward": 1.1250000298023224, |
|
"reward_std": 0.37628915905952454, |
|
"rewards/equation_reward_func": 0.1875000037252903, |
|
"rewards/format_reward_func": 0.9375000149011612, |
|
"step": 141 |
|
}, |
|
{ |
|
"completion_length": 78.43750286102295, |
|
"epoch": 0.025244444444444444, |
|
"grad_norm": 2.160199231813677, |
|
"kl": 2.1884765625, |
|
"learning_rate": 8.019933675572388e-07, |
|
"loss": 0.0875, |
|
"reward": 1.0000000298023224, |
|
"reward_std": 0.2471896894276142, |
|
"rewards/equation_reward_func": 0.06250000186264515, |
|
"rewards/format_reward_func": 0.9375000149011612, |
|
"step": 142 |
|
}, |
|
{ |
|
"completion_length": 80.20833587646484, |
|
"epoch": 0.02542222222222222, |
|
"grad_norm": 2.3799026133423715, |
|
"kl": 1.010009765625, |
|
"learning_rate": 7.991141862571749e-07, |
|
"loss": 0.0404, |
|
"reward": 1.0625000223517418, |
|
"reward_std": 0.3212082237005234, |
|
"rewards/equation_reward_func": 0.1458333358168602, |
|
"rewards/format_reward_func": 0.916666679084301, |
|
"step": 143 |
|
}, |
|
{ |
|
"completion_length": 77.85416984558105, |
|
"epoch": 0.0256, |
|
"grad_norm": 2.3486898873600084, |
|
"kl": 2.1669921875, |
|
"learning_rate": 7.962194752988518e-07, |
|
"loss": 0.0868, |
|
"reward": 0.8958333656191826, |
|
"reward_std": 0.4273201934993267, |
|
"rewards/equation_reward_func": 0.06250000186264515, |
|
"rewards/format_reward_func": 0.833333358168602, |
|
"step": 144 |
|
}, |
|
{ |
|
"completion_length": 81.31250286102295, |
|
"epoch": 0.025777777777777778, |
|
"grad_norm": 5.852632022601826, |
|
"kl": 0.934326171875, |
|
"learning_rate": 7.933093849722723e-07, |
|
"loss": 0.0374, |
|
"reward": 1.166666716337204, |
|
"reward_std": 0.41935470327734947, |
|
"rewards/equation_reward_func": 0.2083333395421505, |
|
"rewards/format_reward_func": 0.9583333432674408, |
|
"step": 145 |
|
}, |
|
{ |
|
"completion_length": 79.81250190734863, |
|
"epoch": 0.025955555555555555, |
|
"grad_norm": 3.627317817877323, |
|
"kl": 1.385498046875, |
|
"learning_rate": 7.903840663659184e-07, |
|
"loss": 0.0555, |
|
"reward": 0.9791666865348816, |
|
"reward_std": 0.23215004801750183, |
|
"rewards/equation_reward_func": 0.0416666679084301, |
|
"rewards/format_reward_func": 0.9375000149011612, |
|
"step": 146 |
|
}, |
|
{ |
|
"completion_length": 79.97916889190674, |
|
"epoch": 0.026133333333333335, |
|
"grad_norm": 3.3314477031020973, |
|
"kl": 0.618408203125, |
|
"learning_rate": 7.874436713589063e-07, |
|
"loss": 0.0248, |
|
"reward": 1.1250000298023224, |
|
"reward_std": 0.2957112640142441, |
|
"rewards/equation_reward_func": 0.14583333767950535, |
|
"rewards/format_reward_func": 0.9791666716337204, |
|
"step": 147 |
|
}, |
|
{ |
|
"completion_length": 77.14583683013916, |
|
"epoch": 0.02631111111111111, |
|
"grad_norm": 1.5297879763799356, |
|
"kl": 1.740966796875, |
|
"learning_rate": 7.844883526131013e-07, |
|
"loss": 0.0696, |
|
"reward": 1.0208333656191826, |
|
"reward_std": 0.4257992319762707, |
|
"rewards/equation_reward_func": 0.1250000037252903, |
|
"rewards/format_reward_func": 0.8958333507180214, |
|
"step": 148 |
|
}, |
|
{ |
|
"completion_length": 74.06250238418579, |
|
"epoch": 0.026488888888888888, |
|
"grad_norm": 6.233097760863968, |
|
"kl": 3.5986328125, |
|
"learning_rate": 7.815182635651912e-07, |
|
"loss": 0.1439, |
|
"reward": 0.9375000298023224, |
|
"reward_std": 0.3627704530954361, |
|
"rewards/equation_reward_func": 0.06250000186264515, |
|
"rewards/format_reward_func": 0.8750000223517418, |
|
"step": 149 |
|
}, |
|
{ |
|
"completion_length": 73.02083539962769, |
|
"epoch": 0.02666666666666667, |
|
"grad_norm": 5.958845153705178, |
|
"kl": 4.09375, |
|
"learning_rate": 7.785335584187219e-07, |
|
"loss": 0.1641, |
|
"reward": 0.9375000223517418, |
|
"reward_std": 0.26070838794112206, |
|
"rewards/equation_reward_func": 0.0416666679084301, |
|
"rewards/format_reward_func": 0.8958333507180214, |
|
"step": 150 |
|
}, |
|
{ |
|
"completion_length": 73.68750095367432, |
|
"epoch": 0.026844444444444445, |
|
"grad_norm": 2.37146983586077, |
|
"kl": 1.56298828125, |
|
"learning_rate": 7.755343921360886e-07, |
|
"loss": 0.0625, |
|
"reward": 0.958333358168602, |
|
"reward_std": 0.2742270827293396, |
|
"rewards/equation_reward_func": 0.06250000186264515, |
|
"rewards/format_reward_func": 0.8958333432674408, |
|
"step": 151 |
|
}, |
|
{ |
|
"completion_length": 78.08333587646484, |
|
"epoch": 0.027022222222222222, |
|
"grad_norm": 0.8124773136751642, |
|
"kl": 0.678466796875, |
|
"learning_rate": 7.725209204304928e-07, |
|
"loss": 0.0271, |
|
"reward": 0.937500037252903, |
|
"reward_std": 0.31970491260290146, |
|
"rewards/equation_reward_func": 0.0416666679084301, |
|
"rewards/format_reward_func": 0.8958333507180214, |
|
"step": 152 |
|
}, |
|
{ |
|
"completion_length": 72.68750238418579, |
|
"epoch": 0.0272, |
|
"grad_norm": 1.8438522048436141, |
|
"kl": 0.5576171875, |
|
"learning_rate": 7.694932997578564e-07, |
|
"loss": 0.0223, |
|
"reward": 1.1250000298023224, |
|
"reward_std": 0.3796723149716854, |
|
"rewards/equation_reward_func": 0.16666666977107525, |
|
"rewards/format_reward_func": 0.9583333432674408, |
|
"step": 153 |
|
}, |
|
{ |
|
"completion_length": 78.06250238418579, |
|
"epoch": 0.02737777777777778, |
|
"grad_norm": 2.3953021804030485, |
|
"kl": 1.169921875, |
|
"learning_rate": 7.664516873086987e-07, |
|
"loss": 0.0469, |
|
"reward": 1.1666667014360428, |
|
"reward_std": 0.44867006316781044, |
|
"rewards/equation_reward_func": 0.22916666977107525, |
|
"rewards/format_reward_func": 0.9375000149011612, |
|
"step": 154 |
|
}, |
|
{ |
|
"completion_length": 77.79166984558105, |
|
"epoch": 0.027555555555555555, |
|
"grad_norm": 0.8215867255038918, |
|
"kl": 0.425048828125, |
|
"learning_rate": 7.633962409999764e-07, |
|
"loss": 0.017, |
|
"reward": 1.1041667014360428, |
|
"reward_std": 0.3397653251886368, |
|
"rewards/equation_reward_func": 0.14583333767950535, |
|
"rewards/format_reward_func": 0.9583333432674408, |
|
"step": 155 |
|
}, |
|
{ |
|
"completion_length": 74.83333492279053, |
|
"epoch": 0.027733333333333332, |
|
"grad_norm": 0.7182201741238872, |
|
"kl": 0.4013671875, |
|
"learning_rate": 7.603271194668835e-07, |
|
"loss": 0.0161, |
|
"reward": 1.2291667014360428, |
|
"reward_std": 0.36417657881975174, |
|
"rewards/equation_reward_func": 0.2291666716337204, |
|
"rewards/format_reward_func": 1.0, |
|
"step": 156 |
|
}, |
|
{ |
|
"completion_length": 77.02083683013916, |
|
"epoch": 0.027911111111111112, |
|
"grad_norm": 0.8227456410541343, |
|
"kl": 0.458984375, |
|
"learning_rate": 7.572444820546155e-07, |
|
"loss": 0.0184, |
|
"reward": 0.958333358168602, |
|
"reward_std": 0.20412414148449898, |
|
"rewards/equation_reward_func": 0.02083333395421505, |
|
"rewards/format_reward_func": 0.9375000149011612, |
|
"step": 157 |
|
}, |
|
{ |
|
"completion_length": 73.06250190734863, |
|
"epoch": 0.02808888888888889, |
|
"grad_norm": 1.4714763458802167, |
|
"kl": 0.7119140625, |
|
"learning_rate": 7.541484888100973e-07, |
|
"loss": 0.0285, |
|
"reward": 0.9791666865348816, |
|
"reward_std": 0.4867551550269127, |
|
"rewards/equation_reward_func": 0.14583333767950535, |
|
"rewards/format_reward_func": 0.833333358168602, |
|
"step": 158 |
|
}, |
|
{ |
|
"completion_length": 72.87500190734863, |
|
"epoch": 0.028266666666666666, |
|
"grad_norm": 1.4514512402843838, |
|
"kl": 0.871337890625, |
|
"learning_rate": 7.510393004736722e-07, |
|
"loss": 0.0349, |
|
"reward": 0.9791666939854622, |
|
"reward_std": 0.5999412871897221, |
|
"rewards/equation_reward_func": 0.16666666977107525, |
|
"rewards/format_reward_func": 0.812500037252903, |
|
"step": 159 |
|
}, |
|
{ |
|
"completion_length": 67.31250143051147, |
|
"epoch": 0.028444444444444446, |
|
"grad_norm": 2.116607202051419, |
|
"kl": 1.553466796875, |
|
"learning_rate": 7.479170784707574e-07, |
|
"loss": 0.0621, |
|
"reward": 0.8333333544433117, |
|
"reward_std": 0.4538251422345638, |
|
"rewards/equation_reward_func": 0.06250000186264515, |
|
"rewards/format_reward_func": 0.7708333544433117, |
|
"step": 160 |
|
}, |
|
{ |
|
"completion_length": 64.12500143051147, |
|
"epoch": 0.028622222222222223, |
|
"grad_norm": 4.988492970714175, |
|
"kl": 2.7998046875, |
|
"learning_rate": 7.447819849034628e-07, |
|
"loss": 0.1121, |
|
"reward": 0.666666692122817, |
|
"reward_std": 0.49578551203012466, |
|
"rewards/equation_reward_func": 0.02083333395421505, |
|
"rewards/format_reward_func": 0.6458333563059568, |
|
"step": 161 |
|
}, |
|
{ |
|
"completion_length": 61.10416841506958, |
|
"epoch": 0.0288, |
|
"grad_norm": 6.132561250847984, |
|
"kl": 3.8466796875, |
|
"learning_rate": 7.416341825421753e-07, |
|
"loss": 0.1538, |
|
"reward": 0.5625000111758709, |
|
"reward_std": 0.5616070628166199, |
|
"rewards/equation_reward_func": 0.02083333395421505, |
|
"rewards/format_reward_func": 0.5416666753590107, |
|
"step": 162 |
|
}, |
|
{ |
|
"completion_length": 62.312501430511475, |
|
"epoch": 0.02897777777777778, |
|
"grad_norm": 4.666254586473765, |
|
"kl": 3.7216796875, |
|
"learning_rate": 7.384738348171068e-07, |
|
"loss": 0.1486, |
|
"reward": 0.8541666939854622, |
|
"reward_std": 0.5078980773687363, |
|
"rewards/equation_reward_func": 0.1250000037252903, |
|
"rewards/format_reward_func": 0.7291666828095913, |
|
"step": 163 |
|
}, |
|
{ |
|
"completion_length": 70.00000286102295, |
|
"epoch": 0.029155555555555556, |
|
"grad_norm": 1.4445543590308407, |
|
"kl": 1.42333984375, |
|
"learning_rate": 7.353011058098103e-07, |
|
"loss": 0.057, |
|
"reward": 0.937500037252903, |
|
"reward_std": 0.5709268674254417, |
|
"rewards/equation_reward_func": 0.1458333358168602, |
|
"rewards/format_reward_func": 0.7916666939854622, |
|
"step": 164 |
|
}, |
|
{ |
|
"completion_length": 72.79167079925537, |
|
"epoch": 0.029333333333333333, |
|
"grad_norm": 1.8455254519389834, |
|
"kl": 1.66259765625, |
|
"learning_rate": 7.321161602446601e-07, |
|
"loss": 0.0666, |
|
"reward": 1.0416667014360428, |
|
"reward_std": 0.40530357509851456, |
|
"rewards/equation_reward_func": 0.1250000037252903, |
|
"rewards/format_reward_func": 0.9166666865348816, |
|
"step": 165 |
|
}, |
|
{ |
|
"completion_length": 67.22916793823242, |
|
"epoch": 0.02951111111111111, |
|
"grad_norm": 1.2830647533382114, |
|
"kl": 0.5908203125, |
|
"learning_rate": 7.289191634803002e-07, |
|
"loss": 0.0236, |
|
"reward": 1.0000000298023224, |
|
"reward_std": 0.2831810861825943, |
|
"rewards/equation_reward_func": 0.06250000186264515, |
|
"rewards/format_reward_func": 0.9375000149011612, |
|
"step": 166 |
|
}, |
|
{ |
|
"completion_length": 67.145836353302, |
|
"epoch": 0.02968888888888889, |
|
"grad_norm": 1.5595729319752896, |
|
"kl": 0.570068359375, |
|
"learning_rate": 7.257102815010584e-07, |
|
"loss": 0.0228, |
|
"reward": 1.1458333656191826, |
|
"reward_std": 0.4072421304881573, |
|
"rewards/equation_reward_func": 0.20833333767950535, |
|
"rewards/format_reward_func": 0.9375000074505806, |
|
"step": 167 |
|
}, |
|
{ |
|
"completion_length": 65.50000190734863, |
|
"epoch": 0.029866666666666666, |
|
"grad_norm": 0.8324762943011298, |
|
"kl": 1.06201171875, |
|
"learning_rate": 7.224896809083297e-07, |
|
"loss": 0.0424, |
|
"reward": 1.1666667014360428, |
|
"reward_std": 0.37223926186561584, |
|
"rewards/equation_reward_func": 0.20833333767950535, |
|
"rewards/format_reward_func": 0.9583333432674408, |
|
"step": 168 |
|
}, |
|
{ |
|
"completion_length": 62.729167461395264, |
|
"epoch": 0.030044444444444443, |
|
"grad_norm": 2.4594683537617437, |
|
"kl": 0.987060546875, |
|
"learning_rate": 7.192575289119245e-07, |
|
"loss": 0.0395, |
|
"reward": 1.0000000298023224, |
|
"reward_std": 0.39079635962843895, |
|
"rewards/equation_reward_func": 0.10416666977107525, |
|
"rewards/format_reward_func": 0.8958333507180214, |
|
"step": 169 |
|
}, |
|
{ |
|
"completion_length": 58.02083444595337, |
|
"epoch": 0.030222222222222223, |
|
"grad_norm": 1.7347800603929018, |
|
"kl": 2.14453125, |
|
"learning_rate": 7.160139933213898e-07, |
|
"loss": 0.0858, |
|
"reward": 1.1666667014360428, |
|
"reward_std": 0.3841203413903713, |
|
"rewards/equation_reward_func": 0.2083333358168602, |
|
"rewards/format_reward_func": 0.9583333432674408, |
|
"step": 170 |
|
}, |
|
{ |
|
"completion_length": 55.604167461395264, |
|
"epoch": 0.0304, |
|
"grad_norm": 2.866447003547887, |
|
"kl": 3.4658203125, |
|
"learning_rate": 7.12759242537295e-07, |
|
"loss": 0.1386, |
|
"reward": 1.0000000223517418, |
|
"reward_std": 0.41085678339004517, |
|
"rewards/equation_reward_func": 0.1250000037252903, |
|
"rewards/format_reward_func": 0.8750000223517418, |
|
"step": 171 |
|
}, |
|
{ |
|
"completion_length": 53.104167461395264, |
|
"epoch": 0.030577777777777777, |
|
"grad_norm": 2.387493786129646, |
|
"kl": 4.9072265625, |
|
"learning_rate": 7.094934455424888e-07, |
|
"loss": 0.1961, |
|
"reward": 1.0416667088866234, |
|
"reward_std": 0.49133747816085815, |
|
"rewards/equation_reward_func": 0.1666666716337204, |
|
"rewards/format_reward_func": 0.8750000223517418, |
|
"step": 172 |
|
}, |
|
{ |
|
"completion_length": 55.875001430511475, |
|
"epoch": 0.030755555555555557, |
|
"grad_norm": 2.9662179988283444, |
|
"kl": 2.531494140625, |
|
"learning_rate": 7.06216771893327e-07, |
|
"loss": 0.1012, |
|
"reward": 0.9375000223517418, |
|
"reward_std": 0.28219256922602654, |
|
"rewards/equation_reward_func": 0.0416666679084301, |
|
"rewards/format_reward_func": 0.8958333507180214, |
|
"step": 173 |
|
}, |
|
{ |
|
"completion_length": 51.500001430511475, |
|
"epoch": 0.030933333333333334, |
|
"grad_norm": 1.3148679005802646, |
|
"kl": 2.0830078125, |
|
"learning_rate": 7.029293917108677e-07, |
|
"loss": 0.0833, |
|
"reward": 0.9166667014360428, |
|
"reward_std": 0.44083888083696365, |
|
"rewards/equation_reward_func": 0.0833333358168602, |
|
"rewards/format_reward_func": 0.8333333507180214, |
|
"step": 174 |
|
}, |
|
{ |
|
"completion_length": 55.500000953674316, |
|
"epoch": 0.03111111111111111, |
|
"grad_norm": 1.940206757858531, |
|
"kl": 0.938232421875, |
|
"learning_rate": 6.996314756720408e-07, |
|
"loss": 0.0375, |
|
"reward": 1.1250000298023224, |
|
"reward_std": 0.3506578877568245, |
|
"rewards/equation_reward_func": 0.16666666977107525, |
|
"rewards/format_reward_func": 0.9583333432674408, |
|
"step": 175 |
|
}, |
|
{ |
|
"completion_length": 57.583335399627686, |
|
"epoch": 0.03128888888888889, |
|
"grad_norm": 0.4970228156471284, |
|
"kl": 0.618408203125, |
|
"learning_rate": 6.963231950007844e-07, |
|
"loss": 0.0247, |
|
"reward": 1.208333358168602, |
|
"reward_std": 0.30354244261980057, |
|
"rewards/equation_reward_func": 0.2083333358168602, |
|
"rewards/format_reward_func": 1.0, |
|
"step": 176 |
|
}, |
|
{ |
|
"completion_length": 57.22916793823242, |
|
"epoch": 0.031466666666666664, |
|
"grad_norm": 2.1203602590832387, |
|
"kl": 0.69189453125, |
|
"learning_rate": 6.930047214591568e-07, |
|
"loss": 0.0277, |
|
"reward": 1.2500000447034836, |
|
"reward_std": 0.47683026641607285, |
|
"rewards/equation_reward_func": 0.3125000074505806, |
|
"rewards/format_reward_func": 0.9375000149011612, |
|
"step": 177 |
|
}, |
|
{ |
|
"completion_length": 54.47916793823242, |
|
"epoch": 0.03164444444444445, |
|
"grad_norm": 1.2499222352319177, |
|
"kl": 1.194091796875, |
|
"learning_rate": 6.896762273384178e-07, |
|
"loss": 0.0477, |
|
"reward": 1.0416667014360428, |
|
"reward_std": 0.46985330432653427, |
|
"rewards/equation_reward_func": 0.14583333767950535, |
|
"rewards/format_reward_func": 0.895833358168602, |
|
"step": 178 |
|
}, |
|
{ |
|
"completion_length": 48.437500953674316, |
|
"epoch": 0.031822222222222224, |
|
"grad_norm": 1.861799414305649, |
|
"kl": 1.77197265625, |
|
"learning_rate": 6.863378854500845e-07, |
|
"loss": 0.0708, |
|
"reward": 0.937500037252903, |
|
"reward_std": 0.3627704605460167, |
|
"rewards/equation_reward_func": 0.06250000186264515, |
|
"rewards/format_reward_func": 0.8750000149011612, |
|
"step": 179 |
|
}, |
|
{ |
|
"completion_length": 52.33333492279053, |
|
"epoch": 0.032, |
|
"grad_norm": 1.4601874721448491, |
|
"kl": 1.70166015625, |
|
"learning_rate": 6.829898691169579e-07, |
|
"loss": 0.068, |
|
"reward": 0.9375000298023224, |
|
"reward_std": 0.27258946001529694, |
|
"rewards/equation_reward_func": 0.0416666679084301, |
|
"rewards/format_reward_func": 0.8958333432674408, |
|
"step": 180 |
|
}, |
|
{ |
|
"completion_length": 52.47916793823242, |
|
"epoch": 0.03217777777777778, |
|
"grad_norm": 3.4537886700647316, |
|
"kl": 1.458251953125, |
|
"learning_rate": 6.796323521641256e-07, |
|
"loss": 0.0584, |
|
"reward": 1.0416667237877846, |
|
"reward_std": 0.5502192042768002, |
|
"rewards/equation_reward_func": 0.18750000558793545, |
|
"rewards/format_reward_func": 0.854166679084301, |
|
"step": 181 |
|
}, |
|
{ |
|
"completion_length": 52.812501430511475, |
|
"epoch": 0.032355555555555554, |
|
"grad_norm": 1.1876803036822332, |
|
"kl": 1.65576171875, |
|
"learning_rate": 6.762655089099353e-07, |
|
"loss": 0.0663, |
|
"reward": 1.1041667014360428, |
|
"reward_std": 0.4688647910952568, |
|
"rewards/equation_reward_func": 0.18750000558793545, |
|
"rewards/format_reward_func": 0.9166666865348816, |
|
"step": 182 |
|
}, |
|
{ |
|
"completion_length": 50.812501430511475, |
|
"epoch": 0.03253333333333333, |
|
"grad_norm": 3.60684735229285, |
|
"kl": 2.094482421875, |
|
"learning_rate": 6.728895141569462e-07, |
|
"loss": 0.0838, |
|
"reward": 1.020833358168602, |
|
"reward_std": 0.4737688973546028, |
|
"rewards/equation_reward_func": 0.1458333358168602, |
|
"rewards/format_reward_func": 0.8750000298023224, |
|
"step": 183 |
|
}, |
|
{ |
|
"completion_length": 50.58333444595337, |
|
"epoch": 0.032711111111111114, |
|
"grad_norm": 5.166407984979809, |
|
"kl": 6.22412109375, |
|
"learning_rate": 6.695045431828524e-07, |
|
"loss": 0.2489, |
|
"reward": 0.9166667014360428, |
|
"reward_std": 0.44867006316781044, |
|
"rewards/equation_reward_func": 0.10416666977107525, |
|
"rewards/format_reward_func": 0.8125000074505806, |
|
"step": 184 |
|
}, |
|
{ |
|
"completion_length": 52.20833396911621, |
|
"epoch": 0.03288888888888889, |
|
"grad_norm": 5.5000432690940375, |
|
"kl": 6.056640625, |
|
"learning_rate": 6.661107717313823e-07, |
|
"loss": 0.2423, |
|
"reward": 1.020833358168602, |
|
"reward_std": 0.41912320628762245, |
|
"rewards/equation_reward_func": 0.14583333395421505, |
|
"rewards/format_reward_func": 0.8750000149011612, |
|
"step": 185 |
|
}, |
|
{ |
|
"completion_length": 48.854167461395264, |
|
"epoch": 0.03306666666666667, |
|
"grad_norm": 7.26207758284529, |
|
"kl": 9.0859375, |
|
"learning_rate": 6.627083760031754e-07, |
|
"loss": 0.3635, |
|
"reward": 0.9375000298023224, |
|
"reward_std": 0.6717033982276917, |
|
"rewards/equation_reward_func": 0.18750000558793545, |
|
"rewards/format_reward_func": 0.7500000298023224, |
|
"step": 186 |
|
}, |
|
{ |
|
"completion_length": 49.250000953674316, |
|
"epoch": 0.033244444444444445, |
|
"grad_norm": 6.343563025728324, |
|
"kl": 7.396484375, |
|
"learning_rate": 6.592975326466336e-07, |
|
"loss": 0.2961, |
|
"reward": 0.895833358168602, |
|
"reward_std": 0.393995963037014, |
|
"rewards/equation_reward_func": 0.0833333358168602, |
|
"rewards/format_reward_func": 0.8125000149011612, |
|
"step": 187 |
|
}, |
|
{ |
|
"completion_length": 49.625000953674316, |
|
"epoch": 0.03342222222222222, |
|
"grad_norm": 2.659807025481146, |
|
"kl": 2.344482421875, |
|
"learning_rate": 6.558784187487494e-07, |
|
"loss": 0.0939, |
|
"reward": 1.2083333730697632, |
|
"reward_std": 0.5645233578979969, |
|
"rewards/equation_reward_func": 0.31250000558793545, |
|
"rewards/format_reward_func": 0.8958333507180214, |
|
"step": 188 |
|
}, |
|
{ |
|
"completion_length": 52.562500953674316, |
|
"epoch": 0.0336, |
|
"grad_norm": 1.2466332143572052, |
|
"kl": 2.30419921875, |
|
"learning_rate": 6.524512118259121e-07, |
|
"loss": 0.0922, |
|
"reward": 1.0625000298023224, |
|
"reward_std": 0.29669977352023125, |
|
"rewards/equation_reward_func": 0.1250000037252903, |
|
"rewards/format_reward_func": 0.9375000149011612, |
|
"step": 189 |
|
}, |
|
{ |
|
"completion_length": 51.291667461395264, |
|
"epoch": 0.033777777777777775, |
|
"grad_norm": 0.8974480191913635, |
|
"kl": 1.81787109375, |
|
"learning_rate": 6.490160898146918e-07, |
|
"loss": 0.0727, |
|
"reward": 1.1458333879709244, |
|
"reward_std": 0.3842546343803406, |
|
"rewards/equation_reward_func": 0.18750000558793545, |
|
"rewards/format_reward_func": 0.9583333432674408, |
|
"step": 190 |
|
}, |
|
{ |
|
"completion_length": 51.75000190734863, |
|
"epoch": 0.03395555555555556, |
|
"grad_norm": 2.4757198804027007, |
|
"kl": 1.689208984375, |
|
"learning_rate": 6.455732310626004e-07, |
|
"loss": 0.0675, |
|
"reward": 1.0416667014360428, |
|
"reward_std": 0.47692746296525, |
|
"rewards/equation_reward_func": 0.1666666716337204, |
|
"rewards/format_reward_func": 0.8750000223517418, |
|
"step": 191 |
|
}, |
|
{ |
|
"completion_length": 51.187500953674316, |
|
"epoch": 0.034133333333333335, |
|
"grad_norm": 1.3507594543460832, |
|
"kl": 1.50927734375, |
|
"learning_rate": 6.421228143188324e-07, |
|
"loss": 0.0604, |
|
"reward": 1.1458333730697632, |
|
"reward_std": 0.49200813844799995, |
|
"rewards/equation_reward_func": 0.2291666716337204, |
|
"rewards/format_reward_func": 0.916666679084301, |
|
"step": 192 |
|
}, |
|
{ |
|
"completion_length": 51.687501430511475, |
|
"epoch": 0.03431111111111111, |
|
"grad_norm": 0.6767908253371119, |
|
"kl": 1.2314453125, |
|
"learning_rate": 6.386650187249843e-07, |
|
"loss": 0.0493, |
|
"reward": 1.0000000223517418, |
|
"reward_std": 0.23116153106093407, |
|
"rewards/equation_reward_func": 0.06250000186264515, |
|
"rewards/format_reward_func": 0.9375000074505806, |
|
"step": 193 |
|
}, |
|
{ |
|
"completion_length": 50.14583444595337, |
|
"epoch": 0.03448888888888889, |
|
"grad_norm": 1.0489566647059154, |
|
"kl": 1.8955078125, |
|
"learning_rate": 6.352000238057539e-07, |
|
"loss": 0.0759, |
|
"reward": 0.9791666939854622, |
|
"reward_std": 0.5161184519529343, |
|
"rewards/equation_reward_func": 0.14583333767950535, |
|
"rewards/format_reward_func": 0.833333358168602, |
|
"step": 194 |
|
}, |
|
{ |
|
"completion_length": 51.62500190734863, |
|
"epoch": 0.034666666666666665, |
|
"grad_norm": 1.6832436149314471, |
|
"kl": 2.7314453125, |
|
"learning_rate": 6.317280094596196e-07, |
|
"loss": 0.1092, |
|
"reward": 0.8541666939854622, |
|
"reward_std": 0.4447544738650322, |
|
"rewards/equation_reward_func": 0.06250000186264515, |
|
"rewards/format_reward_func": 0.7916666865348816, |
|
"step": 195 |
|
}, |
|
{ |
|
"completion_length": 45.08333396911621, |
|
"epoch": 0.03484444444444444, |
|
"grad_norm": 4.004059770646148, |
|
"kl": 5.3125, |
|
"learning_rate": 6.282491559495004e-07, |
|
"loss": 0.2125, |
|
"reward": 0.9375000111758709, |
|
"reward_std": 0.45106470584869385, |
|
"rewards/equation_reward_func": 0.14583333395421505, |
|
"rewards/format_reward_func": 0.7916666828095913, |
|
"step": 196 |
|
}, |
|
{ |
|
"completion_length": 43.437500953674316, |
|
"epoch": 0.035022222222222225, |
|
"grad_norm": 5.168360687547481, |
|
"kl": 6.869140625, |
|
"learning_rate": 6.247636438933962e-07, |
|
"loss": 0.2745, |
|
"reward": 0.791666679084301, |
|
"reward_std": 0.5768304541707039, |
|
"rewards/equation_reward_func": 0.0833333358168602, |
|
"rewards/format_reward_func": 0.7083333618938923, |
|
"step": 197 |
|
}, |
|
{ |
|
"completion_length": 51.89583444595337, |
|
"epoch": 0.0352, |
|
"grad_norm": 1.3469737406143796, |
|
"kl": 4.0185546875, |
|
"learning_rate": 6.212716542550112e-07, |
|
"loss": 0.1607, |
|
"reward": 0.937500037252903, |
|
"reward_std": 0.49327604100108147, |
|
"rewards/equation_reward_func": 0.14583333767950535, |
|
"rewards/format_reward_func": 0.791666679084301, |
|
"step": 198 |
|
}, |
|
{ |
|
"completion_length": 50.45833492279053, |
|
"epoch": 0.03537777777777778, |
|
"grad_norm": 1.0759183343173833, |
|
"kl": 1.4365234375, |
|
"learning_rate": 6.177733683343578e-07, |
|
"loss": 0.0576, |
|
"reward": 1.166666716337204, |
|
"reward_std": 0.5774685852229595, |
|
"rewards/equation_reward_func": 0.2500000074505806, |
|
"rewards/format_reward_func": 0.9166666865348816, |
|
"step": 199 |
|
}, |
|
{ |
|
"completion_length": 52.16666793823242, |
|
"epoch": 0.035555555555555556, |
|
"grad_norm": 1.3378011133376908, |
|
"kl": 1.564453125, |
|
"learning_rate": 6.142689677583445e-07, |
|
"loss": 0.0626, |
|
"reward": 1.1250000298023224, |
|
"reward_std": 0.48728758841753006, |
|
"rewards/equation_reward_func": 0.20833333767950535, |
|
"rewards/format_reward_func": 0.9166666865348816, |
|
"step": 200 |
|
}, |
|
{ |
|
"completion_length": 53.35416793823242, |
|
"epoch": 0.03573333333333333, |
|
"grad_norm": 3.0978024708495218, |
|
"kl": 1.87646484375, |
|
"learning_rate": 6.107586344713451e-07, |
|
"loss": 0.075, |
|
"reward": 1.1041666865348816, |
|
"reward_std": 0.4963582567870617, |
|
"rewards/equation_reward_func": 0.18750000558793545, |
|
"rewards/format_reward_func": 0.9166666865348816, |
|
"step": 201 |
|
}, |
|
{ |
|
"completion_length": 54.60416793823242, |
|
"epoch": 0.03591111111111111, |
|
"grad_norm": 1.4803852973189526, |
|
"kl": 1.585693359375, |
|
"learning_rate": 6.072425507257527e-07, |
|
"loss": 0.0633, |
|
"reward": 1.041666679084301, |
|
"reward_std": 0.2958494834601879, |
|
"rewards/equation_reward_func": 0.12500000186264515, |
|
"rewards/format_reward_func": 0.9166666716337204, |
|
"step": 202 |
|
}, |
|
{ |
|
"completion_length": 50.97916793823242, |
|
"epoch": 0.036088888888888886, |
|
"grad_norm": 1.0545514121261095, |
|
"kl": 3.23779296875, |
|
"learning_rate": 6.03720899072518e-07, |
|
"loss": 0.1296, |
|
"reward": 0.9791667014360428, |
|
"reward_std": 0.3412862755358219, |
|
"rewards/equation_reward_func": 0.0833333358168602, |
|
"rewards/format_reward_func": 0.8958333507180214, |
|
"step": 203 |
|
}, |
|
{ |
|
"completion_length": 53.29166793823242, |
|
"epoch": 0.03626666666666667, |
|
"grad_norm": 1.258708711483678, |
|
"kl": 4.0927734375, |
|
"learning_rate": 6.001938623516705e-07, |
|
"loss": 0.1635, |
|
"reward": 0.895833358168602, |
|
"reward_std": 0.5528258420526981, |
|
"rewards/equation_reward_func": 0.10416666977107525, |
|
"rewards/format_reward_func": 0.7916666939854622, |
|
"step": 204 |
|
}, |
|
{ |
|
"completion_length": 52.562501430511475, |
|
"epoch": 0.036444444444444446, |
|
"grad_norm": 1.192627874291934, |
|
"kl": 4.099365234375, |
|
"learning_rate": 5.966616236828262e-07, |
|
"loss": 0.1639, |
|
"reward": 0.8750000204890966, |
|
"reward_std": 0.5111859105527401, |
|
"rewards/equation_reward_func": 0.14583333767950535, |
|
"rewards/format_reward_func": 0.7291666772216558, |
|
"step": 205 |
|
}, |
|
{ |
|
"completion_length": 51.97916841506958, |
|
"epoch": 0.03662222222222222, |
|
"grad_norm": 0.9273331385562189, |
|
"kl": 2.384033203125, |
|
"learning_rate": 5.931243664556802e-07, |
|
"loss": 0.0952, |
|
"reward": 1.0208333730697632, |
|
"reward_std": 0.40168892592191696, |
|
"rewards/equation_reward_func": 0.1250000037252903, |
|
"rewards/format_reward_func": 0.8958333432674408, |
|
"step": 206 |
|
}, |
|
{ |
|
"completion_length": 52.937500953674316, |
|
"epoch": 0.0368, |
|
"grad_norm": 1.8010297199927365, |
|
"kl": 3.5654296875, |
|
"learning_rate": 5.895822743204855e-07, |
|
"loss": 0.1426, |
|
"reward": 0.9583333730697632, |
|
"reward_std": 0.5215550065040588, |
|
"rewards/equation_reward_func": 0.14583333767950535, |
|
"rewards/format_reward_func": 0.8125000223517418, |
|
"step": 207 |
|
}, |
|
{ |
|
"completion_length": 51.60416841506958, |
|
"epoch": 0.036977777777777776, |
|
"grad_norm": 2.2135535055122597, |
|
"kl": 4.67578125, |
|
"learning_rate": 5.860355311785175e-07, |
|
"loss": 0.1869, |
|
"reward": 0.7916667014360428, |
|
"reward_std": 0.5062714368104935, |
|
"rewards/equation_reward_func": 0.06250000186264515, |
|
"rewards/format_reward_func": 0.7291666828095913, |
|
"step": 208 |
|
}, |
|
{ |
|
"completion_length": 51.187501430511475, |
|
"epoch": 0.03715555555555555, |
|
"grad_norm": 1.4267067115373708, |
|
"kl": 5.53515625, |
|
"learning_rate": 5.824843211725264e-07, |
|
"loss": 0.221, |
|
"reward": 0.7916666939854622, |
|
"reward_std": 0.5978475920855999, |
|
"rewards/equation_reward_func": 0.10416666977107525, |
|
"rewards/format_reward_func": 0.6875000149011612, |
|
"step": 209 |
|
}, |
|
{ |
|
"completion_length": 46.83333492279053, |
|
"epoch": 0.037333333333333336, |
|
"grad_norm": 2.8629336811508863, |
|
"kl": 5.829345703125, |
|
"learning_rate": 5.78928828677177e-07, |
|
"loss": 0.2335, |
|
"reward": 0.7916667014360428, |
|
"reward_std": 0.6519223563373089, |
|
"rewards/equation_reward_func": 0.14583333767950535, |
|
"rewards/format_reward_func": 0.6458333507180214, |
|
"step": 210 |
|
}, |
|
{ |
|
"completion_length": 56.500001430511475, |
|
"epoch": 0.03751111111111111, |
|
"grad_norm": 4.250283297994686, |
|
"kl": 3.20849609375, |
|
"learning_rate": 5.753692382894759e-07, |
|
"loss": 0.1283, |
|
"reward": 0.8958333544433117, |
|
"reward_std": 0.6043893173336983, |
|
"rewards/equation_reward_func": 0.1875000037252903, |
|
"rewards/format_reward_func": 0.7083333469927311, |
|
"step": 211 |
|
}, |
|
{ |
|
"completion_length": 54.375001430511475, |
|
"epoch": 0.03768888888888889, |
|
"grad_norm": 1.3837454584410787, |
|
"kl": 2.209228515625, |
|
"learning_rate": 5.718057348191874e-07, |
|
"loss": 0.0884, |
|
"reward": 0.8541666902601719, |
|
"reward_std": 0.6237337328493595, |
|
"rewards/equation_reward_func": 0.14583333767950535, |
|
"rewards/format_reward_func": 0.7083333544433117, |
|
"step": 212 |
|
}, |
|
{ |
|
"completion_length": 51.89583444595337, |
|
"epoch": 0.037866666666666667, |
|
"grad_norm": 2.4493534707311713, |
|
"kl": 2.712158203125, |
|
"learning_rate": 5.682385032792385e-07, |
|
"loss": 0.1085, |
|
"reward": 0.7916666753590107, |
|
"reward_std": 0.5281161963939667, |
|
"rewards/equation_reward_func": 0.0833333358168602, |
|
"rewards/format_reward_func": 0.7083333544433117, |
|
"step": 213 |
|
}, |
|
{ |
|
"completion_length": 55.89583492279053, |
|
"epoch": 0.03804444444444444, |
|
"grad_norm": 2.2208225510289012, |
|
"kl": 2.525634765625, |
|
"learning_rate": 5.646677288761132e-07, |
|
"loss": 0.101, |
|
"reward": 0.9791667014360428, |
|
"reward_std": 0.5173863507807255, |
|
"rewards/equation_reward_func": 0.1666666716337204, |
|
"rewards/format_reward_func": 0.8125000149011612, |
|
"step": 214 |
|
}, |
|
{ |
|
"completion_length": 57.33333444595337, |
|
"epoch": 0.03822222222222222, |
|
"grad_norm": 0.8710411293209732, |
|
"kl": 2.434814453125, |
|
"learning_rate": 5.610935970002365e-07, |
|
"loss": 0.0974, |
|
"reward": 0.9375000223517418, |
|
"reward_std": 0.5438718348741531, |
|
"rewards/equation_reward_func": 0.1666666716337204, |
|
"rewards/format_reward_func": 0.7708333507180214, |
|
"step": 215 |
|
}, |
|
{ |
|
"completion_length": 52.89583492279053, |
|
"epoch": 0.0384, |
|
"grad_norm": 3.567048205512723, |
|
"kl": 5.6318359375, |
|
"learning_rate": 5.575162932163501e-07, |
|
"loss": 0.2252, |
|
"reward": 0.7916666865348816, |
|
"reward_std": 0.6442962922155857, |
|
"rewards/equation_reward_func": 0.1250000037252903, |
|
"rewards/format_reward_func": 0.6666666865348816, |
|
"step": 216 |
|
}, |
|
{ |
|
"completion_length": 47.85416793823242, |
|
"epoch": 0.03857777777777778, |
|
"grad_norm": 5.381576408790184, |
|
"kl": 7.646484375, |
|
"learning_rate": 5.53936003253877e-07, |
|
"loss": 0.3065, |
|
"reward": 0.6041666883975267, |
|
"reward_std": 0.5564196482300758, |
|
"rewards/equation_reward_func": 0.0416666679084301, |
|
"rewards/format_reward_func": 0.5625000167638063, |
|
"step": 217 |
|
}, |
|
{ |
|
"completion_length": 57.354167461395264, |
|
"epoch": 0.03875555555555556, |
|
"grad_norm": 2.8527721738006755, |
|
"kl": 5.25146484375, |
|
"learning_rate": 5.503529129972792e-07, |
|
"loss": 0.2104, |
|
"reward": 0.8333333618938923, |
|
"reward_std": 0.5661325417459011, |
|
"rewards/equation_reward_func": 0.1250000037252903, |
|
"rewards/format_reward_func": 0.7083333469927311, |
|
"step": 218 |
|
}, |
|
{ |
|
"completion_length": 52.77083444595337, |
|
"epoch": 0.038933333333333334, |
|
"grad_norm": 0.9510637073032893, |
|
"kl": 4.537109375, |
|
"learning_rate": 5.467672084764065e-07, |
|
"loss": 0.1813, |
|
"reward": 0.7083333469927311, |
|
"reward_std": 0.5277270041406155, |
|
"rewards/equation_reward_func": 0.06250000186264515, |
|
"rewards/format_reward_func": 0.6458333469927311, |
|
"step": 219 |
|
}, |
|
{ |
|
"completion_length": 56.437501430511475, |
|
"epoch": 0.03911111111111111, |
|
"grad_norm": 1.4626021636984767, |
|
"kl": 3.047607421875, |
|
"learning_rate": 5.431790758568388e-07, |
|
"loss": 0.122, |
|
"reward": 0.7916666902601719, |
|
"reward_std": 0.5062714405357838, |
|
"rewards/equation_reward_func": 0.0833333358168602, |
|
"rewards/format_reward_func": 0.7083333618938923, |
|
"step": 220 |
|
}, |
|
{ |
|
"completion_length": 52.29166793823242, |
|
"epoch": 0.03928888888888889, |
|
"grad_norm": 1.2504772253255731, |
|
"kl": 3.7666015625, |
|
"learning_rate": 5.395887014302191e-07, |
|
"loss": 0.1507, |
|
"reward": 0.8125000186264515, |
|
"reward_std": 0.4043150581419468, |
|
"rewards/equation_reward_func": 0.0416666679084301, |
|
"rewards/format_reward_func": 0.7708333544433117, |
|
"step": 221 |
|
}, |
|
{ |
|
"completion_length": 58.39583492279053, |
|
"epoch": 0.039466666666666664, |
|
"grad_norm": 1.7464009649268544, |
|
"kl": 3.71337890625, |
|
"learning_rate": 5.359962716045835e-07, |
|
"loss": 0.1485, |
|
"reward": 0.7916666772216558, |
|
"reward_std": 0.5010788105428219, |
|
"rewards/equation_reward_func": 0.0833333358168602, |
|
"rewards/format_reward_func": 0.7083333563059568, |
|
"step": 222 |
|
}, |
|
{ |
|
"completion_length": 61.41666841506958, |
|
"epoch": 0.03964444444444445, |
|
"grad_norm": 3.039553978693204, |
|
"kl": 1.073974609375, |
|
"learning_rate": 5.324019728946812e-07, |
|
"loss": 0.043, |
|
"reward": 0.854166679084301, |
|
"reward_std": 0.36124951019883156, |
|
"rewards/equation_reward_func": 0.0416666679084301, |
|
"rewards/format_reward_func": 0.8125000149011612, |
|
"step": 223 |
|
}, |
|
{ |
|
"completion_length": 56.50000238418579, |
|
"epoch": 0.039822222222222224, |
|
"grad_norm": 2.0365403750402704, |
|
"kl": 1.76123046875, |
|
"learning_rate": 5.288059919122921e-07, |
|
"loss": 0.0705, |
|
"reward": 0.9583333656191826, |
|
"reward_std": 0.45534609258174896, |
|
"rewards/equation_reward_func": 0.10416666977107525, |
|
"rewards/format_reward_func": 0.8541666865348816, |
|
"step": 224 |
|
}, |
|
{ |
|
"completion_length": 61.14583492279053, |
|
"epoch": 0.04, |
|
"grad_norm": 2.811287393829535, |
|
"kl": 0.533447265625, |
|
"learning_rate": 5.252085153565374e-07, |
|
"loss": 0.0213, |
|
"reward": 1.1875000298023224, |
|
"reward_std": 0.37868379428982735, |
|
"rewards/equation_reward_func": 0.2291666716337204, |
|
"rewards/format_reward_func": 0.9583333432674408, |
|
"step": 225 |
|
}, |
|
{ |
|
"completion_length": 59.437501430511475, |
|
"epoch": 0.04017777777777778, |
|
"grad_norm": 2.3086911193675315, |
|
"kl": 0.8291015625, |
|
"learning_rate": 5.216097300041869e-07, |
|
"loss": 0.0332, |
|
"reward": 1.0416667014360428, |
|
"reward_std": 0.3477308079600334, |
|
"rewards/equation_reward_func": 0.1250000037252903, |
|
"rewards/format_reward_func": 0.9166666865348816, |
|
"step": 226 |
|
}, |
|
{ |
|
"completion_length": 62.20833492279053, |
|
"epoch": 0.040355555555555554, |
|
"grad_norm": 4.276021428219898, |
|
"kl": 0.727783203125, |
|
"learning_rate": 5.180098226999618e-07, |
|
"loss": 0.0291, |
|
"reward": 1.062500037252903, |
|
"reward_std": 0.4822668209671974, |
|
"rewards/equation_reward_func": 0.18750000186264515, |
|
"rewards/format_reward_func": 0.8750000149011612, |
|
"step": 227 |
|
}, |
|
{ |
|
"completion_length": 65.10416889190674, |
|
"epoch": 0.04053333333333333, |
|
"grad_norm": 1.2764419644302698, |
|
"kl": 0.894775390625, |
|
"learning_rate": 5.144089803468332e-07, |
|
"loss": 0.0358, |
|
"reward": 1.1250000447034836, |
|
"reward_std": 0.4984116405248642, |
|
"rewards/equation_reward_func": 0.2083333395421505, |
|
"rewards/format_reward_func": 0.9166666865348816, |
|
"step": 228 |
|
}, |
|
{ |
|
"completion_length": 55.812501430511475, |
|
"epoch": 0.040711111111111115, |
|
"grad_norm": 1.717756013089439, |
|
"kl": 1.95361328125, |
|
"learning_rate": 5.108073898963193e-07, |
|
"loss": 0.0781, |
|
"reward": 0.9583333656191826, |
|
"reward_std": 0.3477308079600334, |
|
"rewards/equation_reward_func": 0.06250000186264515, |
|
"rewards/format_reward_func": 0.8958333507180214, |
|
"step": 229 |
|
}, |
|
{ |
|
"completion_length": 57.02083444595337, |
|
"epoch": 0.04088888888888889, |
|
"grad_norm": 1.9105584928374175, |
|
"kl": 3.0869140625, |
|
"learning_rate": 5.072052383387786e-07, |
|
"loss": 0.1236, |
|
"reward": 0.9375000260770321, |
|
"reward_std": 0.37575671821832657, |
|
"rewards/equation_reward_func": 0.10416666977107525, |
|
"rewards/format_reward_func": 0.8333333469927311, |
|
"step": 230 |
|
}, |
|
{ |
|
"completion_length": 53.437501430511475, |
|
"epoch": 0.04106666666666667, |
|
"grad_norm": 3.2606237368453375, |
|
"kl": 6.6767578125, |
|
"learning_rate": 5.036027126937013e-07, |
|
"loss": 0.2667, |
|
"reward": 0.7708333656191826, |
|
"reward_std": 0.5834501683712006, |
|
"rewards/equation_reward_func": 0.12500000186264515, |
|
"rewards/format_reward_func": 0.6458333469927311, |
|
"step": 231 |
|
}, |
|
{ |
|
"completion_length": 57.43750190734863, |
|
"epoch": 0.041244444444444445, |
|
"grad_norm": 4.160342344463272, |
|
"kl": 5.7734375, |
|
"learning_rate": 5e-07, |
|
"loss": 0.2307, |
|
"reward": 0.7916666939854622, |
|
"reward_std": 0.550732146948576, |
|
"rewards/equation_reward_func": 0.08333333395421505, |
|
"rewards/format_reward_func": 0.708333358168602, |
|
"step": 232 |
|
}, |
|
{ |
|
"completion_length": 50.937501430511475, |
|
"epoch": 0.04142222222222222, |
|
"grad_norm": 1.7460050228805684, |
|
"kl": 5.1337890625, |
|
"learning_rate": 4.963972873062987e-07, |
|
"loss": 0.2054, |
|
"reward": 0.8958333618938923, |
|
"reward_std": 0.609845332801342, |
|
"rewards/equation_reward_func": 0.1875000037252903, |
|
"rewards/format_reward_func": 0.7083333507180214, |
|
"step": 233 |
|
}, |
|
{ |
|
"completion_length": 59.77083396911621, |
|
"epoch": 0.0416, |
|
"grad_norm": 0.9068735838301717, |
|
"kl": 2.849609375, |
|
"learning_rate": 4.927947616612215e-07, |
|
"loss": 0.1138, |
|
"reward": 0.958333358168602, |
|
"reward_std": 0.36931218579411507, |
|
"rewards/equation_reward_func": 0.0833333358168602, |
|
"rewards/format_reward_func": 0.8750000149011612, |
|
"step": 234 |
|
}, |
|
{ |
|
"completion_length": 57.58333492279053, |
|
"epoch": 0.041777777777777775, |
|
"grad_norm": 2.6421618291419535, |
|
"kl": 2.1416015625, |
|
"learning_rate": 4.891926101036806e-07, |
|
"loss": 0.0856, |
|
"reward": 0.875000037252903, |
|
"reward_std": 0.45271996036171913, |
|
"rewards/equation_reward_func": 0.06250000186264515, |
|
"rewards/format_reward_func": 0.8125000223517418, |
|
"step": 235 |
|
}, |
|
{ |
|
"completion_length": 61.14583492279053, |
|
"epoch": 0.04195555555555556, |
|
"grad_norm": 2.420138039635954, |
|
"kl": 1.884765625, |
|
"learning_rate": 4.855910196531669e-07, |
|
"loss": 0.0753, |
|
"reward": 0.8541667014360428, |
|
"reward_std": 0.5093041993677616, |
|
"rewards/equation_reward_func": 0.0833333358168602, |
|
"rewards/format_reward_func": 0.770833358168602, |
|
"step": 236 |
|
}, |
|
{ |
|
"completion_length": 57.625000953674316, |
|
"epoch": 0.042133333333333335, |
|
"grad_norm": 2.23529671294081, |
|
"kl": 1.854248046875, |
|
"learning_rate": 4.819901773000383e-07, |
|
"loss": 0.074, |
|
"reward": 0.9791667088866234, |
|
"reward_std": 0.5189073011279106, |
|
"rewards/equation_reward_func": 0.14583333767950535, |
|
"rewards/format_reward_func": 0.8333333507180214, |
|
"step": 237 |
|
}, |
|
{ |
|
"completion_length": 52.47916793823242, |
|
"epoch": 0.04231111111111111, |
|
"grad_norm": 1.6361655131027242, |
|
"kl": 5.17529296875, |
|
"learning_rate": 4.783902699958129e-07, |
|
"loss": 0.2068, |
|
"reward": 0.8125000298023224, |
|
"reward_std": 0.5698040388524532, |
|
"rewards/equation_reward_func": 0.10416666977107525, |
|
"rewards/format_reward_func": 0.7083333507180214, |
|
"step": 238 |
|
}, |
|
{ |
|
"completion_length": 49.375001430511475, |
|
"epoch": 0.04248888888888889, |
|
"grad_norm": 1.222312539805785, |
|
"kl": 5.5263671875, |
|
"learning_rate": 4.747914846434627e-07, |
|
"loss": 0.2211, |
|
"reward": 0.9166667088866234, |
|
"reward_std": 0.6387733817100525, |
|
"rewards/equation_reward_func": 0.1666666716337204, |
|
"rewards/format_reward_func": 0.7500000298023224, |
|
"step": 239 |
|
}, |
|
{ |
|
"completion_length": 51.750001430511475, |
|
"epoch": 0.042666666666666665, |
|
"grad_norm": 1.9824590980545862, |
|
"kl": 4.46875, |
|
"learning_rate": 4.711940080877079e-07, |
|
"loss": 0.1786, |
|
"reward": 1.0208333767950535, |
|
"reward_std": 0.5363415889441967, |
|
"rewards/equation_reward_func": 0.22916667349636555, |
|
"rewards/format_reward_func": 0.7916666753590107, |
|
"step": 240 |
|
}, |
|
{ |
|
"completion_length": 54.687500953674316, |
|
"epoch": 0.04284444444444444, |
|
"grad_norm": 1.6661289610091754, |
|
"kl": 4.4775390625, |
|
"learning_rate": 4.675980271053187e-07, |
|
"loss": 0.1791, |
|
"reward": 0.8750000298023224, |
|
"reward_std": 0.4297148250043392, |
|
"rewards/equation_reward_func": 0.06250000186264515, |
|
"rewards/format_reward_func": 0.8125000149011612, |
|
"step": 241 |
|
}, |
|
{ |
|
"completion_length": 50.437501430511475, |
|
"epoch": 0.043022222222222226, |
|
"grad_norm": 3.5074096399017862, |
|
"kl": 6.017578125, |
|
"learning_rate": 4.6400372839541647e-07, |
|
"loss": 0.2406, |
|
"reward": 0.8541666828095913, |
|
"reward_std": 0.5376026295125484, |
|
"rewards/equation_reward_func": 0.1250000037252903, |
|
"rewards/format_reward_func": 0.7291666828095913, |
|
"step": 242 |
|
}, |
|
{ |
|
"completion_length": 52.10416793823242, |
|
"epoch": 0.0432, |
|
"grad_norm": 3.137739294752723, |
|
"kl": 4.3701171875, |
|
"learning_rate": 4.6041129856978083e-07, |
|
"loss": 0.1746, |
|
"reward": 0.8958333656191826, |
|
"reward_std": 0.47406983748078346, |
|
"rewards/equation_reward_func": 0.1041666679084301, |
|
"rewards/format_reward_func": 0.791666679084301, |
|
"step": 243 |
|
}, |
|
{ |
|
"completion_length": 56.16666793823242, |
|
"epoch": 0.04337777777777778, |
|
"grad_norm": 1.638039054641607, |
|
"kl": 2.74462890625, |
|
"learning_rate": 4.568209241431614e-07, |
|
"loss": 0.11, |
|
"reward": 0.9375000447034836, |
|
"reward_std": 0.3842546418309212, |
|
"rewards/equation_reward_func": 0.06250000186264515, |
|
"rewards/format_reward_func": 0.8750000149011612, |
|
"step": 244 |
|
}, |
|
{ |
|
"completion_length": 59.791667461395264, |
|
"epoch": 0.043555555555555556, |
|
"grad_norm": 1.4835502515862988, |
|
"kl": 2.812744140625, |
|
"learning_rate": 4.532327915235935e-07, |
|
"loss": 0.1124, |
|
"reward": 1.0416666939854622, |
|
"reward_std": 0.49689069390296936, |
|
"rewards/equation_reward_func": 0.1666666716337204, |
|
"rewards/format_reward_func": 0.8750000223517418, |
|
"step": 245 |
|
}, |
|
{ |
|
"completion_length": 58.87500238418579, |
|
"epoch": 0.04373333333333333, |
|
"grad_norm": 1.2125553729452667, |
|
"kl": 1.87158203125, |
|
"learning_rate": 4.4964708700272086e-07, |
|
"loss": 0.0746, |
|
"reward": 1.0000000298023224, |
|
"reward_std": 0.3747682049870491, |
|
"rewards/equation_reward_func": 0.10416666977107525, |
|
"rewards/format_reward_func": 0.8958333432674408, |
|
"step": 246 |
|
}, |
|
{ |
|
"completion_length": 59.83333444595337, |
|
"epoch": 0.04391111111111111, |
|
"grad_norm": 0.8400742214946505, |
|
"kl": 1.407470703125, |
|
"learning_rate": 4.4606399674612306e-07, |
|
"loss": 0.0563, |
|
"reward": 1.1041666865348816, |
|
"reward_std": 0.2591874338686466, |
|
"rewards/equation_reward_func": 0.14583333767950535, |
|
"rewards/format_reward_func": 0.9583333432674408, |
|
"step": 247 |
|
}, |
|
{ |
|
"completion_length": 56.31250238418579, |
|
"epoch": 0.044088888888888886, |
|
"grad_norm": 1.5894946494651951, |
|
"kl": 2.568603515625, |
|
"learning_rate": 4.424837067836499e-07, |
|
"loss": 0.1027, |
|
"reward": 1.1458333507180214, |
|
"reward_std": 0.41730131581425667, |
|
"rewards/equation_reward_func": 0.2500000037252903, |
|
"rewards/format_reward_func": 0.8958333507180214, |
|
"step": 248 |
|
}, |
|
{ |
|
"completion_length": 57.97916793823242, |
|
"epoch": 0.04426666666666667, |
|
"grad_norm": 1.5682356442872543, |
|
"kl": 0.796142578125, |
|
"learning_rate": 4.389064029997634e-07, |
|
"loss": 0.0319, |
|
"reward": 1.208333358168602, |
|
"reward_std": 0.4256649389863014, |
|
"rewards/equation_reward_func": 0.2500000037252903, |
|
"rewards/format_reward_func": 0.9583333432674408, |
|
"step": 249 |
|
}, |
|
{ |
|
"completion_length": 53.187501430511475, |
|
"epoch": 0.044444444444444446, |
|
"grad_norm": 1.616534745852269, |
|
"kl": 2.35107421875, |
|
"learning_rate": 4.353322711238869e-07, |
|
"loss": 0.094, |
|
"reward": 1.0833333805203438, |
|
"reward_std": 0.5344030410051346, |
|
"rewards/equation_reward_func": 0.18750000558793545, |
|
"rewards/format_reward_func": 0.8958333507180214, |
|
"step": 250 |
|
}, |
|
{ |
|
"completion_length": 56.437501430511475, |
|
"epoch": 0.04462222222222222, |
|
"grad_norm": 0.9960822449051799, |
|
"kl": 2.34619140625, |
|
"learning_rate": 4.3176149672076143e-07, |
|
"loss": 0.0939, |
|
"reward": 1.0000000149011612, |
|
"reward_std": 0.3006153665482998, |
|
"rewards/equation_reward_func": 0.08333333395421505, |
|
"rewards/format_reward_func": 0.9166666865348816, |
|
"step": 251 |
|
}, |
|
{ |
|
"completion_length": 58.354167461395264, |
|
"epoch": 0.0448, |
|
"grad_norm": 1.4205394021526243, |
|
"kl": 2.93505859375, |
|
"learning_rate": 4.2819426518081256e-07, |
|
"loss": 0.1178, |
|
"reward": 1.1458333656191826, |
|
"reward_std": 0.5342973358929157, |
|
"rewards/equation_reward_func": 0.2708333395421505, |
|
"rewards/format_reward_func": 0.8750000223517418, |
|
"step": 252 |
|
}, |
|
{ |
|
"completion_length": 52.416667461395264, |
|
"epoch": 0.044977777777777776, |
|
"grad_norm": 4.465478199693993, |
|
"kl": 4.697509765625, |
|
"learning_rate": 4.246307617105241e-07, |
|
"loss": 0.1876, |
|
"reward": 1.0625000223517418, |
|
"reward_std": 0.5653560161590576, |
|
"rewards/equation_reward_func": 0.20833333767950535, |
|
"rewards/format_reward_func": 0.8541666865348816, |
|
"step": 253 |
|
}, |
|
{ |
|
"completion_length": 53.77083396911621, |
|
"epoch": 0.04515555555555555, |
|
"grad_norm": 2.9278233844283457, |
|
"kl": 5.1484375, |
|
"learning_rate": 4.21071171322823e-07, |
|
"loss": 0.206, |
|
"reward": 0.9583333693444729, |
|
"reward_std": 0.5784117728471756, |
|
"rewards/equation_reward_func": 0.18750000558793545, |
|
"rewards/format_reward_func": 0.7708333544433117, |
|
"step": 254 |
|
}, |
|
{ |
|
"completion_length": 46.81250071525574, |
|
"epoch": 0.04533333333333334, |
|
"grad_norm": 3.0060965561056654, |
|
"kl": 6.7744140625, |
|
"learning_rate": 4.1751567882747373e-07, |
|
"loss": 0.2713, |
|
"reward": 1.0000000298023224, |
|
"reward_std": 0.6698537915945053, |
|
"rewards/equation_reward_func": 0.25000000186264515, |
|
"rewards/format_reward_func": 0.7500000223517418, |
|
"step": 255 |
|
}, |
|
{ |
|
"completion_length": 52.812501430511475, |
|
"epoch": 0.04551111111111111, |
|
"grad_norm": 2.435192469284996, |
|
"kl": 3.97900390625, |
|
"learning_rate": 4.139644688214826e-07, |
|
"loss": 0.1592, |
|
"reward": 0.9583333730697632, |
|
"reward_std": 0.44083887711167336, |
|
"rewards/equation_reward_func": 0.10416666977107525, |
|
"rewards/format_reward_func": 0.8541666865348816, |
|
"step": 256 |
|
}, |
|
{ |
|
"completion_length": 54.500000953674316, |
|
"epoch": 0.04568888888888889, |
|
"grad_norm": 2.3477327869102136, |
|
"kl": 4.3671875, |
|
"learning_rate": 4.104177256795144e-07, |
|
"loss": 0.1747, |
|
"reward": 1.1250000298023224, |
|
"reward_std": 0.5271082073450089, |
|
"rewards/equation_reward_func": 0.25000000558793545, |
|
"rewards/format_reward_func": 0.8750000149011612, |
|
"step": 257 |
|
}, |
|
{ |
|
"completion_length": 55.562501430511475, |
|
"epoch": 0.04586666666666667, |
|
"grad_norm": 1.462739785577378, |
|
"kl": 1.19873046875, |
|
"learning_rate": 4.068756335443198e-07, |
|
"loss": 0.0481, |
|
"reward": 1.083333358168602, |
|
"reward_std": 0.42273785918951035, |
|
"rewards/equation_reward_func": 0.16666666977107525, |
|
"rewards/format_reward_func": 0.9166666865348816, |
|
"step": 258 |
|
}, |
|
{ |
|
"completion_length": 54.10416841506958, |
|
"epoch": 0.04604444444444444, |
|
"grad_norm": 1.0194230408797653, |
|
"kl": 1.746826171875, |
|
"learning_rate": 4.0333837631717376e-07, |
|
"loss": 0.07, |
|
"reward": 1.1250000447034836, |
|
"reward_std": 0.4783512204885483, |
|
"rewards/equation_reward_func": 0.20833333767950535, |
|
"rewards/format_reward_func": 0.9166666865348816, |
|
"step": 259 |
|
}, |
|
{ |
|
"completion_length": 54.27083444595337, |
|
"epoch": 0.04622222222222222, |
|
"grad_norm": 1.2154097528831294, |
|
"kl": 2.1083984375, |
|
"learning_rate": 3.998061376483297e-07, |
|
"loss": 0.0844, |
|
"reward": 1.0833333730697632, |
|
"reward_std": 0.44083888456225395, |
|
"rewards/equation_reward_func": 0.18750000558793545, |
|
"rewards/format_reward_func": 0.8958333507180214, |
|
"step": 260 |
|
}, |
|
{ |
|
"completion_length": 52.75000190734863, |
|
"epoch": 0.0464, |
|
"grad_norm": 0.8928800361092635, |
|
"kl": 2.84619140625, |
|
"learning_rate": 3.9627910092748204e-07, |
|
"loss": 0.1137, |
|
"reward": 0.8750000074505806, |
|
"reward_std": 0.3061685785651207, |
|
"rewards/equation_reward_func": 0.0416666679084301, |
|
"rewards/format_reward_func": 0.8333333432674408, |
|
"step": 261 |
|
}, |
|
{ |
|
"completion_length": 48.687500953674316, |
|
"epoch": 0.04657777777777778, |
|
"grad_norm": 2.0850690944306476, |
|
"kl": 4.7197265625, |
|
"learning_rate": 3.9275744927424723e-07, |
|
"loss": 0.1885, |
|
"reward": 0.958333358168602, |
|
"reward_std": 0.5193834900856018, |
|
"rewards/equation_reward_func": 0.18750000558793545, |
|
"rewards/format_reward_func": 0.7708333544433117, |
|
"step": 262 |
|
}, |
|
{ |
|
"completion_length": 48.187501430511475, |
|
"epoch": 0.04675555555555556, |
|
"grad_norm": 1.408092281724432, |
|
"kl": 4.294189453125, |
|
"learning_rate": 3.89241365528655e-07, |
|
"loss": 0.1719, |
|
"reward": 0.8333333507180214, |
|
"reward_std": 0.5240839421749115, |
|
"rewards/equation_reward_func": 0.12500000186264515, |
|
"rewards/format_reward_func": 0.708333358168602, |
|
"step": 263 |
|
}, |
|
{ |
|
"completion_length": 51.64583396911621, |
|
"epoch": 0.046933333333333334, |
|
"grad_norm": 1.9193528605234629, |
|
"kl": 3.56005859375, |
|
"learning_rate": 3.8573103224165547e-07, |
|
"loss": 0.1424, |
|
"reward": 0.8333333656191826, |
|
"reward_std": 0.6091980896890163, |
|
"rewards/equation_reward_func": 0.1250000037252903, |
|
"rewards/format_reward_func": 0.7083333507180214, |
|
"step": 264 |
|
}, |
|
{ |
|
"completion_length": 51.37500047683716, |
|
"epoch": 0.04711111111111111, |
|
"grad_norm": 1.8731610861564136, |
|
"kl": 3.638671875, |
|
"learning_rate": 3.8222663166564207e-07, |
|
"loss": 0.1455, |
|
"reward": 0.8125000298023224, |
|
"reward_std": 0.6027945913374424, |
|
"rewards/equation_reward_func": 0.12500000186264515, |
|
"rewards/format_reward_func": 0.6875000149011612, |
|
"step": 265 |
|
}, |
|
{ |
|
"completion_length": 58.45833492279053, |
|
"epoch": 0.04728888888888889, |
|
"grad_norm": 1.1952492746934869, |
|
"kl": 1.924560546875, |
|
"learning_rate": 3.787283457449889e-07, |
|
"loss": 0.0769, |
|
"reward": 1.1666666939854622, |
|
"reward_std": 0.48326630517840385, |
|
"rewards/equation_reward_func": 0.25000000558793545, |
|
"rewards/format_reward_func": 0.916666679084301, |
|
"step": 266 |
|
}, |
|
{ |
|
"completion_length": 45.64583444595337, |
|
"epoch": 0.047466666666666664, |
|
"grad_norm": 3.8058624476316156, |
|
"kl": 7.12890625, |
|
"learning_rate": 3.752363561066039e-07, |
|
"loss": 0.285, |
|
"reward": 0.729166692122817, |
|
"reward_std": 0.5802789963781834, |
|
"rewards/equation_reward_func": 0.1041666679084301, |
|
"rewards/format_reward_func": 0.6250000204890966, |
|
"step": 267 |
|
}, |
|
{ |
|
"completion_length": 52.70833492279053, |
|
"epoch": 0.04764444444444445, |
|
"grad_norm": 1.2974618772894446, |
|
"kl": 3.54248046875, |
|
"learning_rate": 3.717508440504997e-07, |
|
"loss": 0.1417, |
|
"reward": 0.8750000298023224, |
|
"reward_std": 0.37628914788365364, |
|
"rewards/equation_reward_func": 0.0416666679084301, |
|
"rewards/format_reward_func": 0.833333358168602, |
|
"step": 268 |
|
}, |
|
{ |
|
"completion_length": 51.687500953674316, |
|
"epoch": 0.047822222222222224, |
|
"grad_norm": 1.8625385243751729, |
|
"kl": 3.452392578125, |
|
"learning_rate": 3.6827199054038036e-07, |
|
"loss": 0.1382, |
|
"reward": 0.9791666865348816, |
|
"reward_std": 0.5766182914376259, |
|
"rewards/equation_reward_func": 0.1875000037252903, |
|
"rewards/format_reward_func": 0.7916666865348816, |
|
"step": 269 |
|
}, |
|
{ |
|
"completion_length": 50.500000953674316, |
|
"epoch": 0.048, |
|
"grad_norm": 1.3683659232487602, |
|
"kl": 3.94140625, |
|
"learning_rate": 3.64799976194246e-07, |
|
"loss": 0.1577, |
|
"reward": 0.895833358168602, |
|
"reward_std": 0.4043150581419468, |
|
"rewards/equation_reward_func": 0.06250000186264515, |
|
"rewards/format_reward_func": 0.833333358168602, |
|
"step": 270 |
|
}, |
|
{ |
|
"completion_length": 53.187501430511475, |
|
"epoch": 0.04817777777777778, |
|
"grad_norm": 1.7916494340378104, |
|
"kl": 4.187255859375, |
|
"learning_rate": 3.613349812750158e-07, |
|
"loss": 0.1672, |
|
"reward": 0.9375000223517418, |
|
"reward_std": 0.5739921554923058, |
|
"rewards/equation_reward_func": 0.16666666977107525, |
|
"rewards/format_reward_func": 0.7708333507180214, |
|
"step": 271 |
|
}, |
|
{ |
|
"completion_length": 49.10416793823242, |
|
"epoch": 0.048355555555555554, |
|
"grad_norm": 1.9583453292652802, |
|
"kl": 2.42236328125, |
|
"learning_rate": 3.5787718568116757e-07, |
|
"loss": 0.0969, |
|
"reward": 0.895833358168602, |
|
"reward_std": 0.425799235701561, |
|
"rewards/equation_reward_func": 0.06250000186264515, |
|
"rewards/format_reward_func": 0.833333358168602, |
|
"step": 272 |
|
}, |
|
{ |
|
"completion_length": 40.875001430511475, |
|
"epoch": 0.04853333333333333, |
|
"grad_norm": 2.7960145232227065, |
|
"kl": 5.267578125, |
|
"learning_rate": 3.544267689373995e-07, |
|
"loss": 0.2103, |
|
"reward": 0.9375000223517418, |
|
"reward_std": 0.6141306385397911, |
|
"rewards/equation_reward_func": 0.1875000037252903, |
|
"rewards/format_reward_func": 0.7500000223517418, |
|
"step": 273 |
|
}, |
|
{ |
|
"completion_length": 52.95833444595337, |
|
"epoch": 0.04871111111111111, |
|
"grad_norm": 1.182957198324338, |
|
"kl": 1.4658203125, |
|
"learning_rate": 3.5098391018530813e-07, |
|
"loss": 0.0587, |
|
"reward": 1.1458333730697632, |
|
"reward_std": 0.48782002180814743, |
|
"rewards/equation_reward_func": 0.2291666716337204, |
|
"rewards/format_reward_func": 0.916666679084301, |
|
"step": 274 |
|
}, |
|
{ |
|
"completion_length": 47.39583444595337, |
|
"epoch": 0.04888888888888889, |
|
"grad_norm": 3.75146574426156, |
|
"kl": 5.2431640625, |
|
"learning_rate": 3.4754878817408783e-07, |
|
"loss": 0.2094, |
|
"reward": 0.9166666939854622, |
|
"reward_std": 0.5990909859538078, |
|
"rewards/equation_reward_func": 0.14583333767950535, |
|
"rewards/format_reward_func": 0.770833358168602, |
|
"step": 275 |
|
}, |
|
{ |
|
"completion_length": 55.395835399627686, |
|
"epoch": 0.04906666666666667, |
|
"grad_norm": 0.6517175065786031, |
|
"kl": 1.81494140625, |
|
"learning_rate": 3.4412158125125073e-07, |
|
"loss": 0.0726, |
|
"reward": 1.1041666939854622, |
|
"reward_std": 0.36417657881975174, |
|
"rewards/equation_reward_func": 0.16666666977107525, |
|
"rewards/format_reward_func": 0.9375000074505806, |
|
"step": 276 |
|
}, |
|
{ |
|
"completion_length": 51.45833396911621, |
|
"epoch": 0.049244444444444445, |
|
"grad_norm": 1.1960838188628722, |
|
"kl": 2.79052734375, |
|
"learning_rate": 3.4070246735336645e-07, |
|
"loss": 0.1116, |
|
"reward": 0.9583333656191826, |
|
"reward_std": 0.4338619150221348, |
|
"rewards/equation_reward_func": 0.10416666977107525, |
|
"rewards/format_reward_func": 0.8541666865348816, |
|
"step": 277 |
|
}, |
|
{ |
|
"completion_length": 51.39583492279053, |
|
"epoch": 0.04942222222222222, |
|
"grad_norm": 1.8424306070085834, |
|
"kl": 2.34814453125, |
|
"learning_rate": 3.372916239968245e-07, |
|
"loss": 0.094, |
|
"reward": 0.9791666939854622, |
|
"reward_std": 0.28219256550073624, |
|
"rewards/equation_reward_func": 0.06250000186264515, |
|
"rewards/format_reward_func": 0.916666679084301, |
|
"step": 278 |
|
}, |
|
{ |
|
"completion_length": 54.187500953674316, |
|
"epoch": 0.0496, |
|
"grad_norm": 1.1766152581682274, |
|
"kl": 0.583251953125, |
|
"learning_rate": 3.3388922826861785e-07, |
|
"loss": 0.0233, |
|
"reward": 1.1875000298023224, |
|
"reward_std": 0.40168892219662666, |
|
"rewards/equation_reward_func": 0.22916666977107525, |
|
"rewards/format_reward_func": 0.9583333432674408, |
|
"step": 279 |
|
}, |
|
{ |
|
"completion_length": 54.66666793823242, |
|
"epoch": 0.049777777777777775, |
|
"grad_norm": 1.8840677531860919, |
|
"kl": 0.906982421875, |
|
"learning_rate": 3.3049545681714775e-07, |
|
"loss": 0.0363, |
|
"reward": 1.1041667014360428, |
|
"reward_std": 0.33713918924331665, |
|
"rewards/equation_reward_func": 0.1458333358168602, |
|
"rewards/format_reward_func": 0.9583333432674408, |
|
"step": 280 |
|
}, |
|
{ |
|
"completion_length": 55.312500953674316, |
|
"epoch": 0.04995555555555556, |
|
"grad_norm": 1.2002604740905347, |
|
"kl": 0.96142578125, |
|
"learning_rate": 3.271104858430537e-07, |
|
"loss": 0.0384, |
|
"reward": 1.1666667014360428, |
|
"reward_std": 0.47278038039803505, |
|
"rewards/equation_reward_func": 0.2291666716337204, |
|
"rewards/format_reward_func": 0.9375000149011612, |
|
"step": 281 |
|
}, |
|
{ |
|
"completion_length": 56.79166841506958, |
|
"epoch": 0.050133333333333335, |
|
"grad_norm": 0.6508397200310873, |
|
"kl": 0.695556640625, |
|
"learning_rate": 3.2373449109006474e-07, |
|
"loss": 0.0278, |
|
"reward": 1.2291667014360428, |
|
"reward_std": 0.299626849591732, |
|
"rewards/equation_reward_func": 0.2291666716337204, |
|
"rewards/format_reward_func": 1.0, |
|
"step": 282 |
|
}, |
|
{ |
|
"completion_length": 55.833335399627686, |
|
"epoch": 0.05031111111111111, |
|
"grad_norm": 0.8421705085055283, |
|
"kl": 0.91455078125, |
|
"learning_rate": 3.2036764783587444e-07, |
|
"loss": 0.0366, |
|
"reward": 1.0833333730697632, |
|
"reward_std": 0.3589930906891823, |
|
"rewards/equation_reward_func": 0.14583333767950535, |
|
"rewards/format_reward_func": 0.9375000074505806, |
|
"step": 283 |
|
}, |
|
{ |
|
"completion_length": 54.187501430511475, |
|
"epoch": 0.05048888888888889, |
|
"grad_norm": 0.852398579960793, |
|
"kl": 1.685791015625, |
|
"learning_rate": 3.1701013088304206e-07, |
|
"loss": 0.0673, |
|
"reward": 1.0208333507180214, |
|
"reward_std": 0.43918363004922867, |
|
"rewards/equation_reward_func": 0.16666666977107525, |
|
"rewards/format_reward_func": 0.854166679084301, |
|
"step": 284 |
|
}, |
|
{ |
|
"completion_length": 51.70833444595337, |
|
"epoch": 0.050666666666666665, |
|
"grad_norm": 2.0270999635419984, |
|
"kl": 2.473388671875, |
|
"learning_rate": 3.1366211454991556e-07, |
|
"loss": 0.0991, |
|
"reward": 0.958333358168602, |
|
"reward_std": 0.41783374920487404, |
|
"rewards/equation_reward_func": 0.10416666977107525, |
|
"rewards/format_reward_func": 0.8541666865348816, |
|
"step": 285 |
|
}, |
|
{ |
|
"completion_length": 53.750001430511475, |
|
"epoch": 0.05084444444444444, |
|
"grad_norm": 2.1019015776731607, |
|
"kl": 1.8466796875, |
|
"learning_rate": 3.1032377266158214e-07, |
|
"loss": 0.0738, |
|
"reward": 1.0000000298023224, |
|
"reward_std": 0.4847872592508793, |
|
"rewards/equation_reward_func": 0.1666666716337204, |
|
"rewards/format_reward_func": 0.833333358168602, |
|
"step": 286 |
|
}, |
|
{ |
|
"completion_length": 54.64583492279053, |
|
"epoch": 0.05102222222222222, |
|
"grad_norm": 2.0006844871954352, |
|
"kl": 2.72509765625, |
|
"learning_rate": 3.0699527854084335e-07, |
|
"loss": 0.109, |
|
"reward": 0.9791667014360428, |
|
"reward_std": 0.47457385435700417, |
|
"rewards/equation_reward_func": 0.1250000037252903, |
|
"rewards/format_reward_func": 0.8541666865348816, |
|
"step": 287 |
|
}, |
|
{ |
|
"completion_length": 56.270835399627686, |
|
"epoch": 0.0512, |
|
"grad_norm": 0.9939066440838259, |
|
"kl": 2.009765625, |
|
"learning_rate": 3.036768049992157e-07, |
|
"loss": 0.0803, |
|
"reward": 1.062500037252903, |
|
"reward_std": 0.41619613766670227, |
|
"rewards/equation_reward_func": 0.1458333358168602, |
|
"rewards/format_reward_func": 0.916666679084301, |
|
"step": 288 |
|
}, |
|
{ |
|
"completion_length": 52.83333492279053, |
|
"epoch": 0.05137777777777778, |
|
"grad_norm": 1.3462986622451496, |
|
"kl": 2.93408203125, |
|
"learning_rate": 3.003685243279592e-07, |
|
"loss": 0.1172, |
|
"reward": 1.0625000298023224, |
|
"reward_std": 0.5250724591314793, |
|
"rewards/equation_reward_func": 0.1875000037252903, |
|
"rewards/format_reward_func": 0.8750000223517418, |
|
"step": 289 |
|
}, |
|
{ |
|
"completion_length": 54.416667461395264, |
|
"epoch": 0.051555555555555556, |
|
"grad_norm": 1.2162218711888282, |
|
"kl": 2.576171875, |
|
"learning_rate": 2.9707060828913224e-07, |
|
"loss": 0.1031, |
|
"reward": 1.145833358168602, |
|
"reward_std": 0.495253074914217, |
|
"rewards/equation_reward_func": 0.2500000037252903, |
|
"rewards/format_reward_func": 0.8958333507180214, |
|
"step": 290 |
|
}, |
|
{ |
|
"completion_length": 50.77083396911621, |
|
"epoch": 0.05173333333333333, |
|
"grad_norm": 3.4755720807784884, |
|
"kl": 5.18798828125, |
|
"learning_rate": 2.9378322810667304e-07, |
|
"loss": 0.2078, |
|
"reward": 1.0625000298023224, |
|
"reward_std": 0.6527481563389301, |
|
"rewards/equation_reward_func": 0.22916667349636555, |
|
"rewards/format_reward_func": 0.833333358168602, |
|
"step": 291 |
|
}, |
|
{ |
|
"completion_length": 52.39583444595337, |
|
"epoch": 0.05191111111111111, |
|
"grad_norm": 2.583945749010597, |
|
"kl": 5.158203125, |
|
"learning_rate": 2.9050655445751137e-07, |
|
"loss": 0.2066, |
|
"reward": 0.9791666865348816, |
|
"reward_std": 0.6126096844673157, |
|
"rewards/equation_reward_func": 0.1875000037252903, |
|
"rewards/format_reward_func": 0.7916666939854622, |
|
"step": 292 |
|
}, |
|
{ |
|
"completion_length": 53.70833444595337, |
|
"epoch": 0.052088888888888886, |
|
"grad_norm": 2.4978389077752206, |
|
"kl": 4.4462890625, |
|
"learning_rate": 2.872407574627051e-07, |
|
"loss": 0.1783, |
|
"reward": 1.1041666865348816, |
|
"reward_std": 0.4737689010798931, |
|
"rewards/equation_reward_func": 0.2083333358168602, |
|
"rewards/format_reward_func": 0.8958333507180214, |
|
"step": 293 |
|
}, |
|
{ |
|
"completion_length": 53.89583444595337, |
|
"epoch": 0.05226666666666667, |
|
"grad_norm": 4.46081810506806, |
|
"kl": 6.01220703125, |
|
"learning_rate": 2.839860066786103e-07, |
|
"loss": 0.2403, |
|
"reward": 0.937500037252903, |
|
"reward_std": 0.5994852036237717, |
|
"rewards/equation_reward_func": 0.1666666716337204, |
|
"rewards/format_reward_func": 0.7708333507180214, |
|
"step": 294 |
|
}, |
|
{ |
|
"completion_length": 55.66666841506958, |
|
"epoch": 0.052444444444444446, |
|
"grad_norm": 1.7753949998719913, |
|
"kl": 4.2880859375, |
|
"learning_rate": 2.807424710880756e-07, |
|
"loss": 0.1712, |
|
"reward": 1.0000000149011612, |
|
"reward_std": 0.6212911605834961, |
|
"rewards/equation_reward_func": 0.2083333358168602, |
|
"rewards/format_reward_func": 0.7916666939854622, |
|
"step": 295 |
|
}, |
|
{ |
|
"completion_length": 49.72916793823242, |
|
"epoch": 0.05262222222222222, |
|
"grad_norm": 3.7499389172897573, |
|
"kl": 5.09375, |
|
"learning_rate": 2.7751031909167045e-07, |
|
"loss": 0.204, |
|
"reward": 0.8750000298023224, |
|
"reward_std": 0.49983540177345276, |
|
"rewards/equation_reward_func": 0.10416666977107525, |
|
"rewards/format_reward_func": 0.7708333432674408, |
|
"step": 296 |
|
}, |
|
{ |
|
"completion_length": 53.10416793823242, |
|
"epoch": 0.0528, |
|
"grad_norm": 1.3578432580665998, |
|
"kl": 2.974609375, |
|
"learning_rate": 2.742897184989414e-07, |
|
"loss": 0.1191, |
|
"reward": 1.0625000521540642, |
|
"reward_std": 0.4058360084891319, |
|
"rewards/equation_reward_func": 0.14583333767950535, |
|
"rewards/format_reward_func": 0.916666679084301, |
|
"step": 297 |
|
}, |
|
{ |
|
"completion_length": 57.458335399627686, |
|
"epoch": 0.052977777777777776, |
|
"grad_norm": 1.2056020602520727, |
|
"kl": 1.7373046875, |
|
"learning_rate": 2.710808365197e-07, |
|
"loss": 0.0695, |
|
"reward": 1.2083333730697632, |
|
"reward_std": 0.49666832759976387, |
|
"rewards/equation_reward_func": 0.27083333767950535, |
|
"rewards/format_reward_func": 0.9375000149011612, |
|
"step": 298 |
|
}, |
|
{ |
|
"completion_length": 55.02083396911621, |
|
"epoch": 0.05315555555555555, |
|
"grad_norm": 1.2324543500459841, |
|
"kl": 2.1708984375, |
|
"learning_rate": 2.6788383975533993e-07, |
|
"loss": 0.0867, |
|
"reward": 1.0208333805203438, |
|
"reward_std": 0.4488043636083603, |
|
"rewards/equation_reward_func": 0.1250000037252903, |
|
"rewards/format_reward_func": 0.8958333507180214, |
|
"step": 299 |
|
}, |
|
{ |
|
"completion_length": 55.08333492279053, |
|
"epoch": 0.05333333333333334, |
|
"grad_norm": 0.6124221980932599, |
|
"kl": 1.42626953125, |
|
"learning_rate": 2.646988941901898e-07, |
|
"loss": 0.057, |
|
"reward": 1.1041666865348816, |
|
"reward_std": 0.2350771240890026, |
|
"rewards/equation_reward_func": 0.12500000186264515, |
|
"rewards/format_reward_func": 0.9791666716337204, |
|
"step": 300 |
|
}, |
|
{ |
|
"completion_length": 56.312501430511475, |
|
"epoch": 0.05351111111111111, |
|
"grad_norm": 0.4101111309943839, |
|
"kl": 1.00390625, |
|
"learning_rate": 2.6152616518289305e-07, |
|
"loss": 0.0401, |
|
"reward": 1.1458333730697632, |
|
"reward_std": 0.26070838421583176, |
|
"rewards/equation_reward_func": 0.1666666716337204, |
|
"rewards/format_reward_func": 0.9791666716337204, |
|
"step": 301 |
|
}, |
|
{ |
|
"completion_length": 56.9166693687439, |
|
"epoch": 0.05368888888888889, |
|
"grad_norm": 2.0632242124320124, |
|
"kl": 0.875, |
|
"learning_rate": 2.583658174578247e-07, |
|
"loss": 0.035, |
|
"reward": 1.1041667014360428, |
|
"reward_std": 0.28219256177544594, |
|
"rewards/equation_reward_func": 0.1250000037252903, |
|
"rewards/format_reward_func": 0.9791666716337204, |
|
"step": 302 |
|
}, |
|
{ |
|
"completion_length": 56.04166793823242, |
|
"epoch": 0.05386666666666667, |
|
"grad_norm": 3.089941069383616, |
|
"kl": 0.99755859375, |
|
"learning_rate": 2.5521801509653717e-07, |
|
"loss": 0.0399, |
|
"reward": 1.1875000447034836, |
|
"reward_std": 0.4592616818845272, |
|
"rewards/equation_reward_func": 0.2291666716337204, |
|
"rewards/format_reward_func": 0.9583333432674408, |
|
"step": 303 |
|
}, |
|
{ |
|
"completion_length": 53.312501430511475, |
|
"epoch": 0.054044444444444444, |
|
"grad_norm": 1.232358463835666, |
|
"kl": 2.2294921875, |
|
"learning_rate": 2.520829215292426e-07, |
|
"loss": 0.0892, |
|
"reward": 1.2291667088866234, |
|
"reward_std": 0.471791859716177, |
|
"rewards/equation_reward_func": 0.29166667349636555, |
|
"rewards/format_reward_func": 0.9375000074505806, |
|
"step": 304 |
|
}, |
|
{ |
|
"completion_length": 52.687501430511475, |
|
"epoch": 0.05422222222222222, |
|
"grad_norm": 1.1619189366071008, |
|
"kl": 3.21044921875, |
|
"learning_rate": 2.4896069952632787e-07, |
|
"loss": 0.1283, |
|
"reward": 1.0208333730697632, |
|
"reward_std": 0.5709268562495708, |
|
"rewards/equation_reward_func": 0.20833333767950535, |
|
"rewards/format_reward_func": 0.8125000223517418, |
|
"step": 305 |
|
}, |
|
{ |
|
"completion_length": 58.81250190734863, |
|
"epoch": 0.0544, |
|
"grad_norm": 1.1697787944882119, |
|
"kl": 1.681640625, |
|
"learning_rate": 2.4585151118990285e-07, |
|
"loss": 0.0673, |
|
"reward": 1.1666666865348816, |
|
"reward_std": 0.4442220404744148, |
|
"rewards/equation_reward_func": 0.22916666977107525, |
|
"rewards/format_reward_func": 0.9375000149011612, |
|
"step": 306 |
|
}, |
|
{ |
|
"completion_length": 53.22916841506958, |
|
"epoch": 0.05457777777777778, |
|
"grad_norm": 1.701139042149517, |
|
"kl": 3.260986328125, |
|
"learning_rate": 2.427555179453844e-07, |
|
"loss": 0.1302, |
|
"reward": 0.9583333656191826, |
|
"reward_std": 0.46985330432653427, |
|
"rewards/equation_reward_func": 0.10416666977107525, |
|
"rewards/format_reward_func": 0.8541666939854622, |
|
"step": 307 |
|
}, |
|
{ |
|
"completion_length": 55.81250190734863, |
|
"epoch": 0.05475555555555556, |
|
"grad_norm": 1.1582168355268272, |
|
"kl": 2.69091796875, |
|
"learning_rate": 2.396728805331167e-07, |
|
"loss": 0.1079, |
|
"reward": 1.0625000074505806, |
|
"reward_std": 0.5002285167574883, |
|
"rewards/equation_reward_func": 0.20833333767950535, |
|
"rewards/format_reward_func": 0.854166679084301, |
|
"step": 308 |
|
}, |
|
{ |
|
"completion_length": 50.500001430511475, |
|
"epoch": 0.054933333333333334, |
|
"grad_norm": 1.666774374427531, |
|
"kl": 3.728515625, |
|
"learning_rate": 2.366037590000236e-07, |
|
"loss": 0.1491, |
|
"reward": 0.8750000298023224, |
|
"reward_std": 0.46232305839657784, |
|
"rewards/equation_reward_func": 0.0833333358168602, |
|
"rewards/format_reward_func": 0.7916666939854622, |
|
"step": 309 |
|
}, |
|
{ |
|
"completion_length": 56.000000953674316, |
|
"epoch": 0.05511111111111111, |
|
"grad_norm": 0.9155411365595406, |
|
"kl": 2.46533203125, |
|
"learning_rate": 2.3354831269130132e-07, |
|
"loss": 0.0986, |
|
"reward": 1.1250000298023224, |
|
"reward_std": 0.49133748933672905, |
|
"rewards/equation_reward_func": 0.20833333767950535, |
|
"rewards/format_reward_func": 0.9166666865348816, |
|
"step": 310 |
|
}, |
|
{ |
|
"completion_length": 55.83333444595337, |
|
"epoch": 0.05528888888888889, |
|
"grad_norm": 1.0525255657485806, |
|
"kl": 3.048828125, |
|
"learning_rate": 2.3050670024214375e-07, |
|
"loss": 0.1218, |
|
"reward": 1.0000000447034836, |
|
"reward_std": 0.5053886137902737, |
|
"rewards/equation_reward_func": 0.14583333767950535, |
|
"rewards/format_reward_func": 0.8541666865348816, |
|
"step": 311 |
|
}, |
|
{ |
|
"completion_length": 52.08333492279053, |
|
"epoch": 0.055466666666666664, |
|
"grad_norm": 2.6485128650154963, |
|
"kl": 3.696044921875, |
|
"learning_rate": 2.2747907956950707e-07, |
|
"loss": 0.1479, |
|
"reward": 0.9583333656191826, |
|
"reward_std": 0.39079636707901955, |
|
"rewards/equation_reward_func": 0.0833333358168602, |
|
"rewards/format_reward_func": 0.8750000223517418, |
|
"step": 312 |
|
}, |
|
{ |
|
"completion_length": 58.85416793823242, |
|
"epoch": 0.05564444444444445, |
|
"grad_norm": 1.7377493985395709, |
|
"kl": 3.342041015625, |
|
"learning_rate": 2.2446560786391132e-07, |
|
"loss": 0.1339, |
|
"reward": 0.9791666939854622, |
|
"reward_std": 0.4161961302161217, |
|
"rewards/equation_reward_func": 0.1041666679084301, |
|
"rewards/format_reward_func": 0.8750000223517418, |
|
"step": 313 |
|
}, |
|
{ |
|
"completion_length": 57.10416841506958, |
|
"epoch": 0.055822222222222224, |
|
"grad_norm": 0.9728311568166134, |
|
"kl": 2.14892578125, |
|
"learning_rate": 2.2146644158127826e-07, |
|
"loss": 0.0859, |
|
"reward": 1.0416666939854622, |
|
"reward_std": 0.3602609857916832, |
|
"rewards/equation_reward_func": 0.1250000037252903, |
|
"rewards/format_reward_func": 0.916666679084301, |
|
"step": 314 |
|
}, |
|
{ |
|
"completion_length": 52.83333444595337, |
|
"epoch": 0.056, |
|
"grad_norm": 0.8189600503332306, |
|
"kl": 2.77734375, |
|
"learning_rate": 2.1848173643480873e-07, |
|
"loss": 0.1111, |
|
"reward": 1.0208333507180214, |
|
"reward_std": 0.3170611411333084, |
|
"rewards/equation_reward_func": 0.1458333358168602, |
|
"rewards/format_reward_func": 0.8750000074505806, |
|
"step": 315 |
|
}, |
|
{ |
|
"completion_length": 51.187501430511475, |
|
"epoch": 0.05617777777777778, |
|
"grad_norm": 1.4570844733731012, |
|
"kl": 3.875, |
|
"learning_rate": 2.1551164738689892e-07, |
|
"loss": 0.1549, |
|
"reward": 0.8958333656191826, |
|
"reward_std": 0.5094013959169388, |
|
"rewards/equation_reward_func": 0.1250000037252903, |
|
"rewards/format_reward_func": 0.7708333507180214, |
|
"step": 316 |
|
}, |
|
{ |
|
"completion_length": 58.89583492279053, |
|
"epoch": 0.056355555555555555, |
|
"grad_norm": 1.3079415239293524, |
|
"kl": 1.58935546875, |
|
"learning_rate": 2.1255632864109379e-07, |
|
"loss": 0.0637, |
|
"reward": 1.1041667014360428, |
|
"reward_std": 0.34674229100346565, |
|
"rewards/equation_reward_func": 0.14583333767950535, |
|
"rewards/format_reward_func": 0.9583333432674408, |
|
"step": 317 |
|
}, |
|
{ |
|
"completion_length": 56.93750190734863, |
|
"epoch": 0.05653333333333333, |
|
"grad_norm": 0.9195869282916206, |
|
"kl": 2.50048828125, |
|
"learning_rate": 2.0961593363408154e-07, |
|
"loss": 0.0999, |
|
"reward": 0.9583333656191826, |
|
"reward_std": 0.42678775265812874, |
|
"rewards/equation_reward_func": 0.0833333358168602, |
|
"rewards/format_reward_func": 0.8750000223517418, |
|
"step": 318 |
|
}, |
|
{ |
|
"completion_length": 62.66666889190674, |
|
"epoch": 0.05671111111111111, |
|
"grad_norm": 1.4378742619983675, |
|
"kl": 2.0732421875, |
|
"learning_rate": 2.0669061502772772e-07, |
|
"loss": 0.083, |
|
"reward": 0.9583333507180214, |
|
"reward_std": 0.3102184720337391, |
|
"rewards/equation_reward_func": 0.06250000186264515, |
|
"rewards/format_reward_func": 0.8958333507180214, |
|
"step": 319 |
|
}, |
|
{ |
|
"completion_length": 58.520835399627686, |
|
"epoch": 0.05688888888888889, |
|
"grad_norm": 2.5114180738618748, |
|
"kl": 2.60693359375, |
|
"learning_rate": 2.037805247011482e-07, |
|
"loss": 0.1042, |
|
"reward": 0.9375000447034836, |
|
"reward_std": 0.3842546418309212, |
|
"rewards/equation_reward_func": 0.06250000186264515, |
|
"rewards/format_reward_func": 0.8750000149011612, |
|
"step": 320 |
|
}, |
|
{ |
|
"completion_length": 60.437501430511475, |
|
"epoch": 0.05706666666666667, |
|
"grad_norm": 3.344751006657643, |
|
"kl": 1.382568359375, |
|
"learning_rate": 2.008858137428251e-07, |
|
"loss": 0.0552, |
|
"reward": 1.0625000596046448, |
|
"reward_std": 0.4217669852077961, |
|
"rewards/equation_reward_func": 0.1250000037252903, |
|
"rewards/format_reward_func": 0.9375000149011612, |
|
"step": 321 |
|
}, |
|
{ |
|
"completion_length": 59.16666793823242, |
|
"epoch": 0.057244444444444445, |
|
"grad_norm": 2.980207110263902, |
|
"kl": 2.493896484375, |
|
"learning_rate": 1.9800663244276127e-07, |
|
"loss": 0.0999, |
|
"reward": 1.1458333805203438, |
|
"reward_std": 0.5278613045811653, |
|
"rewards/equation_reward_func": 0.2500000074505806, |
|
"rewards/format_reward_func": 0.8958333432674408, |
|
"step": 322 |
|
}, |
|
{ |
|
"completion_length": 60.35416841506958, |
|
"epoch": 0.05742222222222222, |
|
"grad_norm": 0.9630989177648202, |
|
"kl": 1.7958984375, |
|
"learning_rate": 1.9514313028467783e-07, |
|
"loss": 0.072, |
|
"reward": 0.9375000223517418, |
|
"reward_std": 0.3468805216252804, |
|
"rewards/equation_reward_func": 0.0833333358168602, |
|
"rewards/format_reward_func": 0.8541666716337204, |
|
"step": 323 |
|
}, |
|
{ |
|
"completion_length": 58.10416793823242, |
|
"epoch": 0.0576, |
|
"grad_norm": 1.144547736944523, |
|
"kl": 2.6875, |
|
"learning_rate": 1.9229545593825363e-07, |
|
"loss": 0.1075, |
|
"reward": 0.8958333656191826, |
|
"reward_std": 0.4418274015188217, |
|
"rewards/equation_reward_func": 0.06250000186264515, |
|
"rewards/format_reward_func": 0.833333358168602, |
|
"step": 324 |
|
}, |
|
{ |
|
"completion_length": 58.562501430511475, |
|
"epoch": 0.057777777777777775, |
|
"grad_norm": 2.5145859774979207, |
|
"kl": 3.255615234375, |
|
"learning_rate": 1.8946375725140578e-07, |
|
"loss": 0.13, |
|
"reward": 0.9166666865348816, |
|
"reward_std": 0.3747681975364685, |
|
"rewards/equation_reward_func": 0.06250000186264515, |
|
"rewards/format_reward_func": 0.8541666865348816, |
|
"step": 325 |
|
}, |
|
{ |
|
"completion_length": 61.66666841506958, |
|
"epoch": 0.05795555555555556, |
|
"grad_norm": 1.9889895034082692, |
|
"kl": 2.33544921875, |
|
"learning_rate": 1.8664818124261373e-07, |
|
"loss": 0.0936, |
|
"reward": 1.062500037252903, |
|
"reward_std": 0.3898078463971615, |
|
"rewards/equation_reward_func": 0.1666666716337204, |
|
"rewards/format_reward_func": 0.8958333507180214, |
|
"step": 326 |
|
}, |
|
{ |
|
"completion_length": 61.18750238418579, |
|
"epoch": 0.058133333333333335, |
|
"grad_norm": 1.3792448777248967, |
|
"kl": 0.93701171875, |
|
"learning_rate": 1.8384887409328688e-07, |
|
"loss": 0.0375, |
|
"reward": 1.1250000447034836, |
|
"reward_std": 0.3477308116853237, |
|
"rewards/equation_reward_func": 0.14583333767950535, |
|
"rewards/format_reward_func": 0.9791666716337204, |
|
"step": 327 |
|
}, |
|
{ |
|
"completion_length": 61.10416889190674, |
|
"epoch": 0.05831111111111111, |
|
"grad_norm": 0.7306205857498139, |
|
"kl": 2.73828125, |
|
"learning_rate": 1.8106598114017397e-07, |
|
"loss": 0.1093, |
|
"reward": 1.000000037252903, |
|
"reward_std": 0.4450269974768162, |
|
"rewards/equation_reward_func": 0.14583333767950535, |
|
"rewards/format_reward_func": 0.8541666865348816, |
|
"step": 328 |
|
}, |
|
{ |
|
"completion_length": 61.37500190734863, |
|
"epoch": 0.05848888888888889, |
|
"grad_norm": 1.3174084946886926, |
|
"kl": 2.44970703125, |
|
"learning_rate": 1.782996468678179e-07, |
|
"loss": 0.098, |
|
"reward": 1.000000037252903, |
|
"reward_std": 0.37628915533423424, |
|
"rewards/equation_reward_func": 0.10416666977107525, |
|
"rewards/format_reward_func": 0.8958333507180214, |
|
"step": 329 |
|
}, |
|
{ |
|
"completion_length": 58.125001430511475, |
|
"epoch": 0.058666666666666666, |
|
"grad_norm": 2.0400973712700896, |
|
"kl": 2.13037109375, |
|
"learning_rate": 1.7555001490105486e-07, |
|
"loss": 0.0853, |
|
"reward": 1.0416666939854622, |
|
"reward_std": 0.41228054463863373, |
|
"rewards/equation_reward_func": 0.1458333358168602, |
|
"rewards/format_reward_func": 0.8958333507180214, |
|
"step": 330 |
|
}, |
|
{ |
|
"completion_length": 57.95833492279053, |
|
"epoch": 0.05884444444444444, |
|
"grad_norm": 1.5514755499900446, |
|
"kl": 2.143798828125, |
|
"learning_rate": 1.728172279975561e-07, |
|
"loss": 0.0858, |
|
"reward": 1.1875000596046448, |
|
"reward_std": 0.5512077212333679, |
|
"rewards/equation_reward_func": 0.2916666753590107, |
|
"rewards/format_reward_func": 0.8958333507180214, |
|
"step": 331 |
|
}, |
|
{ |
|
"completion_length": 60.08333492279053, |
|
"epoch": 0.05902222222222222, |
|
"grad_norm": 1.3729341373528947, |
|
"kl": 1.197998046875, |
|
"learning_rate": 1.7010142804041783e-07, |
|
"loss": 0.0479, |
|
"reward": 1.2083333730697632, |
|
"reward_std": 0.45827316492795944, |
|
"rewards/equation_reward_func": 0.25000000558793545, |
|
"rewards/format_reward_func": 0.9583333432674408, |
|
"step": 332 |
|
}, |
|
{ |
|
"completion_length": 59.72916841506958, |
|
"epoch": 0.0592, |
|
"grad_norm": 1.527266988721381, |
|
"kl": 3.48388671875, |
|
"learning_rate": 1.674027560307927e-07, |
|
"loss": 0.1397, |
|
"reward": 1.0000000298023224, |
|
"reward_std": 0.5845837779343128, |
|
"rewards/equation_reward_func": 0.1875000037252903, |
|
"rewards/format_reward_func": 0.8125000149011612, |
|
"step": 333 |
|
}, |
|
{ |
|
"completion_length": 59.895835399627686, |
|
"epoch": 0.05937777777777778, |
|
"grad_norm": 1.050595762515859, |
|
"kl": 2.544921875, |
|
"learning_rate": 1.6472135208057125e-07, |
|
"loss": 0.1018, |
|
"reward": 0.9583333656191826, |
|
"reward_std": 0.39079636335372925, |
|
"rewards/equation_reward_func": 0.0833333358168602, |
|
"rewards/format_reward_func": 0.8750000223517418, |
|
"step": 334 |
|
}, |
|
{ |
|
"completion_length": 54.583335876464844, |
|
"epoch": 0.059555555555555556, |
|
"grad_norm": 1.2933441010118374, |
|
"kl": 5.658203125, |
|
"learning_rate": 1.6205735540510674e-07, |
|
"loss": 0.2263, |
|
"reward": 0.8541666865348816, |
|
"reward_std": 0.6126096807420254, |
|
"rewards/equation_reward_func": 0.1250000037252903, |
|
"rewards/format_reward_func": 0.7291666939854622, |
|
"step": 335 |
|
}, |
|
{ |
|
"completion_length": 53.47916793823242, |
|
"epoch": 0.05973333333333333, |
|
"grad_norm": 4.911862067712068, |
|
"kl": 6.6640625, |
|
"learning_rate": 1.5941090431598653e-07, |
|
"loss": 0.2666, |
|
"reward": 1.0625000223517418, |
|
"reward_std": 0.6126096844673157, |
|
"rewards/equation_reward_func": 0.25000000558793545, |
|
"rewards/format_reward_func": 0.8125000223517418, |
|
"step": 336 |
|
}, |
|
{ |
|
"completion_length": 49.62500238418579, |
|
"epoch": 0.05991111111111111, |
|
"grad_norm": 1.538888251192996, |
|
"kl": 5.263671875, |
|
"learning_rate": 1.5678213621385178e-07, |
|
"loss": 0.2103, |
|
"reward": 0.6875000242143869, |
|
"reward_std": 0.5668769627809525, |
|
"rewards/equation_reward_func": 0.0833333358168602, |
|
"rewards/format_reward_func": 0.6041666883975267, |
|
"step": 337 |
|
}, |
|
{ |
|
"completion_length": 54.29166841506958, |
|
"epoch": 0.060088888888888886, |
|
"grad_norm": 1.4585132738043682, |
|
"kl": 5.94921875, |
|
"learning_rate": 1.5417118758126408e-07, |
|
"loss": 0.2382, |
|
"reward": 0.9166667014360428, |
|
"reward_std": 0.6034007929265499, |
|
"rewards/equation_reward_func": 0.16666666977107525, |
|
"rewards/format_reward_func": 0.7500000149011612, |
|
"step": 338 |
|
}, |
|
{ |
|
"completion_length": 51.14583492279053, |
|
"epoch": 0.06026666666666667, |
|
"grad_norm": 4.340996859120451, |
|
"kl": 7.66015625, |
|
"learning_rate": 1.515781939756186e-07, |
|
"loss": 0.3064, |
|
"reward": 0.7500000149011612, |
|
"reward_std": 0.5763868018984795, |
|
"rewards/equation_reward_func": 0.0833333358168602, |
|
"rewards/format_reward_func": 0.666666679084301, |
|
"step": 339 |
|
}, |
|
{ |
|
"completion_length": 53.10416793823242, |
|
"epoch": 0.060444444444444446, |
|
"grad_norm": 7.13627262359258, |
|
"kl": 7.0458984375, |
|
"learning_rate": 1.490032900221068e-07, |
|
"loss": 0.2815, |
|
"reward": 0.7916666939854622, |
|
"reward_std": 0.4743013270199299, |
|
"rewards/equation_reward_func": 0.0416666679084301, |
|
"rewards/format_reward_func": 0.7500000223517418, |
|
"step": 340 |
|
}, |
|
{ |
|
"completion_length": 51.14583492279053, |
|
"epoch": 0.06062222222222222, |
|
"grad_norm": 8.837913822415763, |
|
"kl": 8.7421875, |
|
"learning_rate": 1.4644660940672627e-07, |
|
"loss": 0.3497, |
|
"reward": 0.6875000167638063, |
|
"reward_std": 0.5780420526862144, |
|
"rewards/equation_reward_func": 0.10416666977107525, |
|
"rewards/format_reward_func": 0.583333345130086, |
|
"step": 341 |
|
}, |
|
{ |
|
"completion_length": 54.68750190734863, |
|
"epoch": 0.0608, |
|
"grad_norm": 4.585825527377868, |
|
"kl": 4.6943359375, |
|
"learning_rate": 1.4390828486934058e-07, |
|
"loss": 0.1878, |
|
"reward": 0.8750000149011612, |
|
"reward_std": 0.5441443584859371, |
|
"rewards/equation_reward_func": 0.10416666977107525, |
|
"rewards/format_reward_func": 0.770833358168602, |
|
"step": 342 |
|
}, |
|
{ |
|
"completion_length": 55.47916841506958, |
|
"epoch": 0.06097777777777778, |
|
"grad_norm": 2.5316402861873586, |
|
"kl": 4.51953125, |
|
"learning_rate": 1.4138844819678725e-07, |
|
"loss": 0.1809, |
|
"reward": 0.895833358168602, |
|
"reward_std": 0.552109844982624, |
|
"rewards/equation_reward_func": 0.10416666977107525, |
|
"rewards/format_reward_func": 0.7916666939854622, |
|
"step": 343 |
|
}, |
|
{ |
|
"completion_length": 58.97916841506958, |
|
"epoch": 0.06115555555555555, |
|
"grad_norm": 1.300871033486432, |
|
"kl": 3.39404296875, |
|
"learning_rate": 1.3888723021603526e-07, |
|
"loss": 0.1359, |
|
"reward": 0.9166666865348816, |
|
"reward_std": 0.583796463906765, |
|
"rewards/equation_reward_func": 0.1458333358168602, |
|
"rewards/format_reward_func": 0.770833358168602, |
|
"step": 344 |
|
}, |
|
{ |
|
"completion_length": 58.875001430511475, |
|
"epoch": 0.06133333333333333, |
|
"grad_norm": 2.981671476687896, |
|
"kl": 3.55615234375, |
|
"learning_rate": 1.3640476078739295e-07, |
|
"loss": 0.1422, |
|
"reward": 0.8958333656191826, |
|
"reward_std": 0.43150830641388893, |
|
"rewards/equation_reward_func": 0.06250000186264515, |
|
"rewards/format_reward_func": 0.833333358168602, |
|
"step": 345 |
|
}, |
|
{ |
|
"completion_length": 57.83333492279053, |
|
"epoch": 0.061511111111111114, |
|
"grad_norm": 2.076358536272458, |
|
"kl": 2.014404296875, |
|
"learning_rate": 1.3394116879776567e-07, |
|
"loss": 0.0805, |
|
"reward": 1.0416667014360428, |
|
"reward_std": 0.47278038039803505, |
|
"rewards/equation_reward_func": 0.16666666977107525, |
|
"rewards/format_reward_func": 0.8750000149011612, |
|
"step": 346 |
|
}, |
|
{ |
|
"completion_length": 60.250001430511475, |
|
"epoch": 0.06168888888888889, |
|
"grad_norm": 1.8103398488618279, |
|
"kl": 1.98193359375, |
|
"learning_rate": 1.3149658215396475e-07, |
|
"loss": 0.0794, |
|
"reward": 0.8750000298023224, |
|
"reward_std": 0.49578551203012466, |
|
"rewards/equation_reward_func": 0.1041666679084301, |
|
"rewards/format_reward_func": 0.770833358168602, |
|
"step": 347 |
|
}, |
|
{ |
|
"completion_length": 64.27083539962769, |
|
"epoch": 0.06186666666666667, |
|
"grad_norm": 3.140491497435094, |
|
"kl": 1.261474609375, |
|
"learning_rate": 1.2907112777606576e-07, |
|
"loss": 0.0505, |
|
"reward": 1.1875000298023224, |
|
"reward_std": 0.5744358189404011, |
|
"rewards/equation_reward_func": 0.29166667349636555, |
|
"rewards/format_reward_func": 0.895833358168602, |
|
"step": 348 |
|
}, |
|
{ |
|
"completion_length": 59.62500190734863, |
|
"epoch": 0.062044444444444444, |
|
"grad_norm": 1.061446640667196, |
|
"kl": 1.03369140625, |
|
"learning_rate": 1.2666493159081942e-07, |
|
"loss": 0.0413, |
|
"reward": 1.0000000223517418, |
|
"reward_std": 0.3102184757590294, |
|
"rewards/equation_reward_func": 0.10416666977107525, |
|
"rewards/format_reward_func": 0.8958333507180214, |
|
"step": 349 |
|
}, |
|
{ |
|
"completion_length": 61.47916889190674, |
|
"epoch": 0.06222222222222222, |
|
"grad_norm": 3.6962335454618858, |
|
"kl": 1.625244140625, |
|
"learning_rate": 1.2427811852511395e-07, |
|
"loss": 0.0649, |
|
"reward": 1.1250000298023224, |
|
"reward_std": 0.3937234431505203, |
|
"rewards/equation_reward_func": 0.1875000037252903, |
|
"rewards/format_reward_func": 0.9375000149011612, |
|
"step": 350 |
|
}, |
|
{ |
|
"completion_length": 59.125001430511475, |
|
"epoch": 0.0624, |
|
"grad_norm": 1.6816913213888882, |
|
"kl": 1.1396484375, |
|
"learning_rate": 1.219108124994887e-07, |
|
"loss": 0.0455, |
|
"reward": 1.0000000298023224, |
|
"reward_std": 0.32624663412570953, |
|
"rewards/equation_reward_func": 0.0833333358168602, |
|
"rewards/format_reward_func": 0.9166666865348816, |
|
"step": 351 |
|
}, |
|
{ |
|
"completion_length": 64.35416793823242, |
|
"epoch": 0.06257777777777777, |
|
"grad_norm": 0.8522481505886366, |
|
"kl": 0.689208984375, |
|
"learning_rate": 1.1956313642169973e-07, |
|
"loss": 0.0276, |
|
"reward": 1.0625000149011612, |
|
"reward_std": 0.27369464561343193, |
|
"rewards/equation_reward_func": 0.10416666977107525, |
|
"rewards/format_reward_func": 0.9583333432674408, |
|
"step": 352 |
|
}, |
|
{ |
|
"completion_length": 61.62500190734863, |
|
"epoch": 0.06275555555555555, |
|
"grad_norm": 1.7897422346452299, |
|
"kl": 0.836669921875, |
|
"learning_rate": 1.1723521218034004e-07, |
|
"loss": 0.0335, |
|
"reward": 1.0833333879709244, |
|
"reward_std": 0.41380149126052856, |
|
"rewards/equation_reward_func": 0.14583333767950535, |
|
"rewards/format_reward_func": 0.9375000149011612, |
|
"step": 353 |
|
}, |
|
{ |
|
"completion_length": 61.08333444595337, |
|
"epoch": 0.06293333333333333, |
|
"grad_norm": 2.0178846511892803, |
|
"kl": 1.092041015625, |
|
"learning_rate": 1.1492716063850971e-07, |
|
"loss": 0.0437, |
|
"reward": 0.958333358168602, |
|
"reward_std": 0.23116152733564377, |
|
"rewards/equation_reward_func": 0.0416666679084301, |
|
"rewards/format_reward_func": 0.9166666716337204, |
|
"step": 354 |
|
}, |
|
{ |
|
"completion_length": 60.52083492279053, |
|
"epoch": 0.06311111111111112, |
|
"grad_norm": 2.7610480485849185, |
|
"kl": 2.68701171875, |
|
"learning_rate": 1.126391016275422e-07, |
|
"loss": 0.1075, |
|
"reward": 1.062500037252903, |
|
"reward_std": 0.34674229100346565, |
|
"rewards/equation_reward_func": 0.1250000037252903, |
|
"rewards/format_reward_func": 0.9375000074505806, |
|
"step": 355 |
|
}, |
|
{ |
|
"completion_length": 55.91666793823242, |
|
"epoch": 0.0632888888888889, |
|
"grad_norm": 1.7083927263129774, |
|
"kl": 2.089111328125, |
|
"learning_rate": 1.1037115394078162e-07, |
|
"loss": 0.0836, |
|
"reward": 1.0416667088866234, |
|
"reward_std": 0.46722716465592384, |
|
"rewards/equation_reward_func": 0.1458333358168602, |
|
"rewards/format_reward_func": 0.8958333507180214, |
|
"step": 356 |
|
}, |
|
{ |
|
"completion_length": 56.937501430511475, |
|
"epoch": 0.06346666666666667, |
|
"grad_norm": 1.569207458888609, |
|
"kl": 1.089111328125, |
|
"learning_rate": 1.0812343532741569e-07, |
|
"loss": 0.0436, |
|
"reward": 1.0625000223517418, |
|
"reward_std": 0.3714948333799839, |
|
"rewards/equation_reward_func": 0.16666666977107525, |
|
"rewards/format_reward_func": 0.8958333358168602, |
|
"step": 357 |
|
}, |
|
{ |
|
"completion_length": 60.500001430511475, |
|
"epoch": 0.06364444444444445, |
|
"grad_norm": 3.1252392054257516, |
|
"kl": 1.154296875, |
|
"learning_rate": 1.058960624863629e-07, |
|
"loss": 0.0462, |
|
"reward": 1.062500037252903, |
|
"reward_std": 0.432873398065567, |
|
"rewards/equation_reward_func": 0.1666666716337204, |
|
"rewards/format_reward_func": 0.8958333507180214, |
|
"step": 358 |
|
}, |
|
{ |
|
"completion_length": 60.75000190734863, |
|
"epoch": 0.06382222222222222, |
|
"grad_norm": 2.640265426366037, |
|
"kl": 0.833984375, |
|
"learning_rate": 1.0368915106021253e-07, |
|
"loss": 0.0334, |
|
"reward": 1.1041667312383652, |
|
"reward_std": 0.4864138960838318, |
|
"rewards/equation_reward_func": 0.18750000558793545, |
|
"rewards/format_reward_func": 0.9166666865348816, |
|
"step": 359 |
|
}, |
|
{ |
|
"completion_length": 61.25000190734863, |
|
"epoch": 0.064, |
|
"grad_norm": 3.448614558728069, |
|
"kl": 1.076416015625, |
|
"learning_rate": 1.015028156292212e-07, |
|
"loss": 0.0431, |
|
"reward": 1.0833333656191826, |
|
"reward_std": 0.48238347843289375, |
|
"rewards/equation_reward_func": 0.18750000558793545, |
|
"rewards/format_reward_func": 0.8958333507180214, |
|
"step": 360 |
|
}, |
|
{ |
|
"completion_length": 58.187501430511475, |
|
"epoch": 0.06417777777777778, |
|
"grad_norm": 4.146739619804781, |
|
"kl": 0.870361328125, |
|
"learning_rate": 9.933716970536427e-08, |
|
"loss": 0.0348, |
|
"reward": 1.1041667014360428, |
|
"reward_std": 0.5266816467046738, |
|
"rewards/equation_reward_func": 0.2291666716337204, |
|
"rewards/format_reward_func": 0.8750000223517418, |
|
"step": 361 |
|
}, |
|
{ |
|
"completion_length": 59.02083444595337, |
|
"epoch": 0.06435555555555555, |
|
"grad_norm": 1.9186073301177928, |
|
"kl": 1.1279296875, |
|
"learning_rate": 9.719232572644187e-08, |
|
"loss": 0.0451, |
|
"reward": 1.0625000298023224, |
|
"reward_std": 0.456334613263607, |
|
"rewards/equation_reward_func": 0.16666666977107525, |
|
"rewards/format_reward_func": 0.8958333507180214, |
|
"step": 362 |
|
}, |
|
{ |
|
"completion_length": 58.270835399627686, |
|
"epoch": 0.06453333333333333, |
|
"grad_norm": 3.1282429101773346, |
|
"kl": 1.857666015625, |
|
"learning_rate": 9.506839505024145e-08, |
|
"loss": 0.0743, |
|
"reward": 1.1250000149011612, |
|
"reward_std": 0.5089099928736687, |
|
"rewards/equation_reward_func": 0.22916666977107525, |
|
"rewards/format_reward_func": 0.8958333507180214, |
|
"step": 363 |
|
}, |
|
{ |
|
"completion_length": 58.500001430511475, |
|
"epoch": 0.06471111111111111, |
|
"grad_norm": 1.2404291380846855, |
|
"kl": 2.42529296875, |
|
"learning_rate": 9.296548794875658e-08, |
|
"loss": 0.0971, |
|
"reward": 0.9791667088866234, |
|
"reward_std": 0.5063771307468414, |
|
"rewards/equation_reward_func": 0.1250000037252903, |
|
"rewards/format_reward_func": 0.8541666939854622, |
|
"step": 364 |
|
}, |
|
{ |
|
"completion_length": 54.187501430511475, |
|
"epoch": 0.06488888888888888, |
|
"grad_norm": 1.4856882416240396, |
|
"kl": 2.43212890625, |
|
"learning_rate": 9.088371360246105e-08, |
|
"loss": 0.0974, |
|
"reward": 0.9583333767950535, |
|
"reward_std": 0.5198958218097687, |
|
"rewards/equation_reward_func": 0.14583333767950535, |
|
"rewards/format_reward_func": 0.8125000186264515, |
|
"step": 365 |
|
}, |
|
{ |
|
"completion_length": 60.020835399627686, |
|
"epoch": 0.06506666666666666, |
|
"grad_norm": 1.2813862007137036, |
|
"kl": 2.4736328125, |
|
"learning_rate": 8.882318009464123e-08, |
|
"loss": 0.099, |
|
"reward": 0.9166666865348816, |
|
"reward_std": 0.347730815410614, |
|
"rewards/equation_reward_func": 0.06250000186264515, |
|
"rewards/format_reward_func": 0.854166679084301, |
|
"step": 366 |
|
}, |
|
{ |
|
"completion_length": 56.47916793823242, |
|
"epoch": 0.06524444444444444, |
|
"grad_norm": 1.4700306877141611, |
|
"kl": 2.24755859375, |
|
"learning_rate": 8.678399440578365e-08, |
|
"loss": 0.0899, |
|
"reward": 0.9375000223517418, |
|
"reward_std": 0.5813841745257378, |
|
"rewards/equation_reward_func": 0.1666666716337204, |
|
"rewards/format_reward_func": 0.7708333507180214, |
|
"step": 367 |
|
}, |
|
{ |
|
"completion_length": 56.85416841506958, |
|
"epoch": 0.06542222222222223, |
|
"grad_norm": 2.152590597555675, |
|
"kl": 2.2392578125, |
|
"learning_rate": 8.476626240802099e-08, |
|
"loss": 0.0897, |
|
"reward": 0.9375000223517418, |
|
"reward_std": 0.4938579201698303, |
|
"rewards/equation_reward_func": 0.1250000037252903, |
|
"rewards/format_reward_func": 0.8125000186264515, |
|
"step": 368 |
|
}, |
|
{ |
|
"completion_length": 52.187500953674316, |
|
"epoch": 0.0656, |
|
"grad_norm": 2.381613257880136, |
|
"kl": 3.70361328125, |
|
"learning_rate": 8.277008885963593e-08, |
|
"loss": 0.1481, |
|
"reward": 0.979166679084301, |
|
"reward_std": 0.6662384197115898, |
|
"rewards/equation_reward_func": 0.20833333767950535, |
|
"rewards/format_reward_func": 0.770833358168602, |
|
"step": 369 |
|
}, |
|
{ |
|
"completion_length": 53.43750190734863, |
|
"epoch": 0.06577777777777778, |
|
"grad_norm": 1.978390854604455, |
|
"kl": 2.68603515625, |
|
"learning_rate": 8.079557739962128e-08, |
|
"loss": 0.1073, |
|
"reward": 0.937500037252903, |
|
"reward_std": 0.6371357701718807, |
|
"rewards/equation_reward_func": 0.1875000037252903, |
|
"rewards/format_reward_func": 0.7500000223517418, |
|
"step": 370 |
|
}, |
|
{ |
|
"completion_length": 61.43750190734863, |
|
"epoch": 0.06595555555555556, |
|
"grad_norm": 1.670924504713265, |
|
"kl": 2.015625, |
|
"learning_rate": 7.884283054229956e-08, |
|
"loss": 0.0807, |
|
"reward": 0.9583333507180214, |
|
"reward_std": 0.36779123172163963, |
|
"rewards/equation_reward_func": 0.0833333358168602, |
|
"rewards/format_reward_func": 0.8750000223517418, |
|
"step": 371 |
|
}, |
|
{ |
|
"completion_length": 55.312500953674316, |
|
"epoch": 0.06613333333333334, |
|
"grad_norm": 1.7031503147281266, |
|
"kl": 2.489501953125, |
|
"learning_rate": 7.691194967200098e-08, |
|
"loss": 0.0995, |
|
"reward": 1.041666705161333, |
|
"reward_std": 0.58039565756917, |
|
"rewards/equation_reward_func": 0.2291666716337204, |
|
"rewards/format_reward_func": 0.8125000186264515, |
|
"step": 372 |
|
}, |
|
{ |
|
"completion_length": 57.29166889190674, |
|
"epoch": 0.06631111111111111, |
|
"grad_norm": 1.4511519173022578, |
|
"kl": 2.66552734375, |
|
"learning_rate": 7.500303503779897e-08, |
|
"loss": 0.1066, |
|
"reward": 1.0625000298023224, |
|
"reward_std": 0.6057954281568527, |
|
"rewards/equation_reward_func": 0.2708333395421505, |
|
"rewards/format_reward_func": 0.791666679084301, |
|
"step": 373 |
|
}, |
|
{ |
|
"completion_length": 50.22916793823242, |
|
"epoch": 0.06648888888888889, |
|
"grad_norm": 2.3542410534514118, |
|
"kl": 4.15283203125, |
|
"learning_rate": 7.311618574830569e-08, |
|
"loss": 0.1664, |
|
"reward": 0.8541667014360428, |
|
"reward_std": 0.5683979243040085, |
|
"rewards/equation_reward_func": 0.1250000037252903, |
|
"rewards/format_reward_func": 0.7291666865348816, |
|
"step": 374 |
|
}, |
|
{ |
|
"completion_length": 56.187501430511475, |
|
"epoch": 0.06666666666666667, |
|
"grad_norm": 1.3402956105324395, |
|
"kl": 2.796142578125, |
|
"learning_rate": 7.125149976652684e-08, |
|
"loss": 0.1119, |
|
"reward": 0.937500037252903, |
|
"reward_std": 0.4822668172419071, |
|
"rewards/equation_reward_func": 0.1250000037252903, |
|
"rewards/format_reward_func": 0.8125000149011612, |
|
"step": 375 |
|
}, |
|
{ |
|
"completion_length": 56.437501430511475, |
|
"epoch": 0.06684444444444444, |
|
"grad_norm": 1.3065544985054902, |
|
"kl": 3.40673828125, |
|
"learning_rate": 6.940907390477457e-08, |
|
"loss": 0.136, |
|
"reward": 1.020833358168602, |
|
"reward_std": 0.41557733342051506, |
|
"rewards/equation_reward_func": 0.14583333767950535, |
|
"rewards/format_reward_func": 0.8750000149011612, |
|
"step": 376 |
|
}, |
|
{ |
|
"completion_length": 47.29166793823242, |
|
"epoch": 0.06702222222222222, |
|
"grad_norm": 3.1009623462965976, |
|
"kl": 5.3515625, |
|
"learning_rate": 6.758900381964228e-08, |
|
"loss": 0.2139, |
|
"reward": 0.7500000298023224, |
|
"reward_std": 0.5388510599732399, |
|
"rewards/equation_reward_func": 0.0416666679084301, |
|
"rewards/format_reward_func": 0.708333358168602, |
|
"step": 377 |
|
}, |
|
{ |
|
"completion_length": 53.54166793823242, |
|
"epoch": 0.0672, |
|
"grad_norm": 1.1872610426825552, |
|
"kl": 3.68310546875, |
|
"learning_rate": 6.579138400703715e-08, |
|
"loss": 0.1474, |
|
"reward": 0.8125000223517418, |
|
"reward_std": 0.4662386476993561, |
|
"rewards/equation_reward_func": 0.06250000186264515, |
|
"rewards/format_reward_func": 0.7500000149011612, |
|
"step": 378 |
|
}, |
|
{ |
|
"completion_length": 51.562501430511475, |
|
"epoch": 0.06737777777777777, |
|
"grad_norm": 3.57334398201852, |
|
"kl": 4.5546875, |
|
"learning_rate": 6.401630779727451e-08, |
|
"loss": 0.1822, |
|
"reward": 0.8333333395421505, |
|
"reward_std": 0.37490642443299294, |
|
"rewards/equation_reward_func": 0.06250000186264515, |
|
"rewards/format_reward_func": 0.7708333469927311, |
|
"step": 379 |
|
}, |
|
{ |
|
"completion_length": 51.125001430511475, |
|
"epoch": 0.06755555555555555, |
|
"grad_norm": 2.66140039223916, |
|
"kl": 3.92041015625, |
|
"learning_rate": 6.22638673502327e-08, |
|
"loss": 0.1571, |
|
"reward": 0.8958333544433117, |
|
"reward_std": 0.43363041803240776, |
|
"rewards/equation_reward_func": 0.1041666679084301, |
|
"rewards/format_reward_func": 0.7916666828095913, |
|
"step": 380 |
|
}, |
|
{ |
|
"completion_length": 53.39583492279053, |
|
"epoch": 0.06773333333333334, |
|
"grad_norm": 1.3849153453116665, |
|
"kl": 2.9033203125, |
|
"learning_rate": 6.05341536505673e-08, |
|
"loss": 0.1163, |
|
"reward": 1.1041667088866234, |
|
"reward_std": 0.5925082266330719, |
|
"rewards/equation_reward_func": 0.2708333395421505, |
|
"rewards/format_reward_func": 0.833333358168602, |
|
"step": 381 |
|
}, |
|
{ |
|
"completion_length": 51.66666793823242, |
|
"epoch": 0.06791111111111112, |
|
"grad_norm": 1.217375000722545, |
|
"kl": 4.306884765625, |
|
"learning_rate": 5.882725650298787e-08, |
|
"loss": 0.1719, |
|
"reward": 0.8958333656191826, |
|
"reward_std": 0.5843112505972385, |
|
"rewards/equation_reward_func": 0.14583333767950535, |
|
"rewards/format_reward_func": 0.7500000149011612, |
|
"step": 382 |
|
}, |
|
{ |
|
"completion_length": 52.29166793823242, |
|
"epoch": 0.0680888888888889, |
|
"grad_norm": 1.3213564242297655, |
|
"kl": 3.68896484375, |
|
"learning_rate": 5.714326452759549e-08, |
|
"loss": 0.1475, |
|
"reward": 0.8750000223517418, |
|
"reward_std": 0.5200340487062931, |
|
"rewards/equation_reward_func": 0.10416666977107525, |
|
"rewards/format_reward_func": 0.7708333507180214, |
|
"step": 383 |
|
}, |
|
{ |
|
"completion_length": 53.000001430511475, |
|
"epoch": 0.06826666666666667, |
|
"grad_norm": 1.4169493034835212, |
|
"kl": 3.507080078125, |
|
"learning_rate": 5.548226515528132e-08, |
|
"loss": 0.14, |
|
"reward": 0.9791666939854622, |
|
"reward_std": 0.5409447588026524, |
|
"rewards/equation_reward_func": 0.14583333767950535, |
|
"rewards/format_reward_func": 0.8333333656191826, |
|
"step": 384 |
|
}, |
|
{ |
|
"completion_length": 58.37500190734863, |
|
"epoch": 0.06844444444444445, |
|
"grad_norm": 1.1023233854193553, |
|
"kl": 2.56787109375, |
|
"learning_rate": 5.384434462318777e-08, |
|
"loss": 0.1028, |
|
"reward": 0.9375000298023224, |
|
"reward_std": 0.4688647836446762, |
|
"rewards/equation_reward_func": 0.10416666977107525, |
|
"rewards/format_reward_func": 0.833333358168602, |
|
"step": 385 |
|
}, |
|
{ |
|
"completion_length": 56.62500238418579, |
|
"epoch": 0.06862222222222222, |
|
"grad_norm": 1.4791889195725796, |
|
"kl": 2.52001953125, |
|
"learning_rate": 5.222958797023036e-08, |
|
"loss": 0.1008, |
|
"reward": 1.1458333693444729, |
|
"reward_std": 0.5843112505972385, |
|
"rewards/equation_reward_func": 0.2916666716337204, |
|
"rewards/format_reward_func": 0.8541666828095913, |
|
"step": 386 |
|
}, |
|
{ |
|
"completion_length": 57.45833492279053, |
|
"epoch": 0.0688, |
|
"grad_norm": 1.246850620199564, |
|
"kl": 2.88037109375, |
|
"learning_rate": 5.063807903268369e-08, |
|
"loss": 0.115, |
|
"reward": 1.0625000298023224, |
|
"reward_std": 0.5097602866590023, |
|
"rewards/equation_reward_func": 0.16666666977107525, |
|
"rewards/format_reward_func": 0.895833358168602, |
|
"step": 387 |
|
}, |
|
{ |
|
"completion_length": 52.08333444595337, |
|
"epoch": 0.06897777777777778, |
|
"grad_norm": 1.9646365955378329, |
|
"kl": 3.38916015625, |
|
"learning_rate": 4.9069900439828115e-08, |
|
"loss": 0.1355, |
|
"reward": 1.0416666865348816, |
|
"reward_std": 0.40530357882380486, |
|
"rewards/equation_reward_func": 0.1458333358168602, |
|
"rewards/format_reward_func": 0.895833358168602, |
|
"step": 388 |
|
}, |
|
{ |
|
"completion_length": 51.47916841506958, |
|
"epoch": 0.06915555555555555, |
|
"grad_norm": 5.101279066687743, |
|
"kl": 4.92578125, |
|
"learning_rate": 4.7525133609659484e-08, |
|
"loss": 0.197, |
|
"reward": 0.7916666939854622, |
|
"reward_std": 0.5373301096260548, |
|
"rewards/equation_reward_func": 0.06250000186264515, |
|
"rewards/format_reward_func": 0.7291666865348816, |
|
"step": 389 |
|
}, |
|
{ |
|
"completion_length": 55.020835876464844, |
|
"epoch": 0.06933333333333333, |
|
"grad_norm": 2.859624921800997, |
|
"kl": 3.75, |
|
"learning_rate": 4.600385874466256e-08, |
|
"loss": 0.1498, |
|
"reward": 0.9166666865348816, |
|
"reward_std": 0.34296492487192154, |
|
"rewards/equation_reward_func": 0.06250000186264515, |
|
"rewards/format_reward_func": 0.854166679084301, |
|
"step": 390 |
|
}, |
|
{ |
|
"completion_length": 52.208335399627686, |
|
"epoch": 0.0695111111111111, |
|
"grad_norm": 1.2049238425693272, |
|
"kl": 3.3515625, |
|
"learning_rate": 4.4506154827646915e-08, |
|
"loss": 0.1341, |
|
"reward": 0.9583333805203438, |
|
"reward_std": 0.5388510636985302, |
|
"rewards/equation_reward_func": 0.14583333767950535, |
|
"rewards/format_reward_func": 0.8125000223517418, |
|
"step": 391 |
|
}, |
|
{ |
|
"completion_length": 53.70833492279053, |
|
"epoch": 0.06968888888888888, |
|
"grad_norm": 1.8756766886500422, |
|
"kl": 3.8017578125, |
|
"learning_rate": 4.303209961764587e-08, |
|
"loss": 0.1517, |
|
"reward": 0.9375000149011612, |
|
"reward_std": 0.6559193283319473, |
|
"rewards/equation_reward_func": 0.18750000558793545, |
|
"rewards/format_reward_func": 0.7500000223517418, |
|
"step": 392 |
|
}, |
|
{ |
|
"completion_length": 53.31250047683716, |
|
"epoch": 0.06986666666666666, |
|
"grad_norm": 2.233928517978556, |
|
"kl": 2.48046875, |
|
"learning_rate": 4.158176964587967e-08, |
|
"loss": 0.0993, |
|
"reward": 1.0208333805203438, |
|
"reward_std": 0.5134512856602669, |
|
"rewards/equation_reward_func": 0.1666666716337204, |
|
"rewards/format_reward_func": 0.8541666865348816, |
|
"step": 393 |
|
}, |
|
{ |
|
"completion_length": 54.958335399627686, |
|
"epoch": 0.07004444444444445, |
|
"grad_norm": 2.1952634154416963, |
|
"kl": 2.598876953125, |
|
"learning_rate": 4.015524021178196e-08, |
|
"loss": 0.1041, |
|
"reward": 1.0833333730697632, |
|
"reward_std": 0.5025997683405876, |
|
"rewards/equation_reward_func": 0.2083333395421505, |
|
"rewards/format_reward_func": 0.8750000223517418, |
|
"step": 394 |
|
}, |
|
{ |
|
"completion_length": 54.91666793823242, |
|
"epoch": 0.07022222222222223, |
|
"grad_norm": 1.556450915650428, |
|
"kl": 3.673828125, |
|
"learning_rate": 3.8752585379090317e-08, |
|
"loss": 0.1468, |
|
"reward": 0.9791666939854622, |
|
"reward_std": 0.5681380145251751, |
|
"rewards/equation_reward_func": 0.1666666716337204, |
|
"rewards/format_reward_func": 0.8125000223517418, |
|
"step": 395 |
|
}, |
|
{ |
|
"completion_length": 59.64583492279053, |
|
"epoch": 0.0704, |
|
"grad_norm": 2.3442496072285035, |
|
"kl": 1.22265625, |
|
"learning_rate": 3.7373877972001255e-08, |
|
"loss": 0.0489, |
|
"reward": 1.166666716337204, |
|
"reward_std": 0.49578551575541496, |
|
"rewards/equation_reward_func": 0.2291666716337204, |
|
"rewards/format_reward_func": 0.9375000149011612, |
|
"step": 396 |
|
}, |
|
{ |
|
"completion_length": 56.416667461395264, |
|
"epoch": 0.07057777777777778, |
|
"grad_norm": 1.382319715238746, |
|
"kl": 2.299560546875, |
|
"learning_rate": 3.601918957138844e-08, |
|
"loss": 0.092, |
|
"reward": 1.125000037252903, |
|
"reward_std": 0.5317768938839436, |
|
"rewards/equation_reward_func": 0.22916666977107525, |
|
"rewards/format_reward_func": 0.8958333507180214, |
|
"step": 397 |
|
}, |
|
{ |
|
"completion_length": 55.00000190734863, |
|
"epoch": 0.07075555555555556, |
|
"grad_norm": 1.8307641531327195, |
|
"kl": 3.07080078125, |
|
"learning_rate": 3.46885905110873e-08, |
|
"loss": 0.1227, |
|
"reward": 1.0208333730697632, |
|
"reward_std": 0.532146617770195, |
|
"rewards/equation_reward_func": 0.1666666716337204, |
|
"rewards/format_reward_func": 0.8541666939854622, |
|
"step": 398 |
|
}, |
|
{ |
|
"completion_length": 54.66666793823242, |
|
"epoch": 0.07093333333333333, |
|
"grad_norm": 1.5553001993082431, |
|
"kl": 3.23291015625, |
|
"learning_rate": 3.3382149874242814e-08, |
|
"loss": 0.1295, |
|
"reward": 1.0000000298023224, |
|
"reward_std": 0.502599760890007, |
|
"rewards/equation_reward_func": 0.14583333767950535, |
|
"rewards/format_reward_func": 0.8541666865348816, |
|
"step": 399 |
|
}, |
|
{ |
|
"completion_length": 54.60416841506958, |
|
"epoch": 0.07111111111111111, |
|
"grad_norm": 1.574316545264386, |
|
"kl": 3.5634765625, |
|
"learning_rate": 3.20999354897229e-08, |
|
"loss": 0.1431, |
|
"reward": 0.9583333507180214, |
|
"reward_std": 0.4067096970975399, |
|
"rewards/equation_reward_func": 0.10416666977107525, |
|
"rewards/format_reward_func": 0.854166679084301, |
|
"step": 400 |
|
}, |
|
{ |
|
"completion_length": 61.43750190734863, |
|
"epoch": 0.07128888888888889, |
|
"grad_norm": 2.037039114885329, |
|
"kl": 2.79345703125, |
|
"learning_rate": 3.0842013928596754e-08, |
|
"loss": 0.1117, |
|
"reward": 0.958333358168602, |
|
"reward_std": 0.2957112602889538, |
|
"rewards/equation_reward_func": 0.06250000186264515, |
|
"rewards/format_reward_func": 0.8958333432674408, |
|
"step": 401 |
|
}, |
|
{ |
|
"completion_length": 55.125001430511475, |
|
"epoch": 0.07146666666666666, |
|
"grad_norm": 3.5875644910793727, |
|
"kl": 3.16015625, |
|
"learning_rate": 2.9608450500678562e-08, |
|
"loss": 0.1265, |
|
"reward": 0.8333333656191826, |
|
"reward_std": 0.41380149498581886, |
|
"rewards/equation_reward_func": 0.02083333395421505, |
|
"rewards/format_reward_func": 0.8125000298023224, |
|
"step": 402 |
|
}, |
|
{ |
|
"completion_length": 54.91666841506958, |
|
"epoch": 0.07164444444444444, |
|
"grad_norm": 1.6377386422578162, |
|
"kl": 1.947998046875, |
|
"learning_rate": 2.839930925113715e-08, |
|
"loss": 0.0777, |
|
"reward": 1.104166716337204, |
|
"reward_std": 0.4778187908232212, |
|
"rewards/equation_reward_func": 0.1666666716337204, |
|
"rewards/format_reward_func": 0.9375000149011612, |
|
"step": 403 |
|
}, |
|
{ |
|
"completion_length": 57.47916793823242, |
|
"epoch": 0.07182222222222222, |
|
"grad_norm": 1.0788472393855095, |
|
"kl": 2.4091796875, |
|
"learning_rate": 2.721465295716996e-08, |
|
"loss": 0.0963, |
|
"reward": 0.9791667088866234, |
|
"reward_std": 0.46483253315091133, |
|
"rewards/equation_reward_func": 0.12500000186264515, |
|
"rewards/format_reward_func": 0.8541666865348816, |
|
"step": 404 |
|
}, |
|
{ |
|
"completion_length": 56.187500953674316, |
|
"epoch": 0.072, |
|
"grad_norm": 1.5018621132397523, |
|
"kl": 1.85888671875, |
|
"learning_rate": 2.605454312474448e-08, |
|
"loss": 0.0745, |
|
"reward": 1.1666667014360428, |
|
"reward_std": 0.5234047770500183, |
|
"rewards/equation_reward_func": 0.2500000074505806, |
|
"rewards/format_reward_func": 0.9166666865348816, |
|
"step": 405 |
|
}, |
|
{ |
|
"completion_length": 56.312501430511475, |
|
"epoch": 0.07217777777777777, |
|
"grad_norm": 2.078201578902569, |
|
"kl": 2.600341796875, |
|
"learning_rate": 2.4919039985404622e-08, |
|
"loss": 0.1039, |
|
"reward": 1.000000037252903, |
|
"reward_std": 0.5240839384496212, |
|
"rewards/equation_reward_func": 0.14583333767950535, |
|
"rewards/format_reward_func": 0.8541666865348816, |
|
"step": 406 |
|
}, |
|
{ |
|
"completion_length": 55.083335399627686, |
|
"epoch": 0.07235555555555556, |
|
"grad_norm": 2.2993454586501643, |
|
"kl": 3.27734375, |
|
"learning_rate": 2.380820249314375e-08, |
|
"loss": 0.131, |
|
"reward": 0.9791667088866234, |
|
"reward_std": 0.48782002553343773, |
|
"rewards/equation_reward_func": 0.14583333767950535, |
|
"rewards/format_reward_func": 0.8333333507180214, |
|
"step": 407 |
|
}, |
|
{ |
|
"completion_length": 56.854167461395264, |
|
"epoch": 0.07253333333333334, |
|
"grad_norm": 1.3535072176160663, |
|
"kl": 2.1513671875, |
|
"learning_rate": 2.2722088321343258e-08, |
|
"loss": 0.0861, |
|
"reward": 0.9166666939854622, |
|
"reward_std": 0.42678775265812874, |
|
"rewards/equation_reward_func": 0.06250000186264515, |
|
"rewards/format_reward_func": 0.8541666939854622, |
|
"step": 408 |
|
}, |
|
{ |
|
"completion_length": 55.250000953674316, |
|
"epoch": 0.07271111111111112, |
|
"grad_norm": 1.8133842507703675, |
|
"kl": 2.7421875, |
|
"learning_rate": 2.1660753859779223e-08, |
|
"loss": 0.1095, |
|
"reward": 1.0416667088866234, |
|
"reward_std": 0.5461693182587624, |
|
"rewards/equation_reward_func": 0.2083333358168602, |
|
"rewards/format_reward_func": 0.8333333507180214, |
|
"step": 409 |
|
}, |
|
{ |
|
"completion_length": 55.14583396911621, |
|
"epoch": 0.07288888888888889, |
|
"grad_norm": 0.9743576087041721, |
|
"kl": 2.7861328125, |
|
"learning_rate": 2.0624254211693894e-08, |
|
"loss": 0.1113, |
|
"reward": 1.000000037252903, |
|
"reward_std": 0.39079635962843895, |
|
"rewards/equation_reward_func": 0.10416666977107525, |
|
"rewards/format_reward_func": 0.8958333507180214, |
|
"step": 410 |
|
}, |
|
{ |
|
"completion_length": 55.02083492279053, |
|
"epoch": 0.07306666666666667, |
|
"grad_norm": 2.516245940031829, |
|
"kl": 3.21875, |
|
"learning_rate": 1.9612643190935196e-08, |
|
"loss": 0.1288, |
|
"reward": 0.9791667088866234, |
|
"reward_std": 0.5782451070845127, |
|
"rewards/equation_reward_func": 0.16666666977107525, |
|
"rewards/format_reward_func": 0.8125000223517418, |
|
"step": 411 |
|
}, |
|
{ |
|
"completion_length": 60.187500953674316, |
|
"epoch": 0.07324444444444445, |
|
"grad_norm": 2.6330682117724113, |
|
"kl": 1.1962890625, |
|
"learning_rate": 1.8625973319162602e-08, |
|
"loss": 0.0478, |
|
"reward": 1.020833358168602, |
|
"reward_std": 0.4778187870979309, |
|
"rewards/equation_reward_func": 0.1458333358168602, |
|
"rewards/format_reward_func": 0.8750000298023224, |
|
"step": 412 |
|
}, |
|
{ |
|
"completion_length": 51.312500953674316, |
|
"epoch": 0.07342222222222222, |
|
"grad_norm": 1.1821663445604937, |
|
"kl": 4.4501953125, |
|
"learning_rate": 1.7664295823120347e-08, |
|
"loss": 0.178, |
|
"reward": 0.8958333507180214, |
|
"reward_std": 0.618318747729063, |
|
"rewards/equation_reward_func": 0.1666666716337204, |
|
"rewards/format_reward_func": 0.7291666865348816, |
|
"step": 413 |
|
}, |
|
{ |
|
"completion_length": 57.75000190734863, |
|
"epoch": 0.0736, |
|
"grad_norm": 0.8122615265168446, |
|
"kl": 2.16064453125, |
|
"learning_rate": 1.672766063197789e-08, |
|
"loss": 0.0861, |
|
"reward": 0.9791666939854622, |
|
"reward_std": 0.37575671449303627, |
|
"rewards/equation_reward_func": 0.0833333358168602, |
|
"rewards/format_reward_func": 0.8958333507180214, |
|
"step": 414 |
|
}, |
|
{ |
|
"completion_length": 57.22916841506958, |
|
"epoch": 0.07377777777777778, |
|
"grad_norm": 3.0181550761407716, |
|
"kl": 1.967529296875, |
|
"learning_rate": 1.5816116374737452e-08, |
|
"loss": 0.0785, |
|
"reward": 1.0833333730697632, |
|
"reward_std": 0.39079636335372925, |
|
"rewards/equation_reward_func": 0.14583333767950535, |
|
"rewards/format_reward_func": 0.9375000149011612, |
|
"step": 415 |
|
}, |
|
{ |
|
"completion_length": 55.375000953674316, |
|
"epoch": 0.07395555555555555, |
|
"grad_norm": 1.1321615634392828, |
|
"kl": 0.98779296875, |
|
"learning_rate": 1.492971037770924e-08, |
|
"loss": 0.0395, |
|
"reward": 1.1458333730697632, |
|
"reward_std": 0.34674229472875595, |
|
"rewards/equation_reward_func": 0.18750000558793545, |
|
"rewards/format_reward_func": 0.9583333432674408, |
|
"step": 416 |
|
}, |
|
{ |
|
"completion_length": 55.041667461395264, |
|
"epoch": 0.07413333333333333, |
|
"grad_norm": 1.1137121433760955, |
|
"kl": 2.485595703125, |
|
"learning_rate": 1.4068488662054733e-08, |
|
"loss": 0.0994, |
|
"reward": 1.0833333805203438, |
|
"reward_std": 0.49578550457954407, |
|
"rewards/equation_reward_func": 0.1875000037252903, |
|
"rewards/format_reward_func": 0.8958333507180214, |
|
"step": 417 |
|
}, |
|
{ |
|
"completion_length": 59.33333444595337, |
|
"epoch": 0.0743111111111111, |
|
"grad_norm": 1.8567494155306574, |
|
"kl": 3.08056640625, |
|
"learning_rate": 1.3232495941396637e-08, |
|
"loss": 0.1232, |
|
"reward": 1.0416667014360428, |
|
"reward_std": 0.581656701862812, |
|
"rewards/equation_reward_func": 0.18750000558793545, |
|
"rewards/format_reward_func": 0.8541666939854622, |
|
"step": 418 |
|
}, |
|
{ |
|
"completion_length": 59.79166793823242, |
|
"epoch": 0.07448888888888888, |
|
"grad_norm": 1.3834584558844922, |
|
"kl": 2.0390625, |
|
"learning_rate": 1.2421775619498199e-08, |
|
"loss": 0.0815, |
|
"reward": 1.1250000298023224, |
|
"reward_std": 0.3532840199768543, |
|
"rewards/equation_reward_func": 0.1666666716337204, |
|
"rewards/format_reward_func": 0.9583333432674408, |
|
"step": 419 |
|
}, |
|
{ |
|
"completion_length": 58.66666793823242, |
|
"epoch": 0.07466666666666667, |
|
"grad_norm": 0.8828120170847628, |
|
"kl": 1.23046875, |
|
"learning_rate": 1.1636369788008971e-08, |
|
"loss": 0.0493, |
|
"reward": 1.1875000298023224, |
|
"reward_std": 0.4722479432821274, |
|
"rewards/equation_reward_func": 0.25000000558793545, |
|
"rewards/format_reward_func": 0.9375000149011612, |
|
"step": 420 |
|
}, |
|
{ |
|
"completion_length": 53.39583444595337, |
|
"epoch": 0.07484444444444445, |
|
"grad_norm": 1.6828918296780182, |
|
"kl": 2.09814453125, |
|
"learning_rate": 1.0876319224279895e-08, |
|
"loss": 0.0837, |
|
"reward": 0.9791666939854622, |
|
"reward_std": 0.44323352351784706, |
|
"rewards/equation_reward_func": 0.1250000037252903, |
|
"rewards/format_reward_func": 0.854166679084301, |
|
"step": 421 |
|
}, |
|
{ |
|
"completion_length": 55.08333444595337, |
|
"epoch": 0.07502222222222223, |
|
"grad_norm": 1.320246490235028, |
|
"kl": 3.072265625, |
|
"learning_rate": 1.014166338924627e-08, |
|
"loss": 0.123, |
|
"reward": 0.9375000298023224, |
|
"reward_std": 0.4662386514246464, |
|
"rewards/equation_reward_func": 0.1041666679084301, |
|
"rewards/format_reward_func": 0.833333358168602, |
|
"step": 422 |
|
}, |
|
{ |
|
"completion_length": 53.06250190734863, |
|
"epoch": 0.0752, |
|
"grad_norm": 1.4581841364335688, |
|
"kl": 2.12109375, |
|
"learning_rate": 9.432440425378663e-09, |
|
"loss": 0.0848, |
|
"reward": 1.0208333730697632, |
|
"reward_std": 0.38435182720422745, |
|
"rewards/equation_reward_func": 0.1250000037252903, |
|
"rewards/format_reward_func": 0.8958333507180214, |
|
"step": 423 |
|
}, |
|
{ |
|
"completion_length": 56.97916841506958, |
|
"epoch": 0.07537777777777778, |
|
"grad_norm": 3.1980164553732098, |
|
"kl": 2.3525390625, |
|
"learning_rate": 8.748687154702672e-09, |
|
"loss": 0.0941, |
|
"reward": 1.0416666939854622, |
|
"reward_std": 0.5055268332362175, |
|
"rewards/equation_reward_func": 0.1875000037252903, |
|
"rewards/format_reward_func": 0.854166679084301, |
|
"step": 424 |
|
}, |
|
{ |
|
"completion_length": 57.645835399627686, |
|
"epoch": 0.07555555555555556, |
|
"grad_norm": 1.5490994703153829, |
|
"kl": 1.429931640625, |
|
"learning_rate": 8.090439076887556e-09, |
|
"loss": 0.0573, |
|
"reward": 1.2500000447034836, |
|
"reward_std": 0.49578551575541496, |
|
"rewards/equation_reward_func": 0.31250000558793545, |
|
"rewards/format_reward_func": 0.9375000149011612, |
|
"step": 425 |
|
}, |
|
{ |
|
"completion_length": 55.70833492279053, |
|
"epoch": 0.07573333333333333, |
|
"grad_norm": 0.9371848147502277, |
|
"kl": 2.319580078125, |
|
"learning_rate": 7.457730367402549e-09, |
|
"loss": 0.0928, |
|
"reward": 1.0416666939854622, |
|
"reward_std": 0.46833235025405884, |
|
"rewards/equation_reward_func": 0.14583333767950535, |
|
"rewards/format_reward_func": 0.8958333507180214, |
|
"step": 426 |
|
}, |
|
{ |
|
"completion_length": 55.25000238418579, |
|
"epoch": 0.07591111111111111, |
|
"grad_norm": 1.4055409510031482, |
|
"kl": 3.28759765625, |
|
"learning_rate": 6.850593875742827e-09, |
|
"loss": 0.1316, |
|
"reward": 1.0208333730697632, |
|
"reward_std": 0.566160973161459, |
|
"rewards/equation_reward_func": 0.18750000558793545, |
|
"rewards/format_reward_func": 0.8333333507180214, |
|
"step": 427 |
|
}, |
|
{ |
|
"completion_length": 55.562501430511475, |
|
"epoch": 0.07608888888888889, |
|
"grad_norm": 1.344704452916086, |
|
"kl": 3.205810546875, |
|
"learning_rate": 6.269061123724162e-09, |
|
"loss": 0.1285, |
|
"reward": 0.9583333730697632, |
|
"reward_std": 0.43123578280210495, |
|
"rewards/equation_reward_func": 0.10416666977107525, |
|
"rewards/format_reward_func": 0.8541666865348816, |
|
"step": 428 |
|
}, |
|
{ |
|
"completion_length": 59.22916793823242, |
|
"epoch": 0.07626666666666666, |
|
"grad_norm": 2.1608184033551643, |
|
"kl": 1.736328125, |
|
"learning_rate": 5.713162303845886e-09, |
|
"loss": 0.0696, |
|
"reward": 1.2083333879709244, |
|
"reward_std": 0.5388510562479496, |
|
"rewards/equation_reward_func": 0.2708333395421505, |
|
"rewards/format_reward_func": 0.9375000149011612, |
|
"step": 429 |
|
}, |
|
{ |
|
"completion_length": 54.125001430511475, |
|
"epoch": 0.07644444444444444, |
|
"grad_norm": 1.8308437940638247, |
|
"kl": 1.9599609375, |
|
"learning_rate": 5.182926277723821e-09, |
|
"loss": 0.0783, |
|
"reward": 1.0625000223517418, |
|
"reward_std": 0.5724712051451206, |
|
"rewards/equation_reward_func": 0.2291666716337204, |
|
"rewards/format_reward_func": 0.8333333432674408, |
|
"step": 430 |
|
}, |
|
{ |
|
"completion_length": 56.500000953674316, |
|
"epoch": 0.07662222222222222, |
|
"grad_norm": 1.6765820181739854, |
|
"kl": 2.41552734375, |
|
"learning_rate": 4.678380574591356e-09, |
|
"loss": 0.0966, |
|
"reward": 1.0000000298023224, |
|
"reward_std": 0.438050027936697, |
|
"rewards/equation_reward_func": 0.14583333767950535, |
|
"rewards/format_reward_func": 0.8541666939854622, |
|
"step": 431 |
|
}, |
|
{ |
|
"completion_length": 57.27083492279053, |
|
"epoch": 0.0768, |
|
"grad_norm": 1.3657278862024629, |
|
"kl": 2.685546875, |
|
"learning_rate": 4.199551389870659e-09, |
|
"loss": 0.1074, |
|
"reward": 0.8541666865348816, |
|
"reward_std": 0.45774073153734207, |
|
"rewards/equation_reward_func": 0.06250000186264515, |
|
"rewards/format_reward_func": 0.7916666865348816, |
|
"step": 432 |
|
}, |
|
{ |
|
"completion_length": 55.64583492279053, |
|
"epoch": 0.07697777777777778, |
|
"grad_norm": 0.8539346194962987, |
|
"kl": 2.49462890625, |
|
"learning_rate": 3.746463583812143e-09, |
|
"loss": 0.0997, |
|
"reward": 1.041666679084301, |
|
"reward_std": 0.4067096970975399, |
|
"rewards/equation_reward_func": 0.1458333358168602, |
|
"rewards/format_reward_func": 0.8958333507180214, |
|
"step": 433 |
|
}, |
|
{ |
|
"completion_length": 57.875000953674316, |
|
"epoch": 0.07715555555555556, |
|
"grad_norm": 1.2719357827752074, |
|
"kl": 2.07666015625, |
|
"learning_rate": 3.3191406802041688e-09, |
|
"loss": 0.0831, |
|
"reward": 1.0416666939854622, |
|
"reward_std": 0.3332236036658287, |
|
"rewards/equation_reward_func": 0.12500000186264515, |
|
"rewards/format_reward_func": 0.916666679084301, |
|
"step": 434 |
|
}, |
|
{ |
|
"completion_length": 54.312500953674316, |
|
"epoch": 0.07733333333333334, |
|
"grad_norm": 1.4529499100749974, |
|
"kl": 3.67041015625, |
|
"learning_rate": 2.9176048651513575e-09, |
|
"loss": 0.1465, |
|
"reward": 0.958333358168602, |
|
"reward_std": 0.5915607511997223, |
|
"rewards/equation_reward_func": 0.1875000037252903, |
|
"rewards/format_reward_func": 0.7708333507180214, |
|
"step": 435 |
|
}, |
|
{ |
|
"completion_length": 55.08333492279053, |
|
"epoch": 0.07751111111111111, |
|
"grad_norm": 2.3005383670846693, |
|
"kl": 3.82958984375, |
|
"learning_rate": 2.541876985923119e-09, |
|
"loss": 0.1532, |
|
"reward": 0.9791667014360428, |
|
"reward_std": 0.49742312356829643, |
|
"rewards/equation_reward_func": 0.14583333767950535, |
|
"rewards/format_reward_func": 0.8333333432674408, |
|
"step": 436 |
|
}, |
|
{ |
|
"completion_length": 54.20833444595337, |
|
"epoch": 0.07768888888888889, |
|
"grad_norm": 1.2230209357396296, |
|
"kl": 2.626220703125, |
|
"learning_rate": 2.1919765498708554e-09, |
|
"loss": 0.1052, |
|
"reward": 0.979166716337204, |
|
"reward_std": 0.4918699115514755, |
|
"rewards/equation_reward_func": 0.1250000037252903, |
|
"rewards/format_reward_func": 0.854166679084301, |
|
"step": 437 |
|
}, |
|
{ |
|
"completion_length": 53.04166841506958, |
|
"epoch": 0.07786666666666667, |
|
"grad_norm": 1.2004863808274155, |
|
"kl": 2.7109375, |
|
"learning_rate": 1.867921723415433e-09, |
|
"loss": 0.1085, |
|
"reward": 1.0208333786576986, |
|
"reward_std": 0.507898073643446, |
|
"rewards/equation_reward_func": 0.20833333767950535, |
|
"rewards/format_reward_func": 0.812500013038516, |
|
"step": 438 |
|
}, |
|
{ |
|
"completion_length": 56.416667461395264, |
|
"epoch": 0.07804444444444444, |
|
"grad_norm": 2.4507514591114656, |
|
"kl": 1.8857421875, |
|
"learning_rate": 1.5697293311039973e-09, |
|
"loss": 0.0755, |
|
"reward": 1.020833358168602, |
|
"reward_std": 0.4759799763560295, |
|
"rewards/equation_reward_func": 0.16666666977107525, |
|
"rewards/format_reward_func": 0.854166679084301, |
|
"step": 439 |
|
}, |
|
{ |
|
"completion_length": 54.937501430511475, |
|
"epoch": 0.07822222222222222, |
|
"grad_norm": 2.2350073546663545, |
|
"kl": 2.593994140625, |
|
"learning_rate": 1.2974148547362228e-09, |
|
"loss": 0.1039, |
|
"reward": 1.0833333656191826, |
|
"reward_std": 0.5678940936923027, |
|
"rewards/equation_reward_func": 0.2291666716337204, |
|
"rewards/format_reward_func": 0.8541666939854622, |
|
"step": 440 |
|
}, |
|
{ |
|
"completion_length": 57.60416793823242, |
|
"epoch": 0.0784, |
|
"grad_norm": 2.858115206523939, |
|
"kl": 1.82763671875, |
|
"learning_rate": 1.0509924325609598e-09, |
|
"loss": 0.0733, |
|
"reward": 1.020833358168602, |
|
"reward_std": 0.42745841667056084, |
|
"rewards/equation_reward_func": 0.1458333358168602, |
|
"rewards/format_reward_func": 0.8750000149011612, |
|
"step": 441 |
|
}, |
|
{ |
|
"completion_length": 58.10416793823242, |
|
"epoch": 0.07857777777777777, |
|
"grad_norm": 0.8237264126414511, |
|
"kl": 2.230712890625, |
|
"learning_rate": 8.304748585417076e-10, |
|
"loss": 0.0891, |
|
"reward": 1.1458333805203438, |
|
"reward_std": 0.41129202395677567, |
|
"rewards/equation_reward_func": 0.18750000558793545, |
|
"rewards/format_reward_func": 0.9583333358168602, |
|
"step": 442 |
|
}, |
|
{ |
|
"completion_length": 55.27083492279053, |
|
"epoch": 0.07875555555555555, |
|
"grad_norm": 2.025879478872808, |
|
"kl": 2.39501953125, |
|
"learning_rate": 6.358735816926475e-10, |
|
"loss": 0.0957, |
|
"reward": 1.1458333805203438, |
|
"reward_std": 0.5063771307468414, |
|
"rewards/equation_reward_func": 0.29166667722165585, |
|
"rewards/format_reward_func": 0.8541666865348816, |
|
"step": 443 |
|
}, |
|
{ |
|
"completion_length": 56.437500953674316, |
|
"epoch": 0.07893333333333333, |
|
"grad_norm": 1.7721145491613786, |
|
"kl": 2.5361328125, |
|
"learning_rate": 4.671987054842841e-10, |
|
"loss": 0.1016, |
|
"reward": 0.9375000223517418, |
|
"reward_std": 0.3828308768570423, |
|
"rewards/equation_reward_func": 0.0833333358168602, |
|
"rewards/format_reward_func": 0.854166679084301, |
|
"step": 444 |
|
}, |
|
{ |
|
"completion_length": 54.43750190734863, |
|
"epoch": 0.0791111111111111, |
|
"grad_norm": 2.0018604753277365, |
|
"kl": 2.74658203125, |
|
"learning_rate": 3.2445898731853216e-10, |
|
"loss": 0.1102, |
|
"reward": 0.937500037252903, |
|
"reward_std": 0.5536307953298092, |
|
"rewards/equation_reward_func": 0.1250000037252903, |
|
"rewards/format_reward_func": 0.8125000298023224, |
|
"step": 445 |
|
}, |
|
{ |
|
"completion_length": 55.375001430511475, |
|
"epoch": 0.0792888888888889, |
|
"grad_norm": 1.6563944165609519, |
|
"kl": 2.6494140625, |
|
"learning_rate": 2.076618380744133e-10, |
|
"loss": 0.1061, |
|
"reward": 0.9375000223517418, |
|
"reward_std": 0.4548136591911316, |
|
"rewards/equation_reward_func": 0.0833333358168602, |
|
"rewards/format_reward_func": 0.8541666939854622, |
|
"step": 446 |
|
}, |
|
{ |
|
"completion_length": 56.208335399627686, |
|
"epoch": 0.07946666666666667, |
|
"grad_norm": 1.4046224808818546, |
|
"kl": 2.59716796875, |
|
"learning_rate": 1.16813321723197e-10, |
|
"loss": 0.1038, |
|
"reward": 1.0416667088866234, |
|
"reward_std": 0.528849832713604, |
|
"rewards/equation_reward_func": 0.16666666977107525, |
|
"rewards/format_reward_func": 0.8750000223517418, |
|
"step": 447 |
|
}, |
|
{ |
|
"completion_length": 56.187501430511475, |
|
"epoch": 0.07964444444444445, |
|
"grad_norm": 1.9614743025012586, |
|
"kl": 2.751953125, |
|
"learning_rate": 5.191815501343066e-11, |
|
"loss": 0.11, |
|
"reward": 1.1041667014360428, |
|
"reward_std": 0.462188757956028, |
|
"rewards/equation_reward_func": 0.20833333767950535, |
|
"rewards/format_reward_func": 0.8958333432674408, |
|
"step": 448 |
|
}, |
|
{ |
|
"completion_length": 58.187500953674316, |
|
"epoch": 0.07982222222222222, |
|
"grad_norm": 1.1288440912101967, |
|
"kl": 2.55517578125, |
|
"learning_rate": 1.2979707226135061e-11, |
|
"loss": 0.1023, |
|
"reward": 0.9791667014360428, |
|
"reward_std": 0.437777504324913, |
|
"rewards/equation_reward_func": 0.1250000037252903, |
|
"rewards/format_reward_func": 0.8541666865348816, |
|
"step": 449 |
|
}, |
|
{ |
|
"completion_length": 57.750001430511475, |
|
"epoch": 0.08, |
|
"grad_norm": 1.6866388049647176, |
|
"kl": 1.45751953125, |
|
"learning_rate": 0.0, |
|
"loss": 0.0583, |
|
"reward": 1.1875000447034836, |
|
"reward_std": 0.32525811716914177, |
|
"rewards/equation_reward_func": 0.2083333395421505, |
|
"rewards/format_reward_func": 0.9791666716337204, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"step": 450, |
|
"total_flos": 0.0, |
|
"train_loss": 0.08981622397834212, |
|
"train_runtime": 6983.3902, |
|
"train_samples_per_second": 3.093, |
|
"train_steps_per_second": 0.064 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 450, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|