Qwen-2.5-7B-Simple-RL / trainer_state.json
zhimeng's picture
Model save
6303069 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.997867803837953,
"eval_steps": 100,
"global_step": 117,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completion_length": 627.8802337646484,
"epoch": 0.008528784648187633,
"grad_norm": 0.35644883946602257,
"kl": 0.0,
"learning_rate": 2.5e-07,
"loss": -0.0129,
"reward": 0.6093750298023224,
"reward_std": 0.3860909380018711,
"rewards/accuracy_reward": 0.6093750298023224,
"rewards/format_reward": 0.0,
"step": 1
},
{
"clip_ratio": 0.0,
"completion_length": 587.183611869812,
"epoch": 0.042643923240938165,
"grad_norm": 0.34939166374507424,
"kl": 0.00016704201698303223,
"learning_rate": 1.25e-06,
"loss": 0.0135,
"reward": 0.6184895997866988,
"reward_std": 0.34997194120660424,
"rewards/accuracy_reward": 0.6184895997866988,
"rewards/format_reward": 0.0,
"step": 5
},
{
"clip_ratio": 0.0,
"completion_length": 611.967724609375,
"epoch": 0.08528784648187633,
"grad_norm": 31.88849856902946,
"kl": 0.0043338298797607425,
"learning_rate": 2.5e-06,
"loss": 0.0188,
"reward": 0.6291666887700558,
"reward_std": 0.32357291094958784,
"rewards/accuracy_reward": 0.6291666887700558,
"rewards/format_reward": 0.0,
"step": 10
},
{
"clip_ratio": 0.0,
"completion_length": 616.8541862487793,
"epoch": 0.1279317697228145,
"grad_norm": 0.3815627956846007,
"kl": 0.010546112060546875,
"learning_rate": 2.993961440992859e-06,
"loss": 0.0578,
"reward": 0.6953125193715095,
"reward_std": 0.2850981580093503,
"rewards/accuracy_reward": 0.6953125193715095,
"rewards/format_reward": 0.0,
"step": 15
},
{
"clip_ratio": 0.0,
"completion_length": 618.81460647583,
"epoch": 0.17057569296375266,
"grad_norm": 0.21300730475167035,
"kl": 0.0043548583984375,
"learning_rate": 2.957235057439301e-06,
"loss": 0.0758,
"reward": 0.7281250193715095,
"reward_std": 0.23656688714399934,
"rewards/accuracy_reward": 0.7281250193715095,
"rewards/format_reward": 0.0,
"step": 20
},
{
"clip_ratio": 0.0,
"completion_length": 591.3015823364258,
"epoch": 0.21321961620469082,
"grad_norm": 0.12113342234838173,
"kl": 0.015129280090332032,
"learning_rate": 2.887956450710995e-06,
"loss": 0.0443,
"reward": 0.7828125208616257,
"reward_std": 0.177566824760288,
"rewards/accuracy_reward": 0.7828125208616257,
"rewards/format_reward": 0.0,
"step": 25
},
{
"clip_ratio": 0.0,
"completion_length": 579.9239768981934,
"epoch": 0.255863539445629,
"grad_norm": 0.14438849219292743,
"kl": 0.005391120910644531,
"learning_rate": 2.7876731904027993e-06,
"loss": 0.0371,
"reward": 0.7609375171363354,
"reward_std": 0.16949560260400176,
"rewards/accuracy_reward": 0.7609375171363354,
"rewards/format_reward": 0.0,
"step": 30
},
{
"clip_ratio": 0.0,
"completion_length": 571.2515838623046,
"epoch": 0.29850746268656714,
"grad_norm": 0.15658618448015763,
"kl": 0.004360771179199219,
"learning_rate": 2.6586254388368995e-06,
"loss": 0.0415,
"reward": 0.7854166865348816,
"reward_std": 0.17419785326346754,
"rewards/accuracy_reward": 0.7854166865348816,
"rewards/format_reward": 0.0,
"step": 35
},
{
"clip_ratio": 0.0,
"completion_length": 579.2099113464355,
"epoch": 0.3411513859275053,
"grad_norm": 0.6009157958287555,
"kl": 0.026328277587890626,
"learning_rate": 2.5036959095382875e-06,
"loss": 0.0321,
"reward": 0.7708333596587181,
"reward_std": 0.17964822258800267,
"rewards/accuracy_reward": 0.7708333596587181,
"rewards/format_reward": 0.0,
"step": 40
},
{
"clip_ratio": 0.0,
"completion_length": 586.2880401611328,
"epoch": 0.3837953091684435,
"grad_norm": 0.46893633502563103,
"kl": 0.015601730346679688,
"learning_rate": 2.3263454721781537e-06,
"loss": 0.0288,
"reward": 0.7869791895151138,
"reward_std": 0.17324934136122466,
"rewards/accuracy_reward": 0.7869791895151138,
"rewards/format_reward": 0.0,
"step": 45
},
{
"clip_ratio": 0.0,
"completion_length": 577.1146011352539,
"epoch": 0.42643923240938164,
"grad_norm": 0.14115724397125742,
"kl": 0.00496826171875,
"learning_rate": 2.1305358424643485e-06,
"loss": 0.0306,
"reward": 0.7510416850447654,
"reward_std": 0.1911184054799378,
"rewards/accuracy_reward": 0.7510416850447654,
"rewards/format_reward": 0.0,
"step": 50
},
{
"clip_ratio": 0.0,
"completion_length": 579.0744941711425,
"epoch": 0.4690831556503198,
"grad_norm": 3.183002278410084,
"kl": 0.018677902221679688,
"learning_rate": 1.9206410839590043e-06,
"loss": 0.0246,
"reward": 0.7661458477377892,
"reward_std": 0.19202441712841392,
"rewards/accuracy_reward": 0.7661458477377892,
"rewards/format_reward": 0.0,
"step": 55
},
{
"clip_ratio": 0.0,
"completion_length": 601.4166862487793,
"epoch": 0.511727078891258,
"grad_norm": 0.12757187329087855,
"kl": 0.006385040283203125,
"learning_rate": 1.7013498987264833e-06,
"loss": 0.0345,
"reward": 0.7364583522081375,
"reward_std": 0.19316800702363252,
"rewards/accuracy_reward": 0.7364583522081375,
"rewards/format_reward": 0.0,
"step": 60
},
{
"clip_ratio": 0.0,
"completion_length": 581.6609550476074,
"epoch": 0.5543710021321961,
"grad_norm": 0.17404128901148935,
"kl": 0.007928085327148438,
"learning_rate": 1.4775608894771048e-06,
"loss": 0.0328,
"reward": 0.7505208536982536,
"reward_std": 0.2075295069254935,
"rewards/accuracy_reward": 0.7505208536982536,
"rewards/format_reward": 0.0,
"step": 65
},
{
"clip_ratio": 0.0,
"completion_length": 596.1479362487793,
"epoch": 0.5970149253731343,
"grad_norm": 0.2944207023714681,
"kl": 0.006090927124023438,
"learning_rate": 1.2542731328772936e-06,
"loss": 0.0339,
"reward": 0.7265625208616256,
"reward_std": 0.19276394164189697,
"rewards/accuracy_reward": 0.7265625208616256,
"rewards/format_reward": 0.0,
"step": 70
},
{
"clip_ratio": 0.0,
"completion_length": 565.5979385375977,
"epoch": 0.6396588486140725,
"grad_norm": 0.18840980741585026,
"kl": 0.0075702667236328125,
"learning_rate": 1.036474508437579e-06,
"loss": 0.0368,
"reward": 0.7677083507180213,
"reward_std": 0.18664944088086485,
"rewards/accuracy_reward": 0.7677083507180213,
"rewards/format_reward": 0.0,
"step": 75
},
{
"clip_ratio": 0.0,
"completion_length": 580.122933959961,
"epoch": 0.6823027718550106,
"grad_norm": 0.11471347531056907,
"kl": 0.006945037841796875,
"learning_rate": 8.290302775265509e-07,
"loss": 0.0341,
"reward": 0.7583333522081375,
"reward_std": 0.1750888627022505,
"rewards/accuracy_reward": 0.7583333522081375,
"rewards/format_reward": 0.0,
"step": 80
},
{
"clip_ratio": 0.0,
"completion_length": 592.2614776611329,
"epoch": 0.7249466950959488,
"grad_norm": 0.22116249522463094,
"kl": 0.009731292724609375,
"learning_rate": 6.3657440147149e-07,
"loss": 0.0333,
"reward": 0.7656250178813935,
"reward_std": 0.21401627436280252,
"rewards/accuracy_reward": 0.7656250178813935,
"rewards/format_reward": 0.0,
"step": 85
},
{
"clip_ratio": 0.0,
"completion_length": 579.4776222229004,
"epoch": 0.767590618336887,
"grad_norm": 0.2030484454112882,
"kl": 0.010486984252929687,
"learning_rate": 4.63406026519703e-07,
"loss": 0.0328,
"reward": 0.7552083522081375,
"reward_std": 0.16649889973923565,
"rewards/accuracy_reward": 0.7552083522081375,
"rewards/format_reward": 0.0,
"step": 90
},
{
"clip_ratio": 0.0,
"completion_length": 586.7932479858398,
"epoch": 0.8102345415778252,
"grad_norm": 0.11451963582699543,
"kl": 0.006160736083984375,
"learning_rate": 3.133934480154885e-07,
"loss": 0.0294,
"reward": 0.7520833507180213,
"reward_std": 0.18174212109297513,
"rewards/accuracy_reward": 0.7520833507180213,
"rewards/format_reward": 0.0,
"step": 95
},
{
"epoch": 0.8528784648187633,
"grad_norm": 0.1403395507058327,
"learning_rate": 1.8988769907430552e-07,
"loss": 0.0343,
"step": 100
},
{
"epoch": 0.8528784648187633,
"eval_clip_ratio": 0.0,
"eval_completion_length": 571.9227185058594,
"eval_kl": 0.0098244873046875,
"eval_loss": 0.019065221771597862,
"eval_reward": 0.6759333529949189,
"eval_reward_std": 0.2166214306771755,
"eval_rewards/accuracy_reward": 0.6759333529949189,
"eval_rewards/format_reward": 0.0,
"eval_runtime": 11129.6267,
"eval_samples_per_second": 0.449,
"eval_steps_per_second": 0.009,
"step": 100
},
{
"clip_ratio": 0.0,
"completion_length": 583.2151233673096,
"epoch": 0.8955223880597015,
"grad_norm": 0.1561949193212127,
"kl": 0.005991172790527344,
"learning_rate": 9.564769404039419e-08,
"loss": 0.0236,
"reward": 0.7598958529531956,
"reward_std": 0.1863211216405034,
"rewards/accuracy_reward": 0.7598958529531956,
"rewards/format_reward": 0.0,
"step": 105
},
{
"clip_ratio": 0.0,
"completion_length": 586.4302299499511,
"epoch": 0.9381663113006397,
"grad_norm": 5.136938540934993,
"kl": 0.008953857421875,
"learning_rate": 3.277859889929147e-08,
"loss": 0.0333,
"reward": 0.7864583492279053,
"reward_std": 0.1887844305485487,
"rewards/accuracy_reward": 0.7864583492279053,
"rewards/format_reward": 0.0,
"step": 110
},
{
"clip_ratio": 0.0,
"completion_length": 569.3703300476075,
"epoch": 0.9808102345415778,
"grad_norm": 0.147469744170478,
"kl": 0.00705413818359375,
"learning_rate": 2.684805348397268e-09,
"loss": 0.0296,
"reward": 0.7828125163912774,
"reward_std": 0.16681436980143188,
"rewards/accuracy_reward": 0.7828125163912774,
"rewards/format_reward": 0.0,
"step": 115
},
{
"clip_ratio": 0.0,
"completion_length": 583.3411636352539,
"epoch": 0.997867803837953,
"kl": 0.007679939270019531,
"reward": 0.7473958507180214,
"reward_std": 0.18170781643129885,
"rewards/accuracy_reward": 0.7473958507180214,
"rewards/format_reward": 0.0,
"step": 117,
"total_flos": 0.0,
"train_loss": 0.034317180164094664,
"train_runtime": 31905.6637,
"train_samples_per_second": 0.235,
"train_steps_per_second": 0.004
}
],
"logging_steps": 5,
"max_steps": 117,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}