qwen2.5_1.5b_ins-pt-bf16 / trainer_state.json
sheepy928's picture
Model save
0bc1fa4 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 9.985936343449296,
"eval_steps": 200,
"global_step": 6750,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.29607698001480387,
"grad_norm": 2.7709997274459304,
"learning_rate": 1.9407407407407407e-05,
"loss": 6.1641,
"mean_token_accuracy": 0.1623367673992674,
"step": 200
},
{
"epoch": 0.29607698001480387,
"eval_loss": 4.407998561859131,
"eval_mean_token_accuracy": 0.25814827533577533,
"eval_runtime": 17.8969,
"eval_samples_per_second": 7.152,
"eval_steps_per_second": 0.894,
"step": 200
},
{
"epoch": 0.5921539600296077,
"grad_norm": 3.4130161394563543,
"learning_rate": 1.8814814814814816e-05,
"loss": 3.9329,
"mean_token_accuracy": 0.30167891483516485,
"step": 400
},
{
"epoch": 0.5921539600296077,
"eval_loss": 3.607056140899658,
"eval_mean_token_accuracy": 0.337335927960928,
"eval_runtime": 17.8919,
"eval_samples_per_second": 7.154,
"eval_steps_per_second": 0.894,
"step": 400
},
{
"epoch": 0.8882309400444115,
"grad_norm": 1.9205457771259444,
"learning_rate": 1.8222222222222224e-05,
"loss": 3.3827,
"mean_token_accuracy": 0.35828514194139194,
"step": 600
},
{
"epoch": 0.8882309400444115,
"eval_loss": 3.2180376052856445,
"eval_mean_token_accuracy": 0.3790006868131868,
"eval_runtime": 17.8941,
"eval_samples_per_second": 7.153,
"eval_steps_per_second": 0.894,
"step": 600
},
{
"epoch": 1.1835677276091783,
"grad_norm": 2.219040531148233,
"learning_rate": 1.7629629629629633e-05,
"loss": 3.0463,
"mean_token_accuracy": 0.3991521079866944,
"step": 800
},
{
"epoch": 1.1835677276091783,
"eval_loss": 2.958108901977539,
"eval_mean_token_accuracy": 0.41293498168498166,
"eval_runtime": 17.8905,
"eval_samples_per_second": 7.155,
"eval_steps_per_second": 0.894,
"step": 800
},
{
"epoch": 1.4796447076239823,
"grad_norm": 1.9578494903008412,
"learning_rate": 1.7037037037037038e-05,
"loss": 2.8108,
"mean_token_accuracy": 0.4309763431013431,
"step": 1000
},
{
"epoch": 1.4796447076239823,
"eval_loss": 2.753509044647217,
"eval_mean_token_accuracy": 0.4397664835164835,
"eval_runtime": 17.886,
"eval_samples_per_second": 7.156,
"eval_steps_per_second": 0.895,
"step": 1000
},
{
"epoch": 1.7757216876387862,
"grad_norm": 1.6226934039550227,
"learning_rate": 1.6444444444444444e-05,
"loss": 2.6416,
"mean_token_accuracy": 0.4566705586080586,
"step": 1200
},
{
"epoch": 1.7757216876387862,
"eval_loss": 2.5862350463867188,
"eval_mean_token_accuracy": 0.4640758547008547,
"eval_runtime": 17.9046,
"eval_samples_per_second": 7.149,
"eval_steps_per_second": 0.894,
"step": 1200
},
{
"epoch": 2.071058475203553,
"grad_norm": 1.5520186396248283,
"learning_rate": 1.5851851851851852e-05,
"loss": 2.4744,
"mean_token_accuracy": 0.48079080485095527,
"step": 1400
},
{
"epoch": 2.071058475203553,
"eval_loss": 2.44563627243042,
"eval_mean_token_accuracy": 0.4833009004884005,
"eval_runtime": 17.8972,
"eval_samples_per_second": 7.152,
"eval_steps_per_second": 0.894,
"step": 1400
},
{
"epoch": 2.3671354552183566,
"grad_norm": 1.5647563757923375,
"learning_rate": 1.525925925925926e-05,
"loss": 2.3221,
"mean_token_accuracy": 0.5035665064102564,
"step": 1600
},
{
"epoch": 2.3671354552183566,
"eval_loss": 2.3370866775512695,
"eval_mean_token_accuracy": 0.49927884615384616,
"eval_runtime": 17.8991,
"eval_samples_per_second": 7.151,
"eval_steps_per_second": 0.894,
"step": 1600
},
{
"epoch": 2.6632124352331608,
"grad_norm": 1.5085290967116813,
"learning_rate": 1.4666666666666666e-05,
"loss": 2.2258,
"mean_token_accuracy": 0.5176262591575091,
"step": 1800
},
{
"epoch": 2.6632124352331608,
"eval_loss": 2.240328550338745,
"eval_mean_token_accuracy": 0.5144898504273504,
"eval_runtime": 17.8923,
"eval_samples_per_second": 7.154,
"eval_steps_per_second": 0.894,
"step": 1800
},
{
"epoch": 2.9592894152479645,
"grad_norm": 1.5061171138250924,
"learning_rate": 1.4074074074074075e-05,
"loss": 2.1464,
"mean_token_accuracy": 0.5297419108669108,
"step": 2000
},
{
"epoch": 2.9592894152479645,
"eval_loss": 2.162349224090576,
"eval_mean_token_accuracy": 0.5255132020757021,
"eval_runtime": 17.9046,
"eval_samples_per_second": 7.149,
"eval_steps_per_second": 0.894,
"step": 2000
},
{
"epoch": 3.254626202812731,
"grad_norm": 1.5604440734793847,
"learning_rate": 1.3481481481481482e-05,
"loss": 2.0498,
"mean_token_accuracy": 0.5452948610843348,
"step": 2200
},
{
"epoch": 3.254626202812731,
"eval_loss": 2.100726366043091,
"eval_mean_token_accuracy": 0.534930173992674,
"eval_runtime": 17.8785,
"eval_samples_per_second": 7.159,
"eval_steps_per_second": 0.895,
"step": 2200
},
{
"epoch": 3.5507031828275353,
"grad_norm": 1.5621392816113453,
"learning_rate": 1.288888888888889e-05,
"loss": 1.9897,
"mean_token_accuracy": 0.554459249084249,
"step": 2400
},
{
"epoch": 3.5507031828275353,
"eval_loss": 2.05090069770813,
"eval_mean_token_accuracy": 0.5429983211233211,
"eval_runtime": 17.896,
"eval_samples_per_second": 7.152,
"eval_steps_per_second": 0.894,
"step": 2400
},
{
"epoch": 3.846780162842339,
"grad_norm": 1.2766839744837124,
"learning_rate": 1.2296296296296298e-05,
"loss": 1.9316,
"mean_token_accuracy": 0.5623727106227107,
"step": 2600
},
{
"epoch": 3.846780162842339,
"eval_loss": 1.9980659484863281,
"eval_mean_token_accuracy": 0.5504884004884005,
"eval_runtime": 17.8898,
"eval_samples_per_second": 7.155,
"eval_steps_per_second": 0.894,
"step": 2600
},
{
"epoch": 4.142116950407106,
"grad_norm": 1.3499621558383104,
"learning_rate": 1.1703703703703703e-05,
"loss": 1.8843,
"mean_token_accuracy": 0.5703756491350477,
"step": 2800
},
{
"epoch": 4.142116950407106,
"eval_loss": 1.9619176387786865,
"eval_mean_token_accuracy": 0.5568185286935287,
"eval_runtime": 17.8884,
"eval_samples_per_second": 7.155,
"eval_steps_per_second": 0.894,
"step": 2800
},
{
"epoch": 4.438193930421909,
"grad_norm": 1.2966133165796498,
"learning_rate": 1.1111111111111113e-05,
"loss": 1.8236,
"mean_token_accuracy": 0.58019971001221,
"step": 3000
},
{
"epoch": 4.438193930421909,
"eval_loss": 1.928423523902893,
"eval_mean_token_accuracy": 0.5617177960927962,
"eval_runtime": 17.9022,
"eval_samples_per_second": 7.15,
"eval_steps_per_second": 0.894,
"step": 3000
},
{
"epoch": 4.734270910436713,
"grad_norm": 1.3799177772818907,
"learning_rate": 1.0518518518518519e-05,
"loss": 1.8055,
"mean_token_accuracy": 0.5831678113553114,
"step": 3200
},
{
"epoch": 4.734270910436713,
"eval_loss": 1.8989052772521973,
"eval_mean_token_accuracy": 0.5659836691086692,
"eval_runtime": 17.9121,
"eval_samples_per_second": 7.146,
"eval_steps_per_second": 0.893,
"step": 3200
},
{
"epoch": 5.02960769800148,
"grad_norm": 1.189701686001914,
"learning_rate": 9.925925925925927e-06,
"loss": 1.7795,
"mean_token_accuracy": 0.5869535331001496,
"step": 3400
},
{
"epoch": 5.02960769800148,
"eval_loss": 1.8730087280273438,
"eval_mean_token_accuracy": 0.5700644841269842,
"eval_runtime": 17.8877,
"eval_samples_per_second": 7.156,
"eval_steps_per_second": 0.894,
"step": 3400
},
{
"epoch": 5.325684678016284,
"grad_norm": 1.2336131696453405,
"learning_rate": 9.333333333333334e-06,
"loss": 1.7181,
"mean_token_accuracy": 0.5980093101343101,
"step": 3600
},
{
"epoch": 5.325684678016284,
"eval_loss": 1.8516058921813965,
"eval_mean_token_accuracy": 0.5734088827838828,
"eval_runtime": 17.8893,
"eval_samples_per_second": 7.155,
"eval_steps_per_second": 0.894,
"step": 3600
},
{
"epoch": 5.6217616580310885,
"grad_norm": 1.3072064080159254,
"learning_rate": 8.740740740740741e-06,
"loss": 1.7039,
"mean_token_accuracy": 0.5995736797924298,
"step": 3800
},
{
"epoch": 5.6217616580310885,
"eval_loss": 1.8317267894744873,
"eval_mean_token_accuracy": 0.5765376984126984,
"eval_runtime": 17.8909,
"eval_samples_per_second": 7.154,
"eval_steps_per_second": 0.894,
"step": 3800
},
{
"epoch": 5.917838638045892,
"grad_norm": 1.1207509804360356,
"learning_rate": 8.148148148148148e-06,
"loss": 1.6899,
"mean_token_accuracy": 0.6014800442612942,
"step": 4000
},
{
"epoch": 5.917838638045892,
"eval_loss": 1.8143231868743896,
"eval_mean_token_accuracy": 0.5789148351648352,
"eval_runtime": 17.9255,
"eval_samples_per_second": 7.141,
"eval_steps_per_second": 0.893,
"step": 4000
},
{
"epoch": 6.213175425610658,
"grad_norm": 1.1903087300751234,
"learning_rate": 7.555555555555556e-06,
"loss": 1.6474,
"mean_token_accuracy": 0.6092672156581931,
"step": 4200
},
{
"epoch": 6.213175425610658,
"eval_loss": 1.8032296895980835,
"eval_mean_token_accuracy": 0.580849358974359,
"eval_runtime": 17.9223,
"eval_samples_per_second": 7.142,
"eval_steps_per_second": 0.893,
"step": 4200
},
{
"epoch": 6.509252405625462,
"grad_norm": 1.1713052935721413,
"learning_rate": 6.962962962962964e-06,
"loss": 1.6235,
"mean_token_accuracy": 0.6131747557997558,
"step": 4400
},
{
"epoch": 6.509252405625462,
"eval_loss": 1.7903690338134766,
"eval_mean_token_accuracy": 0.5832226800976801,
"eval_runtime": 17.9071,
"eval_samples_per_second": 7.148,
"eval_steps_per_second": 0.893,
"step": 4400
},
{
"epoch": 6.805329385640267,
"grad_norm": 1.219773082768343,
"learning_rate": 6.370370370370371e-06,
"loss": 1.6199,
"mean_token_accuracy": 0.6140330815018316,
"step": 4600
},
{
"epoch": 6.805329385640267,
"eval_loss": 1.775641679763794,
"eval_mean_token_accuracy": 0.5847260378510378,
"eval_runtime": 17.9026,
"eval_samples_per_second": 7.15,
"eval_steps_per_second": 0.894,
"step": 4600
},
{
"epoch": 7.100666173205033,
"grad_norm": 1.227329336420539,
"learning_rate": 5.777777777777778e-06,
"loss": 1.5968,
"mean_token_accuracy": 0.6178009507896726,
"step": 4800
},
{
"epoch": 7.100666173205033,
"eval_loss": 1.767627239227295,
"eval_mean_token_accuracy": 0.5867120726495727,
"eval_runtime": 17.9009,
"eval_samples_per_second": 7.15,
"eval_steps_per_second": 0.894,
"step": 4800
},
{
"epoch": 7.3967431532198376,
"grad_norm": 1.190619502177119,
"learning_rate": 5.185185185185185e-06,
"loss": 1.564,
"mean_token_accuracy": 0.6240972603785103,
"step": 5000
},
{
"epoch": 7.3967431532198376,
"eval_loss": 1.7600514888763428,
"eval_mean_token_accuracy": 0.5872462606837607,
"eval_runtime": 17.8856,
"eval_samples_per_second": 7.157,
"eval_steps_per_second": 0.895,
"step": 5000
},
{
"epoch": 7.692820133234641,
"grad_norm": 1.0951341674513762,
"learning_rate": 4.592592592592593e-06,
"loss": 1.5614,
"mean_token_accuracy": 0.624267322954823,
"step": 5200
},
{
"epoch": 7.692820133234641,
"eval_loss": 1.7513068914413452,
"eval_mean_token_accuracy": 0.5891063797313797,
"eval_runtime": 17.8894,
"eval_samples_per_second": 7.155,
"eval_steps_per_second": 0.894,
"step": 5200
},
{
"epoch": 7.988897113249445,
"grad_norm": 1.0806878005212168,
"learning_rate": 4.000000000000001e-06,
"loss": 1.5575,
"mean_token_accuracy": 0.6250475427350427,
"step": 5400
},
{
"epoch": 7.988897113249445,
"eval_loss": 1.7434966564178467,
"eval_mean_token_accuracy": 0.5898237179487179,
"eval_runtime": 17.917,
"eval_samples_per_second": 7.144,
"eval_steps_per_second": 0.893,
"step": 5400
},
{
"epoch": 8.284233900814211,
"grad_norm": 1.1517570180426586,
"learning_rate": 3.4074074074074077e-06,
"loss": 1.5248,
"mean_token_accuracy": 0.6320280631370857,
"step": 5600
},
{
"epoch": 8.284233900814211,
"eval_loss": 1.741744041442871,
"eval_mean_token_accuracy": 0.5906135531135531,
"eval_runtime": 17.8965,
"eval_samples_per_second": 7.152,
"eval_steps_per_second": 0.894,
"step": 5600
},
{
"epoch": 8.580310880829016,
"grad_norm": 1.0340477842720268,
"learning_rate": 2.814814814814815e-06,
"loss": 1.5212,
"mean_token_accuracy": 0.6322646138583639,
"step": 5800
},
{
"epoch": 8.580310880829016,
"eval_loss": 1.735644817352295,
"eval_mean_token_accuracy": 0.5920615842490843,
"eval_runtime": 17.8899,
"eval_samples_per_second": 7.155,
"eval_steps_per_second": 0.894,
"step": 5800
},
{
"epoch": 8.876387860843819,
"grad_norm": 1.030414959224373,
"learning_rate": 2.222222222222222e-06,
"loss": 1.5159,
"mean_token_accuracy": 0.6326660561660562,
"step": 6000
},
{
"epoch": 8.876387860843819,
"eval_loss": 1.7318824529647827,
"eval_mean_token_accuracy": 0.5921760531135531,
"eval_runtime": 17.9183,
"eval_samples_per_second": 7.144,
"eval_steps_per_second": 0.893,
"step": 6000
},
{
"epoch": 9.171724648408587,
"grad_norm": 0.9859235701888986,
"learning_rate": 1.62962962962963e-06,
"loss": 1.5007,
"mean_token_accuracy": 0.6362307248585444,
"step": 6200
},
{
"epoch": 9.171724648408587,
"eval_loss": 1.7302496433258057,
"eval_mean_token_accuracy": 0.5925709706959706,
"eval_runtime": 17.9121,
"eval_samples_per_second": 7.146,
"eval_steps_per_second": 0.893,
"step": 6200
},
{
"epoch": 9.46780162842339,
"grad_norm": 0.9796094549613502,
"learning_rate": 1.0370370370370371e-06,
"loss": 1.488,
"mean_token_accuracy": 0.638957036019536,
"step": 6400
},
{
"epoch": 9.46780162842339,
"eval_loss": 1.7277941703796387,
"eval_mean_token_accuracy": 0.5929544413919414,
"eval_runtime": 17.9246,
"eval_samples_per_second": 7.141,
"eval_steps_per_second": 0.893,
"step": 6400
},
{
"epoch": 9.763878608438194,
"grad_norm": 0.9868368122782655,
"learning_rate": 4.444444444444445e-07,
"loss": 1.4899,
"mean_token_accuracy": 0.6385683379120879,
"step": 6600
},
{
"epoch": 9.763878608438194,
"eval_loss": 1.7258272171020508,
"eval_mean_token_accuracy": 0.5932234432234432,
"eval_runtime": 17.8902,
"eval_samples_per_second": 7.155,
"eval_steps_per_second": 0.894,
"step": 6600
},
{
"epoch": 9.985936343449296,
"step": 6750,
"total_flos": 768626978193408.0,
"train_loss": 0.0,
"train_runtime": 1.3815,
"train_samples_per_second": 2424.882,
"train_steps_per_second": 152.008
}
],
"logging_steps": 200,
"max_steps": 210,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 768626978193408.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}