|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 9.985936343449296, |
|
"eval_steps": 200, |
|
"global_step": 6750, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.29607698001480387, |
|
"grad_norm": 2.7709997274459304, |
|
"learning_rate": 1.9407407407407407e-05, |
|
"loss": 6.1641, |
|
"mean_token_accuracy": 0.1623367673992674, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.29607698001480387, |
|
"eval_loss": 4.407998561859131, |
|
"eval_mean_token_accuracy": 0.25814827533577533, |
|
"eval_runtime": 17.8969, |
|
"eval_samples_per_second": 7.152, |
|
"eval_steps_per_second": 0.894, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5921539600296077, |
|
"grad_norm": 3.4130161394563543, |
|
"learning_rate": 1.8814814814814816e-05, |
|
"loss": 3.9329, |
|
"mean_token_accuracy": 0.30167891483516485, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.5921539600296077, |
|
"eval_loss": 3.607056140899658, |
|
"eval_mean_token_accuracy": 0.337335927960928, |
|
"eval_runtime": 17.8919, |
|
"eval_samples_per_second": 7.154, |
|
"eval_steps_per_second": 0.894, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.8882309400444115, |
|
"grad_norm": 1.9205457771259444, |
|
"learning_rate": 1.8222222222222224e-05, |
|
"loss": 3.3827, |
|
"mean_token_accuracy": 0.35828514194139194, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.8882309400444115, |
|
"eval_loss": 3.2180376052856445, |
|
"eval_mean_token_accuracy": 0.3790006868131868, |
|
"eval_runtime": 17.8941, |
|
"eval_samples_per_second": 7.153, |
|
"eval_steps_per_second": 0.894, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.1835677276091783, |
|
"grad_norm": 2.219040531148233, |
|
"learning_rate": 1.7629629629629633e-05, |
|
"loss": 3.0463, |
|
"mean_token_accuracy": 0.3991521079866944, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.1835677276091783, |
|
"eval_loss": 2.958108901977539, |
|
"eval_mean_token_accuracy": 0.41293498168498166, |
|
"eval_runtime": 17.8905, |
|
"eval_samples_per_second": 7.155, |
|
"eval_steps_per_second": 0.894, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.4796447076239823, |
|
"grad_norm": 1.9578494903008412, |
|
"learning_rate": 1.7037037037037038e-05, |
|
"loss": 2.8108, |
|
"mean_token_accuracy": 0.4309763431013431, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.4796447076239823, |
|
"eval_loss": 2.753509044647217, |
|
"eval_mean_token_accuracy": 0.4397664835164835, |
|
"eval_runtime": 17.886, |
|
"eval_samples_per_second": 7.156, |
|
"eval_steps_per_second": 0.895, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.7757216876387862, |
|
"grad_norm": 1.6226934039550227, |
|
"learning_rate": 1.6444444444444444e-05, |
|
"loss": 2.6416, |
|
"mean_token_accuracy": 0.4566705586080586, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.7757216876387862, |
|
"eval_loss": 2.5862350463867188, |
|
"eval_mean_token_accuracy": 0.4640758547008547, |
|
"eval_runtime": 17.9046, |
|
"eval_samples_per_second": 7.149, |
|
"eval_steps_per_second": 0.894, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.071058475203553, |
|
"grad_norm": 1.5520186396248283, |
|
"learning_rate": 1.5851851851851852e-05, |
|
"loss": 2.4744, |
|
"mean_token_accuracy": 0.48079080485095527, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.071058475203553, |
|
"eval_loss": 2.44563627243042, |
|
"eval_mean_token_accuracy": 0.4833009004884005, |
|
"eval_runtime": 17.8972, |
|
"eval_samples_per_second": 7.152, |
|
"eval_steps_per_second": 0.894, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.3671354552183566, |
|
"grad_norm": 1.5647563757923375, |
|
"learning_rate": 1.525925925925926e-05, |
|
"loss": 2.3221, |
|
"mean_token_accuracy": 0.5035665064102564, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.3671354552183566, |
|
"eval_loss": 2.3370866775512695, |
|
"eval_mean_token_accuracy": 0.49927884615384616, |
|
"eval_runtime": 17.8991, |
|
"eval_samples_per_second": 7.151, |
|
"eval_steps_per_second": 0.894, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.6632124352331608, |
|
"grad_norm": 1.5085290967116813, |
|
"learning_rate": 1.4666666666666666e-05, |
|
"loss": 2.2258, |
|
"mean_token_accuracy": 0.5176262591575091, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.6632124352331608, |
|
"eval_loss": 2.240328550338745, |
|
"eval_mean_token_accuracy": 0.5144898504273504, |
|
"eval_runtime": 17.8923, |
|
"eval_samples_per_second": 7.154, |
|
"eval_steps_per_second": 0.894, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.9592894152479645, |
|
"grad_norm": 1.5061171138250924, |
|
"learning_rate": 1.4074074074074075e-05, |
|
"loss": 2.1464, |
|
"mean_token_accuracy": 0.5297419108669108, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.9592894152479645, |
|
"eval_loss": 2.162349224090576, |
|
"eval_mean_token_accuracy": 0.5255132020757021, |
|
"eval_runtime": 17.9046, |
|
"eval_samples_per_second": 7.149, |
|
"eval_steps_per_second": 0.894, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 3.254626202812731, |
|
"grad_norm": 1.5604440734793847, |
|
"learning_rate": 1.3481481481481482e-05, |
|
"loss": 2.0498, |
|
"mean_token_accuracy": 0.5452948610843348, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 3.254626202812731, |
|
"eval_loss": 2.100726366043091, |
|
"eval_mean_token_accuracy": 0.534930173992674, |
|
"eval_runtime": 17.8785, |
|
"eval_samples_per_second": 7.159, |
|
"eval_steps_per_second": 0.895, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 3.5507031828275353, |
|
"grad_norm": 1.5621392816113453, |
|
"learning_rate": 1.288888888888889e-05, |
|
"loss": 1.9897, |
|
"mean_token_accuracy": 0.554459249084249, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 3.5507031828275353, |
|
"eval_loss": 2.05090069770813, |
|
"eval_mean_token_accuracy": 0.5429983211233211, |
|
"eval_runtime": 17.896, |
|
"eval_samples_per_second": 7.152, |
|
"eval_steps_per_second": 0.894, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 3.846780162842339, |
|
"grad_norm": 1.2766839744837124, |
|
"learning_rate": 1.2296296296296298e-05, |
|
"loss": 1.9316, |
|
"mean_token_accuracy": 0.5623727106227107, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 3.846780162842339, |
|
"eval_loss": 1.9980659484863281, |
|
"eval_mean_token_accuracy": 0.5504884004884005, |
|
"eval_runtime": 17.8898, |
|
"eval_samples_per_second": 7.155, |
|
"eval_steps_per_second": 0.894, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 4.142116950407106, |
|
"grad_norm": 1.3499621558383104, |
|
"learning_rate": 1.1703703703703703e-05, |
|
"loss": 1.8843, |
|
"mean_token_accuracy": 0.5703756491350477, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 4.142116950407106, |
|
"eval_loss": 1.9619176387786865, |
|
"eval_mean_token_accuracy": 0.5568185286935287, |
|
"eval_runtime": 17.8884, |
|
"eval_samples_per_second": 7.155, |
|
"eval_steps_per_second": 0.894, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 4.438193930421909, |
|
"grad_norm": 1.2966133165796498, |
|
"learning_rate": 1.1111111111111113e-05, |
|
"loss": 1.8236, |
|
"mean_token_accuracy": 0.58019971001221, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 4.438193930421909, |
|
"eval_loss": 1.928423523902893, |
|
"eval_mean_token_accuracy": 0.5617177960927962, |
|
"eval_runtime": 17.9022, |
|
"eval_samples_per_second": 7.15, |
|
"eval_steps_per_second": 0.894, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 4.734270910436713, |
|
"grad_norm": 1.3799177772818907, |
|
"learning_rate": 1.0518518518518519e-05, |
|
"loss": 1.8055, |
|
"mean_token_accuracy": 0.5831678113553114, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 4.734270910436713, |
|
"eval_loss": 1.8989052772521973, |
|
"eval_mean_token_accuracy": 0.5659836691086692, |
|
"eval_runtime": 17.9121, |
|
"eval_samples_per_second": 7.146, |
|
"eval_steps_per_second": 0.893, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 5.02960769800148, |
|
"grad_norm": 1.189701686001914, |
|
"learning_rate": 9.925925925925927e-06, |
|
"loss": 1.7795, |
|
"mean_token_accuracy": 0.5869535331001496, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 5.02960769800148, |
|
"eval_loss": 1.8730087280273438, |
|
"eval_mean_token_accuracy": 0.5700644841269842, |
|
"eval_runtime": 17.8877, |
|
"eval_samples_per_second": 7.156, |
|
"eval_steps_per_second": 0.894, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 5.325684678016284, |
|
"grad_norm": 1.2336131696453405, |
|
"learning_rate": 9.333333333333334e-06, |
|
"loss": 1.7181, |
|
"mean_token_accuracy": 0.5980093101343101, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 5.325684678016284, |
|
"eval_loss": 1.8516058921813965, |
|
"eval_mean_token_accuracy": 0.5734088827838828, |
|
"eval_runtime": 17.8893, |
|
"eval_samples_per_second": 7.155, |
|
"eval_steps_per_second": 0.894, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 5.6217616580310885, |
|
"grad_norm": 1.3072064080159254, |
|
"learning_rate": 8.740740740740741e-06, |
|
"loss": 1.7039, |
|
"mean_token_accuracy": 0.5995736797924298, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 5.6217616580310885, |
|
"eval_loss": 1.8317267894744873, |
|
"eval_mean_token_accuracy": 0.5765376984126984, |
|
"eval_runtime": 17.8909, |
|
"eval_samples_per_second": 7.154, |
|
"eval_steps_per_second": 0.894, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 5.917838638045892, |
|
"grad_norm": 1.1207509804360356, |
|
"learning_rate": 8.148148148148148e-06, |
|
"loss": 1.6899, |
|
"mean_token_accuracy": 0.6014800442612942, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 5.917838638045892, |
|
"eval_loss": 1.8143231868743896, |
|
"eval_mean_token_accuracy": 0.5789148351648352, |
|
"eval_runtime": 17.9255, |
|
"eval_samples_per_second": 7.141, |
|
"eval_steps_per_second": 0.893, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 6.213175425610658, |
|
"grad_norm": 1.1903087300751234, |
|
"learning_rate": 7.555555555555556e-06, |
|
"loss": 1.6474, |
|
"mean_token_accuracy": 0.6092672156581931, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 6.213175425610658, |
|
"eval_loss": 1.8032296895980835, |
|
"eval_mean_token_accuracy": 0.580849358974359, |
|
"eval_runtime": 17.9223, |
|
"eval_samples_per_second": 7.142, |
|
"eval_steps_per_second": 0.893, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 6.509252405625462, |
|
"grad_norm": 1.1713052935721413, |
|
"learning_rate": 6.962962962962964e-06, |
|
"loss": 1.6235, |
|
"mean_token_accuracy": 0.6131747557997558, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 6.509252405625462, |
|
"eval_loss": 1.7903690338134766, |
|
"eval_mean_token_accuracy": 0.5832226800976801, |
|
"eval_runtime": 17.9071, |
|
"eval_samples_per_second": 7.148, |
|
"eval_steps_per_second": 0.893, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 6.805329385640267, |
|
"grad_norm": 1.219773082768343, |
|
"learning_rate": 6.370370370370371e-06, |
|
"loss": 1.6199, |
|
"mean_token_accuracy": 0.6140330815018316, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 6.805329385640267, |
|
"eval_loss": 1.775641679763794, |
|
"eval_mean_token_accuracy": 0.5847260378510378, |
|
"eval_runtime": 17.9026, |
|
"eval_samples_per_second": 7.15, |
|
"eval_steps_per_second": 0.894, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 7.100666173205033, |
|
"grad_norm": 1.227329336420539, |
|
"learning_rate": 5.777777777777778e-06, |
|
"loss": 1.5968, |
|
"mean_token_accuracy": 0.6178009507896726, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 7.100666173205033, |
|
"eval_loss": 1.767627239227295, |
|
"eval_mean_token_accuracy": 0.5867120726495727, |
|
"eval_runtime": 17.9009, |
|
"eval_samples_per_second": 7.15, |
|
"eval_steps_per_second": 0.894, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 7.3967431532198376, |
|
"grad_norm": 1.190619502177119, |
|
"learning_rate": 5.185185185185185e-06, |
|
"loss": 1.564, |
|
"mean_token_accuracy": 0.6240972603785103, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 7.3967431532198376, |
|
"eval_loss": 1.7600514888763428, |
|
"eval_mean_token_accuracy": 0.5872462606837607, |
|
"eval_runtime": 17.8856, |
|
"eval_samples_per_second": 7.157, |
|
"eval_steps_per_second": 0.895, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 7.692820133234641, |
|
"grad_norm": 1.0951341674513762, |
|
"learning_rate": 4.592592592592593e-06, |
|
"loss": 1.5614, |
|
"mean_token_accuracy": 0.624267322954823, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 7.692820133234641, |
|
"eval_loss": 1.7513068914413452, |
|
"eval_mean_token_accuracy": 0.5891063797313797, |
|
"eval_runtime": 17.8894, |
|
"eval_samples_per_second": 7.155, |
|
"eval_steps_per_second": 0.894, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 7.988897113249445, |
|
"grad_norm": 1.0806878005212168, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 1.5575, |
|
"mean_token_accuracy": 0.6250475427350427, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 7.988897113249445, |
|
"eval_loss": 1.7434966564178467, |
|
"eval_mean_token_accuracy": 0.5898237179487179, |
|
"eval_runtime": 17.917, |
|
"eval_samples_per_second": 7.144, |
|
"eval_steps_per_second": 0.893, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 8.284233900814211, |
|
"grad_norm": 1.1517570180426586, |
|
"learning_rate": 3.4074074074074077e-06, |
|
"loss": 1.5248, |
|
"mean_token_accuracy": 0.6320280631370857, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 8.284233900814211, |
|
"eval_loss": 1.741744041442871, |
|
"eval_mean_token_accuracy": 0.5906135531135531, |
|
"eval_runtime": 17.8965, |
|
"eval_samples_per_second": 7.152, |
|
"eval_steps_per_second": 0.894, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 8.580310880829016, |
|
"grad_norm": 1.0340477842720268, |
|
"learning_rate": 2.814814814814815e-06, |
|
"loss": 1.5212, |
|
"mean_token_accuracy": 0.6322646138583639, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 8.580310880829016, |
|
"eval_loss": 1.735644817352295, |
|
"eval_mean_token_accuracy": 0.5920615842490843, |
|
"eval_runtime": 17.8899, |
|
"eval_samples_per_second": 7.155, |
|
"eval_steps_per_second": 0.894, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 8.876387860843819, |
|
"grad_norm": 1.030414959224373, |
|
"learning_rate": 2.222222222222222e-06, |
|
"loss": 1.5159, |
|
"mean_token_accuracy": 0.6326660561660562, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 8.876387860843819, |
|
"eval_loss": 1.7318824529647827, |
|
"eval_mean_token_accuracy": 0.5921760531135531, |
|
"eval_runtime": 17.9183, |
|
"eval_samples_per_second": 7.144, |
|
"eval_steps_per_second": 0.893, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 9.171724648408587, |
|
"grad_norm": 0.9859235701888986, |
|
"learning_rate": 1.62962962962963e-06, |
|
"loss": 1.5007, |
|
"mean_token_accuracy": 0.6362307248585444, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 9.171724648408587, |
|
"eval_loss": 1.7302496433258057, |
|
"eval_mean_token_accuracy": 0.5925709706959706, |
|
"eval_runtime": 17.9121, |
|
"eval_samples_per_second": 7.146, |
|
"eval_steps_per_second": 0.893, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 9.46780162842339, |
|
"grad_norm": 0.9796094549613502, |
|
"learning_rate": 1.0370370370370371e-06, |
|
"loss": 1.488, |
|
"mean_token_accuracy": 0.638957036019536, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 9.46780162842339, |
|
"eval_loss": 1.7277941703796387, |
|
"eval_mean_token_accuracy": 0.5929544413919414, |
|
"eval_runtime": 17.9246, |
|
"eval_samples_per_second": 7.141, |
|
"eval_steps_per_second": 0.893, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 9.763878608438194, |
|
"grad_norm": 0.9868368122782655, |
|
"learning_rate": 4.444444444444445e-07, |
|
"loss": 1.4899, |
|
"mean_token_accuracy": 0.6385683379120879, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 9.763878608438194, |
|
"eval_loss": 1.7258272171020508, |
|
"eval_mean_token_accuracy": 0.5932234432234432, |
|
"eval_runtime": 17.8902, |
|
"eval_samples_per_second": 7.155, |
|
"eval_steps_per_second": 0.894, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 9.985936343449296, |
|
"step": 6750, |
|
"total_flos": 768626978193408.0, |
|
"train_loss": 0.0, |
|
"train_runtime": 1.3815, |
|
"train_samples_per_second": 2424.882, |
|
"train_steps_per_second": 152.008 |
|
} |
|
], |
|
"logging_steps": 200, |
|
"max_steps": 210, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 768626978193408.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|