|
{ |
|
"best_metric": 1.1517876386642456, |
|
"best_model_checkpoint": "/media/user/Expansion/flan-t5-small-simplifier/checkpoint-80000", |
|
"epoch": 2.0, |
|
"eval_steps": 10000, |
|
"global_step": 89920, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01112099644128114, |
|
"grad_norm": 3.2175512313842773, |
|
"learning_rate": 4.972197508896798e-05, |
|
"loss": 1.6509, |
|
"num_input_tokens_seen": 183488, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.02224199288256228, |
|
"grad_norm": 3.9029898643493652, |
|
"learning_rate": 4.9443950177935946e-05, |
|
"loss": 1.583, |
|
"num_input_tokens_seen": 363744, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.03336298932384341, |
|
"grad_norm": 3.4091885089874268, |
|
"learning_rate": 4.9165925266903915e-05, |
|
"loss": 1.5553, |
|
"num_input_tokens_seen": 547584, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.04448398576512456, |
|
"grad_norm": 2.5965495109558105, |
|
"learning_rate": 4.888790035587189e-05, |
|
"loss": 1.5444, |
|
"num_input_tokens_seen": 732480, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.055604982206405695, |
|
"grad_norm": 4.170108318328857, |
|
"learning_rate": 4.860987544483986e-05, |
|
"loss": 1.5137, |
|
"num_input_tokens_seen": 916672, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.06672597864768683, |
|
"grad_norm": 3.025068759918213, |
|
"learning_rate": 4.8331850533807835e-05, |
|
"loss": 1.5057, |
|
"num_input_tokens_seen": 1095952, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.07784697508896797, |
|
"grad_norm": 2.7047712802886963, |
|
"learning_rate": 4.80538256227758e-05, |
|
"loss": 1.5019, |
|
"num_input_tokens_seen": 1276936, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.08896797153024912, |
|
"grad_norm": 2.844285488128662, |
|
"learning_rate": 4.777580071174377e-05, |
|
"loss": 1.5074, |
|
"num_input_tokens_seen": 1464080, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.10008896797153025, |
|
"grad_norm": 3.055643081665039, |
|
"learning_rate": 4.749777580071175e-05, |
|
"loss": 1.4959, |
|
"num_input_tokens_seen": 1647288, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.11120996441281139, |
|
"grad_norm": 2.6245856285095215, |
|
"learning_rate": 4.721975088967972e-05, |
|
"loss": 1.4777, |
|
"num_input_tokens_seen": 1836832, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.12233096085409252, |
|
"grad_norm": 3.1758244037628174, |
|
"learning_rate": 4.694172597864769e-05, |
|
"loss": 1.4778, |
|
"num_input_tokens_seen": 2020728, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.13345195729537365, |
|
"grad_norm": 2.518728494644165, |
|
"learning_rate": 4.666370106761566e-05, |
|
"loss": 1.4758, |
|
"num_input_tokens_seen": 2198968, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.1445729537366548, |
|
"grad_norm": 3.8143937587738037, |
|
"learning_rate": 4.638567615658363e-05, |
|
"loss": 1.4846, |
|
"num_input_tokens_seen": 2381968, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.15569395017793594, |
|
"grad_norm": 2.766146421432495, |
|
"learning_rate": 4.6107651245551604e-05, |
|
"loss": 1.4468, |
|
"num_input_tokens_seen": 2562176, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.16681494661921709, |
|
"grad_norm": 3.891373634338379, |
|
"learning_rate": 4.582962633451958e-05, |
|
"loss": 1.4586, |
|
"num_input_tokens_seen": 2744688, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.17793594306049823, |
|
"grad_norm": 3.277316093444824, |
|
"learning_rate": 4.555160142348754e-05, |
|
"loss": 1.4606, |
|
"num_input_tokens_seen": 2926000, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.18905693950177935, |
|
"grad_norm": 3.242478132247925, |
|
"learning_rate": 4.5273576512455517e-05, |
|
"loss": 1.446, |
|
"num_input_tokens_seen": 3108520, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.2001779359430605, |
|
"grad_norm": 2.3061795234680176, |
|
"learning_rate": 4.499555160142349e-05, |
|
"loss": 1.4348, |
|
"num_input_tokens_seen": 3289352, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.21129893238434164, |
|
"grad_norm": 3.4106180667877197, |
|
"learning_rate": 4.471752669039146e-05, |
|
"loss": 1.4345, |
|
"num_input_tokens_seen": 3473784, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.22241992882562278, |
|
"grad_norm": 2.88779354095459, |
|
"learning_rate": 4.443950177935943e-05, |
|
"loss": 1.4423, |
|
"num_input_tokens_seen": 3655312, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.22241992882562278, |
|
"eval_loss": 1.2430843114852905, |
|
"eval_runtime": 2.8218, |
|
"eval_samples_per_second": 885.962, |
|
"eval_steps_per_second": 110.922, |
|
"num_input_tokens_seen": 3655312, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.23354092526690393, |
|
"grad_norm": 2.829268217086792, |
|
"learning_rate": 4.4161476868327405e-05, |
|
"loss": 1.4321, |
|
"num_input_tokens_seen": 3847328, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.24466192170818504, |
|
"grad_norm": 3.807185173034668, |
|
"learning_rate": 4.388345195729537e-05, |
|
"loss": 1.4487, |
|
"num_input_tokens_seen": 4025576, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.2557829181494662, |
|
"grad_norm": 2.449007511138916, |
|
"learning_rate": 4.360542704626335e-05, |
|
"loss": 1.413, |
|
"num_input_tokens_seen": 4213832, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.2669039145907473, |
|
"grad_norm": 3.363555431365967, |
|
"learning_rate": 4.3327402135231324e-05, |
|
"loss": 1.4313, |
|
"num_input_tokens_seen": 4395352, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.2780249110320285, |
|
"grad_norm": 2.702747344970703, |
|
"learning_rate": 4.3049377224199286e-05, |
|
"loss": 1.4179, |
|
"num_input_tokens_seen": 4582368, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.2891459074733096, |
|
"grad_norm": 3.286044120788574, |
|
"learning_rate": 4.277135231316726e-05, |
|
"loss": 1.4137, |
|
"num_input_tokens_seen": 4770640, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.30026690391459077, |
|
"grad_norm": 2.6410391330718994, |
|
"learning_rate": 4.249332740213524e-05, |
|
"loss": 1.4303, |
|
"num_input_tokens_seen": 4951280, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.3113879003558719, |
|
"grad_norm": 3.239133358001709, |
|
"learning_rate": 4.2215302491103205e-05, |
|
"loss": 1.4185, |
|
"num_input_tokens_seen": 5133552, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.322508896797153, |
|
"grad_norm": 2.943094253540039, |
|
"learning_rate": 4.1937277580071174e-05, |
|
"loss": 1.4065, |
|
"num_input_tokens_seen": 5319992, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.33362989323843417, |
|
"grad_norm": 2.180136203765869, |
|
"learning_rate": 4.165925266903915e-05, |
|
"loss": 1.4096, |
|
"num_input_tokens_seen": 5501592, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.3447508896797153, |
|
"grad_norm": 2.4302403926849365, |
|
"learning_rate": 4.138122775800712e-05, |
|
"loss": 1.4203, |
|
"num_input_tokens_seen": 5689312, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.35587188612099646, |
|
"grad_norm": 2.850964069366455, |
|
"learning_rate": 4.1103202846975093e-05, |
|
"loss": 1.4291, |
|
"num_input_tokens_seen": 5870384, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.3669928825622776, |
|
"grad_norm": 1.9641114473342896, |
|
"learning_rate": 4.082517793594306e-05, |
|
"loss": 1.4054, |
|
"num_input_tokens_seen": 6048624, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 0.3781138790035587, |
|
"grad_norm": 2.287353754043579, |
|
"learning_rate": 4.054715302491103e-05, |
|
"loss": 1.4118, |
|
"num_input_tokens_seen": 6229728, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.38923487544483987, |
|
"grad_norm": 3.4425182342529297, |
|
"learning_rate": 4.0269128113879006e-05, |
|
"loss": 1.4061, |
|
"num_input_tokens_seen": 6410968, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 0.400355871886121, |
|
"grad_norm": 2.0604770183563232, |
|
"learning_rate": 3.9991103202846975e-05, |
|
"loss": 1.3871, |
|
"num_input_tokens_seen": 6591312, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.41147686832740216, |
|
"grad_norm": 2.671599864959717, |
|
"learning_rate": 3.971307829181495e-05, |
|
"loss": 1.42, |
|
"num_input_tokens_seen": 6777912, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 0.4225978647686833, |
|
"grad_norm": 2.176579475402832, |
|
"learning_rate": 3.943505338078292e-05, |
|
"loss": 1.417, |
|
"num_input_tokens_seen": 6964240, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.4337188612099644, |
|
"grad_norm": 3.2378785610198975, |
|
"learning_rate": 3.915702846975089e-05, |
|
"loss": 1.3992, |
|
"num_input_tokens_seen": 7147288, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 0.44483985765124556, |
|
"grad_norm": 3.0986084938049316, |
|
"learning_rate": 3.887900355871886e-05, |
|
"loss": 1.3884, |
|
"num_input_tokens_seen": 7331520, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.44483985765124556, |
|
"eval_loss": 1.2093411684036255, |
|
"eval_runtime": 2.7984, |
|
"eval_samples_per_second": 893.355, |
|
"eval_steps_per_second": 111.848, |
|
"num_input_tokens_seen": 7331520, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.4559608540925267, |
|
"grad_norm": 3.4998202323913574, |
|
"learning_rate": 3.860097864768684e-05, |
|
"loss": 1.3915, |
|
"num_input_tokens_seen": 7515512, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 0.46708185053380785, |
|
"grad_norm": 3.0249533653259277, |
|
"learning_rate": 3.832295373665481e-05, |
|
"loss": 1.3967, |
|
"num_input_tokens_seen": 7696752, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.47820284697508897, |
|
"grad_norm": 6.8868513107299805, |
|
"learning_rate": 3.8044928825622775e-05, |
|
"loss": 1.4106, |
|
"num_input_tokens_seen": 7878832, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 0.4893238434163701, |
|
"grad_norm": 2.2134385108947754, |
|
"learning_rate": 3.776690391459075e-05, |
|
"loss": 1.3847, |
|
"num_input_tokens_seen": 8059592, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 0.5004448398576512, |
|
"grad_norm": 2.2698676586151123, |
|
"learning_rate": 3.748887900355872e-05, |
|
"loss": 1.3941, |
|
"num_input_tokens_seen": 8245432, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 0.5115658362989324, |
|
"grad_norm": 2.4593448638916016, |
|
"learning_rate": 3.7210854092526695e-05, |
|
"loss": 1.3716, |
|
"num_input_tokens_seen": 8429400, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 0.5226868327402135, |
|
"grad_norm": 2.5121207237243652, |
|
"learning_rate": 3.6932829181494664e-05, |
|
"loss": 1.3733, |
|
"num_input_tokens_seen": 8618216, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 0.5338078291814946, |
|
"grad_norm": 2.2493703365325928, |
|
"learning_rate": 3.665480427046263e-05, |
|
"loss": 1.3754, |
|
"num_input_tokens_seen": 8801640, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.5449288256227758, |
|
"grad_norm": 3.08921217918396, |
|
"learning_rate": 3.637677935943061e-05, |
|
"loss": 1.3694, |
|
"num_input_tokens_seen": 8977800, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 0.556049822064057, |
|
"grad_norm": 2.215222120285034, |
|
"learning_rate": 3.609875444839858e-05, |
|
"loss": 1.3674, |
|
"num_input_tokens_seen": 9160904, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 0.5671708185053381, |
|
"grad_norm": 2.553903818130493, |
|
"learning_rate": 3.582072953736655e-05, |
|
"loss": 1.3735, |
|
"num_input_tokens_seen": 9348344, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 0.5782918149466192, |
|
"grad_norm": 2.546022891998291, |
|
"learning_rate": 3.554270462633452e-05, |
|
"loss": 1.382, |
|
"num_input_tokens_seen": 9532664, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 0.5894128113879004, |
|
"grad_norm": 2.917534112930298, |
|
"learning_rate": 3.5264679715302496e-05, |
|
"loss": 1.3654, |
|
"num_input_tokens_seen": 9717800, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 0.6005338078291815, |
|
"grad_norm": 3.355299472808838, |
|
"learning_rate": 3.4986654804270464e-05, |
|
"loss": 1.3876, |
|
"num_input_tokens_seen": 9902536, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 0.6116548042704626, |
|
"grad_norm": 2.67924427986145, |
|
"learning_rate": 3.470862989323844e-05, |
|
"loss": 1.3575, |
|
"num_input_tokens_seen": 10082216, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 0.6227758007117438, |
|
"grad_norm": 3.040212392807007, |
|
"learning_rate": 3.44306049822064e-05, |
|
"loss": 1.3644, |
|
"num_input_tokens_seen": 10263992, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 0.6338967971530249, |
|
"grad_norm": 3.726254940032959, |
|
"learning_rate": 3.415258007117438e-05, |
|
"loss": 1.3616, |
|
"num_input_tokens_seen": 10444128, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 0.645017793594306, |
|
"grad_norm": 4.716592788696289, |
|
"learning_rate": 3.387455516014235e-05, |
|
"loss": 1.3662, |
|
"num_input_tokens_seen": 10626928, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 0.6561387900355872, |
|
"grad_norm": 2.9317166805267334, |
|
"learning_rate": 3.359653024911032e-05, |
|
"loss": 1.38, |
|
"num_input_tokens_seen": 10808960, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 0.6672597864768683, |
|
"grad_norm": 2.411684989929199, |
|
"learning_rate": 3.331850533807829e-05, |
|
"loss": 1.3782, |
|
"num_input_tokens_seen": 10990432, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.6672597864768683, |
|
"eval_loss": 1.185857892036438, |
|
"eval_runtime": 2.9618, |
|
"eval_samples_per_second": 844.079, |
|
"eval_steps_per_second": 105.679, |
|
"num_input_tokens_seen": 10990432, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.6783807829181495, |
|
"grad_norm": 2.765089273452759, |
|
"learning_rate": 3.3040480427046265e-05, |
|
"loss": 1.3808, |
|
"num_input_tokens_seen": 11172032, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 0.6895017793594306, |
|
"grad_norm": 2.808806896209717, |
|
"learning_rate": 3.2762455516014234e-05, |
|
"loss": 1.3925, |
|
"num_input_tokens_seen": 11356608, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 0.7006227758007118, |
|
"grad_norm": 2.6583220958709717, |
|
"learning_rate": 3.248443060498221e-05, |
|
"loss": 1.3716, |
|
"num_input_tokens_seen": 11538056, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 0.7117437722419929, |
|
"grad_norm": 2.2725088596343994, |
|
"learning_rate": 3.2206405693950184e-05, |
|
"loss": 1.3689, |
|
"num_input_tokens_seen": 11721264, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 0.722864768683274, |
|
"grad_norm": 2.927656412124634, |
|
"learning_rate": 3.1928380782918146e-05, |
|
"loss": 1.3722, |
|
"num_input_tokens_seen": 11896688, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 0.7339857651245552, |
|
"grad_norm": 2.0601186752319336, |
|
"learning_rate": 3.165035587188612e-05, |
|
"loss": 1.3408, |
|
"num_input_tokens_seen": 12084440, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 0.7451067615658363, |
|
"grad_norm": 2.5769150257110596, |
|
"learning_rate": 3.13723309608541e-05, |
|
"loss": 1.3874, |
|
"num_input_tokens_seen": 12264224, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 0.7562277580071174, |
|
"grad_norm": 2.845653772354126, |
|
"learning_rate": 3.1094306049822066e-05, |
|
"loss": 1.3755, |
|
"num_input_tokens_seen": 12446200, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 0.7673487544483986, |
|
"grad_norm": 2.3848676681518555, |
|
"learning_rate": 3.0816281138790034e-05, |
|
"loss": 1.3463, |
|
"num_input_tokens_seen": 12628992, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 0.7784697508896797, |
|
"grad_norm": 3.2360849380493164, |
|
"learning_rate": 3.053825622775801e-05, |
|
"loss": 1.3678, |
|
"num_input_tokens_seen": 12809808, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 0.7895907473309609, |
|
"grad_norm": 2.3211023807525635, |
|
"learning_rate": 3.026023131672598e-05, |
|
"loss": 1.3732, |
|
"num_input_tokens_seen": 12989912, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 0.800711743772242, |
|
"grad_norm": 3.599958658218384, |
|
"learning_rate": 2.9982206405693954e-05, |
|
"loss": 1.3606, |
|
"num_input_tokens_seen": 13170560, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 0.8118327402135231, |
|
"grad_norm": 2.0861263275146484, |
|
"learning_rate": 2.9704181494661926e-05, |
|
"loss": 1.3475, |
|
"num_input_tokens_seen": 13350424, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 0.8229537366548043, |
|
"grad_norm": 2.043938159942627, |
|
"learning_rate": 2.9426156583629895e-05, |
|
"loss": 1.3753, |
|
"num_input_tokens_seen": 13538640, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 0.8340747330960854, |
|
"grad_norm": 2.4880995750427246, |
|
"learning_rate": 2.9148131672597867e-05, |
|
"loss": 1.3497, |
|
"num_input_tokens_seen": 13730096, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 0.8451957295373665, |
|
"grad_norm": 2.535860300064087, |
|
"learning_rate": 2.8870106761565835e-05, |
|
"loss": 1.3708, |
|
"num_input_tokens_seen": 13909256, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 0.8563167259786477, |
|
"grad_norm": 2.499455213546753, |
|
"learning_rate": 2.8592081850533807e-05, |
|
"loss": 1.3546, |
|
"num_input_tokens_seen": 14101536, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 0.8674377224199288, |
|
"grad_norm": 2.3696117401123047, |
|
"learning_rate": 2.8314056939501783e-05, |
|
"loss": 1.3593, |
|
"num_input_tokens_seen": 14281392, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 0.87855871886121, |
|
"grad_norm": 3.260430097579956, |
|
"learning_rate": 2.803603202846975e-05, |
|
"loss": 1.3506, |
|
"num_input_tokens_seen": 14466320, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 0.8896797153024911, |
|
"grad_norm": 4.11997652053833, |
|
"learning_rate": 2.7758007117437723e-05, |
|
"loss": 1.3595, |
|
"num_input_tokens_seen": 14653328, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 0.8896797153024911, |
|
"eval_loss": 1.1787019968032837, |
|
"eval_runtime": 2.9698, |
|
"eval_samples_per_second": 841.809, |
|
"eval_steps_per_second": 105.394, |
|
"num_input_tokens_seen": 14653328, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 0.9008007117437722, |
|
"grad_norm": 1.932468056678772, |
|
"learning_rate": 2.7479982206405695e-05, |
|
"loss": 1.3559, |
|
"num_input_tokens_seen": 14837624, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 0.9119217081850534, |
|
"grad_norm": 2.6026477813720703, |
|
"learning_rate": 2.7201957295373664e-05, |
|
"loss": 1.3454, |
|
"num_input_tokens_seen": 15020192, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 0.9230427046263345, |
|
"grad_norm": 2.3455870151519775, |
|
"learning_rate": 2.692393238434164e-05, |
|
"loss": 1.3563, |
|
"num_input_tokens_seen": 15204288, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 0.9341637010676157, |
|
"grad_norm": 2.8757784366607666, |
|
"learning_rate": 2.664590747330961e-05, |
|
"loss": 1.3283, |
|
"num_input_tokens_seen": 15393744, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 0.9452846975088968, |
|
"grad_norm": 2.3972697257995605, |
|
"learning_rate": 2.636788256227758e-05, |
|
"loss": 1.3612, |
|
"num_input_tokens_seen": 15569824, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 0.9564056939501779, |
|
"grad_norm": 3.187290906906128, |
|
"learning_rate": 2.6089857651245552e-05, |
|
"loss": 1.3525, |
|
"num_input_tokens_seen": 15753768, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 0.9675266903914591, |
|
"grad_norm": 2.447659969329834, |
|
"learning_rate": 2.5811832740213527e-05, |
|
"loss": 1.3532, |
|
"num_input_tokens_seen": 15934952, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 0.9786476868327402, |
|
"grad_norm": 2.037935495376587, |
|
"learning_rate": 2.5533807829181493e-05, |
|
"loss": 1.3318, |
|
"num_input_tokens_seen": 16117896, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 0.9897686832740213, |
|
"grad_norm": 2.7559268474578857, |
|
"learning_rate": 2.5255782918149468e-05, |
|
"loss": 1.3325, |
|
"num_input_tokens_seen": 16298928, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 1.0008896797153024, |
|
"grad_norm": 2.2017595767974854, |
|
"learning_rate": 2.4977758007117437e-05, |
|
"loss": 1.3605, |
|
"num_input_tokens_seen": 16481072, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 1.0120106761565837, |
|
"grad_norm": 2.3097991943359375, |
|
"learning_rate": 2.4699733096085412e-05, |
|
"loss": 1.3404, |
|
"num_input_tokens_seen": 16664336, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 1.0231316725978647, |
|
"grad_norm": 2.6227993965148926, |
|
"learning_rate": 2.4421708185053384e-05, |
|
"loss": 1.3235, |
|
"num_input_tokens_seen": 16845800, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 1.0342526690391458, |
|
"grad_norm": 2.24474835395813, |
|
"learning_rate": 2.4143683274021353e-05, |
|
"loss": 1.3376, |
|
"num_input_tokens_seen": 17029008, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 1.045373665480427, |
|
"grad_norm": 2.7171192169189453, |
|
"learning_rate": 2.3865658362989325e-05, |
|
"loss": 1.3188, |
|
"num_input_tokens_seen": 17215936, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 1.0564946619217082, |
|
"grad_norm": 2.5323736667633057, |
|
"learning_rate": 2.3587633451957297e-05, |
|
"loss": 1.3369, |
|
"num_input_tokens_seen": 17401232, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 1.0676156583629894, |
|
"grad_norm": 2.267789363861084, |
|
"learning_rate": 2.330960854092527e-05, |
|
"loss": 1.3215, |
|
"num_input_tokens_seen": 17583344, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 1.0787366548042705, |
|
"grad_norm": 3.399862289428711, |
|
"learning_rate": 2.3031583629893237e-05, |
|
"loss": 1.3464, |
|
"num_input_tokens_seen": 17771600, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 1.0898576512455516, |
|
"grad_norm": 2.8749985694885254, |
|
"learning_rate": 2.2753558718861213e-05, |
|
"loss": 1.2928, |
|
"num_input_tokens_seen": 17957200, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 1.1009786476868326, |
|
"grad_norm": 2.6826517581939697, |
|
"learning_rate": 2.247553380782918e-05, |
|
"loss": 1.3191, |
|
"num_input_tokens_seen": 18138560, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 1.112099644128114, |
|
"grad_norm": 2.2963333129882812, |
|
"learning_rate": 2.2197508896797153e-05, |
|
"loss": 1.3059, |
|
"num_input_tokens_seen": 18326104, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 1.112099644128114, |
|
"eval_loss": 1.1665468215942383, |
|
"eval_runtime": 3.0497, |
|
"eval_samples_per_second": 819.748, |
|
"eval_steps_per_second": 102.632, |
|
"num_input_tokens_seen": 18326104, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 1.123220640569395, |
|
"grad_norm": 2.2697386741638184, |
|
"learning_rate": 2.1919483985765125e-05, |
|
"loss": 1.321, |
|
"num_input_tokens_seen": 18508560, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 1.1343416370106763, |
|
"grad_norm": 2.436851739883423, |
|
"learning_rate": 2.1641459074733097e-05, |
|
"loss": 1.3528, |
|
"num_input_tokens_seen": 18689264, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 1.1454626334519573, |
|
"grad_norm": 2.297527313232422, |
|
"learning_rate": 2.136343416370107e-05, |
|
"loss": 1.2987, |
|
"num_input_tokens_seen": 18874328, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 1.1565836298932384, |
|
"grad_norm": 2.5088889598846436, |
|
"learning_rate": 2.1085409252669038e-05, |
|
"loss": 1.3139, |
|
"num_input_tokens_seen": 19060472, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 1.1677046263345197, |
|
"grad_norm": 2.067575454711914, |
|
"learning_rate": 2.0807384341637014e-05, |
|
"loss": 1.2961, |
|
"num_input_tokens_seen": 19247416, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 1.1788256227758007, |
|
"grad_norm": 2.467543363571167, |
|
"learning_rate": 2.0529359430604982e-05, |
|
"loss": 1.3279, |
|
"num_input_tokens_seen": 19436888, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 1.1899466192170818, |
|
"grad_norm": 3.4245800971984863, |
|
"learning_rate": 2.0251334519572954e-05, |
|
"loss": 1.3303, |
|
"num_input_tokens_seen": 19616320, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 1.201067615658363, |
|
"grad_norm": 2.759120464324951, |
|
"learning_rate": 1.9973309608540926e-05, |
|
"loss": 1.3201, |
|
"num_input_tokens_seen": 19793576, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 1.2121886120996441, |
|
"grad_norm": 2.7749531269073486, |
|
"learning_rate": 1.9695284697508898e-05, |
|
"loss": 1.3194, |
|
"num_input_tokens_seen": 19980880, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 1.2233096085409252, |
|
"grad_norm": 2.4467661380767822, |
|
"learning_rate": 1.9417259786476867e-05, |
|
"loss": 1.3293, |
|
"num_input_tokens_seen": 20163048, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 1.2344306049822065, |
|
"grad_norm": 3.4420840740203857, |
|
"learning_rate": 1.9139234875444842e-05, |
|
"loss": 1.3015, |
|
"num_input_tokens_seen": 20340904, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 1.2455516014234875, |
|
"grad_norm": 2.4761664867401123, |
|
"learning_rate": 1.8861209964412814e-05, |
|
"loss": 1.3292, |
|
"num_input_tokens_seen": 20524328, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 1.2566725978647688, |
|
"grad_norm": 3.0505285263061523, |
|
"learning_rate": 1.8583185053380783e-05, |
|
"loss": 1.317, |
|
"num_input_tokens_seen": 20707808, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 1.2677935943060499, |
|
"grad_norm": 2.361429214477539, |
|
"learning_rate": 1.8305160142348755e-05, |
|
"loss": 1.3176, |
|
"num_input_tokens_seen": 20892744, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 1.278914590747331, |
|
"grad_norm": 1.9151511192321777, |
|
"learning_rate": 1.8027135231316727e-05, |
|
"loss": 1.3267, |
|
"num_input_tokens_seen": 21072216, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 1.290035587188612, |
|
"grad_norm": 3.0513691902160645, |
|
"learning_rate": 1.77491103202847e-05, |
|
"loss": 1.2948, |
|
"num_input_tokens_seen": 21256064, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 1.3011565836298933, |
|
"grad_norm": 1.7151504755020142, |
|
"learning_rate": 1.7471085409252668e-05, |
|
"loss": 1.2967, |
|
"num_input_tokens_seen": 21439648, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 1.3122775800711743, |
|
"grad_norm": 2.3011133670806885, |
|
"learning_rate": 1.7193060498220643e-05, |
|
"loss": 1.3199, |
|
"num_input_tokens_seen": 21621776, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 1.3233985765124556, |
|
"grad_norm": 2.1768672466278076, |
|
"learning_rate": 1.691503558718861e-05, |
|
"loss": 1.3233, |
|
"num_input_tokens_seen": 21810728, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 1.3345195729537367, |
|
"grad_norm": 3.249089241027832, |
|
"learning_rate": 1.6637010676156584e-05, |
|
"loss": 1.3298, |
|
"num_input_tokens_seen": 21991016, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 1.3345195729537367, |
|
"eval_loss": 1.1589475870132446, |
|
"eval_runtime": 3.0361, |
|
"eval_samples_per_second": 823.416, |
|
"eval_steps_per_second": 103.092, |
|
"num_input_tokens_seen": 21991016, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 1.3456405693950177, |
|
"grad_norm": 1.7282594442367554, |
|
"learning_rate": 1.6358985765124556e-05, |
|
"loss": 1.325, |
|
"num_input_tokens_seen": 22178168, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 1.3567615658362988, |
|
"grad_norm": 2.2659966945648193, |
|
"learning_rate": 1.6080960854092528e-05, |
|
"loss": 1.3321, |
|
"num_input_tokens_seen": 22359360, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 1.36788256227758, |
|
"grad_norm": 2.155791759490967, |
|
"learning_rate": 1.58029359430605e-05, |
|
"loss": 1.2995, |
|
"num_input_tokens_seen": 22543952, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 1.3790035587188612, |
|
"grad_norm": 2.902367353439331, |
|
"learning_rate": 1.5524911032028472e-05, |
|
"loss": 1.3061, |
|
"num_input_tokens_seen": 22728064, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 1.3901245551601424, |
|
"grad_norm": 2.168686866760254, |
|
"learning_rate": 1.5246886120996442e-05, |
|
"loss": 1.3231, |
|
"num_input_tokens_seen": 22913608, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 1.4012455516014235, |
|
"grad_norm": 2.682150363922119, |
|
"learning_rate": 1.4968861209964412e-05, |
|
"loss": 1.3091, |
|
"num_input_tokens_seen": 23101416, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 1.4123665480427046, |
|
"grad_norm": 2.82143497467041, |
|
"learning_rate": 1.4690836298932384e-05, |
|
"loss": 1.2922, |
|
"num_input_tokens_seen": 23281200, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 1.4234875444839858, |
|
"grad_norm": 2.1155049800872803, |
|
"learning_rate": 1.4412811387900358e-05, |
|
"loss": 1.3015, |
|
"num_input_tokens_seen": 23462552, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 1.434608540925267, |
|
"grad_norm": 2.6742069721221924, |
|
"learning_rate": 1.4134786476868328e-05, |
|
"loss": 1.3108, |
|
"num_input_tokens_seen": 23640424, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 1.445729537366548, |
|
"grad_norm": 2.575198173522949, |
|
"learning_rate": 1.3856761565836299e-05, |
|
"loss": 1.2987, |
|
"num_input_tokens_seen": 23825392, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 1.4568505338078293, |
|
"grad_norm": 3.2627415657043457, |
|
"learning_rate": 1.3578736654804272e-05, |
|
"loss": 1.3096, |
|
"num_input_tokens_seen": 24004872, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 1.4679715302491103, |
|
"grad_norm": 3.2727510929107666, |
|
"learning_rate": 1.3300711743772243e-05, |
|
"loss": 1.3108, |
|
"num_input_tokens_seen": 24186064, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 1.4790925266903914, |
|
"grad_norm": 2.187281608581543, |
|
"learning_rate": 1.3022686832740213e-05, |
|
"loss": 1.3239, |
|
"num_input_tokens_seen": 24367968, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 1.4902135231316727, |
|
"grad_norm": 3.5734856128692627, |
|
"learning_rate": 1.2744661921708187e-05, |
|
"loss": 1.3241, |
|
"num_input_tokens_seen": 24551184, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 1.5013345195729537, |
|
"grad_norm": 2.756578207015991, |
|
"learning_rate": 1.2466637010676157e-05, |
|
"loss": 1.307, |
|
"num_input_tokens_seen": 24730288, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 1.512455516014235, |
|
"grad_norm": 2.3752739429473877, |
|
"learning_rate": 1.2188612099644127e-05, |
|
"loss": 1.2963, |
|
"num_input_tokens_seen": 24914816, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 1.523576512455516, |
|
"grad_norm": 3.7108139991760254, |
|
"learning_rate": 1.1910587188612101e-05, |
|
"loss": 1.3126, |
|
"num_input_tokens_seen": 25100648, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 1.5346975088967971, |
|
"grad_norm": 2.5954089164733887, |
|
"learning_rate": 1.1632562277580072e-05, |
|
"loss": 1.3053, |
|
"num_input_tokens_seen": 25282312, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 1.5458185053380782, |
|
"grad_norm": 3.0645289421081543, |
|
"learning_rate": 1.1354537366548044e-05, |
|
"loss": 1.2976, |
|
"num_input_tokens_seen": 25465504, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 1.5569395017793595, |
|
"grad_norm": 2.3734166622161865, |
|
"learning_rate": 1.1076512455516016e-05, |
|
"loss": 1.2994, |
|
"num_input_tokens_seen": 25656600, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 1.5569395017793595, |
|
"eval_loss": 1.1562061309814453, |
|
"eval_runtime": 2.9007, |
|
"eval_samples_per_second": 861.851, |
|
"eval_steps_per_second": 107.904, |
|
"num_input_tokens_seen": 25656600, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 1.5680604982206405, |
|
"grad_norm": 2.8284573554992676, |
|
"learning_rate": 1.0798487544483986e-05, |
|
"loss": 1.3125, |
|
"num_input_tokens_seen": 25844216, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 1.5791814946619218, |
|
"grad_norm": 1.8697574138641357, |
|
"learning_rate": 1.0520462633451958e-05, |
|
"loss": 1.3247, |
|
"num_input_tokens_seen": 26026824, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 1.5903024911032029, |
|
"grad_norm": 2.4746646881103516, |
|
"learning_rate": 1.024243772241993e-05, |
|
"loss": 1.3295, |
|
"num_input_tokens_seen": 26211760, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 1.601423487544484, |
|
"grad_norm": 2.415778398513794, |
|
"learning_rate": 9.9644128113879e-06, |
|
"loss": 1.3191, |
|
"num_input_tokens_seen": 26393624, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 1.612544483985765, |
|
"grad_norm": 2.263882875442505, |
|
"learning_rate": 9.686387900355872e-06, |
|
"loss": 1.3136, |
|
"num_input_tokens_seen": 26576872, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 1.6236654804270463, |
|
"grad_norm": 2.436645269393921, |
|
"learning_rate": 9.408362989323843e-06, |
|
"loss": 1.286, |
|
"num_input_tokens_seen": 26763752, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 1.6347864768683276, |
|
"grad_norm": 2.739278793334961, |
|
"learning_rate": 9.130338078291816e-06, |
|
"loss": 1.2712, |
|
"num_input_tokens_seen": 26943856, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 1.6459074733096086, |
|
"grad_norm": 2.406345844268799, |
|
"learning_rate": 8.852313167259788e-06, |
|
"loss": 1.305, |
|
"num_input_tokens_seen": 27122192, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 1.6570284697508897, |
|
"grad_norm": 2.1659858226776123, |
|
"learning_rate": 8.574288256227759e-06, |
|
"loss": 1.312, |
|
"num_input_tokens_seen": 27305912, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 1.6681494661921707, |
|
"grad_norm": 2.7831106185913086, |
|
"learning_rate": 8.29626334519573e-06, |
|
"loss": 1.2997, |
|
"num_input_tokens_seen": 27485504, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 1.6792704626334518, |
|
"grad_norm": 2.9885916709899902, |
|
"learning_rate": 8.018238434163701e-06, |
|
"loss": 1.3187, |
|
"num_input_tokens_seen": 27665584, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 1.690391459074733, |
|
"grad_norm": 2.5562667846679688, |
|
"learning_rate": 7.740213523131673e-06, |
|
"loss": 1.3142, |
|
"num_input_tokens_seen": 27851448, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 1.7015124555160144, |
|
"grad_norm": 2.9897525310516357, |
|
"learning_rate": 7.462188612099645e-06, |
|
"loss": 1.3071, |
|
"num_input_tokens_seen": 28033824, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 1.7126334519572954, |
|
"grad_norm": 2.6760592460632324, |
|
"learning_rate": 7.184163701067615e-06, |
|
"loss": 1.3049, |
|
"num_input_tokens_seen": 28220000, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 1.7237544483985765, |
|
"grad_norm": 2.314532995223999, |
|
"learning_rate": 6.906138790035588e-06, |
|
"loss": 1.3124, |
|
"num_input_tokens_seen": 28403400, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 1.7348754448398576, |
|
"grad_norm": 2.2853899002075195, |
|
"learning_rate": 6.6281138790035586e-06, |
|
"loss": 1.3188, |
|
"num_input_tokens_seen": 28585168, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 1.7459964412811388, |
|
"grad_norm": 2.462369918823242, |
|
"learning_rate": 6.3500889679715306e-06, |
|
"loss": 1.3146, |
|
"num_input_tokens_seen": 28767960, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 1.75711743772242, |
|
"grad_norm": 2.626847505569458, |
|
"learning_rate": 6.072064056939502e-06, |
|
"loss": 1.3082, |
|
"num_input_tokens_seen": 28950344, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 1.7682384341637012, |
|
"grad_norm": 3.058187484741211, |
|
"learning_rate": 5.794039145907473e-06, |
|
"loss": 1.301, |
|
"num_input_tokens_seen": 29133248, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 1.7793594306049823, |
|
"grad_norm": 1.932924747467041, |
|
"learning_rate": 5.516014234875446e-06, |
|
"loss": 1.2952, |
|
"num_input_tokens_seen": 29314808, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 1.7793594306049823, |
|
"eval_loss": 1.1517876386642456, |
|
"eval_runtime": 3.0679, |
|
"eval_samples_per_second": 814.88, |
|
"eval_steps_per_second": 102.023, |
|
"num_input_tokens_seen": 29314808, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 1.7904804270462633, |
|
"grad_norm": 3.326586961746216, |
|
"learning_rate": 5.237989323843417e-06, |
|
"loss": 1.3161, |
|
"num_input_tokens_seen": 29504040, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 1.8016014234875444, |
|
"grad_norm": 3.139636754989624, |
|
"learning_rate": 4.959964412811388e-06, |
|
"loss": 1.3, |
|
"num_input_tokens_seen": 29686712, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 1.8127224199288257, |
|
"grad_norm": 2.5767743587493896, |
|
"learning_rate": 4.681939501779359e-06, |
|
"loss": 1.2934, |
|
"num_input_tokens_seen": 29868136, |
|
"step": 81500 |
|
}, |
|
{ |
|
"epoch": 1.8238434163701067, |
|
"grad_norm": 2.1930339336395264, |
|
"learning_rate": 4.4039145907473305e-06, |
|
"loss": 1.3104, |
|
"num_input_tokens_seen": 30050016, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 1.834964412811388, |
|
"grad_norm": 2.9890389442443848, |
|
"learning_rate": 4.125889679715303e-06, |
|
"loss": 1.303, |
|
"num_input_tokens_seen": 30234304, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 1.846085409252669, |
|
"grad_norm": 2.9597034454345703, |
|
"learning_rate": 3.8478647686832745e-06, |
|
"loss": 1.3024, |
|
"num_input_tokens_seen": 30419248, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 1.8572064056939501, |
|
"grad_norm": 3.17301082611084, |
|
"learning_rate": 3.5698398576512457e-06, |
|
"loss": 1.2926, |
|
"num_input_tokens_seen": 30601776, |
|
"step": 83500 |
|
}, |
|
{ |
|
"epoch": 1.8683274021352312, |
|
"grad_norm": 2.677340269088745, |
|
"learning_rate": 3.291814946619217e-06, |
|
"loss": 1.3184, |
|
"num_input_tokens_seen": 30783800, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 1.8794483985765125, |
|
"grad_norm": 3.5369062423706055, |
|
"learning_rate": 3.013790035587189e-06, |
|
"loss": 1.2858, |
|
"num_input_tokens_seen": 30968472, |
|
"step": 84500 |
|
}, |
|
{ |
|
"epoch": 1.8905693950177938, |
|
"grad_norm": 2.870908737182617, |
|
"learning_rate": 2.73576512455516e-06, |
|
"loss": 1.3107, |
|
"num_input_tokens_seen": 31147072, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 1.9016903914590748, |
|
"grad_norm": 2.1105244159698486, |
|
"learning_rate": 2.457740213523132e-06, |
|
"loss": 1.2797, |
|
"num_input_tokens_seen": 31330536, |
|
"step": 85500 |
|
}, |
|
{ |
|
"epoch": 1.9128113879003559, |
|
"grad_norm": 4.080565452575684, |
|
"learning_rate": 2.1797153024911032e-06, |
|
"loss": 1.2991, |
|
"num_input_tokens_seen": 31507752, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 1.923932384341637, |
|
"grad_norm": 3.036339044570923, |
|
"learning_rate": 1.901690391459075e-06, |
|
"loss": 1.2884, |
|
"num_input_tokens_seen": 31685504, |
|
"step": 86500 |
|
}, |
|
{ |
|
"epoch": 1.935053380782918, |
|
"grad_norm": 2.3314597606658936, |
|
"learning_rate": 1.6236654804270462e-06, |
|
"loss": 1.2935, |
|
"num_input_tokens_seen": 31865648, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 1.9461743772241993, |
|
"grad_norm": 2.4721710681915283, |
|
"learning_rate": 1.3456405693950178e-06, |
|
"loss": 1.3124, |
|
"num_input_tokens_seen": 32046704, |
|
"step": 87500 |
|
}, |
|
{ |
|
"epoch": 1.9572953736654806, |
|
"grad_norm": 2.4747238159179688, |
|
"learning_rate": 1.0676156583629894e-06, |
|
"loss": 1.2941, |
|
"num_input_tokens_seen": 32235800, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 1.9684163701067616, |
|
"grad_norm": 2.4421122074127197, |
|
"learning_rate": 7.89590747330961e-07, |
|
"loss": 1.3187, |
|
"num_input_tokens_seen": 32415672, |
|
"step": 88500 |
|
}, |
|
{ |
|
"epoch": 1.9795373665480427, |
|
"grad_norm": 3.469534158706665, |
|
"learning_rate": 5.115658362989324e-07, |
|
"loss": 1.2897, |
|
"num_input_tokens_seen": 32597136, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 1.9906583629893237, |
|
"grad_norm": 3.1344103813171387, |
|
"learning_rate": 2.335409252669039e-07, |
|
"loss": 1.2902, |
|
"num_input_tokens_seen": 32782688, |
|
"step": 89500 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"num_input_tokens_seen": 32939232, |
|
"step": 89920, |
|
"total_flos": 1.1959161056722944e+16, |
|
"train_loss": 1.358397777055082, |
|
"train_runtime": 3698.2072, |
|
"train_samples_per_second": 194.516, |
|
"train_steps_per_second": 24.314, |
|
"train_tokens_per_second": 8905.5 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 89920, |
|
"num_input_tokens_seen": 32939232, |
|
"num_train_epochs": 2, |
|
"save_steps": 10000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.1959161056722944e+16, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|