|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.04294742008712191, |
|
"eval_steps": 20, |
|
"global_step": 700, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0012270691453463403, |
|
"grad_norm": 0.05003070831298828, |
|
"learning_rate": 0.00019981588314717073, |
|
"loss": 2.6972, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0012270691453463403, |
|
"eval_loss": 2.2967841625213623, |
|
"eval_runtime": 23.2641, |
|
"eval_samples_per_second": 4.298, |
|
"eval_steps_per_second": 0.559, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0024541382906926807, |
|
"grad_norm": 0.07180789858102798, |
|
"learning_rate": 0.00019957039401006504, |
|
"loss": 2.2022, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0024541382906926807, |
|
"eval_loss": 2.068006992340088, |
|
"eval_runtime": 23.5719, |
|
"eval_samples_per_second": 4.242, |
|
"eval_steps_per_second": 0.552, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.003681207436039021, |
|
"grad_norm": 0.08049603551626205, |
|
"learning_rate": 0.00019932490487295938, |
|
"loss": 2.0529, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.003681207436039021, |
|
"eval_loss": 1.9338455200195312, |
|
"eval_runtime": 23.1495, |
|
"eval_samples_per_second": 4.32, |
|
"eval_steps_per_second": 0.562, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.004908276581385361, |
|
"grad_norm": 0.08653070032596588, |
|
"learning_rate": 0.00019907941573585368, |
|
"loss": 1.9395, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.004908276581385361, |
|
"eval_loss": 1.8689138889312744, |
|
"eval_runtime": 23.7275, |
|
"eval_samples_per_second": 4.215, |
|
"eval_steps_per_second": 0.548, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.006135345726731701, |
|
"grad_norm": 0.08481493592262268, |
|
"learning_rate": 0.00019883392659874802, |
|
"loss": 1.8773, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.006135345726731701, |
|
"eval_loss": 1.8180441856384277, |
|
"eval_runtime": 23.5705, |
|
"eval_samples_per_second": 4.243, |
|
"eval_steps_per_second": 0.552, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.007362414872078042, |
|
"grad_norm": 0.11568216979503632, |
|
"learning_rate": 0.00019858843746164233, |
|
"loss": 1.7827, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.007362414872078042, |
|
"eval_loss": 1.774067997932434, |
|
"eval_runtime": 23.9722, |
|
"eval_samples_per_second": 4.172, |
|
"eval_steps_per_second": 0.542, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.008589484017424381, |
|
"grad_norm": 0.10869361460208893, |
|
"learning_rate": 0.00019834294832453666, |
|
"loss": 1.812, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.008589484017424381, |
|
"eval_loss": 1.737804889678955, |
|
"eval_runtime": 23.519, |
|
"eval_samples_per_second": 4.252, |
|
"eval_steps_per_second": 0.553, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.009816553162770723, |
|
"grad_norm": 0.0976206362247467, |
|
"learning_rate": 0.00019809745918743097, |
|
"loss": 1.74, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.009816553162770723, |
|
"eval_loss": 1.700899600982666, |
|
"eval_runtime": 23.2347, |
|
"eval_samples_per_second": 4.304, |
|
"eval_steps_per_second": 0.56, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.011043622308117063, |
|
"grad_norm": 0.1123971939086914, |
|
"learning_rate": 0.00019785197005032528, |
|
"loss": 1.787, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.011043622308117063, |
|
"eval_loss": 1.6765294075012207, |
|
"eval_runtime": 23.6403, |
|
"eval_samples_per_second": 4.23, |
|
"eval_steps_per_second": 0.55, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.012270691453463402, |
|
"grad_norm": 0.10320968925952911, |
|
"learning_rate": 0.0001976064809132196, |
|
"loss": 1.7804, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.012270691453463402, |
|
"eval_loss": 1.6563650369644165, |
|
"eval_runtime": 23.6381, |
|
"eval_samples_per_second": 4.23, |
|
"eval_steps_per_second": 0.55, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.013497760598809742, |
|
"grad_norm": 0.14491896331310272, |
|
"learning_rate": 0.00019736099177611392, |
|
"loss": 1.7043, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.013497760598809742, |
|
"eval_loss": 1.6346065998077393, |
|
"eval_runtime": 23.7121, |
|
"eval_samples_per_second": 4.217, |
|
"eval_steps_per_second": 0.548, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.014724829744156084, |
|
"grad_norm": 0.12502990663051605, |
|
"learning_rate": 0.00019711550263900825, |
|
"loss": 1.7345, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.014724829744156084, |
|
"eval_loss": 1.6147732734680176, |
|
"eval_runtime": 23.5936, |
|
"eval_samples_per_second": 4.238, |
|
"eval_steps_per_second": 0.551, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.015951898889502422, |
|
"grad_norm": 0.1230228915810585, |
|
"learning_rate": 0.00019687001350190256, |
|
"loss": 1.7338, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.015951898889502422, |
|
"eval_loss": 1.5957908630371094, |
|
"eval_runtime": 23.389, |
|
"eval_samples_per_second": 4.276, |
|
"eval_steps_per_second": 0.556, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.017178968034848762, |
|
"grad_norm": 0.12000931799411774, |
|
"learning_rate": 0.00019662452436479687, |
|
"loss": 1.7143, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.017178968034848762, |
|
"eval_loss": 1.585697889328003, |
|
"eval_runtime": 23.566, |
|
"eval_samples_per_second": 4.243, |
|
"eval_steps_per_second": 0.552, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.018406037180195105, |
|
"grad_norm": 0.1442350149154663, |
|
"learning_rate": 0.00019637903522769118, |
|
"loss": 1.6406, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.018406037180195105, |
|
"eval_loss": 1.5710804462432861, |
|
"eval_runtime": 23.5083, |
|
"eval_samples_per_second": 4.254, |
|
"eval_steps_per_second": 0.553, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.019633106325541445, |
|
"grad_norm": 0.09555982798337936, |
|
"learning_rate": 0.00019613354609058549, |
|
"loss": 1.6213, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.019633106325541445, |
|
"eval_loss": 1.5556869506835938, |
|
"eval_runtime": 23.5239, |
|
"eval_samples_per_second": 4.251, |
|
"eval_steps_per_second": 0.553, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.020860175470887785, |
|
"grad_norm": 0.13320715725421906, |
|
"learning_rate": 0.00019588805695347982, |
|
"loss": 1.6956, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.020860175470887785, |
|
"eval_loss": 1.5424914360046387, |
|
"eval_runtime": 23.6064, |
|
"eval_samples_per_second": 4.236, |
|
"eval_steps_per_second": 0.551, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.022087244616234125, |
|
"grad_norm": 0.12061001360416412, |
|
"learning_rate": 0.00019564256781637413, |
|
"loss": 1.6589, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.022087244616234125, |
|
"eval_loss": 1.528477430343628, |
|
"eval_runtime": 23.6796, |
|
"eval_samples_per_second": 4.223, |
|
"eval_steps_per_second": 0.549, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.023314313761580465, |
|
"grad_norm": 0.14327766001224518, |
|
"learning_rate": 0.00019539707867926844, |
|
"loss": 1.5946, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.023314313761580465, |
|
"eval_loss": 1.52202570438385, |
|
"eval_runtime": 23.6756, |
|
"eval_samples_per_second": 4.224, |
|
"eval_steps_per_second": 0.549, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.024541382906926805, |
|
"grad_norm": 0.12291988730430603, |
|
"learning_rate": 0.00019515158954216277, |
|
"loss": 1.5366, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.024541382906926805, |
|
"eval_loss": 1.507960319519043, |
|
"eval_runtime": 23.6216, |
|
"eval_samples_per_second": 4.233, |
|
"eval_steps_per_second": 0.55, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.025768452052273145, |
|
"grad_norm": 0.15288175642490387, |
|
"learning_rate": 0.00019490610040505708, |
|
"loss": 1.5829, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.025768452052273145, |
|
"eval_loss": 1.4994325637817383, |
|
"eval_runtime": 23.6368, |
|
"eval_samples_per_second": 4.231, |
|
"eval_steps_per_second": 0.55, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.026995521197619485, |
|
"grad_norm": 0.13319191336631775, |
|
"learning_rate": 0.0001946606112679514, |
|
"loss": 1.5523, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.026995521197619485, |
|
"eval_loss": 1.4956778287887573, |
|
"eval_runtime": 23.6921, |
|
"eval_samples_per_second": 4.221, |
|
"eval_steps_per_second": 0.549, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.028222590342965825, |
|
"grad_norm": 0.14759239554405212, |
|
"learning_rate": 0.00019441512213084572, |
|
"loss": 1.5735, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.028222590342965825, |
|
"eval_loss": 1.486402988433838, |
|
"eval_runtime": 23.2911, |
|
"eval_samples_per_second": 4.293, |
|
"eval_steps_per_second": 0.558, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.029449659488312168, |
|
"grad_norm": 0.11428073793649673, |
|
"learning_rate": 0.00019416963299374006, |
|
"loss": 1.5788, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.029449659488312168, |
|
"eval_loss": 1.4712104797363281, |
|
"eval_runtime": 23.4851, |
|
"eval_samples_per_second": 4.258, |
|
"eval_steps_per_second": 0.554, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.030676728633658508, |
|
"grad_norm": 0.11649870127439499, |
|
"learning_rate": 0.00019392414385663436, |
|
"loss": 1.5667, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.030676728633658508, |
|
"eval_loss": 1.4620152711868286, |
|
"eval_runtime": 23.5455, |
|
"eval_samples_per_second": 4.247, |
|
"eval_steps_per_second": 0.552, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.031903797779004844, |
|
"grad_norm": 0.16019868850708008, |
|
"learning_rate": 0.00019367865471952867, |
|
"loss": 1.4778, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.031903797779004844, |
|
"eval_loss": 1.4597880840301514, |
|
"eval_runtime": 23.624, |
|
"eval_samples_per_second": 4.233, |
|
"eval_steps_per_second": 0.55, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.03313086692435119, |
|
"grad_norm": 0.1370091289281845, |
|
"learning_rate": 0.00019343316558242298, |
|
"loss": 1.5531, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.03313086692435119, |
|
"eval_loss": 1.443243384361267, |
|
"eval_runtime": 23.5537, |
|
"eval_samples_per_second": 4.246, |
|
"eval_steps_per_second": 0.552, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.034357936069697524, |
|
"grad_norm": 0.1211417093873024, |
|
"learning_rate": 0.0001931876764453173, |
|
"loss": 1.5879, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.034357936069697524, |
|
"eval_loss": 1.4466437101364136, |
|
"eval_runtime": 23.8508, |
|
"eval_samples_per_second": 4.193, |
|
"eval_steps_per_second": 0.545, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.03558500521504387, |
|
"grad_norm": 0.14397528767585754, |
|
"learning_rate": 0.00019294218730821162, |
|
"loss": 1.5352, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.03558500521504387, |
|
"eval_loss": 1.4339115619659424, |
|
"eval_runtime": 23.649, |
|
"eval_samples_per_second": 4.229, |
|
"eval_steps_per_second": 0.55, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.03681207436039021, |
|
"grad_norm": 0.12468410283327103, |
|
"learning_rate": 0.00019269669817110593, |
|
"loss": 1.5045, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.03681207436039021, |
|
"eval_loss": 1.4277862310409546, |
|
"eval_runtime": 23.647, |
|
"eval_samples_per_second": 4.229, |
|
"eval_steps_per_second": 0.55, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.03803914350573655, |
|
"grad_norm": 0.1577584445476532, |
|
"learning_rate": 0.00019245120903400024, |
|
"loss": 1.5497, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.03803914350573655, |
|
"eval_loss": 1.4203659296035767, |
|
"eval_runtime": 23.8622, |
|
"eval_samples_per_second": 4.191, |
|
"eval_steps_per_second": 0.545, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.03926621265108289, |
|
"grad_norm": 0.12410438805818558, |
|
"learning_rate": 0.00019220571989689457, |
|
"loss": 1.503, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.03926621265108289, |
|
"eval_loss": 1.4154139757156372, |
|
"eval_runtime": 23.4706, |
|
"eval_samples_per_second": 4.261, |
|
"eval_steps_per_second": 0.554, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.04049328179642923, |
|
"grad_norm": 0.13563913106918335, |
|
"learning_rate": 0.00019196023075978888, |
|
"loss": 1.4851, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.04049328179642923, |
|
"eval_loss": 1.414802074432373, |
|
"eval_runtime": 23.3961, |
|
"eval_samples_per_second": 4.274, |
|
"eval_steps_per_second": 0.556, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.04172035094177557, |
|
"grad_norm": 0.13915061950683594, |
|
"learning_rate": 0.00019171474162268321, |
|
"loss": 1.4847, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.04172035094177557, |
|
"eval_loss": 1.4029760360717773, |
|
"eval_runtime": 23.6066, |
|
"eval_samples_per_second": 4.236, |
|
"eval_steps_per_second": 0.551, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.04294742008712191, |
|
"grad_norm": 0.14418162405490875, |
|
"learning_rate": 0.00019146925248557752, |
|
"loss": 1.4724, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.04294742008712191, |
|
"eval_loss": 1.4029196500778198, |
|
"eval_runtime": 23.5244, |
|
"eval_samples_per_second": 4.251, |
|
"eval_steps_per_second": 0.553, |
|
"step": 700 |
|
} |
|
], |
|
"logging_steps": 20, |
|
"max_steps": 16299, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"total_flos": 5.08603168290816e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|