|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9984, |
|
"eval_steps": 500, |
|
"global_step": 52, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0192, |
|
"grad_norm": 2.5694146156311035, |
|
"learning_rate": 1.6666666666666667e-06, |
|
"loss": 0.5167, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0384, |
|
"grad_norm": 2.4537222385406494, |
|
"learning_rate": 3.3333333333333333e-06, |
|
"loss": 0.5203, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0576, |
|
"grad_norm": 2.321495532989502, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4748, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0768, |
|
"grad_norm": 1.8380646705627441, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 0.5253, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.096, |
|
"grad_norm": 1.4723880290985107, |
|
"learning_rate": 8.333333333333334e-06, |
|
"loss": 0.4606, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.1152, |
|
"grad_norm": 1.1131113767623901, |
|
"learning_rate": 1e-05, |
|
"loss": 0.4019, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.1344, |
|
"grad_norm": 0.9251970648765564, |
|
"learning_rate": 9.988343845952697e-06, |
|
"loss": 0.4271, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.1536, |
|
"grad_norm": 1.1454741954803467, |
|
"learning_rate": 9.953429730181653e-06, |
|
"loss": 0.3827, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.1728, |
|
"grad_norm": 0.9471750259399414, |
|
"learning_rate": 9.895420438411616e-06, |
|
"loss": 0.3547, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.192, |
|
"grad_norm": 0.7063102126121521, |
|
"learning_rate": 9.814586436738998e-06, |
|
"loss": 0.3375, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.2112, |
|
"grad_norm": 0.6931116580963135, |
|
"learning_rate": 9.711304610594104e-06, |
|
"loss": 0.3156, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.2304, |
|
"grad_norm": 0.7782188653945923, |
|
"learning_rate": 9.586056507527266e-06, |
|
"loss": 0.3111, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.2496, |
|
"grad_norm": 0.7079610228538513, |
|
"learning_rate": 9.439426092011877e-06, |
|
"loss": 0.2994, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.2688, |
|
"grad_norm": 0.6569077968597412, |
|
"learning_rate": 9.272097022732444e-06, |
|
"loss": 0.2973, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.288, |
|
"grad_norm": 0.680906355381012, |
|
"learning_rate": 9.08484946505221e-06, |
|
"loss": 0.2882, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.3072, |
|
"grad_norm": 0.5492838025093079, |
|
"learning_rate": 8.8785564535221e-06, |
|
"loss": 0.2531, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.3264, |
|
"grad_norm": 0.6430338025093079, |
|
"learning_rate": 8.65417982139062e-06, |
|
"loss": 0.2819, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.3456, |
|
"grad_norm": 0.5319820046424866, |
|
"learning_rate": 8.412765716093273e-06, |
|
"loss": 0.2765, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.3648, |
|
"grad_norm": 0.4264802932739258, |
|
"learning_rate": 8.155439721630265e-06, |
|
"loss": 0.2588, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.384, |
|
"grad_norm": 0.42277446389198303, |
|
"learning_rate": 7.883401610574338e-06, |
|
"loss": 0.2572, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.4032, |
|
"grad_norm": 0.46798449754714966, |
|
"learning_rate": 7.597919750177168e-06, |
|
"loss": 0.2319, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.4224, |
|
"grad_norm": 0.4637551009654999, |
|
"learning_rate": 7.300325188655762e-06, |
|
"loss": 0.2307, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.4416, |
|
"grad_norm": 0.477437824010849, |
|
"learning_rate": 6.9920054492312086e-06, |
|
"loss": 0.2495, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.4608, |
|
"grad_norm": 0.40543755888938904, |
|
"learning_rate": 6.674398060854931e-06, |
|
"loss": 0.2272, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.4067147672176361, |
|
"learning_rate": 6.348983855785122e-06, |
|
"loss": 0.2349, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.4992, |
|
"grad_norm": 0.37864550948143005, |
|
"learning_rate": 6.0172800652631706e-06, |
|
"loss": 0.2345, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.5184, |
|
"grad_norm": 0.3472854197025299, |
|
"learning_rate": 5.680833245481234e-06, |
|
"loss": 0.2192, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.5376, |
|
"grad_norm": 0.36279794573783875, |
|
"learning_rate": 5.341212066823356e-06, |
|
"loss": 0.2212, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.5568, |
|
"grad_norm": 0.35561856627464294, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2218, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.576, |
|
"grad_norm": 0.34567970037460327, |
|
"learning_rate": 4.6587879331766465e-06, |
|
"loss": 0.2157, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.5952, |
|
"grad_norm": 0.3285064101219177, |
|
"learning_rate": 4.319166754518768e-06, |
|
"loss": 0.2214, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.6144, |
|
"grad_norm": 0.28714433312416077, |
|
"learning_rate": 3.982719934736832e-06, |
|
"loss": 0.1882, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.6336, |
|
"grad_norm": 0.31616437435150146, |
|
"learning_rate": 3.6510161442148783e-06, |
|
"loss": 0.1859, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.6528, |
|
"grad_norm": 0.3225463330745697, |
|
"learning_rate": 3.3256019391450696e-06, |
|
"loss": 0.2063, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.672, |
|
"grad_norm": 0.33429035544395447, |
|
"learning_rate": 3.007994550768793e-06, |
|
"loss": 0.2043, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.6912, |
|
"grad_norm": 0.33725297451019287, |
|
"learning_rate": 2.6996748113442397e-06, |
|
"loss": 0.2168, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.7104, |
|
"grad_norm": 0.3270690143108368, |
|
"learning_rate": 2.4020802498228333e-06, |
|
"loss": 0.2076, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.7296, |
|
"grad_norm": 0.34343093633651733, |
|
"learning_rate": 2.1165983894256647e-06, |
|
"loss": 0.2022, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.7488, |
|
"grad_norm": 0.3212158977985382, |
|
"learning_rate": 1.8445602783697375e-06, |
|
"loss": 0.2043, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.768, |
|
"grad_norm": 0.4333406388759613, |
|
"learning_rate": 1.5872342839067305e-06, |
|
"loss": 0.1961, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.7872, |
|
"grad_norm": 0.3265644609928131, |
|
"learning_rate": 1.3458201786093795e-06, |
|
"loss": 0.206, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.8064, |
|
"grad_norm": 0.3291531503200531, |
|
"learning_rate": 1.1214435464779006e-06, |
|
"loss": 0.2164, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.8256, |
|
"grad_norm": 0.3214084506034851, |
|
"learning_rate": 9.151505349477901e-07, |
|
"loss": 0.1954, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.8448, |
|
"grad_norm": 0.33091598749160767, |
|
"learning_rate": 7.279029772675572e-07, |
|
"loss": 0.2029, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.864, |
|
"grad_norm": 0.305397629737854, |
|
"learning_rate": 5.60573907988124e-07, |
|
"loss": 0.1895, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.8832, |
|
"grad_norm": 0.3404708504676819, |
|
"learning_rate": 4.139434924727359e-07, |
|
"loss": 0.206, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.9024, |
|
"grad_norm": 0.3258639872074127, |
|
"learning_rate": 2.88695389405898e-07, |
|
"loss": 0.2152, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.9216, |
|
"grad_norm": 0.3110244870185852, |
|
"learning_rate": 1.8541356326100436e-07, |
|
"loss": 0.2068, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.9408, |
|
"grad_norm": 0.31024959683418274, |
|
"learning_rate": 1.0457956158838545e-07, |
|
"loss": 0.1988, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.3411162495613098, |
|
"learning_rate": 4.657026981834623e-08, |
|
"loss": 0.2048, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.9792, |
|
"grad_norm": 0.32382944226264954, |
|
"learning_rate": 1.1656154047303691e-08, |
|
"loss": 0.2035, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.9984, |
|
"grad_norm": 0.3142213821411133, |
|
"learning_rate": 0.0, |
|
"loss": 0.2071, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.9984, |
|
"step": 52, |
|
"total_flos": 149313677950976.0, |
|
"train_loss": 0.26948727323458743, |
|
"train_runtime": 7475.3088, |
|
"train_samples_per_second": 0.669, |
|
"train_steps_per_second": 0.007 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 52, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 149313677950976.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|