|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.20512820512820512, |
|
"eval_steps": 9, |
|
"global_step": 100, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0020512820512820513, |
|
"eval_loss": 3.666609048843384, |
|
"eval_runtime": 74.3332, |
|
"eval_samples_per_second": 11.058, |
|
"eval_steps_per_second": 1.386, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.006153846153846154, |
|
"grad_norm": 55.59685516357422, |
|
"learning_rate": 1.5e-05, |
|
"loss": 14.9637, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.012307692307692308, |
|
"grad_norm": 49.33617401123047, |
|
"learning_rate": 3e-05, |
|
"loss": 14.2659, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.018461538461538463, |
|
"grad_norm": 40.35097885131836, |
|
"learning_rate": 4.5e-05, |
|
"loss": 12.0429, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.018461538461538463, |
|
"eval_loss": 2.3609907627105713, |
|
"eval_runtime": 75.0844, |
|
"eval_samples_per_second": 10.948, |
|
"eval_steps_per_second": 1.372, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.024615384615384615, |
|
"grad_norm": 30.74170684814453, |
|
"learning_rate": 4.993910125649561e-05, |
|
"loss": 8.4852, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.03076923076923077, |
|
"grad_norm": 30.664113998413086, |
|
"learning_rate": 4.962019382530521e-05, |
|
"loss": 6.9901, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.036923076923076927, |
|
"grad_norm": 37.1199951171875, |
|
"learning_rate": 4.9031542398457974e-05, |
|
"loss": 5.2257, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.036923076923076927, |
|
"eval_loss": 1.283078670501709, |
|
"eval_runtime": 75.1182, |
|
"eval_samples_per_second": 10.943, |
|
"eval_steps_per_second": 1.371, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.043076923076923075, |
|
"grad_norm": 25.036869049072266, |
|
"learning_rate": 4.817959636416969e-05, |
|
"loss": 5.1287, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.04923076923076923, |
|
"grad_norm": 19.620019912719727, |
|
"learning_rate": 4.707368982147318e-05, |
|
"loss": 4.4763, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.055384615384615386, |
|
"grad_norm": 24.405311584472656, |
|
"learning_rate": 4.572593931387604e-05, |
|
"loss": 3.8667, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.055384615384615386, |
|
"eval_loss": 0.9862188100814819, |
|
"eval_runtime": 75.1057, |
|
"eval_samples_per_second": 10.945, |
|
"eval_steps_per_second": 1.371, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.06153846153846154, |
|
"grad_norm": 22.108592987060547, |
|
"learning_rate": 4.415111107797445e-05, |
|
"loss": 4.0031, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.06769230769230769, |
|
"grad_norm": 19.9141788482666, |
|
"learning_rate": 4.2366459261474933e-05, |
|
"loss": 3.8066, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.07384615384615385, |
|
"grad_norm": 22.954469680786133, |
|
"learning_rate": 4.039153688314145e-05, |
|
"loss": 3.7173, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.07384615384615385, |
|
"eval_loss": 0.9269228577613831, |
|
"eval_runtime": 75.1147, |
|
"eval_samples_per_second": 10.943, |
|
"eval_steps_per_second": 1.371, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 24.711549758911133, |
|
"learning_rate": 3.824798160583012e-05, |
|
"loss": 3.9268, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.08615384615384615, |
|
"grad_norm": 16.412107467651367, |
|
"learning_rate": 3.5959278669726935e-05, |
|
"loss": 3.9029, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.09230769230769231, |
|
"grad_norm": 19.484609603881836, |
|
"learning_rate": 3.355050358314172e-05, |
|
"loss": 3.7293, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.09230769230769231, |
|
"eval_loss": 0.888292133808136, |
|
"eval_runtime": 75.1111, |
|
"eval_samples_per_second": 10.944, |
|
"eval_steps_per_second": 1.371, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.09846153846153846, |
|
"grad_norm": 17.762903213500977, |
|
"learning_rate": 3.104804738999169e-05, |
|
"loss": 3.8463, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.10461538461538461, |
|
"grad_norm": 16.857402801513672, |
|
"learning_rate": 2.8479327524001636e-05, |
|
"loss": 3.2797, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.11076923076923077, |
|
"grad_norm": 17.598472595214844, |
|
"learning_rate": 2.587248741756253e-05, |
|
"loss": 3.4031, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.11076923076923077, |
|
"eval_loss": 0.8580312132835388, |
|
"eval_runtime": 75.1055, |
|
"eval_samples_per_second": 10.945, |
|
"eval_steps_per_second": 1.371, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.11692307692307692, |
|
"grad_norm": 19.14990234375, |
|
"learning_rate": 2.3256088156396868e-05, |
|
"loss": 3.5648, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.12307692307692308, |
|
"grad_norm": 15.191015243530273, |
|
"learning_rate": 2.0658795558326743e-05, |
|
"loss": 3.2272, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.12923076923076923, |
|
"grad_norm": 16.499048233032227, |
|
"learning_rate": 1.8109066104575023e-05, |
|
"loss": 3.1709, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.12923076923076923, |
|
"eval_loss": 0.8416498303413391, |
|
"eval_runtime": 75.1294, |
|
"eval_samples_per_second": 10.941, |
|
"eval_steps_per_second": 1.371, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.13538461538461538, |
|
"grad_norm": 16.03173828125, |
|
"learning_rate": 1.56348351646022e-05, |
|
"loss": 3.1598, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.14153846153846153, |
|
"grad_norm": 15.822704315185547, |
|
"learning_rate": 1.3263210930352737e-05, |
|
"loss": 3.1743, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.1476923076923077, |
|
"grad_norm": 22.87495994567871, |
|
"learning_rate": 1.1020177413231334e-05, |
|
"loss": 3.7806, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.1476923076923077, |
|
"eval_loss": 0.8304129838943481, |
|
"eval_runtime": 75.1064, |
|
"eval_samples_per_second": 10.944, |
|
"eval_steps_per_second": 1.371, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.15384615384615385, |
|
"grad_norm": 17.428983688354492, |
|
"learning_rate": 8.930309757836517e-06, |
|
"loss": 3.6015, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 19.788145065307617, |
|
"learning_rate": 7.016504991533726e-06, |
|
"loss": 3.2009, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.16615384615384615, |
|
"grad_norm": 18.700437545776367, |
|
"learning_rate": 5.299731159831953e-06, |
|
"loss": 3.3891, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.16615384615384615, |
|
"eval_loss": 0.8223448991775513, |
|
"eval_runtime": 75.1008, |
|
"eval_samples_per_second": 10.945, |
|
"eval_steps_per_second": 1.371, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.1723076923076923, |
|
"grad_norm": 16.810592651367188, |
|
"learning_rate": 3.798797596089351e-06, |
|
"loss": 3.3512, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.17846153846153845, |
|
"grad_norm": 17.362895965576172, |
|
"learning_rate": 2.5301488425208296e-06, |
|
"loss": 3.5883, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.18461538461538463, |
|
"grad_norm": 22.981599807739258, |
|
"learning_rate": 1.5076844803522922e-06, |
|
"loss": 3.2509, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.18461538461538463, |
|
"eval_loss": 0.8187195062637329, |
|
"eval_runtime": 75.1228, |
|
"eval_samples_per_second": 10.942, |
|
"eval_steps_per_second": 1.371, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.19076923076923077, |
|
"grad_norm": 16.67460823059082, |
|
"learning_rate": 7.426068431000882e-07, |
|
"loss": 3.4548, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.19692307692307692, |
|
"grad_norm": 21.03778839111328, |
|
"learning_rate": 2.4329828146074095e-07, |
|
"loss": 3.7234, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.20307692307692307, |
|
"grad_norm": 17.65099334716797, |
|
"learning_rate": 1.522932452260595e-08, |
|
"loss": 3.2285, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.20307692307692307, |
|
"eval_loss": 0.8180755376815796, |
|
"eval_runtime": 75.1032, |
|
"eval_samples_per_second": 10.945, |
|
"eval_steps_per_second": 1.371, |
|
"step": 99 |
|
} |
|
], |
|
"logging_steps": 3, |
|
"max_steps": 100, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 9, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.406258997362688e+17, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|