|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.994871794871795, |
|
"eval_steps": 500, |
|
"global_step": 657, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.045584045584045586, |
|
"grad_norm": 0.9747199141382675, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7529, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.09116809116809117, |
|
"grad_norm": 0.8738603014316104, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6963, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.13675213675213677, |
|
"grad_norm": 0.8150690175536751, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6838, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.18233618233618235, |
|
"grad_norm": 0.8040590579100562, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6731, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.22792022792022792, |
|
"grad_norm": 0.8097580687205038, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6568, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.27350427350427353, |
|
"grad_norm": 0.7017512733530653, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6482, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.3190883190883191, |
|
"grad_norm": 0.46658768732638123, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6508, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.3646723646723647, |
|
"grad_norm": 0.27624642671580113, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6449, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.41025641025641024, |
|
"grad_norm": 0.24917438010760184, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6411, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.45584045584045585, |
|
"grad_norm": 0.2581331671168845, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6402, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.5014245014245015, |
|
"grad_norm": 0.24037652957624636, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6422, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.5470085470085471, |
|
"grad_norm": 0.23269369333404952, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6374, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.5925925925925926, |
|
"grad_norm": 0.23915104514085617, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6395, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.6381766381766382, |
|
"grad_norm": 0.2609780887042042, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6412, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.6837606837606838, |
|
"grad_norm": 0.23404048980327943, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6337, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.7293447293447294, |
|
"grad_norm": 0.21838704952274085, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6333, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.7749287749287749, |
|
"grad_norm": 0.2382035774873146, |
|
"learning_rate": 5e-06, |
|
"loss": 0.633, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.8205128205128205, |
|
"grad_norm": 0.2438000385852018, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6386, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.8660968660968661, |
|
"grad_norm": 0.22530554762484006, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6345, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.9116809116809117, |
|
"grad_norm": 0.2349886748386411, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6408, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.9572649572649573, |
|
"grad_norm": 0.25085516929265755, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6322, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.9982905982905983, |
|
"eval_loss": 0.6290858387947083, |
|
"eval_runtime": 223.0866, |
|
"eval_samples_per_second": 52.997, |
|
"eval_steps_per_second": 0.417, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 1.002849002849003, |
|
"grad_norm": 0.4128874703264031, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6672, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.0484330484330484, |
|
"grad_norm": 0.2741974395463933, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6126, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.0940170940170941, |
|
"grad_norm": 0.24996957336799014, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6122, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.1396011396011396, |
|
"grad_norm": 0.2308654521968029, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6147, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.1851851851851851, |
|
"grad_norm": 0.21800620424831227, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6104, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.2307692307692308, |
|
"grad_norm": 0.22878445296983144, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6109, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.2763532763532763, |
|
"grad_norm": 0.22048121695978434, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6137, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.3219373219373218, |
|
"grad_norm": 0.22321985375553052, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6102, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.3675213675213675, |
|
"grad_norm": 0.23340158642086364, |
|
"learning_rate": 5e-06, |
|
"loss": 0.609, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.413105413105413, |
|
"grad_norm": 0.2160885724116477, |
|
"learning_rate": 5e-06, |
|
"loss": 0.613, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.4586894586894588, |
|
"grad_norm": 0.21579764874251084, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6105, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.5042735042735043, |
|
"grad_norm": 0.24191612198460108, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6049, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.54985754985755, |
|
"grad_norm": 0.2218278655422596, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6081, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.5954415954415955, |
|
"grad_norm": 0.2322145105092986, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6129, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.641025641025641, |
|
"grad_norm": 0.21658541308539972, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6092, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.6866096866096867, |
|
"grad_norm": 0.24054412349179422, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6057, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.7321937321937322, |
|
"grad_norm": 0.2725818691336268, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6065, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.7777777777777777, |
|
"grad_norm": 0.23261322507876125, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6075, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.8233618233618234, |
|
"grad_norm": 0.21931910039916533, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6028, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.868945868945869, |
|
"grad_norm": 0.2102187671028171, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6003, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.9145299145299144, |
|
"grad_norm": 0.22631213435723302, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6067, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.96011396011396, |
|
"grad_norm": 0.23696933054944608, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6083, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.9965811965811966, |
|
"eval_loss": 0.6215006709098816, |
|
"eval_runtime": 224.1659, |
|
"eval_samples_per_second": 52.742, |
|
"eval_steps_per_second": 0.415, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 2.005698005698006, |
|
"grad_norm": 0.2323893228533647, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6421, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 2.051282051282051, |
|
"grad_norm": 0.2248336007687922, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5803, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.096866096866097, |
|
"grad_norm": 0.225417442071853, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5869, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 2.1424501424501425, |
|
"grad_norm": 0.2204104722476421, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5845, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 2.1880341880341883, |
|
"grad_norm": 0.2565084390638512, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5837, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 2.2336182336182335, |
|
"grad_norm": 0.2245257556826023, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5828, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 2.2792022792022792, |
|
"grad_norm": 0.22001308034422606, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5831, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.324786324786325, |
|
"grad_norm": 0.2270454664497775, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5838, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 2.3703703703703702, |
|
"grad_norm": 0.22740470236209073, |
|
"learning_rate": 5e-06, |
|
"loss": 0.586, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 2.415954415954416, |
|
"grad_norm": 0.2302979493200276, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5883, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 2.4615384615384617, |
|
"grad_norm": 0.25292997039450504, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5927, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 2.5071225071225074, |
|
"grad_norm": 0.2223763921390108, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5914, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 2.5527065527065527, |
|
"grad_norm": 0.23815767371523935, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5859, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 2.5982905982905984, |
|
"grad_norm": 0.2593266359272012, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5854, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 2.6438746438746437, |
|
"grad_norm": 0.23931559412564932, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5854, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 2.6894586894586894, |
|
"grad_norm": 0.21456969497967046, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5865, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 2.735042735042735, |
|
"grad_norm": 0.23061617250879088, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5862, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.780626780626781, |
|
"grad_norm": 0.2088223573865155, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5856, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 2.826210826210826, |
|
"grad_norm": 0.22810405657658617, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5874, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 2.871794871794872, |
|
"grad_norm": 0.268468678439456, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5889, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 2.9173789173789175, |
|
"grad_norm": 0.23190583340922707, |
|
"learning_rate": 5e-06, |
|
"loss": 0.585, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.962962962962963, |
|
"grad_norm": 0.2202766065355913, |
|
"learning_rate": 5e-06, |
|
"loss": 0.58, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.994871794871795, |
|
"eval_loss": 0.6190248131752014, |
|
"eval_runtime": 222.4874, |
|
"eval_samples_per_second": 53.14, |
|
"eval_steps_per_second": 0.418, |
|
"step": 657 |
|
}, |
|
{ |
|
"epoch": 2.994871794871795, |
|
"step": 657, |
|
"total_flos": 2754957111459840.0, |
|
"train_loss": 0.6167326612196739, |
|
"train_runtime": 35776.9525, |
|
"train_samples_per_second": 18.835, |
|
"train_steps_per_second": 0.018 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 657, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2754957111459840.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|