|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 5.0, |
|
"eval_steps": 500, |
|
"global_step": 625, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 16.931909561157227, |
|
"learning_rate": 2.53968253968254e-06, |
|
"loss": 17.7745, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 20.85945701599121, |
|
"learning_rate": 5.7142857142857145e-06, |
|
"loss": 16.3129, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 17.975467681884766, |
|
"learning_rate": 8.888888888888888e-06, |
|
"loss": 12.0213, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 6.495634078979492, |
|
"learning_rate": 1.2063492063492064e-05, |
|
"loss": 6.7596, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 5.885069847106934, |
|
"learning_rate": 1.523809523809524e-05, |
|
"loss": 3.8509, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 4.575489521026611, |
|
"learning_rate": 1.8412698412698415e-05, |
|
"loss": 2.0809, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 5.4581708908081055, |
|
"learning_rate": 1.999609421031453e-05, |
|
"loss": 1.503, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 1.8591735363006592, |
|
"learning_rate": 1.9964866196679105e-05, |
|
"loss": 1.2903, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 2.716611385345459, |
|
"learning_rate": 1.990250772639552e-05, |
|
"loss": 1.174, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 1.5368252992630005, |
|
"learning_rate": 1.9809213608668188e-05, |
|
"loss": 1.1583, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 1.6447538137435913, |
|
"learning_rate": 1.96852752963305e-05, |
|
"loss": 1.1363, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 2.368715286254883, |
|
"learning_rate": 1.9531079975339912e-05, |
|
"loss": 1.1077, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 2.465362787246704, |
|
"learning_rate": 1.9347109355200672e-05, |
|
"loss": 1.0916, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 1.520612359046936, |
|
"learning_rate": 1.9133938164092942e-05, |
|
"loss": 1.0992, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 1.3989702463150024, |
|
"learning_rate": 1.8892232353409582e-05, |
|
"loss": 1.0849, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 1.8318055868148804, |
|
"learning_rate": 1.8622747017309676e-05, |
|
"loss": 1.0918, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.3599999999999999, |
|
"grad_norm": 1.3566697835922241, |
|
"learning_rate": 1.832632403378808e-05, |
|
"loss": 1.0673, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 1.595423936843872, |
|
"learning_rate": 1.8003889434630473e-05, |
|
"loss": 1.0789, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 1.4190820455551147, |
|
"learning_rate": 1.765645051247007e-05, |
|
"loss": 1.0618, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 1.8037772178649902, |
|
"learning_rate": 1.728509267398376e-05, |
|
"loss": 1.0658, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.6800000000000002, |
|
"grad_norm": 2.374105453491211, |
|
"learning_rate": 1.6890976049058267e-05, |
|
"loss": 1.0481, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 2.712444543838501, |
|
"learning_rate": 1.6475331866519387e-05, |
|
"loss": 1.0856, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.8399999999999999, |
|
"grad_norm": 0.846947193145752, |
|
"learning_rate": 1.6039458607746614e-05, |
|
"loss": 1.0575, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 1.0007365942001343, |
|
"learning_rate": 1.558471795018936e-05, |
|
"loss": 1.0391, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 1.445178747177124, |
|
"learning_rate": 1.5112530513457236e-05, |
|
"loss": 1.0378, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 1.1010501384735107, |
|
"learning_rate": 1.4624371421273823e-05, |
|
"loss": 1.0421, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 1.2421690225601196, |
|
"learning_rate": 1.4121765693158364e-05, |
|
"loss": 1.0279, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 1.585425853729248, |
|
"learning_rate": 1.3606283480231957e-05, |
|
"loss": 1.036, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"grad_norm": 1.0381609201431274, |
|
"learning_rate": 1.3079535160031598e-05, |
|
"loss": 1.0348, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 1.5096782445907593, |
|
"learning_rate": 1.2543166305656099e-05, |
|
"loss": 1.0362, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 0.7411025762557983, |
|
"learning_rate": 1.1998852544960266e-05, |
|
"loss": 1.0293, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 1.0825903415679932, |
|
"learning_rate": 1.1448294325857387e-05, |
|
"loss": 1.0287, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"grad_norm": 1.187018871307373, |
|
"learning_rate": 1.0893211604083325e-05, |
|
"loss": 1.0302, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.7199999999999998, |
|
"grad_norm": 1.1337244510650635, |
|
"learning_rate": 1.0335338470017742e-05, |
|
"loss": 1.0358, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 1.3198221921920776, |
|
"learning_rate": 9.776417731348416e-06, |
|
"loss": 1.028, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 1.04779851436615, |
|
"learning_rate": 9.218195468502462e-06, |
|
"loss": 1.0221, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 1.228852391242981, |
|
"learning_rate": 8.662415579853492e-06, |
|
"loss": 1.0152, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 3.04, |
|
"grad_norm": 0.8538860082626343, |
|
"learning_rate": 8.110814333745496e-06, |
|
"loss": 1.0264, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 3.12, |
|
"grad_norm": 1.0245864391326904, |
|
"learning_rate": 7.56511494435318e-06, |
|
"loss": 1.0161, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"grad_norm": 0.983771800994873, |
|
"learning_rate": 7.027022188323716e-06, |
|
"loss": 1.0131, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 3.2800000000000002, |
|
"grad_norm": 0.8150555491447449, |
|
"learning_rate": 6.498217079017818e-06, |
|
"loss": 1.0271, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 3.36, |
|
"grad_norm": 1.072811484336853, |
|
"learning_rate": 5.980351614987759e-06, |
|
"loss": 1.0062, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 3.44, |
|
"grad_norm": 1.0907390117645264, |
|
"learning_rate": 5.475043619098334e-06, |
|
"loss": 1.0137, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 3.52, |
|
"grad_norm": 0.8887478113174438, |
|
"learning_rate": 4.983871684413363e-06, |
|
"loss": 0.9897, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"grad_norm": 0.8817629218101501, |
|
"learning_rate": 4.508370242636968e-06, |
|
"loss": 1.0112, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 3.68, |
|
"grad_norm": 1.7927525043487549, |
|
"learning_rate": 4.050024770515869e-06, |
|
"loss": 0.9977, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 3.76, |
|
"grad_norm": 1.2345198392868042, |
|
"learning_rate": 3.6102671491780393e-06, |
|
"loss": 0.9906, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 3.84, |
|
"grad_norm": 0.9548463225364685, |
|
"learning_rate": 3.1904711909051933e-06, |
|
"loss": 1.0106, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 3.92, |
|
"grad_norm": 1.2012273073196411, |
|
"learning_rate": 2.7919483473136678e-06, |
|
"loss": 0.9938, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 1.8137465715408325, |
|
"learning_rate": 2.4159436123512737e-06, |
|
"loss": 0.9991, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 4.08, |
|
"grad_norm": 0.9726725220680237, |
|
"learning_rate": 2.0636316329094317e-06, |
|
"loss": 0.9852, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 4.16, |
|
"grad_norm": 0.9943102598190308, |
|
"learning_rate": 1.7361130392009407e-06, |
|
"loss": 0.9887, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 4.24, |
|
"grad_norm": 0.8023040890693665, |
|
"learning_rate": 1.4344110063675143e-06, |
|
"loss": 0.9813, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 4.32, |
|
"grad_norm": 0.987560510635376, |
|
"learning_rate": 1.1594680580585815e-06, |
|
"loss": 0.9934, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 4.4, |
|
"grad_norm": 0.8008418083190918, |
|
"learning_rate": 9.121431219671096e-07, |
|
"loss": 0.9944, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 4.48, |
|
"grad_norm": 1.4193716049194336, |
|
"learning_rate": 6.932088465209941e-07, |
|
"loss": 0.9935, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 4.5600000000000005, |
|
"grad_norm": 1.3066496849060059, |
|
"learning_rate": 5.033491871127105e-07, |
|
"loss": 0.9825, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 4.64, |
|
"grad_norm": 0.9865421056747437, |
|
"learning_rate": 3.4315726940795436e-07, |
|
"loss": 0.9849, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 4.72, |
|
"grad_norm": 0.7975912690162659, |
|
"learning_rate": 2.1313353640827207e-07, |
|
"loss": 0.9803, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 4.8, |
|
"grad_norm": 0.8560538291931152, |
|
"learning_rate": 1.1368418505635303e-07, |
|
"loss": 0.9821, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 4.88, |
|
"grad_norm": 0.9261242747306824, |
|
"learning_rate": 4.5119897268023347e-08, |
|
"loss": 0.9878, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 4.96, |
|
"grad_norm": 0.7569891214370728, |
|
"learning_rate": 7.654869355252503e-09, |
|
"loss": 0.9881, |
|
"step": 620 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 625, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.425513594532659e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|