|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.0, |
|
"eval_steps": 500, |
|
"global_step": 250, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 6.857266902923584, |
|
"learning_rate": 7.2000000000000005e-06, |
|
"loss": 1.585, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 2.489650011062622, |
|
"learning_rate": 1.5200000000000002e-05, |
|
"loss": 0.268, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 6.398056507110596, |
|
"learning_rate": 1.9984407641819812e-05, |
|
"loss": 0.2895, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.1504039317369461, |
|
"learning_rate": 1.9809551553491918e-05, |
|
"loss": 0.1859, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.2296534776687622, |
|
"learning_rate": 1.944376370237481e-05, |
|
"loss": 0.0984, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.012895800173282623, |
|
"learning_rate": 1.889416373291298e-05, |
|
"loss": 0.0668, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 2.1277153491973877, |
|
"learning_rate": 1.8171448983351284e-05, |
|
"loss": 0.1228, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 1.6015583276748657, |
|
"learning_rate": 1.7289686274214116e-05, |
|
"loss": 0.193, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.42995280027389526, |
|
"learning_rate": 1.6266038113644605e-05, |
|
"loss": 0.1573, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.2124607264995575, |
|
"learning_rate": 1.5120428648705716e-05, |
|
"loss": 0.0905, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.20811651647090912, |
|
"learning_rate": 1.3875155864521031e-05, |
|
"loss": 0.1169, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.7650100588798523, |
|
"learning_rate": 1.2554457579357906e-05, |
|
"loss": 0.125, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 0.23556451499462128, |
|
"learning_rate": 1.1184039683065014e-05, |
|
"loss": 0.0576, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 0.6805216073989868, |
|
"learning_rate": 9.790575801166432e-06, |
|
"loss": 0.0703, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 0.023868851363658905, |
|
"learning_rate": 8.401188123081653e-06, |
|
"loss": 0.049, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 1.3114885091781616, |
|
"learning_rate": 7.042919499559538e-06, |
|
"loss": 0.129, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.3599999999999999, |
|
"grad_norm": 0.2610754668712616, |
|
"learning_rate": 5.742207084349274e-06, |
|
"loss": 0.0652, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 2.7140579223632812, |
|
"learning_rate": 4.524367765074499e-06, |
|
"loss": 0.0147, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 1.237959384918213, |
|
"learning_rate": 3.4131053988131947e-06, |
|
"loss": 0.0694, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.09863968938589096, |
|
"learning_rate": 2.4300494434824373e-06, |
|
"loss": 0.0786, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.6800000000000002, |
|
"grad_norm": 0.009426446631550789, |
|
"learning_rate": 1.5943339650431578e-06, |
|
"loss": 0.0301, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 1.9068654775619507, |
|
"learning_rate": 9.222252146709143e-07, |
|
"loss": 0.025, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.8399999999999999, |
|
"grad_norm": 0.03250988572835922, |
|
"learning_rate": 4.268050246793276e-07, |
|
"loss": 0.0093, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 0.29542025923728943, |
|
"learning_rate": 1.1771618553447217e-07, |
|
"loss": 0.0756, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.12926587462425232, |
|
"learning_rate": 9.74759906957612e-10, |
|
"loss": 0.0351, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"step": 250, |
|
"total_flos": 3.846009907091866e+16, |
|
"train_loss": 0.16032438999414445, |
|
"train_runtime": 2680.6813, |
|
"train_samples_per_second": 0.373, |
|
"train_steps_per_second": 0.093 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 250, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 1000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.846009907091866e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|