|
{ |
|
"best_metric": 0.7168570756912231, |
|
"best_model_checkpoint": "miner_id_24/checkpoint-300", |
|
"epoch": 0.6511123168746609, |
|
"eval_steps": 150, |
|
"global_step": 300, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.002170374389582203, |
|
"eval_loss": 3.8393607139587402, |
|
"eval_runtime": 52.8023, |
|
"eval_samples_per_second": 14.715, |
|
"eval_steps_per_second": 1.856, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.02170374389582203, |
|
"grad_norm": 26.712209701538086, |
|
"learning_rate": 6e-06, |
|
"loss": 10.3362, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.04340748779164406, |
|
"grad_norm": 38.7980842590332, |
|
"learning_rate": 1.2e-05, |
|
"loss": 12.0151, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.06511123168746609, |
|
"grad_norm": 29.304401397705078, |
|
"learning_rate": 1.8e-05, |
|
"loss": 11.2638, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.08681497558328811, |
|
"grad_norm": 39.74540710449219, |
|
"learning_rate": 2.4e-05, |
|
"loss": 9.229, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.10851871947911014, |
|
"grad_norm": 65.20191955566406, |
|
"learning_rate": 3e-05, |
|
"loss": 6.7067, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.13022246337493218, |
|
"grad_norm": 15.829483985900879, |
|
"learning_rate": 2.9996479470277262e-05, |
|
"loss": 5.6606, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.1519262072707542, |
|
"grad_norm": 16.72377586364746, |
|
"learning_rate": 2.9985919533659653e-05, |
|
"loss": 4.3456, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.17362995116657623, |
|
"grad_norm": 17.048402786254883, |
|
"learning_rate": 2.9968325147023267e-05, |
|
"loss": 3.5431, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.19533369506239825, |
|
"grad_norm": 23.174785614013672, |
|
"learning_rate": 2.994370456924292e-05, |
|
"loss": 3.5592, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.21703743895822028, |
|
"grad_norm": 63.94609832763672, |
|
"learning_rate": 2.9912069357315394e-05, |
|
"loss": 3.5889, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.23874118285404233, |
|
"grad_norm": 17.54082679748535, |
|
"learning_rate": 2.9873434360934543e-05, |
|
"loss": 4.0375, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.26044492674986436, |
|
"grad_norm": 13.922243118286133, |
|
"learning_rate": 2.9827817715520775e-05, |
|
"loss": 3.5151, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.2821486706456864, |
|
"grad_norm": 18.978328704833984, |
|
"learning_rate": 2.977524083370823e-05, |
|
"loss": 3.1774, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.3038524145415084, |
|
"grad_norm": 19.49057388305664, |
|
"learning_rate": 2.9715728395293587e-05, |
|
"loss": 3.2158, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.32555615843733043, |
|
"grad_norm": 36.420654296875, |
|
"learning_rate": 2.96493083356513e-05, |
|
"loss": 3.1129, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.32555615843733043, |
|
"eval_loss": 0.803920328617096, |
|
"eval_runtime": 53.8173, |
|
"eval_samples_per_second": 14.438, |
|
"eval_steps_per_second": 1.821, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.34725990233315246, |
|
"grad_norm": 15.815438270568848, |
|
"learning_rate": 2.9576011832620583e-05, |
|
"loss": 3.6763, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.3689636462289745, |
|
"grad_norm": 17.322349548339844, |
|
"learning_rate": 2.9495873291870436e-05, |
|
"loss": 3.2852, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.3906673901247965, |
|
"grad_norm": 16.479698181152344, |
|
"learning_rate": 2.940893033074948e-05, |
|
"loss": 3.0177, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.41237113402061853, |
|
"grad_norm": 20.874675750732422, |
|
"learning_rate": 2.9315223760628224e-05, |
|
"loss": 2.676, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.43407487791644056, |
|
"grad_norm": 29.774669647216797, |
|
"learning_rate": 2.9214797567742036e-05, |
|
"loss": 3.227, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.45577862181226264, |
|
"grad_norm": 14.984698295593262, |
|
"learning_rate": 2.9107698892543862e-05, |
|
"loss": 3.4, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.47748236570808467, |
|
"grad_norm": 18.93448829650879, |
|
"learning_rate": 2.8993978007576263e-05, |
|
"loss": 2.9846, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.4991861096039067, |
|
"grad_norm": 17.96265411376953, |
|
"learning_rate": 2.8873688293873336e-05, |
|
"loss": 3.0037, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.5208898534997287, |
|
"grad_norm": 23.578723907470703, |
|
"learning_rate": 2.874688621590339e-05, |
|
"loss": 2.7363, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.5425935973955507, |
|
"grad_norm": 39.76424026489258, |
|
"learning_rate": 2.861363129506436e-05, |
|
"loss": 3.1817, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.5642973412913728, |
|
"grad_norm": 20.42775535583496, |
|
"learning_rate": 2.847398608174417e-05, |
|
"loss": 3.2541, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.5860010851871947, |
|
"grad_norm": 16.566482543945312, |
|
"learning_rate": 2.832801612595937e-05, |
|
"loss": 2.8651, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.6077048290830168, |
|
"grad_norm": 16.73906707763672, |
|
"learning_rate": 2.8175789946585697e-05, |
|
"loss": 2.8237, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.6294085729788389, |
|
"grad_norm": 23.292736053466797, |
|
"learning_rate": 2.801737899919502e-05, |
|
"loss": 3.0393, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.6511123168746609, |
|
"grad_norm": 38.29425048828125, |
|
"learning_rate": 2.7852857642513838e-05, |
|
"loss": 2.7109, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.6511123168746609, |
|
"eval_loss": 0.7168570756912231, |
|
"eval_runtime": 53.8613, |
|
"eval_samples_per_second": 14.426, |
|
"eval_steps_per_second": 1.819, |
|
"step": 300 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 150, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 3, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4.206407486275584e+17, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|