|
{ |
|
"best_metric": 1.8633633852005005, |
|
"best_model_checkpoint": "miner_id_24/checkpoint-200", |
|
"epoch": 1.0021141649048626, |
|
"eval_steps": 50, |
|
"global_step": 237, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.004228329809725159, |
|
"eval_loss": 2.096205472946167, |
|
"eval_runtime": 6.3468, |
|
"eval_samples_per_second": 15.756, |
|
"eval_steps_per_second": 3.939, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.042283298097251586, |
|
"grad_norm": 7.564020156860352, |
|
"learning_rate": 4.24e-05, |
|
"loss": 3.6687, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.08456659619450317, |
|
"grad_norm": 5.9212799072265625, |
|
"learning_rate": 8.48e-05, |
|
"loss": 3.7253, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.12684989429175475, |
|
"grad_norm": 6.140769004821777, |
|
"learning_rate": 0.0001272, |
|
"loss": 3.7108, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.16913319238900634, |
|
"grad_norm": 8.776068687438965, |
|
"learning_rate": 0.0001696, |
|
"loss": 3.5971, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.21141649048625794, |
|
"grad_norm": 44.07782745361328, |
|
"learning_rate": 0.000212, |
|
"loss": 3.6765, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.21141649048625794, |
|
"eval_loss": 2.286642074584961, |
|
"eval_runtime": 6.5576, |
|
"eval_samples_per_second": 15.249, |
|
"eval_steps_per_second": 3.812, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.2536997885835095, |
|
"grad_norm": 5.269192695617676, |
|
"learning_rate": 0.00021050764994571535, |
|
"loss": 3.8943, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.2959830866807611, |
|
"grad_norm": 5.10400390625, |
|
"learning_rate": 0.00020607262070143734, |
|
"loss": 3.5937, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.3382663847780127, |
|
"grad_norm": 6.93461275100708, |
|
"learning_rate": 0.00019881979181685374, |
|
"loss": 3.7963, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.38054968287526425, |
|
"grad_norm": 10.116086959838867, |
|
"learning_rate": 0.00018895338517082536, |
|
"loss": 3.8322, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.42283298097251587, |
|
"grad_norm": 48.350345611572266, |
|
"learning_rate": 0.00017675121458398993, |
|
"loss": 3.6459, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.42283298097251587, |
|
"eval_loss": 2.329221248626709, |
|
"eval_runtime": 6.5526, |
|
"eval_samples_per_second": 15.261, |
|
"eval_steps_per_second": 3.815, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.46511627906976744, |
|
"grad_norm": 5.142873287200928, |
|
"learning_rate": 0.0001625568632627182, |
|
"loss": 3.9402, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.507399577167019, |
|
"grad_norm": 5.514346122741699, |
|
"learning_rate": 0.0001467700093384222, |
|
"loss": 3.8567, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.5496828752642706, |
|
"grad_norm": 6.7151103019714355, |
|
"learning_rate": 0.00012983517191130775, |
|
"loss": 3.6157, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.5919661733615222, |
|
"grad_norm": 7.746840000152588, |
|
"learning_rate": 0.00011222919448238718, |
|
"loss": 3.9139, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.6342494714587738, |
|
"grad_norm": 28.235681533813477, |
|
"learning_rate": 9.444781820961864e-05, |
|
"loss": 3.6412, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.6342494714587738, |
|
"eval_loss": 2.091932773590088, |
|
"eval_runtime": 6.5409, |
|
"eval_samples_per_second": 15.288, |
|
"eval_steps_per_second": 3.822, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.6765327695560254, |
|
"grad_norm": 5.661881923675537, |
|
"learning_rate": 7.699172305235922e-05, |
|
"loss": 3.8396, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.718816067653277, |
|
"grad_norm": 7.51119327545166, |
|
"learning_rate": 6.035242985127933e-05, |
|
"loss": 3.8834, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.7610993657505285, |
|
"grad_norm": 7.847259998321533, |
|
"learning_rate": 4.4998460306603325e-05, |
|
"loss": 3.938, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.8033826638477801, |
|
"grad_norm": 6.869880676269531, |
|
"learning_rate": 3.136214455575538e-05, |
|
"loss": 3.6543, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.8456659619450317, |
|
"grad_norm": 22.58540916442871, |
|
"learning_rate": 1.9827447816679272e-05, |
|
"loss": 3.3961, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.8456659619450317, |
|
"eval_loss": 1.8633633852005005, |
|
"eval_runtime": 6.5539, |
|
"eval_samples_per_second": 15.258, |
|
"eval_steps_per_second": 3.815, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.8879492600422833, |
|
"grad_norm": 5.115078449249268, |
|
"learning_rate": 1.0719158868709316e-05, |
|
"loss": 3.8714, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.9302325581395349, |
|
"grad_norm": 6.137579441070557, |
|
"learning_rate": 4.29374479686329e-06, |
|
"loss": 3.8645, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.9725158562367865, |
|
"grad_norm": 7.499491214752197, |
|
"learning_rate": 7.321295075359002e-07, |
|
"loss": 3.5654, |
|
"step": 230 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 237, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 3, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4.371259110457344e+16, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|