|
{ |
|
"best_metric": 8.457124710083008, |
|
"best_model_checkpoint": "miner_id_24/checkpoint-350", |
|
"epoch": 1.0012771392081736, |
|
"eval_steps": 50, |
|
"global_step": 392, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.002554278416347382, |
|
"eval_loss": 10.567898750305176, |
|
"eval_runtime": 1.5305, |
|
"eval_samples_per_second": 107.807, |
|
"eval_steps_per_second": 27.442, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.02554278416347382, |
|
"grad_norm": 0.652108371257782, |
|
"learning_rate": 4.3e-05, |
|
"loss": 10.5667, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.05108556832694764, |
|
"grad_norm": 1.1399426460266113, |
|
"learning_rate": 8.6e-05, |
|
"loss": 10.5238, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.07662835249042145, |
|
"grad_norm": 1.5333579778671265, |
|
"learning_rate": 0.000129, |
|
"loss": 10.225, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.10217113665389528, |
|
"grad_norm": 1.1316747665405273, |
|
"learning_rate": 0.000172, |
|
"loss": 9.5687, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.1277139208173691, |
|
"grad_norm": 1.0548256635665894, |
|
"learning_rate": 0.000215, |
|
"loss": 9.007, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1277139208173691, |
|
"eval_loss": 9.007640838623047, |
|
"eval_runtime": 1.5083, |
|
"eval_samples_per_second": 109.397, |
|
"eval_steps_per_second": 27.847, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1532567049808429, |
|
"grad_norm": 0.5301891565322876, |
|
"learning_rate": 0.00021454676797063256, |
|
"loss": 8.9809, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.17879948914431673, |
|
"grad_norm": 0.5254368185997009, |
|
"learning_rate": 0.00021319089363643615, |
|
"loss": 8.977, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.20434227330779056, |
|
"grad_norm": 0.5787448883056641, |
|
"learning_rate": 0.0002109438100332444, |
|
"loss": 8.8874, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.22988505747126436, |
|
"grad_norm": 0.5263362526893616, |
|
"learning_rate": 0.00020782446507290164, |
|
"loss": 8.8784, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.2554278416347382, |
|
"grad_norm": 1.8147071599960327, |
|
"learning_rate": 0.00020385916177022965, |
|
"loss": 8.7811, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2554278416347382, |
|
"eval_loss": 8.785709381103516, |
|
"eval_runtime": 1.507, |
|
"eval_samples_per_second": 109.486, |
|
"eval_steps_per_second": 27.869, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.280970625798212, |
|
"grad_norm": 1.605078935623169, |
|
"learning_rate": 0.00019908133645012017, |
|
"loss": 8.8128, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.3065134099616858, |
|
"grad_norm": 0.38945770263671875, |
|
"learning_rate": 0.00019353127680496004, |
|
"loss": 8.8304, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.33205619412515963, |
|
"grad_norm": 0.49945083260536194, |
|
"learning_rate": 0.00018725578217979277, |
|
"loss": 8.7196, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.35759897828863346, |
|
"grad_norm": 0.6314445734024048, |
|
"learning_rate": 0.00018030776894976716, |
|
"loss": 8.6939, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.3831417624521073, |
|
"grad_norm": 1.3763868808746338, |
|
"learning_rate": 0.00017274582431741934, |
|
"loss": 8.625, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.3831417624521073, |
|
"eval_loss": 8.687312126159668, |
|
"eval_runtime": 1.5279, |
|
"eval_samples_per_second": 107.991, |
|
"eval_steps_per_second": 27.489, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.4086845466155811, |
|
"grad_norm": 0.777454674243927, |
|
"learning_rate": 0.000164633712292269, |
|
"loss": 8.7205, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.4342273307790549, |
|
"grad_norm": 0.593923032283783, |
|
"learning_rate": 0.0001560398360184206, |
|
"loss": 8.6939, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.45977011494252873, |
|
"grad_norm": 0.5520887970924377, |
|
"learning_rate": 0.00014703666098394352, |
|
"loss": 8.6602, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.48531289910600256, |
|
"grad_norm": 0.6347320675849915, |
|
"learning_rate": 0.00013770010397565872, |
|
"loss": 8.5483, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.5108556832694764, |
|
"grad_norm": 0.9477065205574036, |
|
"learning_rate": 0.0001281088929318019, |
|
"loss": 8.5906, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5108556832694764, |
|
"eval_loss": 8.575843811035156, |
|
"eval_runtime": 1.5207, |
|
"eval_samples_per_second": 108.501, |
|
"eval_steps_per_second": 27.618, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5363984674329502, |
|
"grad_norm": 0.7455259561538696, |
|
"learning_rate": 0.00011834390309042884, |
|
"loss": 8.6157, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.561941251596424, |
|
"grad_norm": 0.8545849323272705, |
|
"learning_rate": 0.0001084874750313081, |
|
"loss": 8.5817, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.5874840357598978, |
|
"grad_norm": 0.5993427634239197, |
|
"learning_rate": 9.862272036172428e-05, |
|
"loss": 8.5773, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.6130268199233716, |
|
"grad_norm": 0.7996647357940674, |
|
"learning_rate": 8.883282090080499e-05, |
|
"loss": 8.4541, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6385696040868455, |
|
"grad_norm": 1.0403778553009033, |
|
"learning_rate": 7.92003272718056e-05, |
|
"loss": 8.4273, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.6385696040868455, |
|
"eval_loss": 8.494529724121094, |
|
"eval_runtime": 1.5036, |
|
"eval_samples_per_second": 109.739, |
|
"eval_steps_per_second": 27.934, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.6641123882503193, |
|
"grad_norm": 0.7150876522064209, |
|
"learning_rate": 6.980646281677891e-05, |
|
"loss": 8.5303, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.6896551724137931, |
|
"grad_norm": 0.7501654624938965, |
|
"learning_rate": 6.073043870317715e-05, |
|
"loss": 8.5242, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.7151979565772669, |
|
"grad_norm": 0.5660316348075867, |
|
"learning_rate": 5.20487859975602e-05, |
|
"loss": 8.4898, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.7407407407407407, |
|
"grad_norm": 0.7373713850975037, |
|
"learning_rate": 4.383471033851466e-05, |
|
"loss": 8.3627, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.7662835249042146, |
|
"grad_norm": 2.3701064586639404, |
|
"learning_rate": 3.6157474650325716e-05, |
|
"loss": 8.4074, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7662835249042146, |
|
"eval_loss": 8.469345092773438, |
|
"eval_runtime": 1.5048, |
|
"eval_samples_per_second": 109.646, |
|
"eval_steps_per_second": 27.91, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7918263090676884, |
|
"grad_norm": 0.7007551193237305, |
|
"learning_rate": 2.908181510249828e-05, |
|
"loss": 8.474, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.8173690932311622, |
|
"grad_norm": 0.7329103350639343, |
|
"learning_rate": 2.2667395239887702e-05, |
|
"loss": 8.5448, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.842911877394636, |
|
"grad_norm": 0.5645519495010376, |
|
"learning_rate": 1.69683028863367e-05, |
|
"loss": 8.4645, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.8684546615581098, |
|
"grad_norm": 0.7888035774230957, |
|
"learning_rate": 1.2032594064039719e-05, |
|
"loss": 8.4487, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.8939974457215837, |
|
"grad_norm": 2.03556489944458, |
|
"learning_rate": 7.90188777440881e-06, |
|
"loss": 8.3678, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.8939974457215837, |
|
"eval_loss": 8.457124710083008, |
|
"eval_runtime": 1.5108, |
|
"eval_samples_per_second": 109.217, |
|
"eval_steps_per_second": 27.801, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.9195402298850575, |
|
"grad_norm": 1.0569088459014893, |
|
"learning_rate": 4.611015057339232e-06, |
|
"loss": 8.455, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.9450830140485313, |
|
"grad_norm": 0.7266004085540771, |
|
"learning_rate": 2.1877252880858024e-06, |
|
"loss": 8.4595, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.9706257982120051, |
|
"grad_norm": 0.9919468760490417, |
|
"learning_rate": 6.524521883204303e-07, |
|
"loss": 8.4392, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.9961685823754789, |
|
"grad_norm": 1.0076446533203125, |
|
"learning_rate": 1.814152441780875e-08, |
|
"loss": 8.3541, |
|
"step": 390 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 392, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 3, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3851034999128064.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|