lesso15's picture
Training in progress, step 392, checkpoint
1a7eba3 verified
{
"best_metric": 8.457124710083008,
"best_model_checkpoint": "miner_id_24/checkpoint-350",
"epoch": 1.0012771392081736,
"eval_steps": 50,
"global_step": 392,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002554278416347382,
"eval_loss": 10.567898750305176,
"eval_runtime": 1.5305,
"eval_samples_per_second": 107.807,
"eval_steps_per_second": 27.442,
"step": 1
},
{
"epoch": 0.02554278416347382,
"grad_norm": 0.652108371257782,
"learning_rate": 4.3e-05,
"loss": 10.5667,
"step": 10
},
{
"epoch": 0.05108556832694764,
"grad_norm": 1.1399426460266113,
"learning_rate": 8.6e-05,
"loss": 10.5238,
"step": 20
},
{
"epoch": 0.07662835249042145,
"grad_norm": 1.5333579778671265,
"learning_rate": 0.000129,
"loss": 10.225,
"step": 30
},
{
"epoch": 0.10217113665389528,
"grad_norm": 1.1316747665405273,
"learning_rate": 0.000172,
"loss": 9.5687,
"step": 40
},
{
"epoch": 0.1277139208173691,
"grad_norm": 1.0548256635665894,
"learning_rate": 0.000215,
"loss": 9.007,
"step": 50
},
{
"epoch": 0.1277139208173691,
"eval_loss": 9.007640838623047,
"eval_runtime": 1.5083,
"eval_samples_per_second": 109.397,
"eval_steps_per_second": 27.847,
"step": 50
},
{
"epoch": 0.1532567049808429,
"grad_norm": 0.5301891565322876,
"learning_rate": 0.00021454676797063256,
"loss": 8.9809,
"step": 60
},
{
"epoch": 0.17879948914431673,
"grad_norm": 0.5254368185997009,
"learning_rate": 0.00021319089363643615,
"loss": 8.977,
"step": 70
},
{
"epoch": 0.20434227330779056,
"grad_norm": 0.5787448883056641,
"learning_rate": 0.0002109438100332444,
"loss": 8.8874,
"step": 80
},
{
"epoch": 0.22988505747126436,
"grad_norm": 0.5263362526893616,
"learning_rate": 0.00020782446507290164,
"loss": 8.8784,
"step": 90
},
{
"epoch": 0.2554278416347382,
"grad_norm": 1.8147071599960327,
"learning_rate": 0.00020385916177022965,
"loss": 8.7811,
"step": 100
},
{
"epoch": 0.2554278416347382,
"eval_loss": 8.785709381103516,
"eval_runtime": 1.507,
"eval_samples_per_second": 109.486,
"eval_steps_per_second": 27.869,
"step": 100
},
{
"epoch": 0.280970625798212,
"grad_norm": 1.605078935623169,
"learning_rate": 0.00019908133645012017,
"loss": 8.8128,
"step": 110
},
{
"epoch": 0.3065134099616858,
"grad_norm": 0.38945770263671875,
"learning_rate": 0.00019353127680496004,
"loss": 8.8304,
"step": 120
},
{
"epoch": 0.33205619412515963,
"grad_norm": 0.49945083260536194,
"learning_rate": 0.00018725578217979277,
"loss": 8.7196,
"step": 130
},
{
"epoch": 0.35759897828863346,
"grad_norm": 0.6314445734024048,
"learning_rate": 0.00018030776894976716,
"loss": 8.6939,
"step": 140
},
{
"epoch": 0.3831417624521073,
"grad_norm": 1.3763868808746338,
"learning_rate": 0.00017274582431741934,
"loss": 8.625,
"step": 150
},
{
"epoch": 0.3831417624521073,
"eval_loss": 8.687312126159668,
"eval_runtime": 1.5279,
"eval_samples_per_second": 107.991,
"eval_steps_per_second": 27.489,
"step": 150
},
{
"epoch": 0.4086845466155811,
"grad_norm": 0.777454674243927,
"learning_rate": 0.000164633712292269,
"loss": 8.7205,
"step": 160
},
{
"epoch": 0.4342273307790549,
"grad_norm": 0.593923032283783,
"learning_rate": 0.0001560398360184206,
"loss": 8.6939,
"step": 170
},
{
"epoch": 0.45977011494252873,
"grad_norm": 0.5520887970924377,
"learning_rate": 0.00014703666098394352,
"loss": 8.6602,
"step": 180
},
{
"epoch": 0.48531289910600256,
"grad_norm": 0.6347320675849915,
"learning_rate": 0.00013770010397565872,
"loss": 8.5483,
"step": 190
},
{
"epoch": 0.5108556832694764,
"grad_norm": 0.9477065205574036,
"learning_rate": 0.0001281088929318019,
"loss": 8.5906,
"step": 200
},
{
"epoch": 0.5108556832694764,
"eval_loss": 8.575843811035156,
"eval_runtime": 1.5207,
"eval_samples_per_second": 108.501,
"eval_steps_per_second": 27.618,
"step": 200
},
{
"epoch": 0.5363984674329502,
"grad_norm": 0.7455259561538696,
"learning_rate": 0.00011834390309042884,
"loss": 8.6157,
"step": 210
},
{
"epoch": 0.561941251596424,
"grad_norm": 0.8545849323272705,
"learning_rate": 0.0001084874750313081,
"loss": 8.5817,
"step": 220
},
{
"epoch": 0.5874840357598978,
"grad_norm": 0.5993427634239197,
"learning_rate": 9.862272036172428e-05,
"loss": 8.5773,
"step": 230
},
{
"epoch": 0.6130268199233716,
"grad_norm": 0.7996647357940674,
"learning_rate": 8.883282090080499e-05,
"loss": 8.4541,
"step": 240
},
{
"epoch": 0.6385696040868455,
"grad_norm": 1.0403778553009033,
"learning_rate": 7.92003272718056e-05,
"loss": 8.4273,
"step": 250
},
{
"epoch": 0.6385696040868455,
"eval_loss": 8.494529724121094,
"eval_runtime": 1.5036,
"eval_samples_per_second": 109.739,
"eval_steps_per_second": 27.934,
"step": 250
},
{
"epoch": 0.6641123882503193,
"grad_norm": 0.7150876522064209,
"learning_rate": 6.980646281677891e-05,
"loss": 8.5303,
"step": 260
},
{
"epoch": 0.6896551724137931,
"grad_norm": 0.7501654624938965,
"learning_rate": 6.073043870317715e-05,
"loss": 8.5242,
"step": 270
},
{
"epoch": 0.7151979565772669,
"grad_norm": 0.5660316348075867,
"learning_rate": 5.20487859975602e-05,
"loss": 8.4898,
"step": 280
},
{
"epoch": 0.7407407407407407,
"grad_norm": 0.7373713850975037,
"learning_rate": 4.383471033851466e-05,
"loss": 8.3627,
"step": 290
},
{
"epoch": 0.7662835249042146,
"grad_norm": 2.3701064586639404,
"learning_rate": 3.6157474650325716e-05,
"loss": 8.4074,
"step": 300
},
{
"epoch": 0.7662835249042146,
"eval_loss": 8.469345092773438,
"eval_runtime": 1.5048,
"eval_samples_per_second": 109.646,
"eval_steps_per_second": 27.91,
"step": 300
},
{
"epoch": 0.7918263090676884,
"grad_norm": 0.7007551193237305,
"learning_rate": 2.908181510249828e-05,
"loss": 8.474,
"step": 310
},
{
"epoch": 0.8173690932311622,
"grad_norm": 0.7329103350639343,
"learning_rate": 2.2667395239887702e-05,
"loss": 8.5448,
"step": 320
},
{
"epoch": 0.842911877394636,
"grad_norm": 0.5645519495010376,
"learning_rate": 1.69683028863367e-05,
"loss": 8.4645,
"step": 330
},
{
"epoch": 0.8684546615581098,
"grad_norm": 0.7888035774230957,
"learning_rate": 1.2032594064039719e-05,
"loss": 8.4487,
"step": 340
},
{
"epoch": 0.8939974457215837,
"grad_norm": 2.03556489944458,
"learning_rate": 7.90188777440881e-06,
"loss": 8.3678,
"step": 350
},
{
"epoch": 0.8939974457215837,
"eval_loss": 8.457124710083008,
"eval_runtime": 1.5108,
"eval_samples_per_second": 109.217,
"eval_steps_per_second": 27.801,
"step": 350
},
{
"epoch": 0.9195402298850575,
"grad_norm": 1.0569088459014893,
"learning_rate": 4.611015057339232e-06,
"loss": 8.455,
"step": 360
},
{
"epoch": 0.9450830140485313,
"grad_norm": 0.7266004085540771,
"learning_rate": 2.1877252880858024e-06,
"loss": 8.4595,
"step": 370
},
{
"epoch": 0.9706257982120051,
"grad_norm": 0.9919468760490417,
"learning_rate": 6.524521883204303e-07,
"loss": 8.4392,
"step": 380
},
{
"epoch": 0.9961685823754789,
"grad_norm": 1.0076446533203125,
"learning_rate": 1.814152441780875e-08,
"loss": 8.3541,
"step": 390
}
],
"logging_steps": 10,
"max_steps": 392,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 3,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3851034999128064.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}