lesso02's picture
Training in progress, step 500, checkpoint
d3e9899 verified
{
"best_metric": 0.7672083377838135,
"best_model_checkpoint": "miner_id_24/checkpoint-450",
"epoch": 0.17096939647803044,
"eval_steps": 50,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00034193879295606086,
"eval_loss": 4.036013603210449,
"eval_runtime": 182.0782,
"eval_samples_per_second": 6.766,
"eval_steps_per_second": 1.692,
"step": 1
},
{
"epoch": 0.0034193879295606085,
"grad_norm": 0.4218708574771881,
"learning_rate": 4.0400000000000006e-05,
"loss": 1.1371,
"step": 10
},
{
"epoch": 0.006838775859121217,
"grad_norm": 0.48933151364326477,
"learning_rate": 8.080000000000001e-05,
"loss": 1.0934,
"step": 20
},
{
"epoch": 0.010258163788681825,
"grad_norm": 11.574728012084961,
"learning_rate": 0.00012119999999999999,
"loss": 6.2024,
"step": 30
},
{
"epoch": 0.013677551718242434,
"grad_norm": 5.524878025054932,
"learning_rate": 0.00016160000000000002,
"loss": 4.9511,
"step": 40
},
{
"epoch": 0.017096939647803042,
"grad_norm": 1.5277706384658813,
"learning_rate": 0.000202,
"loss": 1.4149,
"step": 50
},
{
"epoch": 0.017096939647803042,
"eval_loss": 2.1701488494873047,
"eval_runtime": 181.9971,
"eval_samples_per_second": 6.769,
"eval_steps_per_second": 1.692,
"step": 50
},
{
"epoch": 0.02051632757736365,
"grad_norm": 0.28429192304611206,
"learning_rate": 0.00020175396907624226,
"loss": 0.8977,
"step": 60
},
{
"epoch": 0.02393571550692426,
"grad_norm": 0.2915061414241791,
"learning_rate": 0.0002010170749428986,
"loss": 0.8808,
"step": 70
},
{
"epoch": 0.027355103436484868,
"grad_norm": 9.972613334655762,
"learning_rate": 0.00019979290767411438,
"loss": 3.6864,
"step": 80
},
{
"epoch": 0.030774491366045476,
"grad_norm": 1.236708164215088,
"learning_rate": 0.0001980874312897702,
"loss": 2.9426,
"step": 90
},
{
"epoch": 0.034193879295606085,
"grad_norm": 1.045754313468933,
"learning_rate": 0.00019590895469937675,
"loss": 1.0005,
"step": 100
},
{
"epoch": 0.034193879295606085,
"eval_loss": 1.6803466081619263,
"eval_runtime": 182.4878,
"eval_samples_per_second": 6.751,
"eval_steps_per_second": 1.688,
"step": 100
},
{
"epoch": 0.0376132672251667,
"grad_norm": 0.32626786828041077,
"learning_rate": 0.0001932680912219027,
"loss": 0.8573,
"step": 110
},
{
"epoch": 0.0410326551547273,
"grad_norm": 0.27949294447898865,
"learning_rate": 0.00019017770687875164,
"loss": 0.8193,
"step": 120
},
{
"epoch": 0.044452043084287914,
"grad_norm": 5.386131763458252,
"learning_rate": 0.000186652857711799,
"loss": 2.9468,
"step": 130
},
{
"epoch": 0.04787143101384852,
"grad_norm": 0.46363380551338196,
"learning_rate": 0.00018271071643186968,
"loss": 1.296,
"step": 140
},
{
"epoch": 0.05129081894340913,
"grad_norm": 1.1684648990631104,
"learning_rate": 0.00017837048875501678,
"loss": 0.8626,
"step": 150
},
{
"epoch": 0.05129081894340913,
"eval_loss": 1.3778539896011353,
"eval_runtime": 181.9692,
"eval_samples_per_second": 6.77,
"eval_steps_per_second": 1.693,
"step": 150
},
{
"epoch": 0.054710206872969736,
"grad_norm": 0.3281489610671997,
"learning_rate": 0.00017365331983420376,
"loss": 0.8043,
"step": 160
},
{
"epoch": 0.05812959480253035,
"grad_norm": 0.2923721969127655,
"learning_rate": 0.0001685821912422447,
"loss": 0.7794,
"step": 170
},
{
"epoch": 0.06154898273209095,
"grad_norm": 6.542027950286865,
"learning_rate": 0.00016318180900789148,
"loss": 2.2373,
"step": 180
},
{
"epoch": 0.06496837066165156,
"grad_norm": 0.3875706195831299,
"learning_rate": 0.00015747848325054544,
"loss": 1.3301,
"step": 190
},
{
"epoch": 0.06838775859121217,
"grad_norm": 0.8955979943275452,
"learning_rate": 0.0001515,
"loss": 0.8364,
"step": 200
},
{
"epoch": 0.06838775859121217,
"eval_loss": 1.3041934967041016,
"eval_runtime": 182.0023,
"eval_samples_per_second": 6.769,
"eval_steps_per_second": 1.692,
"step": 200
},
{
"epoch": 0.07180714652077279,
"grad_norm": 0.32174670696258545,
"learning_rate": 0.00014527548582569683,
"loss": 0.8064,
"step": 210
},
{
"epoch": 0.0752265344503334,
"grad_norm": 0.29309213161468506,
"learning_rate": 0.00013883526593500714,
"loss": 0.7734,
"step": 220
},
{
"epoch": 0.078645922379894,
"grad_norm": 6.007425785064697,
"learning_rate": 0.0001322107164318697,
"loss": 2.042,
"step": 230
},
{
"epoch": 0.0820653103094546,
"grad_norm": 0.8852247595787048,
"learning_rate": 0.00012543411145556643,
"loss": 1.3562,
"step": 240
},
{
"epoch": 0.08548469823901522,
"grad_norm": 1.0789457559585571,
"learning_rate": 0.00011853846594435998,
"loss": 0.7506,
"step": 250
},
{
"epoch": 0.08548469823901522,
"eval_loss": 0.965094804763794,
"eval_runtime": 182.2492,
"eval_samples_per_second": 6.76,
"eval_steps_per_second": 1.69,
"step": 250
},
{
"epoch": 0.08890408616857583,
"grad_norm": 0.33002084493637085,
"learning_rate": 0.00011155737479003301,
"loss": 0.8221,
"step": 260
},
{
"epoch": 0.09232347409813643,
"grad_norm": 0.34726133942604065,
"learning_rate": 0.00010452484916695262,
"loss": 0.7565,
"step": 270
},
{
"epoch": 0.09574286202769704,
"grad_norm": 2.3043720722198486,
"learning_rate": 9.747515083304742e-05,
"loss": 1.6552,
"step": 280
},
{
"epoch": 0.09916224995725766,
"grad_norm": 3.1896228790283203,
"learning_rate": 9.044262520996702e-05,
"loss": 0.7903,
"step": 290
},
{
"epoch": 0.10258163788681826,
"grad_norm": 1.102461814880371,
"learning_rate": 8.346153405564004e-05,
"loss": 0.7895,
"step": 300
},
{
"epoch": 0.10258163788681826,
"eval_loss": 0.8672501444816589,
"eval_runtime": 181.9625,
"eval_samples_per_second": 6.771,
"eval_steps_per_second": 1.693,
"step": 300
},
{
"epoch": 0.10600102581637887,
"grad_norm": 0.28580135107040405,
"learning_rate": 7.656588854443357e-05,
"loss": 0.828,
"step": 310
},
{
"epoch": 0.10942041374593947,
"grad_norm": 0.3089773654937744,
"learning_rate": 6.978928356813031e-05,
"loss": 0.7365,
"step": 320
},
{
"epoch": 0.11283980167550009,
"grad_norm": 4.380869388580322,
"learning_rate": 6.316473406499288e-05,
"loss": 1.1384,
"step": 330
},
{
"epoch": 0.1162591896050607,
"grad_norm": 0.316327840089798,
"learning_rate": 5.672451417430317e-05,
"loss": 0.9138,
"step": 340
},
{
"epoch": 0.1196785775346213,
"grad_norm": 1.1205822229385376,
"learning_rate": 5.050000000000002e-05,
"loss": 0.6975,
"step": 350
},
{
"epoch": 0.1196785775346213,
"eval_loss": 0.8012982606887817,
"eval_runtime": 182.4497,
"eval_samples_per_second": 6.753,
"eval_steps_per_second": 1.688,
"step": 350
},
{
"epoch": 0.1230979654641819,
"grad_norm": 0.3492680490016937,
"learning_rate": 4.452151674945458e-05,
"loss": 0.7909,
"step": 360
},
{
"epoch": 0.1265173533937425,
"grad_norm": 0.31265050172805786,
"learning_rate": 3.8818190992108515e-05,
"loss": 0.6951,
"step": 370
},
{
"epoch": 0.12993674132330313,
"grad_norm": 4.880677700042725,
"learning_rate": 3.3417808757755355e-05,
"loss": 0.9059,
"step": 380
},
{
"epoch": 0.13335612925286375,
"grad_norm": 0.3902217745780945,
"learning_rate": 2.8346680165796253e-05,
"loss": 0.7231,
"step": 390
},
{
"epoch": 0.13677551718242434,
"grad_norm": 0.9335039258003235,
"learning_rate": 2.362951124498323e-05,
"loss": 0.7406,
"step": 400
},
{
"epoch": 0.13677551718242434,
"eval_loss": 0.7848264575004578,
"eval_runtime": 182.302,
"eval_samples_per_second": 6.758,
"eval_steps_per_second": 1.69,
"step": 400
},
{
"epoch": 0.14019490511198496,
"grad_norm": 0.31434395909309387,
"learning_rate": 1.928928356813032e-05,
"loss": 0.7779,
"step": 410
},
{
"epoch": 0.14361429304154558,
"grad_norm": 0.28142374753952026,
"learning_rate": 1.5347142288200977e-05,
"loss": 0.7075,
"step": 420
},
{
"epoch": 0.14703368097110617,
"grad_norm": 2.3318331241607666,
"learning_rate": 1.1822293121248375e-05,
"loss": 0.7626,
"step": 430
},
{
"epoch": 0.1504530689006668,
"grad_norm": 0.3560838997364044,
"learning_rate": 8.731908778097302e-06,
"loss": 0.8547,
"step": 440
},
{
"epoch": 0.15387245683022738,
"grad_norm": 2.5311930179595947,
"learning_rate": 6.09104530062326e-06,
"loss": 0.8116,
"step": 450
},
{
"epoch": 0.15387245683022738,
"eval_loss": 0.7672083377838135,
"eval_runtime": 182.3331,
"eval_samples_per_second": 6.757,
"eval_steps_per_second": 1.689,
"step": 450
},
{
"epoch": 0.157291844759788,
"grad_norm": 0.31098073720932007,
"learning_rate": 3.912568710229791e-06,
"loss": 0.7879,
"step": 460
},
{
"epoch": 0.16071123268934862,
"grad_norm": 6.655428886413574,
"learning_rate": 2.2070923258856255e-06,
"loss": 0.9999,
"step": 470
},
{
"epoch": 0.1641306206189092,
"grad_norm": 4.029115200042725,
"learning_rate": 9.829250571013935e-07,
"loss": 0.9647,
"step": 480
},
{
"epoch": 0.16755000854846983,
"grad_norm": 0.3827471435070038,
"learning_rate": 2.4603092375775605e-07,
"loss": 0.6842,
"step": 490
},
{
"epoch": 0.17096939647803044,
"grad_norm": 0.9424235224723816,
"learning_rate": 0.0,
"loss": 0.7972,
"step": 500
},
{
"epoch": 0.17096939647803044,
"eval_loss": 0.779399037361145,
"eval_runtime": 182.875,
"eval_samples_per_second": 6.737,
"eval_steps_per_second": 1.684,
"step": 500
}
],
"logging_steps": 10,
"max_steps": 500,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 3,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 1
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.674007469981696e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}