|
{ |
|
"best_metric": 0.7672083377838135, |
|
"best_model_checkpoint": "miner_id_24/checkpoint-450", |
|
"epoch": 0.17096939647803044, |
|
"eval_steps": 50, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.00034193879295606086, |
|
"eval_loss": 4.036013603210449, |
|
"eval_runtime": 182.0782, |
|
"eval_samples_per_second": 6.766, |
|
"eval_steps_per_second": 1.692, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0034193879295606085, |
|
"grad_norm": 0.4218708574771881, |
|
"learning_rate": 4.0400000000000006e-05, |
|
"loss": 1.1371, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.006838775859121217, |
|
"grad_norm": 0.48933151364326477, |
|
"learning_rate": 8.080000000000001e-05, |
|
"loss": 1.0934, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.010258163788681825, |
|
"grad_norm": 11.574728012084961, |
|
"learning_rate": 0.00012119999999999999, |
|
"loss": 6.2024, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.013677551718242434, |
|
"grad_norm": 5.524878025054932, |
|
"learning_rate": 0.00016160000000000002, |
|
"loss": 4.9511, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.017096939647803042, |
|
"grad_norm": 1.5277706384658813, |
|
"learning_rate": 0.000202, |
|
"loss": 1.4149, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.017096939647803042, |
|
"eval_loss": 2.1701488494873047, |
|
"eval_runtime": 181.9971, |
|
"eval_samples_per_second": 6.769, |
|
"eval_steps_per_second": 1.692, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.02051632757736365, |
|
"grad_norm": 0.28429192304611206, |
|
"learning_rate": 0.00020175396907624226, |
|
"loss": 0.8977, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.02393571550692426, |
|
"grad_norm": 0.2915061414241791, |
|
"learning_rate": 0.0002010170749428986, |
|
"loss": 0.8808, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.027355103436484868, |
|
"grad_norm": 9.972613334655762, |
|
"learning_rate": 0.00019979290767411438, |
|
"loss": 3.6864, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.030774491366045476, |
|
"grad_norm": 1.236708164215088, |
|
"learning_rate": 0.0001980874312897702, |
|
"loss": 2.9426, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.034193879295606085, |
|
"grad_norm": 1.045754313468933, |
|
"learning_rate": 0.00019590895469937675, |
|
"loss": 1.0005, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.034193879295606085, |
|
"eval_loss": 1.6803466081619263, |
|
"eval_runtime": 182.4878, |
|
"eval_samples_per_second": 6.751, |
|
"eval_steps_per_second": 1.688, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.0376132672251667, |
|
"grad_norm": 0.32626786828041077, |
|
"learning_rate": 0.0001932680912219027, |
|
"loss": 0.8573, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.0410326551547273, |
|
"grad_norm": 0.27949294447898865, |
|
"learning_rate": 0.00019017770687875164, |
|
"loss": 0.8193, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.044452043084287914, |
|
"grad_norm": 5.386131763458252, |
|
"learning_rate": 0.000186652857711799, |
|
"loss": 2.9468, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.04787143101384852, |
|
"grad_norm": 0.46363380551338196, |
|
"learning_rate": 0.00018271071643186968, |
|
"loss": 1.296, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.05129081894340913, |
|
"grad_norm": 1.1684648990631104, |
|
"learning_rate": 0.00017837048875501678, |
|
"loss": 0.8626, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.05129081894340913, |
|
"eval_loss": 1.3778539896011353, |
|
"eval_runtime": 181.9692, |
|
"eval_samples_per_second": 6.77, |
|
"eval_steps_per_second": 1.693, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.054710206872969736, |
|
"grad_norm": 0.3281489610671997, |
|
"learning_rate": 0.00017365331983420376, |
|
"loss": 0.8043, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.05812959480253035, |
|
"grad_norm": 0.2923721969127655, |
|
"learning_rate": 0.0001685821912422447, |
|
"loss": 0.7794, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.06154898273209095, |
|
"grad_norm": 6.542027950286865, |
|
"learning_rate": 0.00016318180900789148, |
|
"loss": 2.2373, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.06496837066165156, |
|
"grad_norm": 0.3875706195831299, |
|
"learning_rate": 0.00015747848325054544, |
|
"loss": 1.3301, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.06838775859121217, |
|
"grad_norm": 0.8955979943275452, |
|
"learning_rate": 0.0001515, |
|
"loss": 0.8364, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.06838775859121217, |
|
"eval_loss": 1.3041934967041016, |
|
"eval_runtime": 182.0023, |
|
"eval_samples_per_second": 6.769, |
|
"eval_steps_per_second": 1.692, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.07180714652077279, |
|
"grad_norm": 0.32174670696258545, |
|
"learning_rate": 0.00014527548582569683, |
|
"loss": 0.8064, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.0752265344503334, |
|
"grad_norm": 0.29309213161468506, |
|
"learning_rate": 0.00013883526593500714, |
|
"loss": 0.7734, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.078645922379894, |
|
"grad_norm": 6.007425785064697, |
|
"learning_rate": 0.0001322107164318697, |
|
"loss": 2.042, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.0820653103094546, |
|
"grad_norm": 0.8852247595787048, |
|
"learning_rate": 0.00012543411145556643, |
|
"loss": 1.3562, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.08548469823901522, |
|
"grad_norm": 1.0789457559585571, |
|
"learning_rate": 0.00011853846594435998, |
|
"loss": 0.7506, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.08548469823901522, |
|
"eval_loss": 0.965094804763794, |
|
"eval_runtime": 182.2492, |
|
"eval_samples_per_second": 6.76, |
|
"eval_steps_per_second": 1.69, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.08890408616857583, |
|
"grad_norm": 0.33002084493637085, |
|
"learning_rate": 0.00011155737479003301, |
|
"loss": 0.8221, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.09232347409813643, |
|
"grad_norm": 0.34726133942604065, |
|
"learning_rate": 0.00010452484916695262, |
|
"loss": 0.7565, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.09574286202769704, |
|
"grad_norm": 2.3043720722198486, |
|
"learning_rate": 9.747515083304742e-05, |
|
"loss": 1.6552, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.09916224995725766, |
|
"grad_norm": 3.1896228790283203, |
|
"learning_rate": 9.044262520996702e-05, |
|
"loss": 0.7903, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.10258163788681826, |
|
"grad_norm": 1.102461814880371, |
|
"learning_rate": 8.346153405564004e-05, |
|
"loss": 0.7895, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.10258163788681826, |
|
"eval_loss": 0.8672501444816589, |
|
"eval_runtime": 181.9625, |
|
"eval_samples_per_second": 6.771, |
|
"eval_steps_per_second": 1.693, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.10600102581637887, |
|
"grad_norm": 0.28580135107040405, |
|
"learning_rate": 7.656588854443357e-05, |
|
"loss": 0.828, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.10942041374593947, |
|
"grad_norm": 0.3089773654937744, |
|
"learning_rate": 6.978928356813031e-05, |
|
"loss": 0.7365, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.11283980167550009, |
|
"grad_norm": 4.380869388580322, |
|
"learning_rate": 6.316473406499288e-05, |
|
"loss": 1.1384, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.1162591896050607, |
|
"grad_norm": 0.316327840089798, |
|
"learning_rate": 5.672451417430317e-05, |
|
"loss": 0.9138, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.1196785775346213, |
|
"grad_norm": 1.1205822229385376, |
|
"learning_rate": 5.050000000000002e-05, |
|
"loss": 0.6975, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.1196785775346213, |
|
"eval_loss": 0.8012982606887817, |
|
"eval_runtime": 182.4497, |
|
"eval_samples_per_second": 6.753, |
|
"eval_steps_per_second": 1.688, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.1230979654641819, |
|
"grad_norm": 0.3492680490016937, |
|
"learning_rate": 4.452151674945458e-05, |
|
"loss": 0.7909, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.1265173533937425, |
|
"grad_norm": 0.31265050172805786, |
|
"learning_rate": 3.8818190992108515e-05, |
|
"loss": 0.6951, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.12993674132330313, |
|
"grad_norm": 4.880677700042725, |
|
"learning_rate": 3.3417808757755355e-05, |
|
"loss": 0.9059, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.13335612925286375, |
|
"grad_norm": 0.3902217745780945, |
|
"learning_rate": 2.8346680165796253e-05, |
|
"loss": 0.7231, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.13677551718242434, |
|
"grad_norm": 0.9335039258003235, |
|
"learning_rate": 2.362951124498323e-05, |
|
"loss": 0.7406, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.13677551718242434, |
|
"eval_loss": 0.7848264575004578, |
|
"eval_runtime": 182.302, |
|
"eval_samples_per_second": 6.758, |
|
"eval_steps_per_second": 1.69, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.14019490511198496, |
|
"grad_norm": 0.31434395909309387, |
|
"learning_rate": 1.928928356813032e-05, |
|
"loss": 0.7779, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.14361429304154558, |
|
"grad_norm": 0.28142374753952026, |
|
"learning_rate": 1.5347142288200977e-05, |
|
"loss": 0.7075, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.14703368097110617, |
|
"grad_norm": 2.3318331241607666, |
|
"learning_rate": 1.1822293121248375e-05, |
|
"loss": 0.7626, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.1504530689006668, |
|
"grad_norm": 0.3560838997364044, |
|
"learning_rate": 8.731908778097302e-06, |
|
"loss": 0.8547, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.15387245683022738, |
|
"grad_norm": 2.5311930179595947, |
|
"learning_rate": 6.09104530062326e-06, |
|
"loss": 0.8116, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.15387245683022738, |
|
"eval_loss": 0.7672083377838135, |
|
"eval_runtime": 182.3331, |
|
"eval_samples_per_second": 6.757, |
|
"eval_steps_per_second": 1.689, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.157291844759788, |
|
"grad_norm": 0.31098073720932007, |
|
"learning_rate": 3.912568710229791e-06, |
|
"loss": 0.7879, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.16071123268934862, |
|
"grad_norm": 6.655428886413574, |
|
"learning_rate": 2.2070923258856255e-06, |
|
"loss": 0.9999, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.1641306206189092, |
|
"grad_norm": 4.029115200042725, |
|
"learning_rate": 9.829250571013935e-07, |
|
"loss": 0.9647, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.16755000854846983, |
|
"grad_norm": 0.3827471435070038, |
|
"learning_rate": 2.4603092375775605e-07, |
|
"loss": 0.6842, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.17096939647803044, |
|
"grad_norm": 0.9424235224723816, |
|
"learning_rate": 0.0, |
|
"loss": 0.7972, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.17096939647803044, |
|
"eval_loss": 0.779399037361145, |
|
"eval_runtime": 182.875, |
|
"eval_samples_per_second": 6.737, |
|
"eval_steps_per_second": 1.684, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 3, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 1 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.674007469981696e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|