{ "best_metric": 0.7672083377838135, "best_model_checkpoint": "miner_id_24/checkpoint-450", "epoch": 0.17096939647803044, "eval_steps": 50, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00034193879295606086, "eval_loss": 4.036013603210449, "eval_runtime": 182.0782, "eval_samples_per_second": 6.766, "eval_steps_per_second": 1.692, "step": 1 }, { "epoch": 0.0034193879295606085, "grad_norm": 0.4218708574771881, "learning_rate": 4.0400000000000006e-05, "loss": 1.1371, "step": 10 }, { "epoch": 0.006838775859121217, "grad_norm": 0.48933151364326477, "learning_rate": 8.080000000000001e-05, "loss": 1.0934, "step": 20 }, { "epoch": 0.010258163788681825, "grad_norm": 11.574728012084961, "learning_rate": 0.00012119999999999999, "loss": 6.2024, "step": 30 }, { "epoch": 0.013677551718242434, "grad_norm": 5.524878025054932, "learning_rate": 0.00016160000000000002, "loss": 4.9511, "step": 40 }, { "epoch": 0.017096939647803042, "grad_norm": 1.5277706384658813, "learning_rate": 0.000202, "loss": 1.4149, "step": 50 }, { "epoch": 0.017096939647803042, "eval_loss": 2.1701488494873047, "eval_runtime": 181.9971, "eval_samples_per_second": 6.769, "eval_steps_per_second": 1.692, "step": 50 }, { "epoch": 0.02051632757736365, "grad_norm": 0.28429192304611206, "learning_rate": 0.00020175396907624226, "loss": 0.8977, "step": 60 }, { "epoch": 0.02393571550692426, "grad_norm": 0.2915061414241791, "learning_rate": 0.0002010170749428986, "loss": 0.8808, "step": 70 }, { "epoch": 0.027355103436484868, "grad_norm": 9.972613334655762, "learning_rate": 0.00019979290767411438, "loss": 3.6864, "step": 80 }, { "epoch": 0.030774491366045476, "grad_norm": 1.236708164215088, "learning_rate": 0.0001980874312897702, "loss": 2.9426, "step": 90 }, { "epoch": 0.034193879295606085, "grad_norm": 1.045754313468933, "learning_rate": 0.00019590895469937675, "loss": 1.0005, "step": 100 }, { "epoch": 0.034193879295606085, "eval_loss": 1.6803466081619263, "eval_runtime": 182.4878, "eval_samples_per_second": 6.751, "eval_steps_per_second": 1.688, "step": 100 }, { "epoch": 0.0376132672251667, "grad_norm": 0.32626786828041077, "learning_rate": 0.0001932680912219027, "loss": 0.8573, "step": 110 }, { "epoch": 0.0410326551547273, "grad_norm": 0.27949294447898865, "learning_rate": 0.00019017770687875164, "loss": 0.8193, "step": 120 }, { "epoch": 0.044452043084287914, "grad_norm": 5.386131763458252, "learning_rate": 0.000186652857711799, "loss": 2.9468, "step": 130 }, { "epoch": 0.04787143101384852, "grad_norm": 0.46363380551338196, "learning_rate": 0.00018271071643186968, "loss": 1.296, "step": 140 }, { "epoch": 0.05129081894340913, "grad_norm": 1.1684648990631104, "learning_rate": 0.00017837048875501678, "loss": 0.8626, "step": 150 }, { "epoch": 0.05129081894340913, "eval_loss": 1.3778539896011353, "eval_runtime": 181.9692, "eval_samples_per_second": 6.77, "eval_steps_per_second": 1.693, "step": 150 }, { "epoch": 0.054710206872969736, "grad_norm": 0.3281489610671997, "learning_rate": 0.00017365331983420376, "loss": 0.8043, "step": 160 }, { "epoch": 0.05812959480253035, "grad_norm": 0.2923721969127655, "learning_rate": 0.0001685821912422447, "loss": 0.7794, "step": 170 }, { "epoch": 0.06154898273209095, "grad_norm": 6.542027950286865, "learning_rate": 0.00016318180900789148, "loss": 2.2373, "step": 180 }, { "epoch": 0.06496837066165156, "grad_norm": 0.3875706195831299, "learning_rate": 0.00015747848325054544, "loss": 1.3301, "step": 190 }, { "epoch": 0.06838775859121217, "grad_norm": 0.8955979943275452, "learning_rate": 0.0001515, "loss": 0.8364, "step": 200 }, { "epoch": 0.06838775859121217, "eval_loss": 1.3041934967041016, "eval_runtime": 182.0023, "eval_samples_per_second": 6.769, "eval_steps_per_second": 1.692, "step": 200 }, { "epoch": 0.07180714652077279, "grad_norm": 0.32174670696258545, "learning_rate": 0.00014527548582569683, "loss": 0.8064, "step": 210 }, { "epoch": 0.0752265344503334, "grad_norm": 0.29309213161468506, "learning_rate": 0.00013883526593500714, "loss": 0.7734, "step": 220 }, { "epoch": 0.078645922379894, "grad_norm": 6.007425785064697, "learning_rate": 0.0001322107164318697, "loss": 2.042, "step": 230 }, { "epoch": 0.0820653103094546, "grad_norm": 0.8852247595787048, "learning_rate": 0.00012543411145556643, "loss": 1.3562, "step": 240 }, { "epoch": 0.08548469823901522, "grad_norm": 1.0789457559585571, "learning_rate": 0.00011853846594435998, "loss": 0.7506, "step": 250 }, { "epoch": 0.08548469823901522, "eval_loss": 0.965094804763794, "eval_runtime": 182.2492, "eval_samples_per_second": 6.76, "eval_steps_per_second": 1.69, "step": 250 }, { "epoch": 0.08890408616857583, "grad_norm": 0.33002084493637085, "learning_rate": 0.00011155737479003301, "loss": 0.8221, "step": 260 }, { "epoch": 0.09232347409813643, "grad_norm": 0.34726133942604065, "learning_rate": 0.00010452484916695262, "loss": 0.7565, "step": 270 }, { "epoch": 0.09574286202769704, "grad_norm": 2.3043720722198486, "learning_rate": 9.747515083304742e-05, "loss": 1.6552, "step": 280 }, { "epoch": 0.09916224995725766, "grad_norm": 3.1896228790283203, "learning_rate": 9.044262520996702e-05, "loss": 0.7903, "step": 290 }, { "epoch": 0.10258163788681826, "grad_norm": 1.102461814880371, "learning_rate": 8.346153405564004e-05, "loss": 0.7895, "step": 300 }, { "epoch": 0.10258163788681826, "eval_loss": 0.8672501444816589, "eval_runtime": 181.9625, "eval_samples_per_second": 6.771, "eval_steps_per_second": 1.693, "step": 300 }, { "epoch": 0.10600102581637887, "grad_norm": 0.28580135107040405, "learning_rate": 7.656588854443357e-05, "loss": 0.828, "step": 310 }, { "epoch": 0.10942041374593947, "grad_norm": 0.3089773654937744, "learning_rate": 6.978928356813031e-05, "loss": 0.7365, "step": 320 }, { "epoch": 0.11283980167550009, "grad_norm": 4.380869388580322, "learning_rate": 6.316473406499288e-05, "loss": 1.1384, "step": 330 }, { "epoch": 0.1162591896050607, "grad_norm": 0.316327840089798, "learning_rate": 5.672451417430317e-05, "loss": 0.9138, "step": 340 }, { "epoch": 0.1196785775346213, "grad_norm": 1.1205822229385376, "learning_rate": 5.050000000000002e-05, "loss": 0.6975, "step": 350 }, { "epoch": 0.1196785775346213, "eval_loss": 0.8012982606887817, "eval_runtime": 182.4497, "eval_samples_per_second": 6.753, "eval_steps_per_second": 1.688, "step": 350 }, { "epoch": 0.1230979654641819, "grad_norm": 0.3492680490016937, "learning_rate": 4.452151674945458e-05, "loss": 0.7909, "step": 360 }, { "epoch": 0.1265173533937425, "grad_norm": 0.31265050172805786, "learning_rate": 3.8818190992108515e-05, "loss": 0.6951, "step": 370 }, { "epoch": 0.12993674132330313, "grad_norm": 4.880677700042725, "learning_rate": 3.3417808757755355e-05, "loss": 0.9059, "step": 380 }, { "epoch": 0.13335612925286375, "grad_norm": 0.3902217745780945, "learning_rate": 2.8346680165796253e-05, "loss": 0.7231, "step": 390 }, { "epoch": 0.13677551718242434, "grad_norm": 0.9335039258003235, "learning_rate": 2.362951124498323e-05, "loss": 0.7406, "step": 400 }, { "epoch": 0.13677551718242434, "eval_loss": 0.7848264575004578, "eval_runtime": 182.302, "eval_samples_per_second": 6.758, "eval_steps_per_second": 1.69, "step": 400 }, { "epoch": 0.14019490511198496, "grad_norm": 0.31434395909309387, "learning_rate": 1.928928356813032e-05, "loss": 0.7779, "step": 410 }, { "epoch": 0.14361429304154558, "grad_norm": 0.28142374753952026, "learning_rate": 1.5347142288200977e-05, "loss": 0.7075, "step": 420 }, { "epoch": 0.14703368097110617, "grad_norm": 2.3318331241607666, "learning_rate": 1.1822293121248375e-05, "loss": 0.7626, "step": 430 }, { "epoch": 0.1504530689006668, "grad_norm": 0.3560838997364044, "learning_rate": 8.731908778097302e-06, "loss": 0.8547, "step": 440 }, { "epoch": 0.15387245683022738, "grad_norm": 2.5311930179595947, "learning_rate": 6.09104530062326e-06, "loss": 0.8116, "step": 450 }, { "epoch": 0.15387245683022738, "eval_loss": 0.7672083377838135, "eval_runtime": 182.3331, "eval_samples_per_second": 6.757, "eval_steps_per_second": 1.689, "step": 450 }, { "epoch": 0.157291844759788, "grad_norm": 0.31098073720932007, "learning_rate": 3.912568710229791e-06, "loss": 0.7879, "step": 460 }, { "epoch": 0.16071123268934862, "grad_norm": 6.655428886413574, "learning_rate": 2.2070923258856255e-06, "loss": 0.9999, "step": 470 }, { "epoch": 0.1641306206189092, "grad_norm": 4.029115200042725, "learning_rate": 9.829250571013935e-07, "loss": 0.9647, "step": 480 }, { "epoch": 0.16755000854846983, "grad_norm": 0.3827471435070038, "learning_rate": 2.4603092375775605e-07, "loss": 0.6842, "step": 490 }, { "epoch": 0.17096939647803044, "grad_norm": 0.9424235224723816, "learning_rate": 0.0, "loss": 0.7972, "step": 500 }, { "epoch": 0.17096939647803044, "eval_loss": 0.779399037361145, "eval_runtime": 182.875, "eval_samples_per_second": 6.737, "eval_steps_per_second": 1.684, "step": 500 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 1 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.674007469981696e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }