{ "best_metric": 0.1643463671207428, "best_model_checkpoint": "miner_id_24/checkpoint-500", "epoch": 0.212630236019562, "eval_steps": 50, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00042526047203912394, "eval_loss": 0.6095050573348999, "eval_runtime": 72.3902, "eval_samples_per_second": 13.676, "eval_steps_per_second": 3.426, "step": 1 }, { "epoch": 0.00425260472039124, "grad_norm": 0.8989177346229553, "learning_rate": 4.2600000000000005e-05, "loss": 0.2704, "step": 10 }, { "epoch": 0.00850520944078248, "grad_norm": 0.80766361951828, "learning_rate": 8.520000000000001e-05, "loss": 0.2043, "step": 20 }, { "epoch": 0.01275781416117372, "grad_norm": 1.4762111902236938, "learning_rate": 0.0001278, "loss": 0.2043, "step": 30 }, { "epoch": 0.01701041888156496, "grad_norm": 1.0572073459625244, "learning_rate": 0.00017040000000000002, "loss": 0.2232, "step": 40 }, { "epoch": 0.0212630236019562, "grad_norm": 4.580044269561768, "learning_rate": 0.000213, "loss": 0.2669, "step": 50 }, { "epoch": 0.0212630236019562, "eval_loss": 0.2905704379081726, "eval_runtime": 72.2977, "eval_samples_per_second": 13.693, "eval_steps_per_second": 3.43, "step": 50 }, { "epoch": 0.02551562832234744, "grad_norm": 0.48462969064712524, "learning_rate": 0.00021274057135267128, "loss": 0.1732, "step": 60 }, { "epoch": 0.02976823304273868, "grad_norm": 0.7520514726638794, "learning_rate": 0.00021196354932097723, "loss": 0.1737, "step": 70 }, { "epoch": 0.03402083776312992, "grad_norm": 0.733476996421814, "learning_rate": 0.0002106727194781503, "loss": 0.1691, "step": 80 }, { "epoch": 0.03827344248352116, "grad_norm": 1.303952932357788, "learning_rate": 0.00020887437061743096, "loss": 0.2693, "step": 90 }, { "epoch": 0.0425260472039124, "grad_norm": 2.432600975036621, "learning_rate": 0.00020657726411369925, "loss": 0.2541, "step": 100 }, { "epoch": 0.0425260472039124, "eval_loss": 0.2186073362827301, "eval_runtime": 72.3365, "eval_samples_per_second": 13.686, "eval_steps_per_second": 3.428, "step": 100 }, { "epoch": 0.04677865192430364, "grad_norm": 0.3898923099040985, "learning_rate": 0.000203792591238937, "loss": 0.1503, "step": 110 }, { "epoch": 0.05103125664469488, "grad_norm": 0.5413680672645569, "learning_rate": 0.0002005339186394757, "loss": 0.1434, "step": 120 }, { "epoch": 0.05528386136508612, "grad_norm": 0.783905029296875, "learning_rate": 0.00019681712224065936, "loss": 0.1696, "step": 130 }, { "epoch": 0.05953646608547736, "grad_norm": 1.4847009181976318, "learning_rate": 0.0001926603099009319, "loss": 0.2174, "step": 140 }, { "epoch": 0.0637890708058686, "grad_norm": 2.3255372047424316, "learning_rate": 0.00018808373319217114, "loss": 0.275, "step": 150 }, { "epoch": 0.0637890708058686, "eval_loss": 0.2177441269159317, "eval_runtime": 72.4141, "eval_samples_per_second": 13.671, "eval_steps_per_second": 3.425, "step": 150 }, { "epoch": 0.06804167552625984, "grad_norm": 0.3854157626628876, "learning_rate": 0.00018310968873606635, "loss": 0.1658, "step": 160 }, { "epoch": 0.07229428024665108, "grad_norm": 0.4845024049282074, "learning_rate": 0.0001777624095772184, "loss": 0.1831, "step": 170 }, { "epoch": 0.07654688496704232, "grad_norm": 0.9749467968940735, "learning_rate": 0.0001720679471221826, "loss": 0.1933, "step": 180 }, { "epoch": 0.08079948968743356, "grad_norm": 0.8443153500556946, "learning_rate": 0.00016605404421963453, "loss": 0.2047, "step": 190 }, { "epoch": 0.0850520944078248, "grad_norm": 1.4500705003738403, "learning_rate": 0.00015975, "loss": 0.2931, "step": 200 }, { "epoch": 0.0850520944078248, "eval_loss": 0.20021717250347137, "eval_runtime": 72.6074, "eval_samples_per_second": 13.635, "eval_steps_per_second": 3.416, "step": 200 }, { "epoch": 0.08930469912821604, "grad_norm": 0.5776247978210449, "learning_rate": 0.00015318652713303674, "loss": 0.16, "step": 210 }, { "epoch": 0.09355730384860728, "grad_norm": 0.5547011494636536, "learning_rate": 0.00014639560219879464, "loss": 0.1446, "step": 220 }, { "epoch": 0.09780990856899852, "grad_norm": 0.702063798904419, "learning_rate": 0.0001394103099009319, "loss": 0.1747, "step": 230 }, { "epoch": 0.10206251328938976, "grad_norm": 0.9604615569114685, "learning_rate": 0.0001322646818813646, "loss": 0.2264, "step": 240 }, { "epoch": 0.106315118009781, "grad_norm": 2.0104331970214844, "learning_rate": 0.0001249935309215281, "loss": 0.254, "step": 250 }, { "epoch": 0.106315118009781, "eval_loss": 0.1977642923593521, "eval_runtime": 72.6924, "eval_samples_per_second": 13.619, "eval_steps_per_second": 3.412, "step": 250 }, { "epoch": 0.11056772273017224, "grad_norm": 0.3492359220981598, "learning_rate": 0.0001176322813380051, "loss": 0.1203, "step": 260 }, { "epoch": 0.11482032745056348, "grad_norm": 0.4210224449634552, "learning_rate": 0.00011021679639881638, "loss": 0.1282, "step": 270 }, { "epoch": 0.11907293217095472, "grad_norm": 0.927093505859375, "learning_rate": 0.00010278320360118368, "loss": 0.1669, "step": 280 }, { "epoch": 0.12332553689134595, "grad_norm": 1.108055830001831, "learning_rate": 9.536771866199493e-05, "loss": 0.1855, "step": 290 }, { "epoch": 0.1275781416117372, "grad_norm": 1.8096457719802856, "learning_rate": 8.800646907847192e-05, "loss": 0.2914, "step": 300 }, { "epoch": 0.1275781416117372, "eval_loss": 0.187950000166893, "eval_runtime": 72.7518, "eval_samples_per_second": 13.608, "eval_steps_per_second": 3.409, "step": 300 }, { "epoch": 0.13183074633212843, "grad_norm": 0.4479442834854126, "learning_rate": 8.07353181186354e-05, "loss": 0.124, "step": 310 }, { "epoch": 0.13608335105251967, "grad_norm": 0.41207781434059143, "learning_rate": 7.35896900990681e-05, "loss": 0.1531, "step": 320 }, { "epoch": 0.14033595577291091, "grad_norm": 0.9523985981941223, "learning_rate": 6.660439780120536e-05, "loss": 0.1732, "step": 330 }, { "epoch": 0.14458856049330215, "grad_norm": 1.1972426176071167, "learning_rate": 5.981347286696324e-05, "loss": 0.1977, "step": 340 }, { "epoch": 0.1488411652136934, "grad_norm": 1.7219096422195435, "learning_rate": 5.325000000000002e-05, "loss": 0.1944, "step": 350 }, { "epoch": 0.1488411652136934, "eval_loss": 0.17624303698539734, "eval_runtime": 72.6437, "eval_samples_per_second": 13.628, "eval_steps_per_second": 3.414, "step": 350 }, { "epoch": 0.15309376993408463, "grad_norm": 0.34478458762168884, "learning_rate": 4.6945955780365475e-05, "loss": 0.1069, "step": 360 }, { "epoch": 0.15734637465447587, "grad_norm": 0.7543050646781921, "learning_rate": 4.0932052877817393e-05, "loss": 0.1509, "step": 370 }, { "epoch": 0.1615989793748671, "grad_norm": 0.7749956846237183, "learning_rate": 3.523759042278163e-05, "loss": 0.1447, "step": 380 }, { "epoch": 0.16585158409525835, "grad_norm": 0.8737692832946777, "learning_rate": 2.989031126393367e-05, "loss": 0.2315, "step": 390 }, { "epoch": 0.1701041888156496, "grad_norm": 1.4741559028625488, "learning_rate": 2.4916266807828855e-05, "loss": 0.234, "step": 400 }, { "epoch": 0.1701041888156496, "eval_loss": 0.16768178343772888, "eval_runtime": 72.7753, "eval_samples_per_second": 13.604, "eval_steps_per_second": 3.408, "step": 400 }, { "epoch": 0.17435679353604083, "grad_norm": 0.32338669896125793, "learning_rate": 2.033969009906811e-05, "loss": 0.1204, "step": 410 }, { "epoch": 0.17860939825643207, "grad_norm": 0.5574338436126709, "learning_rate": 1.6182877759340637e-05, "loss": 0.1627, "step": 420 }, { "epoch": 0.1828620029768233, "grad_norm": 0.619001030921936, "learning_rate": 1.2466081360524275e-05, "loss": 0.1113, "step": 430 }, { "epoch": 0.18711460769721455, "grad_norm": 0.8440791368484497, "learning_rate": 9.207408761062996e-06, "loss": 0.1754, "step": 440 }, { "epoch": 0.1913672124176058, "grad_norm": 1.8180017471313477, "learning_rate": 6.422735886300764e-06, "loss": 0.2486, "step": 450 }, { "epoch": 0.1913672124176058, "eval_loss": 0.16511203348636627, "eval_runtime": 72.2907, "eval_samples_per_second": 13.695, "eval_steps_per_second": 3.431, "step": 450 }, { "epoch": 0.19561981713799703, "grad_norm": 0.3398416042327881, "learning_rate": 4.125629382569038e-06, "loss": 0.1377, "step": 460 }, { "epoch": 0.19987242185838827, "grad_norm": 0.6524055004119873, "learning_rate": 2.327280521849694e-06, "loss": 0.1348, "step": 470 }, { "epoch": 0.2041250265787795, "grad_norm": 0.5883779525756836, "learning_rate": 1.0364506790227565e-06, "loss": 0.1782, "step": 480 }, { "epoch": 0.20837763129917075, "grad_norm": 0.6829372644424438, "learning_rate": 2.5942864732872295e-07, "loss": 0.181, "step": 490 }, { "epoch": 0.212630236019562, "grad_norm": 1.7673338651657104, "learning_rate": 0.0, "loss": 0.2541, "step": 500 }, { "epoch": 0.212630236019562, "eval_loss": 0.1643463671207428, "eval_runtime": 72.4913, "eval_samples_per_second": 13.657, "eval_steps_per_second": 3.421, "step": 500 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.1238426161656627e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }