{ "best_metric": 1.3218564987182617, "best_model_checkpoint": "miner_id_24/checkpoint-500", "epoch": 0.18392495861688432, "eval_steps": 50, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0003678499172337686, "eval_loss": 2.6280200481414795, "eval_runtime": 65.4834, "eval_samples_per_second": 17.485, "eval_steps_per_second": 4.383, "step": 1 }, { "epoch": 0.0036784991723376862, "grad_norm": 1.1980564594268799, "learning_rate": 4.0400000000000006e-05, "loss": 2.5135, "step": 10 }, { "epoch": 0.0073569983446753725, "grad_norm": 1.2913352251052856, "learning_rate": 8.080000000000001e-05, "loss": 1.9668, "step": 20 }, { "epoch": 0.011035497517013059, "grad_norm": 1.0023733377456665, "learning_rate": 0.00012119999999999999, "loss": 1.6289, "step": 30 }, { "epoch": 0.014713996689350745, "grad_norm": 1.1973117589950562, "learning_rate": 0.00016160000000000002, "loss": 1.5254, "step": 40 }, { "epoch": 0.01839249586168843, "grad_norm": 1.1962615251541138, "learning_rate": 0.000202, "loss": 1.3739, "step": 50 }, { "epoch": 0.01839249586168843, "eval_loss": 1.5840084552764893, "eval_runtime": 65.6655, "eval_samples_per_second": 17.437, "eval_steps_per_second": 4.371, "step": 50 }, { "epoch": 0.022070995034026118, "grad_norm": 0.9154727458953857, "learning_rate": 0.00020175396907624226, "loss": 1.6272, "step": 60 }, { "epoch": 0.025749494206363802, "grad_norm": 0.8820514678955078, "learning_rate": 0.0002010170749428986, "loss": 1.5373, "step": 70 }, { "epoch": 0.02942799337870149, "grad_norm": 0.9362473487854004, "learning_rate": 0.00019979290767411438, "loss": 1.4742, "step": 80 }, { "epoch": 0.033106492551039174, "grad_norm": 0.9134910702705383, "learning_rate": 0.0001980874312897702, "loss": 1.428, "step": 90 }, { "epoch": 0.03678499172337686, "grad_norm": 1.1899622678756714, "learning_rate": 0.00019590895469937675, "loss": 1.3833, "step": 100 }, { "epoch": 0.03678499172337686, "eval_loss": 1.4966574907302856, "eval_runtime": 65.9742, "eval_samples_per_second": 17.355, "eval_steps_per_second": 4.35, "step": 100 }, { "epoch": 0.04046349089571455, "grad_norm": 0.8478430509567261, "learning_rate": 0.0001932680912219027, "loss": 1.5648, "step": 110 }, { "epoch": 0.044141990068052236, "grad_norm": 0.9589736461639404, "learning_rate": 0.00019017770687875164, "loss": 1.4468, "step": 120 }, { "epoch": 0.047820489240389924, "grad_norm": 0.9376389384269714, "learning_rate": 0.000186652857711799, "loss": 1.4539, "step": 130 }, { "epoch": 0.051498988412727605, "grad_norm": 0.8931785821914673, "learning_rate": 0.00018271071643186968, "loss": 1.3959, "step": 140 }, { "epoch": 0.05517748758506529, "grad_norm": 1.1111578941345215, "learning_rate": 0.00017837048875501678, "loss": 1.3041, "step": 150 }, { "epoch": 0.05517748758506529, "eval_loss": 1.4765535593032837, "eval_runtime": 66.144, "eval_samples_per_second": 17.311, "eval_steps_per_second": 4.339, "step": 150 }, { "epoch": 0.05885598675740298, "grad_norm": 0.8667774796485901, "learning_rate": 0.00017365331983420376, "loss": 1.5348, "step": 160 }, { "epoch": 0.06253448592974066, "grad_norm": 0.8413021564483643, "learning_rate": 0.0001685821912422447, "loss": 1.5303, "step": 170 }, { "epoch": 0.06621298510207835, "grad_norm": 0.8751350045204163, "learning_rate": 0.00016318180900789148, "loss": 1.3763, "step": 180 }, { "epoch": 0.06989148427441604, "grad_norm": 0.9402979016304016, "learning_rate": 0.00015747848325054544, "loss": 1.3309, "step": 190 }, { "epoch": 0.07356998344675372, "grad_norm": 1.0289543867111206, "learning_rate": 0.0001515, "loss": 1.2805, "step": 200 }, { "epoch": 0.07356998344675372, "eval_loss": 1.4655332565307617, "eval_runtime": 67.0945, "eval_samples_per_second": 17.065, "eval_steps_per_second": 4.278, "step": 200 }, { "epoch": 0.07724848261909141, "grad_norm": 0.7757532000541687, "learning_rate": 0.00014527548582569683, "loss": 1.4872, "step": 210 }, { "epoch": 0.0809269817914291, "grad_norm": 0.8359755873680115, "learning_rate": 0.00013883526593500714, "loss": 1.4299, "step": 220 }, { "epoch": 0.08460548096376679, "grad_norm": 0.8961185812950134, "learning_rate": 0.0001322107164318697, "loss": 1.404, "step": 230 }, { "epoch": 0.08828398013610447, "grad_norm": 0.9134626984596252, "learning_rate": 0.00012543411145556643, "loss": 1.3383, "step": 240 }, { "epoch": 0.09196247930844216, "grad_norm": 1.0392701625823975, "learning_rate": 0.00011853846594435998, "loss": 1.2476, "step": 250 }, { "epoch": 0.09196247930844216, "eval_loss": 1.4170724153518677, "eval_runtime": 65.8266, "eval_samples_per_second": 17.394, "eval_steps_per_second": 4.36, "step": 250 }, { "epoch": 0.09564097848077985, "grad_norm": 0.7596132755279541, "learning_rate": 0.00011155737479003301, "loss": 1.5365, "step": 260 }, { "epoch": 0.09931947765311752, "grad_norm": 0.8383329510688782, "learning_rate": 0.00010452484916695262, "loss": 1.4248, "step": 270 }, { "epoch": 0.10299797682545521, "grad_norm": 0.8279966711997986, "learning_rate": 9.747515083304742e-05, "loss": 1.3758, "step": 280 }, { "epoch": 0.1066764759977929, "grad_norm": 0.9013005495071411, "learning_rate": 9.044262520996702e-05, "loss": 1.3529, "step": 290 }, { "epoch": 0.11035497517013058, "grad_norm": 1.153868317604065, "learning_rate": 8.346153405564004e-05, "loss": 1.2002, "step": 300 }, { "epoch": 0.11035497517013058, "eval_loss": 1.3866181373596191, "eval_runtime": 65.7201, "eval_samples_per_second": 17.422, "eval_steps_per_second": 4.367, "step": 300 }, { "epoch": 0.11403347434246827, "grad_norm": 0.7157303690910339, "learning_rate": 7.656588854443357e-05, "loss": 1.4783, "step": 310 }, { "epoch": 0.11771197351480596, "grad_norm": 0.7488118410110474, "learning_rate": 6.978928356813031e-05, "loss": 1.4637, "step": 320 }, { "epoch": 0.12139047268714365, "grad_norm": 0.7784683108329773, "learning_rate": 6.316473406499288e-05, "loss": 1.3839, "step": 330 }, { "epoch": 0.12506897185948132, "grad_norm": 0.7745943665504456, "learning_rate": 5.672451417430317e-05, "loss": 1.2398, "step": 340 }, { "epoch": 0.12874747103181902, "grad_norm": 1.1556774377822876, "learning_rate": 5.050000000000002e-05, "loss": 1.24, "step": 350 }, { "epoch": 0.12874747103181902, "eval_loss": 1.3593629598617554, "eval_runtime": 65.7086, "eval_samples_per_second": 17.425, "eval_steps_per_second": 4.368, "step": 350 }, { "epoch": 0.1324259702041567, "grad_norm": 0.7711465358734131, "learning_rate": 4.452151674945458e-05, "loss": 1.4537, "step": 360 }, { "epoch": 0.1361044693764944, "grad_norm": 0.7745826840400696, "learning_rate": 3.8818190992108515e-05, "loss": 1.4033, "step": 370 }, { "epoch": 0.13978296854883207, "grad_norm": 0.7452824115753174, "learning_rate": 3.3417808757755355e-05, "loss": 1.2894, "step": 380 }, { "epoch": 0.14346146772116977, "grad_norm": 0.8825933933258057, "learning_rate": 2.8346680165796253e-05, "loss": 1.3367, "step": 390 }, { "epoch": 0.14713996689350745, "grad_norm": 0.9986250996589661, "learning_rate": 2.362951124498323e-05, "loss": 1.2489, "step": 400 }, { "epoch": 0.14713996689350745, "eval_loss": 1.3324865102767944, "eval_runtime": 65.63, "eval_samples_per_second": 17.446, "eval_steps_per_second": 4.373, "step": 400 }, { "epoch": 0.15081846606584515, "grad_norm": 0.7403445839881897, "learning_rate": 1.928928356813032e-05, "loss": 1.4268, "step": 410 }, { "epoch": 0.15449696523818282, "grad_norm": 0.7790653109550476, "learning_rate": 1.5347142288200977e-05, "loss": 1.3724, "step": 420 }, { "epoch": 0.1581754644105205, "grad_norm": 0.8011258840560913, "learning_rate": 1.1822293121248375e-05, "loss": 1.3301, "step": 430 }, { "epoch": 0.1618539635828582, "grad_norm": 0.8102395534515381, "learning_rate": 8.731908778097302e-06, "loss": 1.2836, "step": 440 }, { "epoch": 0.16553246275519587, "grad_norm": 0.930636465549469, "learning_rate": 6.09104530062326e-06, "loss": 1.2365, "step": 450 }, { "epoch": 0.16553246275519587, "eval_loss": 1.3237847089767456, "eval_runtime": 65.7933, "eval_samples_per_second": 17.403, "eval_steps_per_second": 4.362, "step": 450 }, { "epoch": 0.16921096192753357, "grad_norm": 0.7757034301757812, "learning_rate": 3.912568710229791e-06, "loss": 1.4375, "step": 460 }, { "epoch": 0.17288946109987124, "grad_norm": 0.7832618951797485, "learning_rate": 2.2070923258856255e-06, "loss": 1.339, "step": 470 }, { "epoch": 0.17656796027220895, "grad_norm": 0.7737963795661926, "learning_rate": 9.829250571013935e-07, "loss": 1.3189, "step": 480 }, { "epoch": 0.18024645944454662, "grad_norm": 0.8774094581604004, "learning_rate": 2.4603092375775605e-07, "loss": 1.267, "step": 490 }, { "epoch": 0.18392495861688432, "grad_norm": 1.0865882635116577, "learning_rate": 0.0, "loss": 1.2489, "step": 500 }, { "epoch": 0.18392495861688432, "eval_loss": 1.3218564987182617, "eval_runtime": 65.6585, "eval_samples_per_second": 17.439, "eval_steps_per_second": 4.371, "step": 500 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.886804283392e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }