|
{ |
|
"best_metric": 1.3218564987182617, |
|
"best_model_checkpoint": "miner_id_24/checkpoint-500", |
|
"epoch": 0.18392495861688432, |
|
"eval_steps": 50, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0003678499172337686, |
|
"eval_loss": 2.6280200481414795, |
|
"eval_runtime": 65.4834, |
|
"eval_samples_per_second": 17.485, |
|
"eval_steps_per_second": 4.383, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0036784991723376862, |
|
"grad_norm": 1.1980564594268799, |
|
"learning_rate": 4.0400000000000006e-05, |
|
"loss": 2.5135, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0073569983446753725, |
|
"grad_norm": 1.2913352251052856, |
|
"learning_rate": 8.080000000000001e-05, |
|
"loss": 1.9668, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.011035497517013059, |
|
"grad_norm": 1.0023733377456665, |
|
"learning_rate": 0.00012119999999999999, |
|
"loss": 1.6289, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.014713996689350745, |
|
"grad_norm": 1.1973117589950562, |
|
"learning_rate": 0.00016160000000000002, |
|
"loss": 1.5254, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.01839249586168843, |
|
"grad_norm": 1.1962615251541138, |
|
"learning_rate": 0.000202, |
|
"loss": 1.3739, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.01839249586168843, |
|
"eval_loss": 1.5840084552764893, |
|
"eval_runtime": 65.6655, |
|
"eval_samples_per_second": 17.437, |
|
"eval_steps_per_second": 4.371, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.022070995034026118, |
|
"grad_norm": 0.9154727458953857, |
|
"learning_rate": 0.00020175396907624226, |
|
"loss": 1.6272, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.025749494206363802, |
|
"grad_norm": 0.8820514678955078, |
|
"learning_rate": 0.0002010170749428986, |
|
"loss": 1.5373, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.02942799337870149, |
|
"grad_norm": 0.9362473487854004, |
|
"learning_rate": 0.00019979290767411438, |
|
"loss": 1.4742, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.033106492551039174, |
|
"grad_norm": 0.9134910702705383, |
|
"learning_rate": 0.0001980874312897702, |
|
"loss": 1.428, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.03678499172337686, |
|
"grad_norm": 1.1899622678756714, |
|
"learning_rate": 0.00019590895469937675, |
|
"loss": 1.3833, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.03678499172337686, |
|
"eval_loss": 1.4966574907302856, |
|
"eval_runtime": 65.9742, |
|
"eval_samples_per_second": 17.355, |
|
"eval_steps_per_second": 4.35, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.04046349089571455, |
|
"grad_norm": 0.8478430509567261, |
|
"learning_rate": 0.0001932680912219027, |
|
"loss": 1.5648, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.044141990068052236, |
|
"grad_norm": 0.9589736461639404, |
|
"learning_rate": 0.00019017770687875164, |
|
"loss": 1.4468, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.047820489240389924, |
|
"grad_norm": 0.9376389384269714, |
|
"learning_rate": 0.000186652857711799, |
|
"loss": 1.4539, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.051498988412727605, |
|
"grad_norm": 0.8931785821914673, |
|
"learning_rate": 0.00018271071643186968, |
|
"loss": 1.3959, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.05517748758506529, |
|
"grad_norm": 1.1111578941345215, |
|
"learning_rate": 0.00017837048875501678, |
|
"loss": 1.3041, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.05517748758506529, |
|
"eval_loss": 1.4765535593032837, |
|
"eval_runtime": 66.144, |
|
"eval_samples_per_second": 17.311, |
|
"eval_steps_per_second": 4.339, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.05885598675740298, |
|
"grad_norm": 0.8667774796485901, |
|
"learning_rate": 0.00017365331983420376, |
|
"loss": 1.5348, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.06253448592974066, |
|
"grad_norm": 0.8413021564483643, |
|
"learning_rate": 0.0001685821912422447, |
|
"loss": 1.5303, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.06621298510207835, |
|
"grad_norm": 0.8751350045204163, |
|
"learning_rate": 0.00016318180900789148, |
|
"loss": 1.3763, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.06989148427441604, |
|
"grad_norm": 0.9402979016304016, |
|
"learning_rate": 0.00015747848325054544, |
|
"loss": 1.3309, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.07356998344675372, |
|
"grad_norm": 1.0289543867111206, |
|
"learning_rate": 0.0001515, |
|
"loss": 1.2805, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.07356998344675372, |
|
"eval_loss": 1.4655332565307617, |
|
"eval_runtime": 67.0945, |
|
"eval_samples_per_second": 17.065, |
|
"eval_steps_per_second": 4.278, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.07724848261909141, |
|
"grad_norm": 0.7757532000541687, |
|
"learning_rate": 0.00014527548582569683, |
|
"loss": 1.4872, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.0809269817914291, |
|
"grad_norm": 0.8359755873680115, |
|
"learning_rate": 0.00013883526593500714, |
|
"loss": 1.4299, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.08460548096376679, |
|
"grad_norm": 0.8961185812950134, |
|
"learning_rate": 0.0001322107164318697, |
|
"loss": 1.404, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.08828398013610447, |
|
"grad_norm": 0.9134626984596252, |
|
"learning_rate": 0.00012543411145556643, |
|
"loss": 1.3383, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.09196247930844216, |
|
"grad_norm": 1.0392701625823975, |
|
"learning_rate": 0.00011853846594435998, |
|
"loss": 1.2476, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.09196247930844216, |
|
"eval_loss": 1.4170724153518677, |
|
"eval_runtime": 65.8266, |
|
"eval_samples_per_second": 17.394, |
|
"eval_steps_per_second": 4.36, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.09564097848077985, |
|
"grad_norm": 0.7596132755279541, |
|
"learning_rate": 0.00011155737479003301, |
|
"loss": 1.5365, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.09931947765311752, |
|
"grad_norm": 0.8383329510688782, |
|
"learning_rate": 0.00010452484916695262, |
|
"loss": 1.4248, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.10299797682545521, |
|
"grad_norm": 0.8279966711997986, |
|
"learning_rate": 9.747515083304742e-05, |
|
"loss": 1.3758, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.1066764759977929, |
|
"grad_norm": 0.9013005495071411, |
|
"learning_rate": 9.044262520996702e-05, |
|
"loss": 1.3529, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.11035497517013058, |
|
"grad_norm": 1.153868317604065, |
|
"learning_rate": 8.346153405564004e-05, |
|
"loss": 1.2002, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.11035497517013058, |
|
"eval_loss": 1.3866181373596191, |
|
"eval_runtime": 65.7201, |
|
"eval_samples_per_second": 17.422, |
|
"eval_steps_per_second": 4.367, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.11403347434246827, |
|
"grad_norm": 0.7157303690910339, |
|
"learning_rate": 7.656588854443357e-05, |
|
"loss": 1.4783, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.11771197351480596, |
|
"grad_norm": 0.7488118410110474, |
|
"learning_rate": 6.978928356813031e-05, |
|
"loss": 1.4637, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.12139047268714365, |
|
"grad_norm": 0.7784683108329773, |
|
"learning_rate": 6.316473406499288e-05, |
|
"loss": 1.3839, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.12506897185948132, |
|
"grad_norm": 0.7745943665504456, |
|
"learning_rate": 5.672451417430317e-05, |
|
"loss": 1.2398, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.12874747103181902, |
|
"grad_norm": 1.1556774377822876, |
|
"learning_rate": 5.050000000000002e-05, |
|
"loss": 1.24, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.12874747103181902, |
|
"eval_loss": 1.3593629598617554, |
|
"eval_runtime": 65.7086, |
|
"eval_samples_per_second": 17.425, |
|
"eval_steps_per_second": 4.368, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.1324259702041567, |
|
"grad_norm": 0.7711465358734131, |
|
"learning_rate": 4.452151674945458e-05, |
|
"loss": 1.4537, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.1361044693764944, |
|
"grad_norm": 0.7745826840400696, |
|
"learning_rate": 3.8818190992108515e-05, |
|
"loss": 1.4033, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.13978296854883207, |
|
"grad_norm": 0.7452824115753174, |
|
"learning_rate": 3.3417808757755355e-05, |
|
"loss": 1.2894, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.14346146772116977, |
|
"grad_norm": 0.8825933933258057, |
|
"learning_rate": 2.8346680165796253e-05, |
|
"loss": 1.3367, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.14713996689350745, |
|
"grad_norm": 0.9986250996589661, |
|
"learning_rate": 2.362951124498323e-05, |
|
"loss": 1.2489, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.14713996689350745, |
|
"eval_loss": 1.3324865102767944, |
|
"eval_runtime": 65.63, |
|
"eval_samples_per_second": 17.446, |
|
"eval_steps_per_second": 4.373, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.15081846606584515, |
|
"grad_norm": 0.7403445839881897, |
|
"learning_rate": 1.928928356813032e-05, |
|
"loss": 1.4268, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.15449696523818282, |
|
"grad_norm": 0.7790653109550476, |
|
"learning_rate": 1.5347142288200977e-05, |
|
"loss": 1.3724, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.1581754644105205, |
|
"grad_norm": 0.8011258840560913, |
|
"learning_rate": 1.1822293121248375e-05, |
|
"loss": 1.3301, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.1618539635828582, |
|
"grad_norm": 0.8102395534515381, |
|
"learning_rate": 8.731908778097302e-06, |
|
"loss": 1.2836, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.16553246275519587, |
|
"grad_norm": 0.930636465549469, |
|
"learning_rate": 6.09104530062326e-06, |
|
"loss": 1.2365, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.16553246275519587, |
|
"eval_loss": 1.3237847089767456, |
|
"eval_runtime": 65.7933, |
|
"eval_samples_per_second": 17.403, |
|
"eval_steps_per_second": 4.362, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.16921096192753357, |
|
"grad_norm": 0.7757034301757812, |
|
"learning_rate": 3.912568710229791e-06, |
|
"loss": 1.4375, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.17288946109987124, |
|
"grad_norm": 0.7832618951797485, |
|
"learning_rate": 2.2070923258856255e-06, |
|
"loss": 1.339, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.17656796027220895, |
|
"grad_norm": 0.7737963795661926, |
|
"learning_rate": 9.829250571013935e-07, |
|
"loss": 1.3189, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.18024645944454662, |
|
"grad_norm": 0.8774094581604004, |
|
"learning_rate": 2.4603092375775605e-07, |
|
"loss": 1.267, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.18392495861688432, |
|
"grad_norm": 1.0865882635116577, |
|
"learning_rate": 0.0, |
|
"loss": 1.2489, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.18392495861688432, |
|
"eval_loss": 1.3218564987182617, |
|
"eval_runtime": 65.6585, |
|
"eval_samples_per_second": 17.439, |
|
"eval_steps_per_second": 4.371, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 3, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 8.886804283392e+16, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|