|
{ |
|
"best_metric": 11.9102783203125, |
|
"best_model_checkpoint": "miner_id_24/checkpoint-150", |
|
"epoch": 3.0, |
|
"eval_steps": 50, |
|
"global_step": 168, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.017857142857142856, |
|
"eval_loss": 11.9290132522583, |
|
"eval_runtime": 0.5421, |
|
"eval_samples_per_second": 175.25, |
|
"eval_steps_per_second": 44.274, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.05357142857142857, |
|
"grad_norm": 0.019959961995482445, |
|
"learning_rate": 3e-05, |
|
"loss": 11.9303, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.10714285714285714, |
|
"grad_norm": 0.022794105112552643, |
|
"learning_rate": 6e-05, |
|
"loss": 11.9305, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.16071428571428573, |
|
"grad_norm": 0.0353395938873291, |
|
"learning_rate": 9e-05, |
|
"loss": 11.9295, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.21428571428571427, |
|
"grad_norm": 0.03792329132556915, |
|
"learning_rate": 9.996046986136509e-05, |
|
"loss": 11.93, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.26785714285714285, |
|
"grad_norm": 0.03241180628538132, |
|
"learning_rate": 9.975310752612137e-05, |
|
"loss": 11.9276, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.32142857142857145, |
|
"grad_norm": 0.05070869252085686, |
|
"learning_rate": 9.936876709681668e-05, |
|
"loss": 11.9291, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.375, |
|
"grad_norm": 0.05852275714278221, |
|
"learning_rate": 9.880881572095256e-05, |
|
"loss": 11.9291, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.42857142857142855, |
|
"grad_norm": 0.08124187588691711, |
|
"learning_rate": 9.807524521637102e-05, |
|
"loss": 11.9278, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.48214285714285715, |
|
"grad_norm": 0.05361940711736679, |
|
"learning_rate": 9.717066498610673e-05, |
|
"loss": 11.9256, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.5357142857142857, |
|
"grad_norm": 0.0750463530421257, |
|
"learning_rate": 9.609829273641034e-05, |
|
"loss": 11.9261, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.5892857142857143, |
|
"grad_norm": 0.08543704450130463, |
|
"learning_rate": 9.486194303096062e-05, |
|
"loss": 11.9248, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.6428571428571429, |
|
"grad_norm": 0.09016852080821991, |
|
"learning_rate": 9.346601372197914e-05, |
|
"loss": 11.9243, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.6964285714285714, |
|
"grad_norm": 0.10736904293298721, |
|
"learning_rate": 9.191547030651383e-05, |
|
"loss": 11.9206, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.0793285071849823, |
|
"learning_rate": 9.021582826353824e-05, |
|
"loss": 11.9232, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.8035714285714286, |
|
"grad_norm": 0.05057210102677345, |
|
"learning_rate": 8.83731334346954e-05, |
|
"loss": 11.9215, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.8571428571428571, |
|
"grad_norm": 0.05617088824510574, |
|
"learning_rate": 8.639394051847472e-05, |
|
"loss": 11.9211, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.8928571428571429, |
|
"eval_loss": 11.918818473815918, |
|
"eval_runtime": 0.5369, |
|
"eval_samples_per_second": 176.942, |
|
"eval_steps_per_second": 44.701, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.9107142857142857, |
|
"grad_norm": 0.06780627369880676, |
|
"learning_rate": 8.428528975432066e-05, |
|
"loss": 11.9193, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.9642857142857143, |
|
"grad_norm": 0.03634953871369362, |
|
"learning_rate": 8.2054681879611e-05, |
|
"loss": 11.9178, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 1.0178571428571428, |
|
"grad_norm": 0.04605906456708908, |
|
"learning_rate": 7.971005144858553e-05, |
|
"loss": 11.9182, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 1.0714285714285714, |
|
"grad_norm": 0.025876272469758987, |
|
"learning_rate": 7.725973860813338e-05, |
|
"loss": 11.9199, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.125, |
|
"grad_norm": 0.04855341464281082, |
|
"learning_rate": 7.471245943083615e-05, |
|
"loss": 11.9185, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 1.1785714285714286, |
|
"grad_norm": 0.03910359740257263, |
|
"learning_rate": 7.20772749107956e-05, |
|
"loss": 11.9184, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 1.2321428571428572, |
|
"grad_norm": 0.08992303162813187, |
|
"learning_rate": 6.936355873253206e-05, |
|
"loss": 11.9154, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 1.2857142857142856, |
|
"grad_norm": 0.04321262612938881, |
|
"learning_rate": 6.65809639276034e-05, |
|
"loss": 11.919, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 1.3392857142857144, |
|
"grad_norm": 0.049111876636743546, |
|
"learning_rate": 6.373938853755126e-05, |
|
"loss": 11.9185, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 1.3928571428571428, |
|
"grad_norm": 0.0660889744758606, |
|
"learning_rate": 6.08489404053159e-05, |
|
"loss": 11.9156, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 1.4464285714285714, |
|
"grad_norm": 0.0920424684882164, |
|
"learning_rate": 5.791990122036075e-05, |
|
"loss": 11.9151, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.04646582156419754, |
|
"learning_rate": 5.496268994540309e-05, |
|
"loss": 11.9162, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 1.5535714285714286, |
|
"grad_norm": 0.053827133029699326, |
|
"learning_rate": 5.19878257548463e-05, |
|
"loss": 11.9172, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 1.6071428571428572, |
|
"grad_norm": 0.04865885153412819, |
|
"learning_rate": 4.900589061674649e-05, |
|
"loss": 11.9165, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.6607142857142856, |
|
"grad_norm": 0.0875491127371788, |
|
"learning_rate": 4.602749165141428e-05, |
|
"loss": 11.9132, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 1.7142857142857144, |
|
"grad_norm": 0.04339161515235901, |
|
"learning_rate": 4.3063223400546594e-05, |
|
"loss": 11.9123, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 1.7678571428571428, |
|
"grad_norm": 0.04742836579680443, |
|
"learning_rate": 4.012363014110237e-05, |
|
"loss": 11.9154, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 1.7857142857142856, |
|
"eval_loss": 11.913043975830078, |
|
"eval_runtime": 0.5398, |
|
"eval_samples_per_second": 175.99, |
|
"eval_steps_per_second": 44.461, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.8214285714285714, |
|
"grad_norm": 0.04398871585726738, |
|
"learning_rate": 3.721916837797627e-05, |
|
"loss": 11.9137, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 1.875, |
|
"grad_norm": 0.05781185254454613, |
|
"learning_rate": 3.436016964888865e-05, |
|
"loss": 11.9125, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 1.9285714285714286, |
|
"grad_norm": 0.08808522671461105, |
|
"learning_rate": 3.1556803773799614e-05, |
|
"loss": 11.9077, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 1.9821428571428572, |
|
"grad_norm": 0.06021308898925781, |
|
"learning_rate": 2.8819042679573617e-05, |
|
"loss": 11.9141, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 2.0357142857142856, |
|
"grad_norm": 0.048566147685050964, |
|
"learning_rate": 2.6156624928574707e-05, |
|
"loss": 11.9136, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 2.0892857142857144, |
|
"grad_norm": 0.04223039001226425, |
|
"learning_rate": 2.3579021077369046e-05, |
|
"loss": 11.9142, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 2.142857142857143, |
|
"grad_norm": 0.04601627215743065, |
|
"learning_rate": 2.1095399988757574e-05, |
|
"loss": 11.9133, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.1964285714285716, |
|
"grad_norm": 0.05840318650007248, |
|
"learning_rate": 1.8714596216972007e-05, |
|
"loss": 11.9112, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 0.035777896642684937, |
|
"learning_rate": 1.6445078582048155e-05, |
|
"loss": 11.9091, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 2.3035714285714284, |
|
"grad_norm": 0.037539299577474594, |
|
"learning_rate": 1.4294920045162513e-05, |
|
"loss": 11.9146, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 2.357142857142857, |
|
"grad_norm": 0.042784880846738815, |
|
"learning_rate": 1.2271768992088489e-05, |
|
"loss": 11.9117, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 2.4107142857142856, |
|
"grad_norm": 0.05495860055088997, |
|
"learning_rate": 1.038282202692129e-05, |
|
"loss": 11.9109, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 2.4642857142857144, |
|
"grad_norm": 0.07855169475078583, |
|
"learning_rate": 8.634798372847148e-06, |
|
"loss": 11.9069, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 2.517857142857143, |
|
"grad_norm": 0.03630689159035683, |
|
"learning_rate": 7.033915971016952e-06, |
|
"loss": 11.9139, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 2.571428571428571, |
|
"grad_norm": 0.048760075122117996, |
|
"learning_rate": 5.585869362543416e-06, |
|
"loss": 11.9124, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 2.625, |
|
"grad_norm": 0.04377627745270729, |
|
"learning_rate": 4.29580943229827e-06, |
|
"loss": 11.9118, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 2.678571428571429, |
|
"grad_norm": 0.05847623199224472, |
|
"learning_rate": 3.1683250865636114e-06, |
|
"loss": 11.9095, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.678571428571429, |
|
"eval_loss": 11.9102783203125, |
|
"eval_runtime": 0.5392, |
|
"eval_samples_per_second": 176.177, |
|
"eval_steps_per_second": 44.508, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.732142857142857, |
|
"grad_norm": 0.034065768122673035, |
|
"learning_rate": 2.2074269297119587e-06, |
|
"loss": 11.9088, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 2.7857142857142856, |
|
"grad_norm": 0.04465312138199806, |
|
"learning_rate": 1.4165329979794973e-06, |
|
"loss": 11.9133, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 2.8392857142857144, |
|
"grad_norm": 0.043062131851911545, |
|
"learning_rate": 7.984566010789674e-07, |
|
"loss": 11.9122, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 2.892857142857143, |
|
"grad_norm": 0.05254960432648659, |
|
"learning_rate": 3.553963149013295e-07, |
|
"loss": 11.9094, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 2.946428571428571, |
|
"grad_norm": 0.032099399715662, |
|
"learning_rate": 8.892816090335099e-08, |
|
"loss": 11.9082, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.09737348556518555, |
|
"learning_rate": 0.0, |
|
"loss": 11.908, |
|
"step": 168 |
|
} |
|
], |
|
"logging_steps": 3, |
|
"max_steps": 168, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 5, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 808543272960.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|