|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 504, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.05952380952380952, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.9607843137254903e-05, |
|
"loss": 1.9698, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.11904761904761904, |
|
"grad_norm": 0.0, |
|
"learning_rate": 3.9215686274509805e-05, |
|
"loss": 1.903, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.17857142857142858, |
|
"grad_norm": 0.0, |
|
"learning_rate": 5.882352941176471e-05, |
|
"loss": 1.8528, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.23809523809523808, |
|
"grad_norm": 0.0, |
|
"learning_rate": 7.843137254901961e-05, |
|
"loss": 1.865, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.2976190476190476, |
|
"grad_norm": 0.0, |
|
"learning_rate": 9.80392156862745e-05, |
|
"loss": 1.9221, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.35714285714285715, |
|
"grad_norm": 0.0, |
|
"learning_rate": 9.990263847374976e-05, |
|
"loss": 1.8802, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.4166666666666667, |
|
"grad_norm": 0.0, |
|
"learning_rate": 9.956656688041808e-05, |
|
"loss": 1.9587, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.47619047619047616, |
|
"grad_norm": 0.0, |
|
"learning_rate": 9.899219854168429e-05, |
|
"loss": 1.9369, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.5357142857142857, |
|
"grad_norm": 0.0, |
|
"learning_rate": 9.818229479678158e-05, |
|
"loss": 1.8352, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.5952380952380952, |
|
"grad_norm": 0.0, |
|
"learning_rate": 9.714074934742556e-05, |
|
"loss": 1.8348, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.6547619047619048, |
|
"grad_norm": 0.0, |
|
"learning_rate": 9.587256953841317e-05, |
|
"loss": 1.9108, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.7142857142857143, |
|
"grad_norm": 0.0, |
|
"learning_rate": 9.438385228425938e-05, |
|
"loss": 1.9254, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.7738095238095238, |
|
"grad_norm": 0.0, |
|
"learning_rate": 9.268175475760734e-05, |
|
"loss": 1.8523, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.8333333333333334, |
|
"grad_norm": 0.0, |
|
"learning_rate": 9.077445998033015e-05, |
|
"loss": 1.8635, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.8928571428571429, |
|
"grad_norm": 0.0, |
|
"learning_rate": 8.86711374827494e-05, |
|
"loss": 1.891, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.9523809523809523, |
|
"grad_norm": 0.0, |
|
"learning_rate": 8.638189922010529e-05, |
|
"loss": 1.8867, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.0119047619047619, |
|
"grad_norm": 0.0, |
|
"learning_rate": 8.391775095821482e-05, |
|
"loss": 1.885, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.0714285714285714, |
|
"grad_norm": 0.0, |
|
"learning_rate": 8.129053936203687e-05, |
|
"loss": 1.6391, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.130952380952381, |
|
"grad_norm": 0.0, |
|
"learning_rate": 7.851289504152201e-05, |
|
"loss": 1.6464, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.1904761904761905, |
|
"grad_norm": 0.0, |
|
"learning_rate": 7.559817182855977e-05, |
|
"loss": 1.6074, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 0.0, |
|
"learning_rate": 7.256038257695687e-05, |
|
"loss": 1.5893, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.3095238095238095, |
|
"grad_norm": 0.0, |
|
"learning_rate": 6.941413179409468e-05, |
|
"loss": 1.6661, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.369047619047619, |
|
"grad_norm": 0.0, |
|
"learning_rate": 6.61745454281468e-05, |
|
"loss": 1.5408, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.4285714285714286, |
|
"grad_norm": 0.0, |
|
"learning_rate": 6.28571981484123e-05, |
|
"loss": 1.6856, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.4880952380952381, |
|
"grad_norm": 0.0, |
|
"learning_rate": 5.94780384683728e-05, |
|
"loss": 1.5643, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.5476190476190477, |
|
"grad_norm": 0.0, |
|
"learning_rate": 5.605331207145219e-05, |
|
"loss": 1.7048, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.6071428571428572, |
|
"grad_norm": 0.0, |
|
"learning_rate": 5.2599483708099016e-05, |
|
"loss": 1.6339, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.6666666666666665, |
|
"grad_norm": 0.0, |
|
"learning_rate": 4.913315803968012e-05, |
|
"loss": 1.7719, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.7261904761904763, |
|
"grad_norm": 0.0, |
|
"learning_rate": 4.567099980973684e-05, |
|
"loss": 1.6909, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.7857142857142856, |
|
"grad_norm": 0.0, |
|
"learning_rate": 4.2229653726389765e-05, |
|
"loss": 1.4923, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.8452380952380953, |
|
"grad_norm": 0.0, |
|
"learning_rate": 3.8825664441065734e-05, |
|
"loss": 1.6585, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.9047619047619047, |
|
"grad_norm": 0.0, |
|
"learning_rate": 3.547539700825874e-05, |
|
"loss": 1.6052, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.9642857142857144, |
|
"grad_norm": 0.0, |
|
"learning_rate": 3.219495820872265e-05, |
|
"loss": 1.6084, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.0238095238095237, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.9000119114343582e-05, |
|
"loss": 1.495, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.0833333333333335, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.5906239266969805e-05, |
|
"loss": 1.379, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.142857142857143, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.2928192835717644e-05, |
|
"loss": 1.4044, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.2023809523809526, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.008029710776041e-05, |
|
"loss": 1.3729, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.261904761904762, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.7376243656388923e-05, |
|
"loss": 1.4013, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.3214285714285716, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.4829032517260489e-05, |
|
"loss": 1.3822, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 2.380952380952381, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.245090968929148e-05, |
|
"loss": 1.3588, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.4404761904761907, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.0253308260664902e-05, |
|
"loss": 1.3299, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 0.0, |
|
"learning_rate": 8.246793442995954e-06, |
|
"loss": 1.3465, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 2.5595238095238093, |
|
"grad_norm": 0.0, |
|
"learning_rate": 6.4410117779103e-06, |
|
"loss": 1.2845, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 2.619047619047619, |
|
"grad_norm": 0.0, |
|
"learning_rate": 4.844644760229544e-06, |
|
"loss": 1.3998, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 2.678571428571429, |
|
"grad_norm": 0.0, |
|
"learning_rate": 3.465367100725908e-06, |
|
"loss": 1.4586, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.738095238095238, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.3098098291025273e-06, |
|
"loss": 1.3051, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 2.7976190476190474, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.3835284145856275e-06, |
|
"loss": 1.3665, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 2.857142857142857, |
|
"grad_norm": 0.0, |
|
"learning_rate": 6.909760573925561e-07, |
|
"loss": 1.4534, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 2.9166666666666665, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.3548227947988167e-07, |
|
"loss": 1.419, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 2.9761904761904763, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.9236917498782758e-08, |
|
"loss": 1.3451, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 504, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 1660, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4.647913924224614e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|