|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 9.848, |
|
"eval_steps": 500, |
|
"global_step": 620, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 3.8025777339935303, |
|
"learning_rate": 2.2580645161290324e-06, |
|
"loss": 1.4695, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.3798772096633911, |
|
"learning_rate": 5.483870967741935e-06, |
|
"loss": 0.9564, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.1069912910461426, |
|
"learning_rate": 8.70967741935484e-06, |
|
"loss": 0.7467, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 1.3565958738327026, |
|
"learning_rate": 1.1935483870967743e-05, |
|
"loss": 0.7442, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.8724671006202698, |
|
"learning_rate": 1.5161290322580646e-05, |
|
"loss": 0.6401, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.8387081027030945, |
|
"learning_rate": 1.838709677419355e-05, |
|
"loss": 0.6412, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.112, |
|
"grad_norm": 1.1549522876739502, |
|
"learning_rate": 1.9996038016334953e-05, |
|
"loss": 0.5019, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.272, |
|
"grad_norm": 0.7454654574394226, |
|
"learning_rate": 1.9968952076274873e-05, |
|
"loss": 0.5679, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.432, |
|
"grad_norm": 1.2986727952957153, |
|
"learning_rate": 1.990884868158239e-05, |
|
"loss": 0.5871, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.592, |
|
"grad_norm": 1.4838392734527588, |
|
"learning_rate": 1.9817344551289796e-05, |
|
"loss": 0.5669, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.752, |
|
"grad_norm": 1.1603983640670776, |
|
"learning_rate": 1.9694729658237925e-05, |
|
"loss": 0.5716, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.912, |
|
"grad_norm": 1.4420567750930786, |
|
"learning_rate": 1.954139256400049e-05, |
|
"loss": 0.584, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.064, |
|
"grad_norm": 0.9452749490737915, |
|
"learning_rate": 1.935781918754836e-05, |
|
"loss": 0.4986, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 2.224, |
|
"grad_norm": 1.114253044128418, |
|
"learning_rate": 1.914459126539224e-05, |
|
"loss": 0.4908, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.384, |
|
"grad_norm": 1.864726185798645, |
|
"learning_rate": 1.8902384508083518e-05, |
|
"loss": 0.4794, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.544, |
|
"grad_norm": 1.430006980895996, |
|
"learning_rate": 1.863196645891518e-05, |
|
"loss": 0.4947, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 2.7039999999999997, |
|
"grad_norm": 0.9796671271324158, |
|
"learning_rate": 1.8334194061608577e-05, |
|
"loss": 0.4618, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 2.864, |
|
"grad_norm": 1.1556652784347534, |
|
"learning_rate": 1.8010010944693846e-05, |
|
"loss": 0.4828, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 3.016, |
|
"grad_norm": 0.9884580373764038, |
|
"learning_rate": 1.766044443118978e-05, |
|
"loss": 0.4485, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 3.176, |
|
"grad_norm": 1.4118114709854126, |
|
"learning_rate": 1.7286602283059238e-05, |
|
"loss": 0.4148, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 3.336, |
|
"grad_norm": 1.833937168121338, |
|
"learning_rate": 1.688966919075687e-05, |
|
"loss": 0.3848, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 3.496, |
|
"grad_norm": 1.2855585813522339, |
|
"learning_rate": 1.647090301899358e-05, |
|
"loss": 0.3866, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 3.656, |
|
"grad_norm": 1.1476826667785645, |
|
"learning_rate": 1.60316308206148e-05, |
|
"loss": 0.3989, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 3.816, |
|
"grad_norm": 1.1333897113800049, |
|
"learning_rate": 1.5573244631224364e-05, |
|
"loss": 0.3955, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 3.976, |
|
"grad_norm": 1.8493810892105103, |
|
"learning_rate": 1.5097197057880707e-05, |
|
"loss": 0.353, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 4.128, |
|
"grad_norm": 1.493438720703125, |
|
"learning_rate": 1.4604996675844586e-05, |
|
"loss": 0.3075, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 4.288, |
|
"grad_norm": 2.338547945022583, |
|
"learning_rate": 1.4098203247965876e-05, |
|
"loss": 0.346, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 4.448, |
|
"grad_norm": 1.3177692890167236, |
|
"learning_rate": 1.3578422781858994e-05, |
|
"loss": 0.3307, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 4.608, |
|
"grad_norm": 1.2539737224578857, |
|
"learning_rate": 1.3047302440530538e-05, |
|
"loss": 0.3482, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 4.768, |
|
"grad_norm": 1.0234341621398926, |
|
"learning_rate": 1.2506525322587207e-05, |
|
"loss": 0.3244, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 4.928, |
|
"grad_norm": 1.274221658706665, |
|
"learning_rate": 1.1957805128565232e-05, |
|
"loss": 0.278, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 5.08, |
|
"grad_norm": 1.353709101676941, |
|
"learning_rate": 1.1402880730283598e-05, |
|
"loss": 0.2497, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 5.24, |
|
"grad_norm": 2.2268123626708984, |
|
"learning_rate": 1.0843510660430447e-05, |
|
"loss": 0.2217, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 5.4, |
|
"grad_norm": 2.0857222080230713, |
|
"learning_rate": 1.028146753984505e-05, |
|
"loss": 0.2379, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 5.5600000000000005, |
|
"grad_norm": 2.352081775665283, |
|
"learning_rate": 9.718532460154948e-06, |
|
"loss": 0.245, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 5.72, |
|
"grad_norm": 1.6790026426315308, |
|
"learning_rate": 9.156489339569555e-06, |
|
"loss": 0.2337, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 5.88, |
|
"grad_norm": 0.9887088537216187, |
|
"learning_rate": 8.597119269716403e-06, |
|
"loss": 0.2297, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 6.032, |
|
"grad_norm": 1.4913004636764526, |
|
"learning_rate": 8.042194871434771e-06, |
|
"loss": 0.2222, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 6.192, |
|
"grad_norm": 1.9435575008392334, |
|
"learning_rate": 7.493474677412795e-06, |
|
"loss": 0.1506, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 6.352, |
|
"grad_norm": 1.686815619468689, |
|
"learning_rate": 6.952697559469464e-06, |
|
"loss": 0.1547, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 6.5120000000000005, |
|
"grad_norm": 1.5592477321624756, |
|
"learning_rate": 6.421577218141007e-06, |
|
"loss": 0.1359, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 6.672, |
|
"grad_norm": 2.845890760421753, |
|
"learning_rate": 5.901796752034128e-06, |
|
"loss": 0.1817, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 6.832, |
|
"grad_norm": 1.81553316116333, |
|
"learning_rate": 5.395003324155414e-06, |
|
"loss": 0.135, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 6.992, |
|
"grad_norm": 1.9674084186553955, |
|
"learning_rate": 4.902802942119293e-06, |
|
"loss": 0.1423, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 7.144, |
|
"grad_norm": 1.0333631038665771, |
|
"learning_rate": 4.426755368775637e-06, |
|
"loss": 0.0863, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 7.304, |
|
"grad_norm": 3.6444036960601807, |
|
"learning_rate": 3.9683691793852044e-06, |
|
"loss": 0.0954, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 7.464, |
|
"grad_norm": 2.4928786754608154, |
|
"learning_rate": 3.5290969810064258e-06, |
|
"loss": 0.0892, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 7.624, |
|
"grad_norm": 2.4595706462860107, |
|
"learning_rate": 3.110330809243134e-06, |
|
"loss": 0.0705, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 7.784, |
|
"grad_norm": 2.398029088973999, |
|
"learning_rate": 2.7133977169407634e-06, |
|
"loss": 0.0872, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 7.944, |
|
"grad_norm": 1.6807390451431274, |
|
"learning_rate": 2.339555568810221e-06, |
|
"loss": 0.077, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 8.096, |
|
"grad_norm": 0.9940488934516907, |
|
"learning_rate": 1.9899890553061565e-06, |
|
"loss": 0.0604, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 8.256, |
|
"grad_norm": 1.9791232347488403, |
|
"learning_rate": 1.6658059383914249e-06, |
|
"loss": 0.0446, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 8.416, |
|
"grad_norm": 0.7762503027915955, |
|
"learning_rate": 1.3680335410848211e-06, |
|
"loss": 0.0432, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 8.576, |
|
"grad_norm": 2.0330629348754883, |
|
"learning_rate": 1.097615491916485e-06, |
|
"loss": 0.0428, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 8.736, |
|
"grad_norm": 0.7456198930740356, |
|
"learning_rate": 8.554087346077633e-07, |
|
"loss": 0.0322, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 8.896, |
|
"grad_norm": 1.492835283279419, |
|
"learning_rate": 6.421808124516437e-07, |
|
"loss": 0.0402, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 9.048, |
|
"grad_norm": 0.904043972492218, |
|
"learning_rate": 4.5860743599951186e-07, |
|
"loss": 0.0288, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 9.208, |
|
"grad_norm": 0.47524094581604004, |
|
"learning_rate": 3.0527034176207724e-07, |
|
"loss": 0.029, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 9.368, |
|
"grad_norm": 1.197879433631897, |
|
"learning_rate": 1.8265544871020724e-07, |
|
"loss": 0.0222, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 9.528, |
|
"grad_norm": 0.608761727809906, |
|
"learning_rate": 9.11513184176116e-08, |
|
"loss": 0.0235, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 9.688, |
|
"grad_norm": 0.5942819714546204, |
|
"learning_rate": 3.104792372512822e-08, |
|
"loss": 0.0249, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 9.848, |
|
"grad_norm": 0.9274442791938782, |
|
"learning_rate": 2.535729828669897e-09, |
|
"loss": 0.0308, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 9.848, |
|
"step": 620, |
|
"total_flos": 4.910594218000384e+16, |
|
"train_loss": 0.3172732482754415, |
|
"train_runtime": 12792.4734, |
|
"train_samples_per_second": 0.391, |
|
"train_steps_per_second": 0.048 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 620, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 10000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4.910594218000384e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|