|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 17.094017094017094, |
|
"eval_steps": 1000, |
|
"global_step": 3000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.005698005698005698, |
|
"grad_norm": 6.307312488555908, |
|
"learning_rate": 0.0, |
|
"loss": 3.0911, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.2849002849002849, |
|
"grad_norm": 4.076864719390869, |
|
"learning_rate": 0.0001977977977977978, |
|
"loss": 1.4649, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.5698005698005698, |
|
"grad_norm": 2.819526195526123, |
|
"learning_rate": 0.0001952952952952953, |
|
"loss": 1.1601, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.8547008547008547, |
|
"grad_norm": 3.7976534366607666, |
|
"learning_rate": 0.00019279279279279282, |
|
"loss": 1.1284, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.1396011396011396, |
|
"grad_norm": 3.787907838821411, |
|
"learning_rate": 0.0001902902902902903, |
|
"loss": 0.91, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.4245014245014245, |
|
"grad_norm": 3.807788133621216, |
|
"learning_rate": 0.0001877877877877878, |
|
"loss": 0.7007, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.7094017094017095, |
|
"grad_norm": 3.6782045364379883, |
|
"learning_rate": 0.00018528528528528532, |
|
"loss": 0.6793, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.9943019943019942, |
|
"grad_norm": 3.648346424102783, |
|
"learning_rate": 0.0001827827827827828, |
|
"loss": 0.7325, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.2792022792022792, |
|
"grad_norm": 3.3711447715759277, |
|
"learning_rate": 0.00018028028028028027, |
|
"loss": 0.3936, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.564102564102564, |
|
"grad_norm": 3.362125873565674, |
|
"learning_rate": 0.00017777777777777779, |
|
"loss": 0.3844, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.849002849002849, |
|
"grad_norm": 3.8006858825683594, |
|
"learning_rate": 0.00017527527527527528, |
|
"loss": 0.4327, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 3.133903133903134, |
|
"grad_norm": 3.940809488296509, |
|
"learning_rate": 0.00017277277277277277, |
|
"loss": 0.3364, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 3.4188034188034186, |
|
"grad_norm": 3.061803102493286, |
|
"learning_rate": 0.00017027027027027028, |
|
"loss": 0.2445, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 3.7037037037037037, |
|
"grad_norm": 3.389284372329712, |
|
"learning_rate": 0.00016776776776776777, |
|
"loss": 0.2597, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 3.9886039886039883, |
|
"grad_norm": 3.320084810256958, |
|
"learning_rate": 0.00016526526526526526, |
|
"loss": 0.2698, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 4.273504273504273, |
|
"grad_norm": 2.7199738025665283, |
|
"learning_rate": 0.00016276276276276275, |
|
"loss": 0.1781, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 4.5584045584045585, |
|
"grad_norm": 3.226743459701538, |
|
"learning_rate": 0.00016026026026026027, |
|
"loss": 0.1902, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 4.843304843304844, |
|
"grad_norm": 4.62879753112793, |
|
"learning_rate": 0.00015775775775775776, |
|
"loss": 0.209, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 5.128205128205128, |
|
"grad_norm": 2.747284412384033, |
|
"learning_rate": 0.00015525525525525525, |
|
"loss": 0.1786, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 5.413105413105413, |
|
"grad_norm": 2.259187936782837, |
|
"learning_rate": 0.00015275275275275277, |
|
"loss": 0.1536, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 5.698005698005698, |
|
"grad_norm": 1.9622772932052612, |
|
"learning_rate": 0.00015025025025025026, |
|
"loss": 0.156, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 5.982905982905983, |
|
"grad_norm": 2.236668825149536, |
|
"learning_rate": 0.00014774774774774775, |
|
"loss": 0.1618, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 6.267806267806268, |
|
"grad_norm": 3.983106851577759, |
|
"learning_rate": 0.00014524524524524526, |
|
"loss": 0.1273, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 6.552706552706553, |
|
"grad_norm": 1.7318657636642456, |
|
"learning_rate": 0.00014274274274274275, |
|
"loss": 0.1334, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 6.837606837606837, |
|
"grad_norm": 1.885850191116333, |
|
"learning_rate": 0.00014024024024024024, |
|
"loss": 0.1329, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 7.122507122507122, |
|
"grad_norm": 4.529383659362793, |
|
"learning_rate": 0.00013773773773773776, |
|
"loss": 0.1313, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 7.407407407407407, |
|
"grad_norm": 2.217284679412842, |
|
"learning_rate": 0.00013523523523523525, |
|
"loss": 0.114, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 7.6923076923076925, |
|
"grad_norm": 2.4019832611083984, |
|
"learning_rate": 0.00013273273273273274, |
|
"loss": 0.1175, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 7.977207977207978, |
|
"grad_norm": 2.7499570846557617, |
|
"learning_rate": 0.00013023023023023023, |
|
"loss": 0.1229, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 8.262108262108262, |
|
"grad_norm": 1.7632925510406494, |
|
"learning_rate": 0.00012772772772772775, |
|
"loss": 0.0974, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 8.547008547008547, |
|
"grad_norm": 1.798587441444397, |
|
"learning_rate": 0.00012522522522522524, |
|
"loss": 0.1031, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 8.831908831908832, |
|
"grad_norm": 2.925875663757324, |
|
"learning_rate": 0.00012272272272272273, |
|
"loss": 0.1098, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 9.116809116809117, |
|
"grad_norm": 2.593219757080078, |
|
"learning_rate": 0.00012022022022022023, |
|
"loss": 0.0979, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 9.401709401709402, |
|
"grad_norm": 2.457099199295044, |
|
"learning_rate": 0.00011771771771771771, |
|
"loss": 0.0917, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 9.686609686609687, |
|
"grad_norm": 2.513124465942383, |
|
"learning_rate": 0.00011521521521521521, |
|
"loss": 0.0884, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 9.971509971509972, |
|
"grad_norm": 0.9835783243179321, |
|
"learning_rate": 0.00011271271271271271, |
|
"loss": 0.0891, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 10.256410256410255, |
|
"grad_norm": 1.4291648864746094, |
|
"learning_rate": 0.0001102102102102102, |
|
"loss": 0.0792, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 10.54131054131054, |
|
"grad_norm": 0.974391758441925, |
|
"learning_rate": 0.00010770770770770771, |
|
"loss": 0.0834, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 10.826210826210826, |
|
"grad_norm": 2.3604581356048584, |
|
"learning_rate": 0.0001052052052052052, |
|
"loss": 0.0829, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 11.11111111111111, |
|
"grad_norm": 1.6176166534423828, |
|
"learning_rate": 0.0001027027027027027, |
|
"loss": 0.08, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 11.396011396011396, |
|
"grad_norm": 1.9266570806503296, |
|
"learning_rate": 0.0001002002002002002, |
|
"loss": 0.0763, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 11.68091168091168, |
|
"grad_norm": 2.1315979957580566, |
|
"learning_rate": 9.76976976976977e-05, |
|
"loss": 0.0769, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 11.965811965811966, |
|
"grad_norm": 1.4953100681304932, |
|
"learning_rate": 9.51951951951952e-05, |
|
"loss": 0.0787, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 12.250712250712251, |
|
"grad_norm": 1.715878963470459, |
|
"learning_rate": 9.26926926926927e-05, |
|
"loss": 0.0699, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 12.535612535612536, |
|
"grad_norm": 0.5827972888946533, |
|
"learning_rate": 9.019019019019019e-05, |
|
"loss": 0.0738, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 12.820512820512821, |
|
"grad_norm": 0.3549967408180237, |
|
"learning_rate": 8.76876876876877e-05, |
|
"loss": 0.0743, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 13.105413105413106, |
|
"grad_norm": 2.361628532409668, |
|
"learning_rate": 8.518518518518518e-05, |
|
"loss": 0.0692, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 13.39031339031339, |
|
"grad_norm": 1.2454289197921753, |
|
"learning_rate": 8.268268268268269e-05, |
|
"loss": 0.0711, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 13.675213675213675, |
|
"grad_norm": 1.1497353315353394, |
|
"learning_rate": 8.018018018018019e-05, |
|
"loss": 0.0688, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 13.96011396011396, |
|
"grad_norm": 2.107264995574951, |
|
"learning_rate": 7.767767767767768e-05, |
|
"loss": 0.0706, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 14.245014245014245, |
|
"grad_norm": 0.4413589537143707, |
|
"learning_rate": 7.517517517517519e-05, |
|
"loss": 0.0664, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 14.52991452991453, |
|
"grad_norm": 1.1788307428359985, |
|
"learning_rate": 7.267267267267268e-05, |
|
"loss": 0.0658, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 14.814814814814815, |
|
"grad_norm": 0.5639584064483643, |
|
"learning_rate": 7.017017017017016e-05, |
|
"loss": 0.063, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 15.0997150997151, |
|
"grad_norm": 0.2644463777542114, |
|
"learning_rate": 6.766766766766767e-05, |
|
"loss": 0.0653, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 15.384615384615385, |
|
"grad_norm": 0.10696238279342651, |
|
"learning_rate": 6.516516516516516e-05, |
|
"loss": 0.0614, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 15.66951566951567, |
|
"grad_norm": 1.1413618326187134, |
|
"learning_rate": 6.266266266266266e-05, |
|
"loss": 0.0637, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 15.954415954415955, |
|
"grad_norm": 0.09536276012659073, |
|
"learning_rate": 6.016016016016016e-05, |
|
"loss": 0.0617, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 16.23931623931624, |
|
"grad_norm": 0.09245631843805313, |
|
"learning_rate": 5.765765765765766e-05, |
|
"loss": 0.06, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 16.524216524216524, |
|
"grad_norm": 1.483127474784851, |
|
"learning_rate": 5.515515515515516e-05, |
|
"loss": 0.0605, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 16.80911680911681, |
|
"grad_norm": 0.12009686231613159, |
|
"learning_rate": 5.2652652652652655e-05, |
|
"loss": 0.062, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 17.094017094017094, |
|
"grad_norm": 0.0926225408911705, |
|
"learning_rate": 5.015015015015015e-05, |
|
"loss": 0.0586, |
|
"step": 3000 |
|
} |
|
], |
|
"logging_steps": 50, |
|
"max_steps": 4001, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 23, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 9.09931874049065e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|