|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.988458927359131, |
|
"eval_steps": 500, |
|
"global_step": 552, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.05431093007467753, |
|
"grad_norm": 1.4870964288711548, |
|
"learning_rate": 0.00019997351589651408, |
|
"loss": 3.4965, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.10862186014935506, |
|
"grad_norm": 1.784044861793518, |
|
"learning_rate": 0.00019967573081342103, |
|
"loss": 2.065, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.1629327902240326, |
|
"grad_norm": 0.7305468916893005, |
|
"learning_rate": 0.00019904804439875633, |
|
"loss": 1.2421, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.2172437202987101, |
|
"grad_norm": 0.6995559930801392, |
|
"learning_rate": 0.00019809253413499565, |
|
"loss": 1.093, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.27155465037338766, |
|
"grad_norm": 0.6627448201179504, |
|
"learning_rate": 0.00019681236251822273, |
|
"loss": 1.0856, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.3258655804480652, |
|
"grad_norm": 0.7160666584968567, |
|
"learning_rate": 0.00019521176659107142, |
|
"loss": 1.013, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.3801765105227427, |
|
"grad_norm": 0.6306814551353455, |
|
"learning_rate": 0.0001932960439191915, |
|
"loss": 1.0374, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.4344874405974202, |
|
"grad_norm": 0.7758208513259888, |
|
"learning_rate": 0.00019107153505765306, |
|
"loss": 0.9474, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.48879837067209775, |
|
"grad_norm": 1.2394300699234009, |
|
"learning_rate": 0.000188545602565321, |
|
"loss": 0.9932, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.5431093007467753, |
|
"grad_norm": 0.829031229019165, |
|
"learning_rate": 0.0001857266066366567, |
|
"loss": 0.9204, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.5974202308214528, |
|
"grad_norm": 0.7629134654998779, |
|
"learning_rate": 0.0001826238774315995, |
|
"loss": 0.9457, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.6517311608961304, |
|
"grad_norm": 0.8157823085784912, |
|
"learning_rate": 0.00017924768419510904, |
|
"loss": 0.8539, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.7060420909708078, |
|
"grad_norm": 0.7475631237030029, |
|
"learning_rate": 0.0001756092012685749, |
|
"loss": 0.82, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.7603530210454854, |
|
"grad_norm": 0.6592528223991394, |
|
"learning_rate": 0.000171720471105587, |
|
"loss": 0.8846, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.814663951120163, |
|
"grad_norm": 0.6989027857780457, |
|
"learning_rate": 0.00016759436441447545, |
|
"loss": 0.8367, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.8689748811948405, |
|
"grad_norm": 0.7253873348236084, |
|
"learning_rate": 0.00016324453755953773, |
|
"loss": 0.8068, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.923285811269518, |
|
"grad_norm": 0.7640873193740845, |
|
"learning_rate": 0.00015868538736194427, |
|
"loss": 0.8169, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.9775967413441955, |
|
"grad_norm": 0.7669989466667175, |
|
"learning_rate": 0.00015393200344991995, |
|
"loss": 0.8355, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.0271554650373387, |
|
"grad_norm": 0.7532988786697388, |
|
"learning_rate": 0.0001490001183159105, |
|
"loss": 0.7339, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.0814663951120163, |
|
"grad_norm": 0.7974510192871094, |
|
"learning_rate": 0.0001439060552460318, |
|
"loss": 0.8186, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.1357773251866938, |
|
"grad_norm": 0.9017219543457031, |
|
"learning_rate": 0.0001386666742941419, |
|
"loss": 0.775, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.1900882552613714, |
|
"grad_norm": 0.8205109238624573, |
|
"learning_rate": 0.00013329931647934883, |
|
"loss": 0.7421, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.2443991853360488, |
|
"grad_norm": 0.866692066192627, |
|
"learning_rate": 0.0001278217463916453, |
|
"loss": 0.7113, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.2987101154107263, |
|
"grad_norm": 0.8832337856292725, |
|
"learning_rate": 0.00012225209339563145, |
|
"loss": 0.7545, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.353021045485404, |
|
"grad_norm": 1.0796443223953247, |
|
"learning_rate": 0.00011660879162692675, |
|
"loss": 0.7085, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.4073319755600815, |
|
"grad_norm": 0.9231683015823364, |
|
"learning_rate": 0.00011091051897986678, |
|
"loss": 0.7168, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.461642905634759, |
|
"grad_norm": 0.8881363272666931, |
|
"learning_rate": 0.00010517613528842097, |
|
"loss": 0.7606, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.5159538357094364, |
|
"grad_norm": 0.8930597901344299, |
|
"learning_rate": 9.942461990493625e-05, |
|
"loss": 0.6926, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.570264765784114, |
|
"grad_norm": 1.0270030498504639, |
|
"learning_rate": 9.367500888330545e-05, |
|
"loss": 0.7571, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.6245756958587916, |
|
"grad_norm": 0.8959159255027771, |
|
"learning_rate": 8.79463319744677e-05, |
|
"loss": 0.7786, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.6788866259334692, |
|
"grad_norm": 0.8595919013023376, |
|
"learning_rate": 8.225754964277018e-05, |
|
"loss": 0.6935, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.7331975560081467, |
|
"grad_norm": 0.953175961971283, |
|
"learning_rate": 7.662749031165092e-05, |
|
"loss": 0.6901, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.787508486082824, |
|
"grad_norm": 0.985431969165802, |
|
"learning_rate": 7.107478804634325e-05, |
|
"loss": 0.7101, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.8418194161575017, |
|
"grad_norm": 1.0016827583312988, |
|
"learning_rate": 6.561782087985681e-05, |
|
"loss": 0.707, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.8961303462321792, |
|
"grad_norm": 0.9732582569122314, |
|
"learning_rate": 6.02746499863599e-05, |
|
"loss": 0.7426, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.9504412763068566, |
|
"grad_norm": 0.9253762364387512, |
|
"learning_rate": 5.506295990328385e-05, |
|
"loss": 0.7273, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 2.792293071746826, |
|
"learning_rate": 5.000000000000002e-05, |
|
"loss": 0.7256, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.0543109300746774, |
|
"grad_norm": 0.9254827499389648, |
|
"learning_rate": 4.510252738679136e-05, |
|
"loss": 0.6432, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.108621860149355, |
|
"grad_norm": 1.0876941680908203, |
|
"learning_rate": 4.038675145307747e-05, |
|
"loss": 0.6256, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 2.1629327902240325, |
|
"grad_norm": 0.916249692440033, |
|
"learning_rate": 3.5868280218455796e-05, |
|
"loss": 0.6442, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.2172437202987103, |
|
"grad_norm": 0.9240853190422058, |
|
"learning_rate": 3.1562068674124344e-05, |
|
"loss": 0.5883, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 2.2715546503733877, |
|
"grad_norm": 1.2008038759231567, |
|
"learning_rate": 2.7482369285662378e-05, |
|
"loss": 0.6987, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 2.325865580448065, |
|
"grad_norm": 1.2723044157028198, |
|
"learning_rate": 2.364268482099218e-05, |
|
"loss": 0.708, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 2.380176510522743, |
|
"grad_norm": 0.9695908427238464, |
|
"learning_rate": 2.0055723659649904e-05, |
|
"loss": 0.6782, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 2.43448744059742, |
|
"grad_norm": 1.044391393661499, |
|
"learning_rate": 1.6733357731279377e-05, |
|
"loss": 0.5803, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.4887983706720975, |
|
"grad_norm": 0.9964624643325806, |
|
"learning_rate": 1.368658322256311e-05, |
|
"loss": 0.6112, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 2.5431093007467753, |
|
"grad_norm": 1.004639744758606, |
|
"learning_rate": 1.0925484182639467e-05, |
|
"loss": 0.6322, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 2.5974202308214527, |
|
"grad_norm": 1.1456069946289062, |
|
"learning_rate": 8.45919914746337e-06, |
|
"loss": 0.5633, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 2.6517311608961305, |
|
"grad_norm": 1.1862763166427612, |
|
"learning_rate": 6.2958908935752955e-06, |
|
"loss": 0.5859, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 2.706042090970808, |
|
"grad_norm": 1.1233826875686646, |
|
"learning_rate": 4.442719421385922e-06, |
|
"loss": 0.6147, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.7603530210454856, |
|
"grad_norm": 1.0159374475479126, |
|
"learning_rate": 2.905818257394799e-06, |
|
"loss": 0.5829, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 2.814663951120163, |
|
"grad_norm": 1.053791880607605, |
|
"learning_rate": 1.6902741537767609e-06, |
|
"loss": 0.5938, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 2.8689748811948403, |
|
"grad_norm": 1.0928566455841064, |
|
"learning_rate": 8.00110252525299e-07, |
|
"loss": 0.6136, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 2.923285811269518, |
|
"grad_norm": 1.1599104404449463, |
|
"learning_rate": 2.382727698752474e-07, |
|
"loss": 0.6389, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 2.9775967413441955, |
|
"grad_norm": 1.2020913362503052, |
|
"learning_rate": 6.621245075910665e-09, |
|
"loss": 0.6719, |
|
"step": 550 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 552, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4322859040948224.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|