|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 10.0, |
|
"eval_steps": 200, |
|
"global_step": 10000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.2, |
|
"eval_loss": 0.05579125136137009, |
|
"eval_runtime": 37.7203, |
|
"eval_samples_per_second": 424.174, |
|
"eval_steps_per_second": 6.628, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"eval_loss": 0.19435200095176697, |
|
"eval_runtime": 37.6839, |
|
"eval_samples_per_second": 424.584, |
|
"eval_steps_per_second": 6.634, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 148661.390625, |
|
"learning_rate": 4.75e-05, |
|
"loss": 0.2826, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"eval_loss": 0.3969748318195343, |
|
"eval_runtime": 37.6897, |
|
"eval_samples_per_second": 424.519, |
|
"eval_steps_per_second": 6.633, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"eval_loss": 0.6244608759880066, |
|
"eval_runtime": 37.6841, |
|
"eval_samples_per_second": 424.583, |
|
"eval_steps_per_second": 6.634, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 5462.595703125, |
|
"learning_rate": 4.5e-05, |
|
"loss": 0.8928, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 2.0544872283935547, |
|
"eval_runtime": 37.666, |
|
"eval_samples_per_second": 424.787, |
|
"eval_steps_per_second": 6.637, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"eval_loss": 0.3789268136024475, |
|
"eval_runtime": 37.8068, |
|
"eval_samples_per_second": 423.204, |
|
"eval_steps_per_second": 6.613, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"eval_loss": 0.41200506687164307, |
|
"eval_runtime": 37.7019, |
|
"eval_samples_per_second": 424.381, |
|
"eval_steps_per_second": 6.631, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 26773808.0, |
|
"learning_rate": 4.25e-05, |
|
"loss": 0.5735, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"eval_loss": 0.9738002419471741, |
|
"eval_runtime": 37.6855, |
|
"eval_samples_per_second": 424.567, |
|
"eval_steps_per_second": 6.634, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"eval_loss": 1.4284050464630127, |
|
"eval_runtime": 37.676, |
|
"eval_samples_per_second": 424.673, |
|
"eval_steps_per_second": 6.636, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 1449586.25, |
|
"learning_rate": 4e-05, |
|
"loss": 3.2584, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 3.862783193588257, |
|
"eval_runtime": 37.86, |
|
"eval_samples_per_second": 422.609, |
|
"eval_steps_per_second": 6.603, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"eval_loss": 0.6802809834480286, |
|
"eval_runtime": 37.6736, |
|
"eval_samples_per_second": 424.701, |
|
"eval_steps_per_second": 6.636, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"eval_loss": 0.41678962111473083, |
|
"eval_runtime": 37.886, |
|
"eval_samples_per_second": 422.319, |
|
"eval_steps_per_second": 6.599, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 3005.915771484375, |
|
"learning_rate": 3.7500000000000003e-05, |
|
"loss": 1.1454, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"eval_loss": 0.0627516433596611, |
|
"eval_runtime": 37.6977, |
|
"eval_samples_per_second": 424.429, |
|
"eval_steps_per_second": 6.632, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"eval_loss": 0.03529125079512596, |
|
"eval_runtime": 37.6892, |
|
"eval_samples_per_second": 424.525, |
|
"eval_steps_per_second": 6.633, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.08225303888320923, |
|
"learning_rate": 3.5e-05, |
|
"loss": 0.0693, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 0.030110161751508713, |
|
"eval_runtime": 37.7252, |
|
"eval_samples_per_second": 424.12, |
|
"eval_steps_per_second": 6.627, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"eval_loss": 0.029404468834400177, |
|
"eval_runtime": 37.6821, |
|
"eval_samples_per_second": 424.605, |
|
"eval_steps_per_second": 6.634, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 3.4, |
|
"eval_loss": 0.028393320739269257, |
|
"eval_runtime": 37.9071, |
|
"eval_samples_per_second": 422.084, |
|
"eval_steps_per_second": 6.595, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"grad_norm": 0.11493222415447235, |
|
"learning_rate": 3.2500000000000004e-05, |
|
"loss": 0.0299, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"eval_loss": 0.02786264941096306, |
|
"eval_runtime": 37.6844, |
|
"eval_samples_per_second": 424.578, |
|
"eval_steps_per_second": 6.634, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 3.8, |
|
"eval_loss": 0.027447132393717766, |
|
"eval_runtime": 37.6815, |
|
"eval_samples_per_second": 424.611, |
|
"eval_steps_per_second": 6.635, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 0.09786217659711838, |
|
"learning_rate": 3e-05, |
|
"loss": 0.0287, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 0.02736382931470871, |
|
"eval_runtime": 37.6836, |
|
"eval_samples_per_second": 424.588, |
|
"eval_steps_per_second": 6.634, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 4.2, |
|
"eval_loss": 0.02714085392653942, |
|
"eval_runtime": 37.7033, |
|
"eval_samples_per_second": 424.366, |
|
"eval_steps_per_second": 6.631, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 4.4, |
|
"eval_loss": 0.026028065010905266, |
|
"eval_runtime": 37.7142, |
|
"eval_samples_per_second": 424.243, |
|
"eval_steps_per_second": 6.629, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 4.5, |
|
"grad_norm": 0.10962454974651337, |
|
"learning_rate": 2.7500000000000004e-05, |
|
"loss": 0.0274, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 4.6, |
|
"eval_loss": 0.026036322116851807, |
|
"eval_runtime": 37.9005, |
|
"eval_samples_per_second": 422.157, |
|
"eval_steps_per_second": 6.596, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 4.8, |
|
"eval_loss": 0.026090338826179504, |
|
"eval_runtime": 37.8184, |
|
"eval_samples_per_second": 423.074, |
|
"eval_steps_per_second": 6.611, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.15162990987300873, |
|
"learning_rate": 2.5e-05, |
|
"loss": 0.0267, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 0.025744769722223282, |
|
"eval_runtime": 37.9279, |
|
"eval_samples_per_second": 421.853, |
|
"eval_steps_per_second": 6.591, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 5.2, |
|
"eval_loss": 0.025514526292681694, |
|
"eval_runtime": 37.8847, |
|
"eval_samples_per_second": 422.334, |
|
"eval_steps_per_second": 6.599, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 5.4, |
|
"eval_loss": 0.025529591366648674, |
|
"eval_runtime": 37.6892, |
|
"eval_samples_per_second": 424.525, |
|
"eval_steps_per_second": 6.633, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 5.5, |
|
"grad_norm": 0.05882571265101433, |
|
"learning_rate": 2.25e-05, |
|
"loss": 0.0263, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 5.6, |
|
"eval_loss": 0.02538459375500679, |
|
"eval_runtime": 37.6706, |
|
"eval_samples_per_second": 424.735, |
|
"eval_steps_per_second": 6.636, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 5.8, |
|
"eval_loss": 0.02504335343837738, |
|
"eval_runtime": 37.6763, |
|
"eval_samples_per_second": 424.671, |
|
"eval_steps_per_second": 6.635, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 0.09895586967468262, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0259, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 0.02500820718705654, |
|
"eval_runtime": 37.6737, |
|
"eval_samples_per_second": 424.699, |
|
"eval_steps_per_second": 6.636, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 6.2, |
|
"eval_loss": 0.025201652199029922, |
|
"eval_runtime": 37.9114, |
|
"eval_samples_per_second": 422.036, |
|
"eval_steps_per_second": 6.594, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 6.4, |
|
"eval_loss": 0.02534925751388073, |
|
"eval_runtime": 37.6963, |
|
"eval_samples_per_second": 424.445, |
|
"eval_steps_per_second": 6.632, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 6.5, |
|
"grad_norm": 0.1827753484249115, |
|
"learning_rate": 1.75e-05, |
|
"loss": 0.0256, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 6.6, |
|
"eval_loss": 0.02500862069427967, |
|
"eval_runtime": 37.8861, |
|
"eval_samples_per_second": 422.318, |
|
"eval_steps_per_second": 6.599, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 6.8, |
|
"eval_loss": 0.024682270362973213, |
|
"eval_runtime": 37.762, |
|
"eval_samples_per_second": 423.706, |
|
"eval_steps_per_second": 6.62, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"grad_norm": 0.08037039637565613, |
|
"learning_rate": 1.5e-05, |
|
"loss": 0.0253, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_loss": 0.025552883744239807, |
|
"eval_runtime": 37.6888, |
|
"eval_samples_per_second": 424.53, |
|
"eval_steps_per_second": 6.633, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 7.2, |
|
"eval_loss": 0.024695081636309624, |
|
"eval_runtime": 37.8117, |
|
"eval_samples_per_second": 423.15, |
|
"eval_steps_per_second": 6.612, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 7.4, |
|
"eval_loss": 0.024455612525343895, |
|
"eval_runtime": 37.8335, |
|
"eval_samples_per_second": 422.906, |
|
"eval_steps_per_second": 6.608, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 7.5, |
|
"grad_norm": 0.12335552275180817, |
|
"learning_rate": 1.25e-05, |
|
"loss": 0.0251, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 7.6, |
|
"eval_loss": 0.02451241761445999, |
|
"eval_runtime": 37.6804, |
|
"eval_samples_per_second": 424.624, |
|
"eval_steps_per_second": 6.635, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 7.8, |
|
"eval_loss": 0.024500148370862007, |
|
"eval_runtime": 37.6703, |
|
"eval_samples_per_second": 424.738, |
|
"eval_steps_per_second": 6.637, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 0.18342430889606476, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0251, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 0.02456432767212391, |
|
"eval_runtime": 37.918, |
|
"eval_samples_per_second": 421.963, |
|
"eval_steps_per_second": 6.593, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 8.2, |
|
"eval_loss": 0.02438100427389145, |
|
"eval_runtime": 37.7187, |
|
"eval_samples_per_second": 424.193, |
|
"eval_steps_per_second": 6.628, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 8.4, |
|
"eval_loss": 0.02455132268369198, |
|
"eval_runtime": 37.679, |
|
"eval_samples_per_second": 424.64, |
|
"eval_steps_per_second": 6.635, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 8.5, |
|
"grad_norm": 0.09001246094703674, |
|
"learning_rate": 7.5e-06, |
|
"loss": 0.0252, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 8.6, |
|
"eval_loss": 0.024252494797110558, |
|
"eval_runtime": 37.731, |
|
"eval_samples_per_second": 424.054, |
|
"eval_steps_per_second": 6.626, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 8.8, |
|
"eval_loss": 0.02421058714389801, |
|
"eval_runtime": 37.6831, |
|
"eval_samples_per_second": 424.593, |
|
"eval_steps_per_second": 6.634, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"grad_norm": 0.07999344915151596, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0244, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_loss": 0.024233995005488396, |
|
"eval_runtime": 37.8728, |
|
"eval_samples_per_second": 422.467, |
|
"eval_steps_per_second": 6.601, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 9.2, |
|
"eval_loss": 0.02422364056110382, |
|
"eval_runtime": 37.7417, |
|
"eval_samples_per_second": 423.934, |
|
"eval_steps_per_second": 6.624, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 9.4, |
|
"eval_loss": 0.024163929745554924, |
|
"eval_runtime": 37.7865, |
|
"eval_samples_per_second": 423.432, |
|
"eval_steps_per_second": 6.616, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 9.5, |
|
"grad_norm": 0.10827407240867615, |
|
"learning_rate": 2.5e-06, |
|
"loss": 0.0247, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 9.6, |
|
"eval_loss": 0.024232398718595505, |
|
"eval_runtime": 37.8161, |
|
"eval_samples_per_second": 423.1, |
|
"eval_steps_per_second": 6.611, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 9.8, |
|
"eval_loss": 0.02414529025554657, |
|
"eval_runtime": 37.6993, |
|
"eval_samples_per_second": 424.411, |
|
"eval_steps_per_second": 6.631, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 0.13070951402187347, |
|
"learning_rate": 0.0, |
|
"loss": 0.0245, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_loss": 0.024126721546053886, |
|
"eval_runtime": 37.7053, |
|
"eval_samples_per_second": 424.344, |
|
"eval_steps_per_second": 6.63, |
|
"step": 10000 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 10000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 1000, |
|
"total_flos": 4.180672512e+16, |
|
"train_batch_size": 64, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|