|
{ |
|
"best_metric": 1.0717943906784058, |
|
"best_model_checkpoint": "miner_id_24/checkpoint-500", |
|
"epoch": 0.34916201117318435, |
|
"eval_steps": 50, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0006983240223463687, |
|
"eval_loss": 1.370945930480957, |
|
"eval_runtime": 51.907, |
|
"eval_samples_per_second": 11.617, |
|
"eval_steps_per_second": 2.909, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.006983240223463687, |
|
"grad_norm": 0.8086255788803101, |
|
"learning_rate": 4.34e-05, |
|
"loss": 1.074, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.013966480446927373, |
|
"grad_norm": 0.5126949548721313, |
|
"learning_rate": 8.68e-05, |
|
"loss": 0.9041, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.02094972067039106, |
|
"grad_norm": 0.6224434971809387, |
|
"learning_rate": 0.0001302, |
|
"loss": 0.9133, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.027932960893854747, |
|
"grad_norm": 0.596337616443634, |
|
"learning_rate": 0.0001736, |
|
"loss": 0.8771, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.034916201117318434, |
|
"grad_norm": 1.162047266960144, |
|
"learning_rate": 0.000217, |
|
"loss": 2.273, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.034916201117318434, |
|
"eval_loss": 1.2856680154800415, |
|
"eval_runtime": 51.8319, |
|
"eval_samples_per_second": 11.634, |
|
"eval_steps_per_second": 2.913, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.04189944134078212, |
|
"grad_norm": 0.47395944595336914, |
|
"learning_rate": 0.00021673569945319091, |
|
"loss": 0.9326, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.04888268156424581, |
|
"grad_norm": 0.45159703493118286, |
|
"learning_rate": 0.00021594408545846038, |
|
"loss": 0.864, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.055865921787709494, |
|
"grad_norm": 0.48672589659690857, |
|
"learning_rate": 0.0002146290146796179, |
|
"loss": 0.9053, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.06284916201117319, |
|
"grad_norm": 0.6035557389259338, |
|
"learning_rate": 0.0002127968940093076, |
|
"loss": 0.9917, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.06983240223463687, |
|
"grad_norm": 1.352277398109436, |
|
"learning_rate": 0.00021045664935527106, |
|
"loss": 2.3177, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.06983240223463687, |
|
"eval_loss": 1.2862142324447632, |
|
"eval_runtime": 51.6109, |
|
"eval_samples_per_second": 11.684, |
|
"eval_steps_per_second": 2.926, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.07681564245810056, |
|
"grad_norm": 0.5764887928962708, |
|
"learning_rate": 0.00020761968215422217, |
|
"loss": 0.9639, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.08379888268156424, |
|
"grad_norm": 0.45929816365242004, |
|
"learning_rate": 0.00020429981382519356, |
|
"loss": 0.9454, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.09078212290502793, |
|
"grad_norm": 0.5289813280105591, |
|
"learning_rate": 0.00020051321843297219, |
|
"loss": 0.8689, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.09776536312849161, |
|
"grad_norm": 0.5550933480262756, |
|
"learning_rate": 0.0001962783438896818, |
|
"loss": 0.8495, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.10474860335195531, |
|
"grad_norm": 1.3829443454742432, |
|
"learning_rate": 0.0001916158220784091, |
|
"loss": 2.1619, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.10474860335195531, |
|
"eval_loss": 1.2346261739730835, |
|
"eval_runtime": 51.8194, |
|
"eval_samples_per_second": 11.637, |
|
"eval_steps_per_second": 2.914, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.11173184357541899, |
|
"grad_norm": 0.4873850345611572, |
|
"learning_rate": 0.00018654836833674362, |
|
"loss": 0.9312, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.11871508379888268, |
|
"grad_norm": 0.4591498076915741, |
|
"learning_rate": 0.0001811006707899361, |
|
"loss": 0.8553, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.12569832402234637, |
|
"grad_norm": 0.44848743081092834, |
|
"learning_rate": 0.0001752992700728339, |
|
"loss": 0.8601, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.13268156424581007, |
|
"grad_norm": 0.49805349111557007, |
|
"learning_rate": 0.00016917243002657602, |
|
"loss": 0.7799, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.13966480446927373, |
|
"grad_norm": 1.24271821975708, |
|
"learning_rate": 0.00016275, |
|
"loss": 1.9841, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.13966480446927373, |
|
"eval_loss": 1.2006936073303223, |
|
"eval_runtime": 51.8223, |
|
"eval_samples_per_second": 11.636, |
|
"eval_steps_per_second": 2.914, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.14664804469273743, |
|
"grad_norm": 0.45036932826042175, |
|
"learning_rate": 0.0001560632694266149, |
|
"loss": 0.9129, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.15363128491620112, |
|
"grad_norm": 0.4322090446949005, |
|
"learning_rate": 0.00014914481538562646, |
|
"loss": 0.9577, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.16061452513966482, |
|
"grad_norm": 0.4544154703617096, |
|
"learning_rate": 0.0001420283438896818, |
|
"loss": 0.7483, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.16759776536312848, |
|
"grad_norm": 0.6254767179489136, |
|
"learning_rate": 0.00013474852567256393, |
|
"loss": 0.961, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.17458100558659218, |
|
"grad_norm": 1.0982528924942017, |
|
"learning_rate": 0.00012734082727686196, |
|
"loss": 2.2834, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.17458100558659218, |
|
"eval_loss": 1.151829481124878, |
|
"eval_runtime": 51.8371, |
|
"eval_samples_per_second": 11.633, |
|
"eval_steps_per_second": 2.913, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.18156424581005587, |
|
"grad_norm": 0.49490973353385925, |
|
"learning_rate": 0.0001198413382645404, |
|
"loss": 0.9051, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.18854748603351956, |
|
"grad_norm": 0.419166624546051, |
|
"learning_rate": 0.00011228659539222137, |
|
"loss": 0.856, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.19553072625698323, |
|
"grad_norm": 0.44810253381729126, |
|
"learning_rate": 0.00010471340460777866, |
|
"loss": 0.772, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.20251396648044692, |
|
"grad_norm": 0.5431365370750427, |
|
"learning_rate": 9.715866173545961e-05, |
|
"loss": 0.8117, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.20949720670391062, |
|
"grad_norm": 1.623485803604126, |
|
"learning_rate": 8.965917272313806e-05, |
|
"loss": 1.9584, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.20949720670391062, |
|
"eval_loss": 1.1206214427947998, |
|
"eval_runtime": 51.6387, |
|
"eval_samples_per_second": 11.677, |
|
"eval_steps_per_second": 2.924, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.2164804469273743, |
|
"grad_norm": 0.4230874478816986, |
|
"learning_rate": 8.225147432743606e-05, |
|
"loss": 0.9144, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.22346368715083798, |
|
"grad_norm": 0.41244614124298096, |
|
"learning_rate": 7.497165611031821e-05, |
|
"loss": 0.8197, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.23044692737430167, |
|
"grad_norm": 0.46321332454681396, |
|
"learning_rate": 6.785518461437353e-05, |
|
"loss": 0.8108, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.23743016759776536, |
|
"grad_norm": 0.4879332184791565, |
|
"learning_rate": 6.093673057338509e-05, |
|
"loss": 0.8025, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.24441340782122906, |
|
"grad_norm": 1.3698660135269165, |
|
"learning_rate": 5.4250000000000024e-05, |
|
"loss": 2.2907, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.24441340782122906, |
|
"eval_loss": 1.0982829332351685, |
|
"eval_runtime": 51.8499, |
|
"eval_samples_per_second": 11.63, |
|
"eval_steps_per_second": 2.912, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.25139664804469275, |
|
"grad_norm": 0.40256136655807495, |
|
"learning_rate": 4.782756997342398e-05, |
|
"loss": 0.8455, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.25837988826815644, |
|
"grad_norm": 0.4019988775253296, |
|
"learning_rate": 4.170072992716607e-05, |
|
"loss": 0.8504, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.26536312849162014, |
|
"grad_norm": 0.46585512161254883, |
|
"learning_rate": 3.5899329210063916e-05, |
|
"loss": 0.7688, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.2723463687150838, |
|
"grad_norm": 0.49366164207458496, |
|
"learning_rate": 3.045163166325637e-05, |
|
"loss": 0.9192, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.27932960893854747, |
|
"grad_norm": 1.1542916297912598, |
|
"learning_rate": 2.5384177921590895e-05, |
|
"loss": 2.3411, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.27932960893854747, |
|
"eval_loss": 1.0796287059783936, |
|
"eval_runtime": 51.6139, |
|
"eval_samples_per_second": 11.683, |
|
"eval_steps_per_second": 2.926, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.28631284916201116, |
|
"grad_norm": 0.40678611397743225, |
|
"learning_rate": 2.0721656110318213e-05, |
|
"loss": 0.8682, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.29329608938547486, |
|
"grad_norm": 0.38841360807418823, |
|
"learning_rate": 1.6486781567027783e-05, |
|
"loss": 0.797, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.30027932960893855, |
|
"grad_norm": 0.4177733361721039, |
|
"learning_rate": 1.2700186174806422e-05, |
|
"loss": 0.7461, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.30726256983240224, |
|
"grad_norm": 0.5100810527801514, |
|
"learning_rate": 9.380317845777794e-06, |
|
"loss": 0.8731, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.31424581005586594, |
|
"grad_norm": 1.1657980680465698, |
|
"learning_rate": 6.543350644728947e-06, |
|
"loss": 2.3673, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.31424581005586594, |
|
"eval_loss": 1.072546362876892, |
|
"eval_runtime": 51.6984, |
|
"eval_samples_per_second": 11.664, |
|
"eval_steps_per_second": 2.921, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.32122905027932963, |
|
"grad_norm": 0.3935563862323761, |
|
"learning_rate": 4.2031059906924e-06, |
|
"loss": 0.8383, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.32821229050279327, |
|
"grad_norm": 0.3855022192001343, |
|
"learning_rate": 2.3709853203820825e-06, |
|
"loss": 0.7788, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.33519553072625696, |
|
"grad_norm": 0.4132440984249115, |
|
"learning_rate": 1.0559145415396157e-06, |
|
"loss": 0.7524, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.34217877094972066, |
|
"grad_norm": 0.4790831208229065, |
|
"learning_rate": 2.643005468090745e-07, |
|
"loss": 0.8183, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.34916201117318435, |
|
"grad_norm": 1.258919596672058, |
|
"learning_rate": 0.0, |
|
"loss": 2.1397, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.34916201117318435, |
|
"eval_loss": 1.0717943906784058, |
|
"eval_runtime": 52.0261, |
|
"eval_samples_per_second": 11.59, |
|
"eval_steps_per_second": 2.902, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 3, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.408558478917632e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|