|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 29073, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.05159426271798576, |
|
"grad_norm": 7.558670997619629, |
|
"learning_rate": 2.5736389684813753e-06, |
|
"loss": 0.7519, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.10318852543597153, |
|
"grad_norm": 6.3421125411987305, |
|
"learning_rate": 5.15243553008596e-06, |
|
"loss": 0.3291, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.15478278815395727, |
|
"grad_norm": 7.725200176239014, |
|
"learning_rate": 7.720916905444127e-06, |
|
"loss": 0.2716, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.20637705087194305, |
|
"grad_norm": 6.253057479858398, |
|
"learning_rate": 8.917008196721311e-06, |
|
"loss": 0.2444, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.2579713135899288, |
|
"grad_norm": 8.090571403503418, |
|
"learning_rate": 8.752341920374708e-06, |
|
"loss": 0.2057, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.30956557630791454, |
|
"grad_norm": 5.437821865081787, |
|
"learning_rate": 8.587675644028104e-06, |
|
"loss": 0.1938, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.3611598390259003, |
|
"grad_norm": 3.594510316848755, |
|
"learning_rate": 8.423009367681498e-06, |
|
"loss": 0.1888, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.4127541017438861, |
|
"grad_norm": 1.5619558095932007, |
|
"learning_rate": 8.258343091334895e-06, |
|
"loss": 0.1681, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.46434836446187183, |
|
"grad_norm": 5.857831001281738, |
|
"learning_rate": 8.093676814988291e-06, |
|
"loss": 0.1581, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.5159426271798576, |
|
"grad_norm": 2.4653804302215576, |
|
"learning_rate": 7.929010538641687e-06, |
|
"loss": 0.1469, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.5675368898978433, |
|
"grad_norm": 8.388437271118164, |
|
"learning_rate": 7.764673594847776e-06, |
|
"loss": 0.1416, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.6191311526158291, |
|
"grad_norm": 0.739997386932373, |
|
"learning_rate": 7.600007318501171e-06, |
|
"loss": 0.143, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.6707254153338149, |
|
"grad_norm": 4.071037292480469, |
|
"learning_rate": 7.435341042154568e-06, |
|
"loss": 0.1309, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.7223196780518006, |
|
"grad_norm": 1.3754676580429077, |
|
"learning_rate": 7.270674765807963e-06, |
|
"loss": 0.1311, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.7739139407697864, |
|
"grad_norm": 7.085391998291016, |
|
"learning_rate": 7.1060084894613585e-06, |
|
"loss": 0.1194, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.8255082034877722, |
|
"grad_norm": 3.828031539916992, |
|
"learning_rate": 6.941342213114754e-06, |
|
"loss": 0.1128, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.8771024662057579, |
|
"grad_norm": 6.162450790405273, |
|
"learning_rate": 6.77667593676815e-06, |
|
"loss": 0.115, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.9286967289237437, |
|
"grad_norm": 3.1122217178344727, |
|
"learning_rate": 6.612009660421546e-06, |
|
"loss": 0.1105, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.9802909916417294, |
|
"grad_norm": 9.108379364013672, |
|
"learning_rate": 6.447343384074942e-06, |
|
"loss": 0.1054, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_dev_accuracy": 0.9035151515151515, |
|
"eval_dev_loss": 0.38524121046066284, |
|
"eval_dev_runtime": 24.3219, |
|
"eval_dev_samples_per_second": 678.401, |
|
"eval_dev_steps_per_second": 42.431, |
|
"step": 9691 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_test_accuracy": 0.9203636363636364, |
|
"eval_test_loss": 0.31113502383232117, |
|
"eval_test_runtime": 24.5354, |
|
"eval_test_samples_per_second": 672.497, |
|
"eval_test_steps_per_second": 42.062, |
|
"step": 9691 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_verified_test_accuracy": 0.963639301874596, |
|
"eval_verified_test_loss": 0.13967397809028625, |
|
"eval_verified_test_runtime": 18.4171, |
|
"eval_verified_test_samples_per_second": 671.986, |
|
"eval_verified_test_steps_per_second": 42.026, |
|
"step": 9691 |
|
}, |
|
{ |
|
"epoch": 1.0318852543597152, |
|
"grad_norm": 0.7660722136497498, |
|
"learning_rate": 6.2826771077283374e-06, |
|
"loss": 0.095, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 1.083479517077701, |
|
"grad_norm": 4.1145124435424805, |
|
"learning_rate": 6.118010831381733e-06, |
|
"loss": 0.0881, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 1.1350737797956867, |
|
"grad_norm": 0.3238352835178375, |
|
"learning_rate": 5.953344555035129e-06, |
|
"loss": 0.0771, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 1.1866680425136724, |
|
"grad_norm": 5.4124908447265625, |
|
"learning_rate": 5.788678278688525e-06, |
|
"loss": 0.0841, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 1.2382623052316581, |
|
"grad_norm": 13.157797813415527, |
|
"learning_rate": 5.624012002341921e-06, |
|
"loss": 0.0861, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 1.289856567949644, |
|
"grad_norm": 1.3548617362976074, |
|
"learning_rate": 5.4593457259953155e-06, |
|
"loss": 0.0852, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 1.3414508306676298, |
|
"grad_norm": 2.0701634883880615, |
|
"learning_rate": 5.294679449648712e-06, |
|
"loss": 0.088, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 1.3930450933856156, |
|
"grad_norm": 0.24185556173324585, |
|
"learning_rate": 5.1303425058548e-06, |
|
"loss": 0.076, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 1.4446393561036013, |
|
"grad_norm": 1.3316065073013306, |
|
"learning_rate": 4.965676229508197e-06, |
|
"loss": 0.0769, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 1.496233618821587, |
|
"grad_norm": 2.571403741836548, |
|
"learning_rate": 4.801339285714286e-06, |
|
"loss": 0.0775, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 1.5478278815395727, |
|
"grad_norm": 2.0419647693634033, |
|
"learning_rate": 4.636673009367682e-06, |
|
"loss": 0.0757, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 1.5994221442575585, |
|
"grad_norm": 4.6739912033081055, |
|
"learning_rate": 4.472006733021077e-06, |
|
"loss": 0.0775, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 1.6510164069755442, |
|
"grad_norm": 7.1135053634643555, |
|
"learning_rate": 4.307340456674473e-06, |
|
"loss": 0.0728, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 1.70261066969353, |
|
"grad_norm": 4.576988220214844, |
|
"learning_rate": 4.142674180327869e-06, |
|
"loss": 0.0774, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 1.7542049324115159, |
|
"grad_norm": 0.0673130601644516, |
|
"learning_rate": 3.978337236533958e-06, |
|
"loss": 0.0636, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 1.8057991951295016, |
|
"grad_norm": 8.489474296569824, |
|
"learning_rate": 3.8136709601873536e-06, |
|
"loss": 0.0692, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 1.8573934578474873, |
|
"grad_norm": 13.07606315612793, |
|
"learning_rate": 3.6490046838407495e-06, |
|
"loss": 0.0689, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 1.9089877205654733, |
|
"grad_norm": 0.5132156610488892, |
|
"learning_rate": 3.4843384074941454e-06, |
|
"loss": 0.0668, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 1.960581983283459, |
|
"grad_norm": 6.281818389892578, |
|
"learning_rate": 3.3196721311475413e-06, |
|
"loss": 0.0705, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_dev_accuracy": 0.8983636363636364, |
|
"eval_dev_loss": 0.5292544364929199, |
|
"eval_dev_runtime": 24.2247, |
|
"eval_dev_samples_per_second": 681.122, |
|
"eval_dev_steps_per_second": 42.601, |
|
"step": 19382 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_test_accuracy": 0.9152727272727272, |
|
"eval_test_loss": 0.4211724102497101, |
|
"eval_test_runtime": 24.492, |
|
"eval_test_samples_per_second": 673.688, |
|
"eval_test_steps_per_second": 42.136, |
|
"step": 19382 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_verified_test_accuracy": 0.9569327731092437, |
|
"eval_verified_test_loss": 0.1964138001203537, |
|
"eval_verified_test_runtime": 18.3946, |
|
"eval_verified_test_samples_per_second": 672.807, |
|
"eval_verified_test_steps_per_second": 42.078, |
|
"step": 19382 |
|
}, |
|
{ |
|
"epoch": 2.0121762460014447, |
|
"grad_norm": 0.1685587465763092, |
|
"learning_rate": 3.15533518735363e-06, |
|
"loss": 0.0604, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 2.0637705087194305, |
|
"grad_norm": 3.583531141281128, |
|
"learning_rate": 2.990668911007026e-06, |
|
"loss": 0.0486, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 2.115364771437416, |
|
"grad_norm": 2.067493200302124, |
|
"learning_rate": 2.8260026346604216e-06, |
|
"loss": 0.0446, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 2.166959034155402, |
|
"grad_norm": 0.8109681010246277, |
|
"learning_rate": 2.6616656908665106e-06, |
|
"loss": 0.043, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 2.2185532968733876, |
|
"grad_norm": 1.798444151878357, |
|
"learning_rate": 2.4969994145199065e-06, |
|
"loss": 0.051, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 2.2701475595913734, |
|
"grad_norm": 0.02468039281666279, |
|
"learning_rate": 2.332333138173302e-06, |
|
"loss": 0.0475, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 2.321741822309359, |
|
"grad_norm": 6.017990589141846, |
|
"learning_rate": 2.167666861826698e-06, |
|
"loss": 0.0473, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 2.373336085027345, |
|
"grad_norm": 0.12943503260612488, |
|
"learning_rate": 2.0030005854800936e-06, |
|
"loss": 0.0469, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 2.4249303477453306, |
|
"grad_norm": 0.710267961025238, |
|
"learning_rate": 1.8386636416861827e-06, |
|
"loss": 0.0482, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 2.4765246104633163, |
|
"grad_norm": 4.638703346252441, |
|
"learning_rate": 1.6739973653395785e-06, |
|
"loss": 0.0554, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 2.5281188731813025, |
|
"grad_norm": 1.667284607887268, |
|
"learning_rate": 1.5096604215456673e-06, |
|
"loss": 0.042, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 2.579713135899288, |
|
"grad_norm": 2.6058883666992188, |
|
"learning_rate": 1.3449941451990634e-06, |
|
"loss": 0.0487, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 2.631307398617274, |
|
"grad_norm": 1.2941348552703857, |
|
"learning_rate": 1.180327868852459e-06, |
|
"loss": 0.0482, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 2.6829016613352596, |
|
"grad_norm": 0.13713909685611725, |
|
"learning_rate": 1.0156615925058547e-06, |
|
"loss": 0.047, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 2.7344959240532454, |
|
"grad_norm": 6.215357303619385, |
|
"learning_rate": 8.509953161592506e-07, |
|
"loss": 0.0408, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 2.786090186771231, |
|
"grad_norm": 0.08112023025751114, |
|
"learning_rate": 6.863290398126464e-07, |
|
"loss": 0.0419, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 2.837684449489217, |
|
"grad_norm": 1.3868869543075562, |
|
"learning_rate": 5.216627634660422e-07, |
|
"loss": 0.0449, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 2.8892787122072026, |
|
"grad_norm": 0.07935940474271774, |
|
"learning_rate": 3.56996487119438e-07, |
|
"loss": 0.048, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 2.9408729749251883, |
|
"grad_norm": 0.05485483631491661, |
|
"learning_rate": 1.9233021077283373e-07, |
|
"loss": 0.0393, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 2.992467237643174, |
|
"grad_norm": 4.398229598999023, |
|
"learning_rate": 2.7663934426229508e-08, |
|
"loss": 0.0446, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_dev_accuracy": 0.8943030303030303, |
|
"eval_dev_loss": 0.6738013625144958, |
|
"eval_dev_runtime": 24.3063, |
|
"eval_dev_samples_per_second": 678.836, |
|
"eval_dev_steps_per_second": 42.458, |
|
"step": 29073 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_test_accuracy": 0.9123030303030303, |
|
"eval_test_loss": 0.5465541481971741, |
|
"eval_test_runtime": 24.5738, |
|
"eval_test_samples_per_second": 671.448, |
|
"eval_test_steps_per_second": 41.996, |
|
"step": 29073 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_verified_test_accuracy": 0.9536199095022625, |
|
"eval_verified_test_loss": 0.27511948347091675, |
|
"eval_verified_test_runtime": 18.4409, |
|
"eval_verified_test_samples_per_second": 671.117, |
|
"eval_verified_test_steps_per_second": 41.972, |
|
"step": 29073 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 29073, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"total_flos": 5.067736824879187e+16, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|