deberta-v3-dnli / trainer_state.json
Panzy18
add model
b852d47
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 29073,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.05159426271798576,
"grad_norm": 7.558670997619629,
"learning_rate": 2.5736389684813753e-06,
"loss": 0.7519,
"step": 500
},
{
"epoch": 0.10318852543597153,
"grad_norm": 6.3421125411987305,
"learning_rate": 5.15243553008596e-06,
"loss": 0.3291,
"step": 1000
},
{
"epoch": 0.15478278815395727,
"grad_norm": 7.725200176239014,
"learning_rate": 7.720916905444127e-06,
"loss": 0.2716,
"step": 1500
},
{
"epoch": 0.20637705087194305,
"grad_norm": 6.253057479858398,
"learning_rate": 8.917008196721311e-06,
"loss": 0.2444,
"step": 2000
},
{
"epoch": 0.2579713135899288,
"grad_norm": 8.090571403503418,
"learning_rate": 8.752341920374708e-06,
"loss": 0.2057,
"step": 2500
},
{
"epoch": 0.30956557630791454,
"grad_norm": 5.437821865081787,
"learning_rate": 8.587675644028104e-06,
"loss": 0.1938,
"step": 3000
},
{
"epoch": 0.3611598390259003,
"grad_norm": 3.594510316848755,
"learning_rate": 8.423009367681498e-06,
"loss": 0.1888,
"step": 3500
},
{
"epoch": 0.4127541017438861,
"grad_norm": 1.5619558095932007,
"learning_rate": 8.258343091334895e-06,
"loss": 0.1681,
"step": 4000
},
{
"epoch": 0.46434836446187183,
"grad_norm": 5.857831001281738,
"learning_rate": 8.093676814988291e-06,
"loss": 0.1581,
"step": 4500
},
{
"epoch": 0.5159426271798576,
"grad_norm": 2.4653804302215576,
"learning_rate": 7.929010538641687e-06,
"loss": 0.1469,
"step": 5000
},
{
"epoch": 0.5675368898978433,
"grad_norm": 8.388437271118164,
"learning_rate": 7.764673594847776e-06,
"loss": 0.1416,
"step": 5500
},
{
"epoch": 0.6191311526158291,
"grad_norm": 0.739997386932373,
"learning_rate": 7.600007318501171e-06,
"loss": 0.143,
"step": 6000
},
{
"epoch": 0.6707254153338149,
"grad_norm": 4.071037292480469,
"learning_rate": 7.435341042154568e-06,
"loss": 0.1309,
"step": 6500
},
{
"epoch": 0.7223196780518006,
"grad_norm": 1.3754676580429077,
"learning_rate": 7.270674765807963e-06,
"loss": 0.1311,
"step": 7000
},
{
"epoch": 0.7739139407697864,
"grad_norm": 7.085391998291016,
"learning_rate": 7.1060084894613585e-06,
"loss": 0.1194,
"step": 7500
},
{
"epoch": 0.8255082034877722,
"grad_norm": 3.828031539916992,
"learning_rate": 6.941342213114754e-06,
"loss": 0.1128,
"step": 8000
},
{
"epoch": 0.8771024662057579,
"grad_norm": 6.162450790405273,
"learning_rate": 6.77667593676815e-06,
"loss": 0.115,
"step": 8500
},
{
"epoch": 0.9286967289237437,
"grad_norm": 3.1122217178344727,
"learning_rate": 6.612009660421546e-06,
"loss": 0.1105,
"step": 9000
},
{
"epoch": 0.9802909916417294,
"grad_norm": 9.108379364013672,
"learning_rate": 6.447343384074942e-06,
"loss": 0.1054,
"step": 9500
},
{
"epoch": 1.0,
"eval_dev_accuracy": 0.9035151515151515,
"eval_dev_loss": 0.38524121046066284,
"eval_dev_runtime": 24.3219,
"eval_dev_samples_per_second": 678.401,
"eval_dev_steps_per_second": 42.431,
"step": 9691
},
{
"epoch": 1.0,
"eval_test_accuracy": 0.9203636363636364,
"eval_test_loss": 0.31113502383232117,
"eval_test_runtime": 24.5354,
"eval_test_samples_per_second": 672.497,
"eval_test_steps_per_second": 42.062,
"step": 9691
},
{
"epoch": 1.0,
"eval_verified_test_accuracy": 0.963639301874596,
"eval_verified_test_loss": 0.13967397809028625,
"eval_verified_test_runtime": 18.4171,
"eval_verified_test_samples_per_second": 671.986,
"eval_verified_test_steps_per_second": 42.026,
"step": 9691
},
{
"epoch": 1.0318852543597152,
"grad_norm": 0.7660722136497498,
"learning_rate": 6.2826771077283374e-06,
"loss": 0.095,
"step": 10000
},
{
"epoch": 1.083479517077701,
"grad_norm": 4.1145124435424805,
"learning_rate": 6.118010831381733e-06,
"loss": 0.0881,
"step": 10500
},
{
"epoch": 1.1350737797956867,
"grad_norm": 0.3238352835178375,
"learning_rate": 5.953344555035129e-06,
"loss": 0.0771,
"step": 11000
},
{
"epoch": 1.1866680425136724,
"grad_norm": 5.4124908447265625,
"learning_rate": 5.788678278688525e-06,
"loss": 0.0841,
"step": 11500
},
{
"epoch": 1.2382623052316581,
"grad_norm": 13.157797813415527,
"learning_rate": 5.624012002341921e-06,
"loss": 0.0861,
"step": 12000
},
{
"epoch": 1.289856567949644,
"grad_norm": 1.3548617362976074,
"learning_rate": 5.4593457259953155e-06,
"loss": 0.0852,
"step": 12500
},
{
"epoch": 1.3414508306676298,
"grad_norm": 2.0701634883880615,
"learning_rate": 5.294679449648712e-06,
"loss": 0.088,
"step": 13000
},
{
"epoch": 1.3930450933856156,
"grad_norm": 0.24185556173324585,
"learning_rate": 5.1303425058548e-06,
"loss": 0.076,
"step": 13500
},
{
"epoch": 1.4446393561036013,
"grad_norm": 1.3316065073013306,
"learning_rate": 4.965676229508197e-06,
"loss": 0.0769,
"step": 14000
},
{
"epoch": 1.496233618821587,
"grad_norm": 2.571403741836548,
"learning_rate": 4.801339285714286e-06,
"loss": 0.0775,
"step": 14500
},
{
"epoch": 1.5478278815395727,
"grad_norm": 2.0419647693634033,
"learning_rate": 4.636673009367682e-06,
"loss": 0.0757,
"step": 15000
},
{
"epoch": 1.5994221442575585,
"grad_norm": 4.6739912033081055,
"learning_rate": 4.472006733021077e-06,
"loss": 0.0775,
"step": 15500
},
{
"epoch": 1.6510164069755442,
"grad_norm": 7.1135053634643555,
"learning_rate": 4.307340456674473e-06,
"loss": 0.0728,
"step": 16000
},
{
"epoch": 1.70261066969353,
"grad_norm": 4.576988220214844,
"learning_rate": 4.142674180327869e-06,
"loss": 0.0774,
"step": 16500
},
{
"epoch": 1.7542049324115159,
"grad_norm": 0.0673130601644516,
"learning_rate": 3.978337236533958e-06,
"loss": 0.0636,
"step": 17000
},
{
"epoch": 1.8057991951295016,
"grad_norm": 8.489474296569824,
"learning_rate": 3.8136709601873536e-06,
"loss": 0.0692,
"step": 17500
},
{
"epoch": 1.8573934578474873,
"grad_norm": 13.07606315612793,
"learning_rate": 3.6490046838407495e-06,
"loss": 0.0689,
"step": 18000
},
{
"epoch": 1.9089877205654733,
"grad_norm": 0.5132156610488892,
"learning_rate": 3.4843384074941454e-06,
"loss": 0.0668,
"step": 18500
},
{
"epoch": 1.960581983283459,
"grad_norm": 6.281818389892578,
"learning_rate": 3.3196721311475413e-06,
"loss": 0.0705,
"step": 19000
},
{
"epoch": 2.0,
"eval_dev_accuracy": 0.8983636363636364,
"eval_dev_loss": 0.5292544364929199,
"eval_dev_runtime": 24.2247,
"eval_dev_samples_per_second": 681.122,
"eval_dev_steps_per_second": 42.601,
"step": 19382
},
{
"epoch": 2.0,
"eval_test_accuracy": 0.9152727272727272,
"eval_test_loss": 0.4211724102497101,
"eval_test_runtime": 24.492,
"eval_test_samples_per_second": 673.688,
"eval_test_steps_per_second": 42.136,
"step": 19382
},
{
"epoch": 2.0,
"eval_verified_test_accuracy": 0.9569327731092437,
"eval_verified_test_loss": 0.1964138001203537,
"eval_verified_test_runtime": 18.3946,
"eval_verified_test_samples_per_second": 672.807,
"eval_verified_test_steps_per_second": 42.078,
"step": 19382
},
{
"epoch": 2.0121762460014447,
"grad_norm": 0.1685587465763092,
"learning_rate": 3.15533518735363e-06,
"loss": 0.0604,
"step": 19500
},
{
"epoch": 2.0637705087194305,
"grad_norm": 3.583531141281128,
"learning_rate": 2.990668911007026e-06,
"loss": 0.0486,
"step": 20000
},
{
"epoch": 2.115364771437416,
"grad_norm": 2.067493200302124,
"learning_rate": 2.8260026346604216e-06,
"loss": 0.0446,
"step": 20500
},
{
"epoch": 2.166959034155402,
"grad_norm": 0.8109681010246277,
"learning_rate": 2.6616656908665106e-06,
"loss": 0.043,
"step": 21000
},
{
"epoch": 2.2185532968733876,
"grad_norm": 1.798444151878357,
"learning_rate": 2.4969994145199065e-06,
"loss": 0.051,
"step": 21500
},
{
"epoch": 2.2701475595913734,
"grad_norm": 0.02468039281666279,
"learning_rate": 2.332333138173302e-06,
"loss": 0.0475,
"step": 22000
},
{
"epoch": 2.321741822309359,
"grad_norm": 6.017990589141846,
"learning_rate": 2.167666861826698e-06,
"loss": 0.0473,
"step": 22500
},
{
"epoch": 2.373336085027345,
"grad_norm": 0.12943503260612488,
"learning_rate": 2.0030005854800936e-06,
"loss": 0.0469,
"step": 23000
},
{
"epoch": 2.4249303477453306,
"grad_norm": 0.710267961025238,
"learning_rate": 1.8386636416861827e-06,
"loss": 0.0482,
"step": 23500
},
{
"epoch": 2.4765246104633163,
"grad_norm": 4.638703346252441,
"learning_rate": 1.6739973653395785e-06,
"loss": 0.0554,
"step": 24000
},
{
"epoch": 2.5281188731813025,
"grad_norm": 1.667284607887268,
"learning_rate": 1.5096604215456673e-06,
"loss": 0.042,
"step": 24500
},
{
"epoch": 2.579713135899288,
"grad_norm": 2.6058883666992188,
"learning_rate": 1.3449941451990634e-06,
"loss": 0.0487,
"step": 25000
},
{
"epoch": 2.631307398617274,
"grad_norm": 1.2941348552703857,
"learning_rate": 1.180327868852459e-06,
"loss": 0.0482,
"step": 25500
},
{
"epoch": 2.6829016613352596,
"grad_norm": 0.13713909685611725,
"learning_rate": 1.0156615925058547e-06,
"loss": 0.047,
"step": 26000
},
{
"epoch": 2.7344959240532454,
"grad_norm": 6.215357303619385,
"learning_rate": 8.509953161592506e-07,
"loss": 0.0408,
"step": 26500
},
{
"epoch": 2.786090186771231,
"grad_norm": 0.08112023025751114,
"learning_rate": 6.863290398126464e-07,
"loss": 0.0419,
"step": 27000
},
{
"epoch": 2.837684449489217,
"grad_norm": 1.3868869543075562,
"learning_rate": 5.216627634660422e-07,
"loss": 0.0449,
"step": 27500
},
{
"epoch": 2.8892787122072026,
"grad_norm": 0.07935940474271774,
"learning_rate": 3.56996487119438e-07,
"loss": 0.048,
"step": 28000
},
{
"epoch": 2.9408729749251883,
"grad_norm": 0.05485483631491661,
"learning_rate": 1.9233021077283373e-07,
"loss": 0.0393,
"step": 28500
},
{
"epoch": 2.992467237643174,
"grad_norm": 4.398229598999023,
"learning_rate": 2.7663934426229508e-08,
"loss": 0.0446,
"step": 29000
},
{
"epoch": 3.0,
"eval_dev_accuracy": 0.8943030303030303,
"eval_dev_loss": 0.6738013625144958,
"eval_dev_runtime": 24.3063,
"eval_dev_samples_per_second": 678.836,
"eval_dev_steps_per_second": 42.458,
"step": 29073
},
{
"epoch": 3.0,
"eval_test_accuracy": 0.9123030303030303,
"eval_test_loss": 0.5465541481971741,
"eval_test_runtime": 24.5738,
"eval_test_samples_per_second": 671.448,
"eval_test_steps_per_second": 41.996,
"step": 29073
},
{
"epoch": 3.0,
"eval_verified_test_accuracy": 0.9536199095022625,
"eval_verified_test_loss": 0.27511948347091675,
"eval_verified_test_runtime": 18.4409,
"eval_verified_test_samples_per_second": 671.117,
"eval_verified_test_steps_per_second": 41.972,
"step": 29073
}
],
"logging_steps": 500,
"max_steps": 29073,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"total_flos": 5.067736824879187e+16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}