{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 29073, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05159426271798576, "grad_norm": 7.558670997619629, "learning_rate": 2.5736389684813753e-06, "loss": 0.7519, "step": 500 }, { "epoch": 0.10318852543597153, "grad_norm": 6.3421125411987305, "learning_rate": 5.15243553008596e-06, "loss": 0.3291, "step": 1000 }, { "epoch": 0.15478278815395727, "grad_norm": 7.725200176239014, "learning_rate": 7.720916905444127e-06, "loss": 0.2716, "step": 1500 }, { "epoch": 0.20637705087194305, "grad_norm": 6.253057479858398, "learning_rate": 8.917008196721311e-06, "loss": 0.2444, "step": 2000 }, { "epoch": 0.2579713135899288, "grad_norm": 8.090571403503418, "learning_rate": 8.752341920374708e-06, "loss": 0.2057, "step": 2500 }, { "epoch": 0.30956557630791454, "grad_norm": 5.437821865081787, "learning_rate": 8.587675644028104e-06, "loss": 0.1938, "step": 3000 }, { "epoch": 0.3611598390259003, "grad_norm": 3.594510316848755, "learning_rate": 8.423009367681498e-06, "loss": 0.1888, "step": 3500 }, { "epoch": 0.4127541017438861, "grad_norm": 1.5619558095932007, "learning_rate": 8.258343091334895e-06, "loss": 0.1681, "step": 4000 }, { "epoch": 0.46434836446187183, "grad_norm": 5.857831001281738, "learning_rate": 8.093676814988291e-06, "loss": 0.1581, "step": 4500 }, { "epoch": 0.5159426271798576, "grad_norm": 2.4653804302215576, "learning_rate": 7.929010538641687e-06, "loss": 0.1469, "step": 5000 }, { "epoch": 0.5675368898978433, "grad_norm": 8.388437271118164, "learning_rate": 7.764673594847776e-06, "loss": 0.1416, "step": 5500 }, { "epoch": 0.6191311526158291, "grad_norm": 0.739997386932373, "learning_rate": 7.600007318501171e-06, "loss": 0.143, "step": 6000 }, { "epoch": 0.6707254153338149, "grad_norm": 4.071037292480469, "learning_rate": 7.435341042154568e-06, "loss": 0.1309, "step": 6500 }, { "epoch": 0.7223196780518006, "grad_norm": 1.3754676580429077, "learning_rate": 7.270674765807963e-06, "loss": 0.1311, "step": 7000 }, { "epoch": 0.7739139407697864, "grad_norm": 7.085391998291016, "learning_rate": 7.1060084894613585e-06, "loss": 0.1194, "step": 7500 }, { "epoch": 0.8255082034877722, "grad_norm": 3.828031539916992, "learning_rate": 6.941342213114754e-06, "loss": 0.1128, "step": 8000 }, { "epoch": 0.8771024662057579, "grad_norm": 6.162450790405273, "learning_rate": 6.77667593676815e-06, "loss": 0.115, "step": 8500 }, { "epoch": 0.9286967289237437, "grad_norm": 3.1122217178344727, "learning_rate": 6.612009660421546e-06, "loss": 0.1105, "step": 9000 }, { "epoch": 0.9802909916417294, "grad_norm": 9.108379364013672, "learning_rate": 6.447343384074942e-06, "loss": 0.1054, "step": 9500 }, { "epoch": 1.0, "eval_dev_accuracy": 0.9035151515151515, "eval_dev_loss": 0.38524121046066284, "eval_dev_runtime": 24.3219, "eval_dev_samples_per_second": 678.401, "eval_dev_steps_per_second": 42.431, "step": 9691 }, { "epoch": 1.0, "eval_test_accuracy": 0.9203636363636364, "eval_test_loss": 0.31113502383232117, "eval_test_runtime": 24.5354, "eval_test_samples_per_second": 672.497, "eval_test_steps_per_second": 42.062, "step": 9691 }, { "epoch": 1.0, "eval_verified_test_accuracy": 0.963639301874596, "eval_verified_test_loss": 0.13967397809028625, "eval_verified_test_runtime": 18.4171, "eval_verified_test_samples_per_second": 671.986, "eval_verified_test_steps_per_second": 42.026, "step": 9691 }, { "epoch": 1.0318852543597152, "grad_norm": 0.7660722136497498, "learning_rate": 6.2826771077283374e-06, "loss": 0.095, "step": 10000 }, { "epoch": 1.083479517077701, "grad_norm": 4.1145124435424805, "learning_rate": 6.118010831381733e-06, "loss": 0.0881, "step": 10500 }, { "epoch": 1.1350737797956867, "grad_norm": 0.3238352835178375, "learning_rate": 5.953344555035129e-06, "loss": 0.0771, "step": 11000 }, { "epoch": 1.1866680425136724, "grad_norm": 5.4124908447265625, "learning_rate": 5.788678278688525e-06, "loss": 0.0841, "step": 11500 }, { "epoch": 1.2382623052316581, "grad_norm": 13.157797813415527, "learning_rate": 5.624012002341921e-06, "loss": 0.0861, "step": 12000 }, { "epoch": 1.289856567949644, "grad_norm": 1.3548617362976074, "learning_rate": 5.4593457259953155e-06, "loss": 0.0852, "step": 12500 }, { "epoch": 1.3414508306676298, "grad_norm": 2.0701634883880615, "learning_rate": 5.294679449648712e-06, "loss": 0.088, "step": 13000 }, { "epoch": 1.3930450933856156, "grad_norm": 0.24185556173324585, "learning_rate": 5.1303425058548e-06, "loss": 0.076, "step": 13500 }, { "epoch": 1.4446393561036013, "grad_norm": 1.3316065073013306, "learning_rate": 4.965676229508197e-06, "loss": 0.0769, "step": 14000 }, { "epoch": 1.496233618821587, "grad_norm": 2.571403741836548, "learning_rate": 4.801339285714286e-06, "loss": 0.0775, "step": 14500 }, { "epoch": 1.5478278815395727, "grad_norm": 2.0419647693634033, "learning_rate": 4.636673009367682e-06, "loss": 0.0757, "step": 15000 }, { "epoch": 1.5994221442575585, "grad_norm": 4.6739912033081055, "learning_rate": 4.472006733021077e-06, "loss": 0.0775, "step": 15500 }, { "epoch": 1.6510164069755442, "grad_norm": 7.1135053634643555, "learning_rate": 4.307340456674473e-06, "loss": 0.0728, "step": 16000 }, { "epoch": 1.70261066969353, "grad_norm": 4.576988220214844, "learning_rate": 4.142674180327869e-06, "loss": 0.0774, "step": 16500 }, { "epoch": 1.7542049324115159, "grad_norm": 0.0673130601644516, "learning_rate": 3.978337236533958e-06, "loss": 0.0636, "step": 17000 }, { "epoch": 1.8057991951295016, "grad_norm": 8.489474296569824, "learning_rate": 3.8136709601873536e-06, "loss": 0.0692, "step": 17500 }, { "epoch": 1.8573934578474873, "grad_norm": 13.07606315612793, "learning_rate": 3.6490046838407495e-06, "loss": 0.0689, "step": 18000 }, { "epoch": 1.9089877205654733, "grad_norm": 0.5132156610488892, "learning_rate": 3.4843384074941454e-06, "loss": 0.0668, "step": 18500 }, { "epoch": 1.960581983283459, "grad_norm": 6.281818389892578, "learning_rate": 3.3196721311475413e-06, "loss": 0.0705, "step": 19000 }, { "epoch": 2.0, "eval_dev_accuracy": 0.8983636363636364, "eval_dev_loss": 0.5292544364929199, "eval_dev_runtime": 24.2247, "eval_dev_samples_per_second": 681.122, "eval_dev_steps_per_second": 42.601, "step": 19382 }, { "epoch": 2.0, "eval_test_accuracy": 0.9152727272727272, "eval_test_loss": 0.4211724102497101, "eval_test_runtime": 24.492, "eval_test_samples_per_second": 673.688, "eval_test_steps_per_second": 42.136, "step": 19382 }, { "epoch": 2.0, "eval_verified_test_accuracy": 0.9569327731092437, "eval_verified_test_loss": 0.1964138001203537, "eval_verified_test_runtime": 18.3946, "eval_verified_test_samples_per_second": 672.807, "eval_verified_test_steps_per_second": 42.078, "step": 19382 }, { "epoch": 2.0121762460014447, "grad_norm": 0.1685587465763092, "learning_rate": 3.15533518735363e-06, "loss": 0.0604, "step": 19500 }, { "epoch": 2.0637705087194305, "grad_norm": 3.583531141281128, "learning_rate": 2.990668911007026e-06, "loss": 0.0486, "step": 20000 }, { "epoch": 2.115364771437416, "grad_norm": 2.067493200302124, "learning_rate": 2.8260026346604216e-06, "loss": 0.0446, "step": 20500 }, { "epoch": 2.166959034155402, "grad_norm": 0.8109681010246277, "learning_rate": 2.6616656908665106e-06, "loss": 0.043, "step": 21000 }, { "epoch": 2.2185532968733876, "grad_norm": 1.798444151878357, "learning_rate": 2.4969994145199065e-06, "loss": 0.051, "step": 21500 }, { "epoch": 2.2701475595913734, "grad_norm": 0.02468039281666279, "learning_rate": 2.332333138173302e-06, "loss": 0.0475, "step": 22000 }, { "epoch": 2.321741822309359, "grad_norm": 6.017990589141846, "learning_rate": 2.167666861826698e-06, "loss": 0.0473, "step": 22500 }, { "epoch": 2.373336085027345, "grad_norm": 0.12943503260612488, "learning_rate": 2.0030005854800936e-06, "loss": 0.0469, "step": 23000 }, { "epoch": 2.4249303477453306, "grad_norm": 0.710267961025238, "learning_rate": 1.8386636416861827e-06, "loss": 0.0482, "step": 23500 }, { "epoch": 2.4765246104633163, "grad_norm": 4.638703346252441, "learning_rate": 1.6739973653395785e-06, "loss": 0.0554, "step": 24000 }, { "epoch": 2.5281188731813025, "grad_norm": 1.667284607887268, "learning_rate": 1.5096604215456673e-06, "loss": 0.042, "step": 24500 }, { "epoch": 2.579713135899288, "grad_norm": 2.6058883666992188, "learning_rate": 1.3449941451990634e-06, "loss": 0.0487, "step": 25000 }, { "epoch": 2.631307398617274, "grad_norm": 1.2941348552703857, "learning_rate": 1.180327868852459e-06, "loss": 0.0482, "step": 25500 }, { "epoch": 2.6829016613352596, "grad_norm": 0.13713909685611725, "learning_rate": 1.0156615925058547e-06, "loss": 0.047, "step": 26000 }, { "epoch": 2.7344959240532454, "grad_norm": 6.215357303619385, "learning_rate": 8.509953161592506e-07, "loss": 0.0408, "step": 26500 }, { "epoch": 2.786090186771231, "grad_norm": 0.08112023025751114, "learning_rate": 6.863290398126464e-07, "loss": 0.0419, "step": 27000 }, { "epoch": 2.837684449489217, "grad_norm": 1.3868869543075562, "learning_rate": 5.216627634660422e-07, "loss": 0.0449, "step": 27500 }, { "epoch": 2.8892787122072026, "grad_norm": 0.07935940474271774, "learning_rate": 3.56996487119438e-07, "loss": 0.048, "step": 28000 }, { "epoch": 2.9408729749251883, "grad_norm": 0.05485483631491661, "learning_rate": 1.9233021077283373e-07, "loss": 0.0393, "step": 28500 }, { "epoch": 2.992467237643174, "grad_norm": 4.398229598999023, "learning_rate": 2.7663934426229508e-08, "loss": 0.0446, "step": 29000 }, { "epoch": 3.0, "eval_dev_accuracy": 0.8943030303030303, "eval_dev_loss": 0.6738013625144958, "eval_dev_runtime": 24.3063, "eval_dev_samples_per_second": 678.836, "eval_dev_steps_per_second": 42.458, "step": 29073 }, { "epoch": 3.0, "eval_test_accuracy": 0.9123030303030303, "eval_test_loss": 0.5465541481971741, "eval_test_runtime": 24.5738, "eval_test_samples_per_second": 671.448, "eval_test_steps_per_second": 41.996, "step": 29073 }, { "epoch": 3.0, "eval_verified_test_accuracy": 0.9536199095022625, "eval_verified_test_loss": 0.27511948347091675, "eval_verified_test_runtime": 18.4409, "eval_verified_test_samples_per_second": 671.117, "eval_verified_test_steps_per_second": 41.972, "step": 29073 } ], "logging_steps": 500, "max_steps": 29073, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "total_flos": 5.067736824879187e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }