|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.5918910920390648, |
|
"eval_steps": 200, |
|
"global_step": 2000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0002959455460195324, |
|
"eval_loss": 10.376261711120605, |
|
"eval_runtime": 10.819, |
|
"eval_samples_per_second": 138.829, |
|
"eval_steps_per_second": 34.754, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.002959455460195324, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 10.3804, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.005918910920390648, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 3.2000000000000005e-05, |
|
"loss": 10.3767, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.008878366380585973, |
|
"grad_norm": 0.443359375, |
|
"learning_rate": 4.8e-05, |
|
"loss": 10.3754, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.011837821840781295, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 6.400000000000001e-05, |
|
"loss": 10.3767, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.01479727730097662, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 8e-05, |
|
"loss": 10.3722, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.017756732761171946, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 9.6e-05, |
|
"loss": 10.3804, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.020716188221367268, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 0.00011200000000000001, |
|
"loss": 10.3739, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.02367564368156259, |
|
"grad_norm": 0.42578125, |
|
"learning_rate": 0.00012800000000000002, |
|
"loss": 10.3736, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.026635099141757917, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.000144, |
|
"loss": 10.3643, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.02959455460195324, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 0.00016, |
|
"loss": 10.364, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.032554010062148565, |
|
"grad_norm": 0.478515625, |
|
"learning_rate": 0.00017600000000000002, |
|
"loss": 10.3561, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.03551346552234389, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 0.000192, |
|
"loss": 10.3211, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.03847292098253921, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 0.0001999978128380225, |
|
"loss": 10.2582, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.041432376442734536, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 0.0001999803161162393, |
|
"loss": 10.172, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.04439183190292986, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 0.00019994532573409262, |
|
"loss": 10.1033, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.04735128736312518, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 0.00019989284781388617, |
|
"loss": 10.0041, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.05031074282332051, |
|
"grad_norm": 0.41796875, |
|
"learning_rate": 0.00019982289153773646, |
|
"loss": 9.9331, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.053270198283515834, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 0.00019973546914596623, |
|
"loss": 9.8548, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.05622965374371116, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 0.00019963059593496268, |
|
"loss": 9.7692, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.05918910920390648, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 0.00019950829025450114, |
|
"loss": 9.7054, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.05918910920390648, |
|
"eval_loss": 9.686193466186523, |
|
"eval_runtime": 20.1405, |
|
"eval_samples_per_second": 74.576, |
|
"eval_steps_per_second": 18.669, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.062148564664101805, |
|
"grad_norm": 0.46484375, |
|
"learning_rate": 0.0001993685735045343, |
|
"loss": 9.6486, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.06510802012429713, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.0001992114701314478, |
|
"loss": 9.6029, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.06806747558449246, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.000199037007623783, |
|
"loss": 9.5554, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.07102693104468778, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 0.00019884521650742715, |
|
"loss": 9.4941, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.0739863865048831, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 0.00019863613034027224, |
|
"loss": 9.508, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.07694584196507842, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.0001984097857063434, |
|
"loss": 9.3502, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.07990529742527375, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.0001981662222093976, |
|
"loss": 9.3473, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.08286475288546907, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.00019790548246599447, |
|
"loss": 9.2955, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.0858242083456644, |
|
"grad_norm": 0.625, |
|
"learning_rate": 0.00019762761209803927, |
|
"loss": 9.2712, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.08878366380585972, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 0.0001973326597248006, |
|
"loss": 9.2969, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.09174311926605505, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 0.00019702067695440332, |
|
"loss": 9.1616, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.09470257472625036, |
|
"grad_norm": 0.4609375, |
|
"learning_rate": 0.00019669171837479873, |
|
"loss": 9.1605, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.09766203018644569, |
|
"grad_norm": 0.474609375, |
|
"learning_rate": 0.00019634584154421317, |
|
"loss": 9.1402, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.10062148564664102, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 0.00019598310698107702, |
|
"loss": 9.0839, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.10358094110683634, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 0.00019560357815343577, |
|
"loss": 9.0709, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.10654039656703167, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 0.00019520732146784491, |
|
"loss": 9.0372, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.109499852027227, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 0.0001947944062577507, |
|
"loss": 9.0209, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.11245930748742232, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 0.00019436490477135878, |
|
"loss": 8.9724, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.11541876294761765, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 0.00019391889215899299, |
|
"loss": 9.0212, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.11837821840781296, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 0.0001934564464599461, |
|
"loss": 8.9091, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.11837821840781296, |
|
"eval_loss": 8.961220741271973, |
|
"eval_runtime": 13.0065, |
|
"eval_samples_per_second": 115.48, |
|
"eval_steps_per_second": 28.909, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.12133767386800828, |
|
"grad_norm": 0.443359375, |
|
"learning_rate": 0.00019297764858882514, |
|
"loss": 8.9547, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.12429712932820361, |
|
"grad_norm": 0.466796875, |
|
"learning_rate": 0.00019248258232139388, |
|
"loss": 8.9394, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.12725658478839894, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 0.00019197133427991436, |
|
"loss": 8.9748, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.13021604024859426, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 0.00019144399391799043, |
|
"loss": 8.9198, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.1331754957087896, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 0.00019090065350491626, |
|
"loss": 8.8904, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.1361349511689849, |
|
"grad_norm": 0.494140625, |
|
"learning_rate": 0.0001903414081095315, |
|
"loss": 8.8971, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.13909440662918024, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 0.00018976635558358722, |
|
"loss": 8.84, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.14205386208937557, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.00018917559654462474, |
|
"loss": 8.838, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.1450133175495709, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 0.00018856923435837022, |
|
"loss": 8.7761, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.1479727730097662, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 0.0001879473751206489, |
|
"loss": 8.8421, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.15093222846996152, |
|
"grad_norm": 0.478515625, |
|
"learning_rate": 0.00018731012763882133, |
|
"loss": 8.7691, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.15389168393015684, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 0.00018665760341274505, |
|
"loss": 8.7749, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.15685113939035217, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.00018598991661526572, |
|
"loss": 8.79, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.1598105948505475, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.00018530718407223974, |
|
"loss": 8.8742, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.16277005031074282, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 0.00018460952524209355, |
|
"loss": 8.7845, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.16572950577093815, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 0.00018389706219492147, |
|
"loss": 8.8165, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.16868896123113347, |
|
"grad_norm": 0.486328125, |
|
"learning_rate": 0.00018316991959112716, |
|
"loss": 8.7024, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.1716484166913288, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.00018242822465961176, |
|
"loss": 8.7764, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.17460787215152412, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 0.00018167210717551224, |
|
"loss": 8.7501, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.17756732761171945, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 0.00018090169943749476, |
|
"loss": 8.7257, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.17756732761171945, |
|
"eval_loss": 8.762685775756836, |
|
"eval_runtime": 18.9408, |
|
"eval_samples_per_second": 79.3, |
|
"eval_steps_per_second": 19.851, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.18052678307191478, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 0.00018011713624460608, |
|
"loss": 8.7709, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.1834862385321101, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.00017931855487268782, |
|
"loss": 8.7334, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.18644569399230543, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 0.0001785060950503568, |
|
"loss": 8.824, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.18940514945250073, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.00017767989893455698, |
|
"loss": 8.6731, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.19236460491269605, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 0.00017684011108568592, |
|
"loss": 8.7669, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.19532406037289138, |
|
"grad_norm": 0.49609375, |
|
"learning_rate": 0.00017598687844230088, |
|
"loss": 8.6911, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.1982835158330867, |
|
"grad_norm": 0.44140625, |
|
"learning_rate": 0.00017512035029540885, |
|
"loss": 8.6932, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.20124297129328203, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 0.000174240678262345, |
|
"loss": 8.71, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.20420242675347736, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 0.000173348016260244, |
|
"loss": 8.7219, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.20716188221367268, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 0.00017244252047910892, |
|
"loss": 8.6973, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.210121337673868, |
|
"grad_norm": 0.462890625, |
|
"learning_rate": 0.00017152434935448256, |
|
"loss": 8.6743, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.21308079313406333, |
|
"grad_norm": 0.451171875, |
|
"learning_rate": 0.0001705936635397259, |
|
"loss": 8.7094, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.21604024859425866, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 0.00016965062587790823, |
|
"loss": 8.7353, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.218999704054454, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 0.00016869540137331445, |
|
"loss": 8.6939, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.2219591595146493, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 0.00016772815716257412, |
|
"loss": 8.7202, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.22491861497484464, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.00016674906248541726, |
|
"loss": 8.6779, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.22787807043503996, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.00016575828865506245, |
|
"loss": 8.6627, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.2308375258952353, |
|
"grad_norm": 0.4375, |
|
"learning_rate": 0.0001647560090282419, |
|
"loss": 8.7348, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.2337969813554306, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 0.000163742398974869, |
|
"loss": 8.7236, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.23675643681562591, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 0.0001627176358473537, |
|
"loss": 8.7416, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.23675643681562591, |
|
"eval_loss": 8.710856437683105, |
|
"eval_runtime": 16.7859, |
|
"eval_samples_per_second": 89.48, |
|
"eval_steps_per_second": 22.4, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.23971589227582124, |
|
"grad_norm": 0.47265625, |
|
"learning_rate": 0.0001616818989495711, |
|
"loss": 8.7235, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.24267534773601657, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 0.00016063536950548826, |
|
"loss": 8.7121, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.2456348031962119, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 0.0001595782306274553, |
|
"loss": 8.741, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.24859425865640722, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.00015851066728416618, |
|
"loss": 8.6978, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.25155371411660254, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 0.00015743286626829437, |
|
"loss": 8.7496, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.25451316957679787, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 0.00015634501616380967, |
|
"loss": 8.6913, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.2574726250369932, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.00015524730731298134, |
|
"loss": 8.6728, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.2604320804971885, |
|
"grad_norm": 0.5, |
|
"learning_rate": 0.0001541399317830738, |
|
"loss": 8.6724, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.26339153595738385, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 0.0001530230833327405, |
|
"loss": 8.763, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.2663509914175792, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 0.00015189695737812152, |
|
"loss": 8.6008, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.2693104468777745, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 0.0001507617509586517, |
|
"loss": 8.7395, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.2722699023379698, |
|
"grad_norm": 0.47265625, |
|
"learning_rate": 0.00014961766270258422, |
|
"loss": 8.6413, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.27522935779816515, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.00014846489279223652, |
|
"loss": 8.7083, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.2781888132583605, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 0.0001473036429289641, |
|
"loss": 8.6829, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.2811482687185558, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 0.0001461341162978688, |
|
"loss": 8.6955, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.28410772417875113, |
|
"grad_norm": 0.419921875, |
|
"learning_rate": 0.00014495651753224705, |
|
"loss": 8.6962, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.28706717963894646, |
|
"grad_norm": 0.478515625, |
|
"learning_rate": 0.00014377105267778518, |
|
"loss": 8.7166, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.2900266350991418, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.00014257792915650728, |
|
"loss": 8.6469, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.2929860905593371, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 0.00014137735573048233, |
|
"loss": 8.6999, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.2959455460195324, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 0.00014016954246529696, |
|
"loss": 8.5944, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.2959455460195324, |
|
"eval_loss": 8.698212623596191, |
|
"eval_runtime": 13.7844, |
|
"eval_samples_per_second": 108.964, |
|
"eval_steps_per_second": 27.277, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.2989050014797277, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 0.00013895470069330004, |
|
"loss": 8.7432, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.30186445693992303, |
|
"grad_norm": 0.498046875, |
|
"learning_rate": 0.00013773304297662559, |
|
"loss": 8.6772, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.30482391240011836, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.00013650478307000057, |
|
"loss": 8.73, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.3077833678603137, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 0.00013527013588334415, |
|
"loss": 8.7362, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.310742823320509, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 0.00013402931744416433, |
|
"loss": 8.6947, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.31370227878070434, |
|
"grad_norm": 0.451171875, |
|
"learning_rate": 0.00013278254485975976, |
|
"loss": 8.6919, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.31666173424089966, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 0.00013153003627923218, |
|
"loss": 8.7202, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.319621189701095, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.00013027201085531634, |
|
"loss": 8.7236, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.3225806451612903, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 0.00012900868870603503, |
|
"loss": 8.7817, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.32554010062148564, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.00012774029087618446, |
|
"loss": 8.8011, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.32849955608168097, |
|
"grad_norm": 0.462890625, |
|
"learning_rate": 0.00012646703929865817, |
|
"loss": 8.687, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.3314590115418763, |
|
"grad_norm": 0.47265625, |
|
"learning_rate": 0.00012518915675561483, |
|
"loss": 8.6354, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.3344184670020716, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 0.00012390686683949798, |
|
"loss": 8.6407, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.33737792246226694, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 0.00012262039391391404, |
|
"loss": 8.6823, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.34033737792246227, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 0.0001213299630743747, |
|
"loss": 8.7369, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.3432968333826576, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 0.00012003580010891213, |
|
"loss": 8.6849, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.3462562888428529, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.00011873813145857249, |
|
"loss": 8.6571, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.34921574430304825, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.00011743718417779517, |
|
"loss": 8.7425, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.3521751997632436, |
|
"grad_norm": 0.9453125, |
|
"learning_rate": 0.00011613318589468511, |
|
"loss": 8.6455, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.3551346552234389, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 0.0001148263647711842, |
|
"loss": 8.673, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.3551346552234389, |
|
"eval_loss": 8.696282386779785, |
|
"eval_runtime": 20.1067, |
|
"eval_samples_per_second": 74.702, |
|
"eval_steps_per_second": 18.7, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.3580941106836342, |
|
"grad_norm": 0.466796875, |
|
"learning_rate": 0.0001135169494631497, |
|
"loss": 8.6666, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.36105356614382955, |
|
"grad_norm": 0.484375, |
|
"learning_rate": 0.00011220516908034601, |
|
"loss": 8.6954, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.3640130216040249, |
|
"grad_norm": 0.5, |
|
"learning_rate": 0.00011089125314635726, |
|
"loss": 8.7236, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.3669724770642202, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 0.00010957543155842702, |
|
"loss": 8.7772, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.36993193252441553, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 0.00010825793454723325, |
|
"loss": 8.7222, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.37289138798461086, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.00010693899263660441, |
|
"loss": 8.6544, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.3758508434448062, |
|
"grad_norm": 0.498046875, |
|
"learning_rate": 0.00010561883660318455, |
|
"loss": 8.6639, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.37881029890500145, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.00010429769743605407, |
|
"loss": 8.6819, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.3817697543651968, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.00010297580629631325, |
|
"loss": 8.6511, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.3847292098253921, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 0.00010165339447663587, |
|
"loss": 8.6257, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.38768866528558743, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 0.00010033069336079952, |
|
"loss": 8.7457, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.39064812074578276, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 9.900793438320037e-05, |
|
"loss": 8.6771, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.3936075762059781, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 9.768534898835862e-05, |
|
"loss": 8.6776, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.3965670316661734, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 9.636316859042259e-05, |
|
"loss": 8.6742, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.39952648712636873, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 9.504162453267777e-05, |
|
"loss": 8.6419, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.40248594258656406, |
|
"grad_norm": 0.451171875, |
|
"learning_rate": 9.372094804706867e-05, |
|
"loss": 8.7098, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.4054453980467594, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 9.24013702137397e-05, |
|
"loss": 8.6633, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.4084048535069547, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 9.108312192060298e-05, |
|
"loss": 8.687, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.41136430896715004, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 8.97664338229395e-05, |
|
"loss": 8.713, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.41432376442734536, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 8.845153630304139e-05, |
|
"loss": 8.7511, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.41432376442734536, |
|
"eval_loss": 8.69721794128418, |
|
"eval_runtime": 8.7762, |
|
"eval_samples_per_second": 171.145, |
|
"eval_steps_per_second": 42.843, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.4172832198875407, |
|
"grad_norm": 0.46484375, |
|
"learning_rate": 8.713865942990141e-05, |
|
"loss": 8.6655, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.420242675347736, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 8.582803291895758e-05, |
|
"loss": 8.6978, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.42320213080793134, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 8.451988609189987e-05, |
|
"loss": 8.7573, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.42616158626812667, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 8.321444783654524e-05, |
|
"loss": 8.6963, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.429121041728322, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 8.191194656678904e-05, |
|
"loss": 8.7627, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.4320804971885173, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 8.061261018263919e-05, |
|
"loss": 8.6564, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.43503995264871265, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 7.931666603034033e-05, |
|
"loss": 8.6641, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.437999408108908, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 7.80243408625947e-05, |
|
"loss": 8.6646, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.4409588635691033, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 7.673586079888698e-05, |
|
"loss": 8.7323, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.4439183190292986, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 7.54514512859201e-05, |
|
"loss": 8.6167, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.44687777448949395, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 7.417133705816837e-05, |
|
"loss": 8.6929, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.4498372299496893, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 7.289574209855559e-05, |
|
"loss": 8.6871, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.4527966854098846, |
|
"grad_norm": 0.484375, |
|
"learning_rate": 7.16248895992645e-05, |
|
"loss": 8.6881, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.45575614087007993, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 7.035900192268464e-05, |
|
"loss": 8.6753, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.45871559633027525, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 6.909830056250527e-05, |
|
"loss": 8.7056, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.4616750517904706, |
|
"grad_norm": 0.431640625, |
|
"learning_rate": 6.784300610496048e-05, |
|
"loss": 8.706, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.46463450725066585, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 6.65933381902329e-05, |
|
"loss": 8.6888, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.4675939627108612, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 6.534951547402322e-05, |
|
"loss": 8.7158, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.4705534181710565, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 6.411175558929152e-05, |
|
"loss": 8.728, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.47351287363125183, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 6.28802751081779e-05, |
|
"loss": 8.729, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.47351287363125183, |
|
"eval_loss": 8.696118354797363, |
|
"eval_runtime": 19.8643, |
|
"eval_samples_per_second": 75.613, |
|
"eval_steps_per_second": 18.928, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.47647232909144716, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 6.165528950410884e-05, |
|
"loss": 8.6937, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.4794317845516425, |
|
"grad_norm": 0.486328125, |
|
"learning_rate": 6.0437013114095195e-05, |
|
"loss": 8.6631, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.4823912400118378, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 5.922565910122967e-05, |
|
"loss": 8.696, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.48535069547203313, |
|
"grad_norm": 0.75, |
|
"learning_rate": 5.8021439417389444e-05, |
|
"loss": 8.6176, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.48831015093222846, |
|
"grad_norm": 0.984375, |
|
"learning_rate": 5.6824564766150726e-05, |
|
"loss": 8.7082, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.4912696063924238, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 5.563524456592163e-05, |
|
"loss": 8.6952, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.4942290618526191, |
|
"grad_norm": 0.5, |
|
"learning_rate": 5.4453686913300074e-05, |
|
"loss": 8.678, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.49718851731281444, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 5.328009854666303e-05, |
|
"loss": 8.6815, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.5001479727730098, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 5.2114684809993044e-05, |
|
"loss": 8.6626, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.5031074282332051, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 5.095764961694922e-05, |
|
"loss": 8.7641, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.5060668836934004, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 4.980919541518796e-05, |
|
"loss": 8.6364, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.5090263391535957, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 4.866952315094088e-05, |
|
"loss": 8.689, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.511985794613791, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 4.753883223385467e-05, |
|
"loss": 8.7382, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.5149452500739864, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 4.6417320502100316e-05, |
|
"loss": 8.6902, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.5179047055341817, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 4.530518418775733e-05, |
|
"loss": 8.6841, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.520864160994377, |
|
"grad_norm": 0.478515625, |
|
"learning_rate": 4.4202617882478405e-05, |
|
"loss": 8.708, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.5238236164545723, |
|
"grad_norm": 0.486328125, |
|
"learning_rate": 4.310981450344189e-05, |
|
"loss": 8.6534, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.5267830719147677, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 4.2026965259596666e-05, |
|
"loss": 8.6607, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.529742527374963, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 4.0954259618206295e-05, |
|
"loss": 8.6611, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.5327019828351583, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 3.9891885271697496e-05, |
|
"loss": 8.6325, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.5327019828351583, |
|
"eval_loss": 8.694791793823242, |
|
"eval_runtime": 14.7074, |
|
"eval_samples_per_second": 102.126, |
|
"eval_steps_per_second": 25.565, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.5356614382953536, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 3.884002810481958e-05, |
|
"loss": 8.6837, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.538620893755549, |
|
"grad_norm": 0.490234375, |
|
"learning_rate": 3.779887216211995e-05, |
|
"loss": 8.6631, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.5415803492157443, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 3.676859961574162e-05, |
|
"loss": 8.6576, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.5445398046759397, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 3.574939073354838e-05, |
|
"loss": 8.7047, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.5474992601361349, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 3.4741423847583134e-05, |
|
"loss": 8.7234, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.5504587155963303, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 3.3744875322865034e-05, |
|
"loss": 8.7565, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.5534181710565256, |
|
"grad_norm": 0.4609375, |
|
"learning_rate": 3.275991952653054e-05, |
|
"loss": 8.6812, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.556377626516721, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 3.178672879732435e-05, |
|
"loss": 8.7074, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.5593370819769162, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 3.0825473415445074e-05, |
|
"loss": 8.6826, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.5622965374371116, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 2.9876321572751144e-05, |
|
"loss": 8.7359, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.5652559928973069, |
|
"grad_norm": 0.49609375, |
|
"learning_rate": 2.8939439343332086e-05, |
|
"loss": 8.6599, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.5682154483575023, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 2.8014990654450325e-05, |
|
"loss": 8.631, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.5711749038176975, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 2.7103137257858868e-05, |
|
"loss": 8.6579, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.5741343592778929, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 2.6204038701499056e-05, |
|
"loss": 8.7039, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.5770938147380882, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 2.5317852301584643e-05, |
|
"loss": 8.6511, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.5800532701982836, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 2.4444733115075823e-05, |
|
"loss": 8.6733, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.5830127256584788, |
|
"grad_norm": 0.48828125, |
|
"learning_rate": 2.3584833912548888e-05, |
|
"loss": 8.6404, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.5859721811186742, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 2.2738305151465645e-05, |
|
"loss": 8.6784, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.5889316365788695, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 2.190529494984782e-05, |
|
"loss": 8.6897, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.5918910920390648, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 2.1085949060360654e-05, |
|
"loss": 8.6338, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.5918910920390648, |
|
"eval_loss": 8.694610595703125, |
|
"eval_runtime": 15.9292, |
|
"eval_samples_per_second": 94.292, |
|
"eval_steps_per_second": 23.604, |
|
"step": 2000 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 2500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 400, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 51272680734720.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|