|
{ |
|
"best_metric": 0.019898300990462303, |
|
"best_model_checkpoint": "./vit-spoof/checkpoint-2600", |
|
"epoch": 4.0, |
|
"eval_steps": 100, |
|
"global_step": 3004, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.013315579227696404, |
|
"grad_norm": 2.9065542221069336, |
|
"learning_rate": 0.00019933422103861519, |
|
"loss": 0.4683, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.02663115845539281, |
|
"grad_norm": 5.029272556304932, |
|
"learning_rate": 0.00019866844207723036, |
|
"loss": 0.4784, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.03994673768308921, |
|
"grad_norm": 1.0903401374816895, |
|
"learning_rate": 0.00019800266311584554, |
|
"loss": 0.2297, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.05326231691078562, |
|
"grad_norm": 13.801852226257324, |
|
"learning_rate": 0.00019733688415446071, |
|
"loss": 0.3551, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.06657789613848203, |
|
"grad_norm": 7.049527645111084, |
|
"learning_rate": 0.0001966711051930759, |
|
"loss": 0.1737, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.07989347536617843, |
|
"grad_norm": 4.438915729522705, |
|
"learning_rate": 0.00019600532623169107, |
|
"loss": 0.4895, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.09320905459387484, |
|
"grad_norm": 2.351810932159424, |
|
"learning_rate": 0.00019533954727030627, |
|
"loss": 0.1397, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.10652463382157124, |
|
"grad_norm": 0.2668318748474121, |
|
"learning_rate": 0.00019467376830892145, |
|
"loss": 0.2286, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.11984021304926765, |
|
"grad_norm": 0.18876244127750397, |
|
"learning_rate": 0.00019400798934753662, |
|
"loss": 0.0623, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.13315579227696406, |
|
"grad_norm": 0.08632949739694595, |
|
"learning_rate": 0.00019334221038615183, |
|
"loss": 0.0682, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.13315579227696406, |
|
"eval_accuracy": 0.9505988023952096, |
|
"eval_loss": 0.16598451137542725, |
|
"eval_runtime": 18.9844, |
|
"eval_samples_per_second": 70.374, |
|
"eval_steps_per_second": 8.797, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.14647137150466044, |
|
"grad_norm": 13.015838623046875, |
|
"learning_rate": 0.000192676431424767, |
|
"loss": 0.1278, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.15978695073235685, |
|
"grad_norm": 0.0676693394780159, |
|
"learning_rate": 0.00019201065246338218, |
|
"loss": 0.048, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.17310252996005326, |
|
"grad_norm": 0.08669548481702805, |
|
"learning_rate": 0.00019134487350199735, |
|
"loss": 0.0652, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.18641810918774968, |
|
"grad_norm": 2.5868160724639893, |
|
"learning_rate": 0.00019067909454061253, |
|
"loss": 0.1912, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.19973368841544606, |
|
"grad_norm": 0.046874042600393295, |
|
"learning_rate": 0.0001900133155792277, |
|
"loss": 0.0425, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.21304926764314247, |
|
"grad_norm": 0.7398509383201599, |
|
"learning_rate": 0.00018934753661784288, |
|
"loss": 0.0862, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.22636484687083888, |
|
"grad_norm": 1.8706042766571045, |
|
"learning_rate": 0.00018868175765645806, |
|
"loss": 0.4261, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.2396804260985353, |
|
"grad_norm": 8.126788139343262, |
|
"learning_rate": 0.00018801597869507323, |
|
"loss": 0.2851, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.2529960053262317, |
|
"grad_norm": 0.4172131419181824, |
|
"learning_rate": 0.0001873501997336884, |
|
"loss": 0.1001, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.2663115845539281, |
|
"grad_norm": 0.6365886926651001, |
|
"learning_rate": 0.00018668442077230361, |
|
"loss": 0.0711, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.2663115845539281, |
|
"eval_accuracy": 0.9805389221556886, |
|
"eval_loss": 0.07567165791988373, |
|
"eval_runtime": 19.0052, |
|
"eval_samples_per_second": 70.297, |
|
"eval_steps_per_second": 8.787, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.2796271637816245, |
|
"grad_norm": 0.057595234364271164, |
|
"learning_rate": 0.0001860186418109188, |
|
"loss": 0.1514, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.2929427430093209, |
|
"grad_norm": 8.712677955627441, |
|
"learning_rate": 0.00018535286284953397, |
|
"loss": 0.2079, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.3062583222370173, |
|
"grad_norm": 0.33387067914009094, |
|
"learning_rate": 0.00018468708388814914, |
|
"loss": 0.0374, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.3195739014647137, |
|
"grad_norm": 12.936437606811523, |
|
"learning_rate": 0.00018402130492676432, |
|
"loss": 0.154, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.33288948069241014, |
|
"grad_norm": 0.08830924332141876, |
|
"learning_rate": 0.0001833555259653795, |
|
"loss": 0.0712, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.34620505992010653, |
|
"grad_norm": 0.13204559683799744, |
|
"learning_rate": 0.00018268974700399467, |
|
"loss": 0.1426, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.3595206391478029, |
|
"grad_norm": 0.2291734367609024, |
|
"learning_rate": 0.00018202396804260987, |
|
"loss": 0.0472, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.37283621837549935, |
|
"grad_norm": 0.054096538573503494, |
|
"learning_rate": 0.00018135818908122505, |
|
"loss": 0.086, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.38615179760319573, |
|
"grad_norm": 4.264747619628906, |
|
"learning_rate": 0.00018069241011984023, |
|
"loss": 0.1287, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.3994673768308921, |
|
"grad_norm": 0.15629783272743225, |
|
"learning_rate": 0.0001800266311584554, |
|
"loss": 0.0112, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.3994673768308921, |
|
"eval_accuracy": 0.9550898203592815, |
|
"eval_loss": 0.13134559988975525, |
|
"eval_runtime": 18.8497, |
|
"eval_samples_per_second": 70.876, |
|
"eval_steps_per_second": 8.86, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.41278295605858856, |
|
"grad_norm": 4.84935188293457, |
|
"learning_rate": 0.00017936085219707058, |
|
"loss": 0.117, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.42609853528628494, |
|
"grad_norm": 0.580627977848053, |
|
"learning_rate": 0.00017869507323568575, |
|
"loss": 0.1188, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.4394141145139814, |
|
"grad_norm": 4.605671405792236, |
|
"learning_rate": 0.00017802929427430096, |
|
"loss": 0.1802, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.45272969374167776, |
|
"grad_norm": 0.16331535577774048, |
|
"learning_rate": 0.00017736351531291613, |
|
"loss": 0.1645, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.46604527296937415, |
|
"grad_norm": 9.795440673828125, |
|
"learning_rate": 0.0001766977363515313, |
|
"loss": 0.1133, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.4793608521970706, |
|
"grad_norm": 1.2079029083251953, |
|
"learning_rate": 0.00017603195739014649, |
|
"loss": 0.06, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.49267643142476697, |
|
"grad_norm": 0.02436252497136593, |
|
"learning_rate": 0.00017536617842876166, |
|
"loss": 0.0093, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.5059920106524634, |
|
"grad_norm": 0.19958335161209106, |
|
"learning_rate": 0.00017470039946737684, |
|
"loss": 0.0398, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.5193075898801598, |
|
"grad_norm": 0.8796051740646362, |
|
"learning_rate": 0.00017403462050599201, |
|
"loss": 0.1084, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.5326231691078562, |
|
"grad_norm": 11.185724258422852, |
|
"learning_rate": 0.0001733688415446072, |
|
"loss": 0.0544, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.5326231691078562, |
|
"eval_accuracy": 0.9850299401197605, |
|
"eval_loss": 0.05561767518520355, |
|
"eval_runtime": 19.0907, |
|
"eval_samples_per_second": 69.982, |
|
"eval_steps_per_second": 8.748, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.5459387483355526, |
|
"grad_norm": 0.045906443148851395, |
|
"learning_rate": 0.00017270306258322237, |
|
"loss": 0.049, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.559254327563249, |
|
"grad_norm": 0.03049567900598049, |
|
"learning_rate": 0.00017203728362183754, |
|
"loss": 0.0573, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.5725699067909454, |
|
"grad_norm": 8.935633659362793, |
|
"learning_rate": 0.00017137150466045272, |
|
"loss": 0.324, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.5858854860186418, |
|
"grad_norm": 0.19179384410381317, |
|
"learning_rate": 0.00017070572569906792, |
|
"loss": 0.0121, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.5992010652463382, |
|
"grad_norm": 3.0050430297851562, |
|
"learning_rate": 0.0001700399467376831, |
|
"loss": 0.0361, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.6125166444740346, |
|
"grad_norm": 0.0664597898721695, |
|
"learning_rate": 0.00016937416777629827, |
|
"loss": 0.1106, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.625832223701731, |
|
"grad_norm": 5.098622798919678, |
|
"learning_rate": 0.00016870838881491348, |
|
"loss": 0.1938, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.6391478029294274, |
|
"grad_norm": 4.372994899749756, |
|
"learning_rate": 0.00016804260985352865, |
|
"loss": 0.0807, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.6524633821571239, |
|
"grad_norm": 0.28381890058517456, |
|
"learning_rate": 0.00016737683089214383, |
|
"loss": 0.0463, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.6657789613848203, |
|
"grad_norm": 0.8805880546569824, |
|
"learning_rate": 0.000166711051930759, |
|
"loss": 0.0557, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.6657789613848203, |
|
"eval_accuracy": 0.968562874251497, |
|
"eval_loss": 0.09653697907924652, |
|
"eval_runtime": 18.9994, |
|
"eval_samples_per_second": 70.318, |
|
"eval_steps_per_second": 8.79, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.6790945406125166, |
|
"grad_norm": 0.3464203178882599, |
|
"learning_rate": 0.00016604527296937418, |
|
"loss": 0.1231, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.6924101198402131, |
|
"grad_norm": 0.10989736020565033, |
|
"learning_rate": 0.00016537949400798936, |
|
"loss": 0.0666, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.7057256990679095, |
|
"grad_norm": 2.2785890102386475, |
|
"learning_rate": 0.00016471371504660453, |
|
"loss": 0.1636, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.7190412782956058, |
|
"grad_norm": 0.07484255731105804, |
|
"learning_rate": 0.0001640479360852197, |
|
"loss": 0.0288, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.7323568575233023, |
|
"grad_norm": 0.02765025570988655, |
|
"learning_rate": 0.0001633821571238349, |
|
"loss": 0.0514, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.7456724367509987, |
|
"grad_norm": 0.07284240424633026, |
|
"learning_rate": 0.00016271637816245006, |
|
"loss": 0.0547, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.758988015978695, |
|
"grad_norm": 0.07628186792135239, |
|
"learning_rate": 0.00016205059920106524, |
|
"loss": 0.0239, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.7723035952063915, |
|
"grad_norm": 0.051849640905857086, |
|
"learning_rate": 0.00016138482023968042, |
|
"loss": 0.0568, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.7856191744340879, |
|
"grad_norm": 0.12565506994724274, |
|
"learning_rate": 0.00016071904127829562, |
|
"loss": 0.0568, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.7989347536617842, |
|
"grad_norm": 0.022169604897499084, |
|
"learning_rate": 0.0001600532623169108, |
|
"loss": 0.016, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.7989347536617842, |
|
"eval_accuracy": 0.9827844311377245, |
|
"eval_loss": 0.05710091441869736, |
|
"eval_runtime": 19.0923, |
|
"eval_samples_per_second": 69.976, |
|
"eval_steps_per_second": 8.747, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.8122503328894807, |
|
"grad_norm": 8.000443458557129, |
|
"learning_rate": 0.00015938748335552597, |
|
"loss": 0.0764, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.8255659121171771, |
|
"grad_norm": 4.5883965492248535, |
|
"learning_rate": 0.00015872170439414115, |
|
"loss": 0.1149, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.8388814913448736, |
|
"grad_norm": 0.1661900281906128, |
|
"learning_rate": 0.00015805592543275632, |
|
"loss": 0.0243, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.8521970705725699, |
|
"grad_norm": 2.3490681648254395, |
|
"learning_rate": 0.00015739014647137153, |
|
"loss": 0.1185, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.8655126498002663, |
|
"grad_norm": 0.3317602276802063, |
|
"learning_rate": 0.0001567243675099867, |
|
"loss": 0.0235, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.8788282290279628, |
|
"grad_norm": 0.038634102791547775, |
|
"learning_rate": 0.00015605858854860188, |
|
"loss": 0.0774, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.8921438082556591, |
|
"grad_norm": 0.05563969910144806, |
|
"learning_rate": 0.00015539280958721705, |
|
"loss": 0.0812, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.9054593874833555, |
|
"grad_norm": 0.029876109212636948, |
|
"learning_rate": 0.00015472703062583223, |
|
"loss": 0.0314, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.918774966711052, |
|
"grad_norm": 1.0358515977859497, |
|
"learning_rate": 0.0001540612516644474, |
|
"loss": 0.0538, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.9320905459387483, |
|
"grad_norm": 0.21664029359817505, |
|
"learning_rate": 0.00015339547270306258, |
|
"loss": 0.0382, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.9320905459387483, |
|
"eval_accuracy": 0.9925149700598802, |
|
"eval_loss": 0.029164018109440804, |
|
"eval_runtime": 19.0562, |
|
"eval_samples_per_second": 70.108, |
|
"eval_steps_per_second": 8.764, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.9454061251664447, |
|
"grad_norm": 0.24551372230052948, |
|
"learning_rate": 0.00015272969374167776, |
|
"loss": 0.0445, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.9587217043941412, |
|
"grad_norm": 0.024771912023425102, |
|
"learning_rate": 0.00015206391478029296, |
|
"loss": 0.0366, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.9720372836218375, |
|
"grad_norm": 0.027900271117687225, |
|
"learning_rate": 0.00015139813581890814, |
|
"loss": 0.0044, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.9853528628495339, |
|
"grad_norm": 0.023675180971622467, |
|
"learning_rate": 0.00015073235685752331, |
|
"loss": 0.0191, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.9986684420772304, |
|
"grad_norm": 0.1361338347196579, |
|
"learning_rate": 0.0001500665778961385, |
|
"loss": 0.0496, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.0119840213049267, |
|
"grad_norm": 0.016286784783005714, |
|
"learning_rate": 0.00014940079893475367, |
|
"loss": 0.0658, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.0252996005326231, |
|
"grad_norm": 0.04202214255928993, |
|
"learning_rate": 0.00014873501997336884, |
|
"loss": 0.074, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.0386151797603196, |
|
"grad_norm": 0.10847309976816177, |
|
"learning_rate": 0.00014806924101198402, |
|
"loss": 0.0871, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.051930758988016, |
|
"grad_norm": 0.049633290618658066, |
|
"learning_rate": 0.0001474034620505992, |
|
"loss": 0.0098, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.0652463382157125, |
|
"grad_norm": 0.029612813144922256, |
|
"learning_rate": 0.00014673768308921437, |
|
"loss": 0.0281, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.0652463382157125, |
|
"eval_accuracy": 0.9782934131736527, |
|
"eval_loss": 0.07025627046823502, |
|
"eval_runtime": 18.7557, |
|
"eval_samples_per_second": 71.232, |
|
"eval_steps_per_second": 8.904, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.078561917443409, |
|
"grad_norm": 15.133095741271973, |
|
"learning_rate": 0.00014607190412782957, |
|
"loss": 0.1002, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.0918774966711051, |
|
"grad_norm": 0.051363505423069, |
|
"learning_rate": 0.00014540612516644475, |
|
"loss": 0.0365, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.1051930758988016, |
|
"grad_norm": 0.03931460902094841, |
|
"learning_rate": 0.00014474034620505993, |
|
"loss": 0.0033, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.118508655126498, |
|
"grad_norm": 0.02210480161011219, |
|
"learning_rate": 0.0001440745672436751, |
|
"loss": 0.0695, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.1318242343541944, |
|
"grad_norm": 1.2839746475219727, |
|
"learning_rate": 0.0001434087882822903, |
|
"loss": 0.0675, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.1451398135818909, |
|
"grad_norm": 0.31109488010406494, |
|
"learning_rate": 0.00014274300932090548, |
|
"loss": 0.0486, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.158455392809587, |
|
"grad_norm": 1.5788602828979492, |
|
"learning_rate": 0.00014207723035952066, |
|
"loss": 0.0325, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.1717709720372835, |
|
"grad_norm": 0.03321152552962303, |
|
"learning_rate": 0.00014141145139813583, |
|
"loss": 0.0796, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.18508655126498, |
|
"grad_norm": 12.448341369628906, |
|
"learning_rate": 0.000140745672436751, |
|
"loss": 0.0515, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.1984021304926764, |
|
"grad_norm": 0.1457500457763672, |
|
"learning_rate": 0.0001400798934753662, |
|
"loss": 0.023, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.1984021304926764, |
|
"eval_accuracy": 0.9932634730538922, |
|
"eval_loss": 0.02668871358036995, |
|
"eval_runtime": 18.662, |
|
"eval_samples_per_second": 71.589, |
|
"eval_steps_per_second": 8.949, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.2117177097203728, |
|
"grad_norm": 0.14423052966594696, |
|
"learning_rate": 0.00013941411451398136, |
|
"loss": 0.08, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.2250332889480693, |
|
"grad_norm": 0.055785391479730606, |
|
"learning_rate": 0.00013874833555259654, |
|
"loss": 0.0445, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.2383488681757657, |
|
"grad_norm": 0.0294383205473423, |
|
"learning_rate": 0.00013808255659121172, |
|
"loss": 0.0414, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.2516644474034622, |
|
"grad_norm": 0.0800587385892868, |
|
"learning_rate": 0.0001374167776298269, |
|
"loss": 0.003, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.2649800266311584, |
|
"grad_norm": 0.08012738078832626, |
|
"learning_rate": 0.00013675099866844207, |
|
"loss": 0.0042, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.2782956058588548, |
|
"grad_norm": 0.022230001166462898, |
|
"learning_rate": 0.00013608521970705724, |
|
"loss": 0.0361, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.2916111850865513, |
|
"grad_norm": 0.014080135151743889, |
|
"learning_rate": 0.00013541944074567242, |
|
"loss": 0.004, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.3049267643142477, |
|
"grad_norm": 0.01428599376231432, |
|
"learning_rate": 0.00013475366178428762, |
|
"loss": 0.0016, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.3182423435419441, |
|
"grad_norm": 0.01126811746507883, |
|
"learning_rate": 0.0001340878828229028, |
|
"loss": 0.0014, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.3315579227696404, |
|
"grad_norm": 0.016759509220719337, |
|
"learning_rate": 0.00013342210386151798, |
|
"loss": 0.0695, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.3315579227696404, |
|
"eval_accuracy": 0.9887724550898204, |
|
"eval_loss": 0.043992191553115845, |
|
"eval_runtime": 18.8129, |
|
"eval_samples_per_second": 71.015, |
|
"eval_steps_per_second": 8.877, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.3448735019973368, |
|
"grad_norm": 12.30389404296875, |
|
"learning_rate": 0.00013275632490013318, |
|
"loss": 0.0344, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.3581890812250332, |
|
"grad_norm": 0.019324608147144318, |
|
"learning_rate": 0.00013209054593874836, |
|
"loss": 0.1322, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.3715046604527297, |
|
"grad_norm": 0.08052018284797668, |
|
"learning_rate": 0.00013142476697736353, |
|
"loss": 0.0878, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.3848202396804261, |
|
"grad_norm": 0.015013271011412144, |
|
"learning_rate": 0.0001307589880159787, |
|
"loss": 0.0176, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.3981358189081226, |
|
"grad_norm": 1.5079394578933716, |
|
"learning_rate": 0.00013009320905459388, |
|
"loss": 0.0757, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.411451398135819, |
|
"grad_norm": 1.3230667114257812, |
|
"learning_rate": 0.00012942743009320906, |
|
"loss": 0.054, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.4247669773635154, |
|
"grad_norm": 0.025523126125335693, |
|
"learning_rate": 0.00012876165113182424, |
|
"loss": 0.0439, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.4380825565912116, |
|
"grad_norm": 4.955082893371582, |
|
"learning_rate": 0.0001280958721704394, |
|
"loss": 0.0196, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.451398135818908, |
|
"grad_norm": 0.07307817041873932, |
|
"learning_rate": 0.0001274300932090546, |
|
"loss": 0.0637, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.4647137150466045, |
|
"grad_norm": 0.016766542568802834, |
|
"learning_rate": 0.00012676431424766976, |
|
"loss": 0.0215, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.4647137150466045, |
|
"eval_accuracy": 0.9910179640718563, |
|
"eval_loss": 0.031917981803417206, |
|
"eval_runtime": 18.9879, |
|
"eval_samples_per_second": 70.361, |
|
"eval_steps_per_second": 8.795, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.478029294274301, |
|
"grad_norm": 0.010708771646022797, |
|
"learning_rate": 0.00012609853528628497, |
|
"loss": 0.0036, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.4913448735019974, |
|
"grad_norm": 0.010356970131397247, |
|
"learning_rate": 0.00012543275632490014, |
|
"loss": 0.0016, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.5046604527296936, |
|
"grad_norm": 0.010580988600850105, |
|
"learning_rate": 0.00012476697736351532, |
|
"loss": 0.0354, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.51797603195739, |
|
"grad_norm": 0.009461847133934498, |
|
"learning_rate": 0.0001241011984021305, |
|
"loss": 0.0011, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.5312916111850865, |
|
"grad_norm": 0.011837669648230076, |
|
"learning_rate": 0.00012343541944074567, |
|
"loss": 0.0434, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.544607190412783, |
|
"grad_norm": 0.01360328495502472, |
|
"learning_rate": 0.00012276964047936085, |
|
"loss": 0.0013, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.5579227696404794, |
|
"grad_norm": 0.010293539613485336, |
|
"learning_rate": 0.00012210386151797602, |
|
"loss": 0.0013, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.5712383488681758, |
|
"grad_norm": 0.011664211750030518, |
|
"learning_rate": 0.00012143808255659121, |
|
"loss": 0.0013, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.5845539280958723, |
|
"grad_norm": 0.03968218341469765, |
|
"learning_rate": 0.00012077230359520639, |
|
"loss": 0.0067, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.5978695073235687, |
|
"grad_norm": 1.3615704774856567, |
|
"learning_rate": 0.00012010652463382157, |
|
"loss": 0.0236, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.5978695073235687, |
|
"eval_accuracy": 0.9880239520958084, |
|
"eval_loss": 0.06240475922822952, |
|
"eval_runtime": 19.0358, |
|
"eval_samples_per_second": 70.184, |
|
"eval_steps_per_second": 8.773, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.6111850865512651, |
|
"grad_norm": 0.3500171899795532, |
|
"learning_rate": 0.00011944074567243674, |
|
"loss": 0.0198, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.6245006657789614, |
|
"grad_norm": 0.007627115119248629, |
|
"learning_rate": 0.00011877496671105193, |
|
"loss": 0.0016, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.6378162450066578, |
|
"grad_norm": 0.07766377925872803, |
|
"learning_rate": 0.00011810918774966711, |
|
"loss": 0.001, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.6511318242343542, |
|
"grad_norm": 0.008079525083303452, |
|
"learning_rate": 0.00011744340878828231, |
|
"loss": 0.001, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.6644474034620504, |
|
"grad_norm": 14.718483924865723, |
|
"learning_rate": 0.00011677762982689749, |
|
"loss": 0.0543, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.6777629826897469, |
|
"grad_norm": 3.2829227447509766, |
|
"learning_rate": 0.00011611185086551266, |
|
"loss": 0.0409, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.6910785619174433, |
|
"grad_norm": 0.8293498158454895, |
|
"learning_rate": 0.00011544607190412784, |
|
"loss": 0.0632, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.7043941411451398, |
|
"grad_norm": 0.15176649391651154, |
|
"learning_rate": 0.00011478029294274302, |
|
"loss": 0.0056, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.7177097203728362, |
|
"grad_norm": 0.015873638913035393, |
|
"learning_rate": 0.00011411451398135819, |
|
"loss": 0.0032, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.7310252996005326, |
|
"grad_norm": 0.011890141293406487, |
|
"learning_rate": 0.00011344873501997337, |
|
"loss": 0.0021, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.7310252996005326, |
|
"eval_accuracy": 0.9895209580838323, |
|
"eval_loss": 0.05610267445445061, |
|
"eval_runtime": 18.9536, |
|
"eval_samples_per_second": 70.488, |
|
"eval_steps_per_second": 8.811, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.744340878828229, |
|
"grad_norm": 0.009211612865328789, |
|
"learning_rate": 0.00011278295605858856, |
|
"loss": 0.0014, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 1.7576564580559255, |
|
"grad_norm": 0.015255720354616642, |
|
"learning_rate": 0.00011211717709720373, |
|
"loss": 0.0088, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.770972037283622, |
|
"grad_norm": 0.009559585712850094, |
|
"learning_rate": 0.00011145139813581891, |
|
"loss": 0.0203, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 1.7842876165113184, |
|
"grad_norm": 39.935569763183594, |
|
"learning_rate": 0.00011078561917443409, |
|
"loss": 0.0302, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 1.7976031957390146, |
|
"grad_norm": 0.010259670205414295, |
|
"learning_rate": 0.00011011984021304926, |
|
"loss": 0.0721, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.810918774966711, |
|
"grad_norm": 0.013963967561721802, |
|
"learning_rate": 0.00010945406125166447, |
|
"loss": 0.0358, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 1.8242343541944075, |
|
"grad_norm": 0.2606169283390045, |
|
"learning_rate": 0.00010878828229027964, |
|
"loss": 0.0179, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 1.8375499334221037, |
|
"grad_norm": 21.295629501342773, |
|
"learning_rate": 0.00010812250332889482, |
|
"loss": 0.0677, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 1.8508655126498001, |
|
"grad_norm": 0.014277924783527851, |
|
"learning_rate": 0.00010745672436751, |
|
"loss": 0.0074, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 1.8641810918774966, |
|
"grad_norm": 0.007295672316104174, |
|
"learning_rate": 0.00010679094540612517, |
|
"loss": 0.0218, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.8641810918774966, |
|
"eval_accuracy": 0.9820359281437125, |
|
"eval_loss": 0.07637928426265717, |
|
"eval_runtime": 18.728, |
|
"eval_samples_per_second": 71.337, |
|
"eval_steps_per_second": 8.917, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.877496671105193, |
|
"grad_norm": 0.12700679898262024, |
|
"learning_rate": 0.00010612516644474036, |
|
"loss": 0.0066, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 1.8908122503328895, |
|
"grad_norm": 0.5147867202758789, |
|
"learning_rate": 0.00010545938748335554, |
|
"loss": 0.0024, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 1.904127829560586, |
|
"grad_norm": 0.007088396232575178, |
|
"learning_rate": 0.00010479360852197071, |
|
"loss": 0.0016, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 1.9174434087882823, |
|
"grad_norm": 0.07631803303956985, |
|
"learning_rate": 0.00010412782956058589, |
|
"loss": 0.0016, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 1.9307589880159788, |
|
"grad_norm": 0.02045373059809208, |
|
"learning_rate": 0.00010346205059920106, |
|
"loss": 0.0062, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.9440745672436752, |
|
"grad_norm": 0.034513406455516815, |
|
"learning_rate": 0.00010279627163781624, |
|
"loss": 0.0418, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 1.9573901464713717, |
|
"grad_norm": 0.006663164123892784, |
|
"learning_rate": 0.00010213049267643142, |
|
"loss": 0.0355, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 1.9707057256990679, |
|
"grad_norm": 0.15765492618083954, |
|
"learning_rate": 0.0001014647137150466, |
|
"loss": 0.0708, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 1.9840213049267643, |
|
"grad_norm": 4.438633441925049, |
|
"learning_rate": 0.0001007989347536618, |
|
"loss": 0.0889, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 1.9973368841544608, |
|
"grad_norm": 0.006967665161937475, |
|
"learning_rate": 0.00010013315579227697, |
|
"loss": 0.0102, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.9973368841544608, |
|
"eval_accuracy": 0.9910179640718563, |
|
"eval_loss": 0.034723859280347824, |
|
"eval_runtime": 18.7741, |
|
"eval_samples_per_second": 71.162, |
|
"eval_steps_per_second": 8.895, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.010652463382157, |
|
"grad_norm": 0.09451789408922195, |
|
"learning_rate": 9.946737683089215e-05, |
|
"loss": 0.0049, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 2.0239680426098534, |
|
"grad_norm": 0.008569799363613129, |
|
"learning_rate": 9.880159786950732e-05, |
|
"loss": 0.0173, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 2.03728362183755, |
|
"grad_norm": 0.3698938488960266, |
|
"learning_rate": 9.813581890812251e-05, |
|
"loss": 0.0013, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 2.0505992010652463, |
|
"grad_norm": 0.016449321061372757, |
|
"learning_rate": 9.747003994673769e-05, |
|
"loss": 0.0015, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 2.0639147802929427, |
|
"grad_norm": 0.06819990277290344, |
|
"learning_rate": 9.680426098535287e-05, |
|
"loss": 0.001, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 2.077230359520639, |
|
"grad_norm": 0.006248504854738712, |
|
"learning_rate": 9.613848202396804e-05, |
|
"loss": 0.0011, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 2.0905459387483356, |
|
"grad_norm": 0.005318405572324991, |
|
"learning_rate": 9.547270306258322e-05, |
|
"loss": 0.0277, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 2.103861517976032, |
|
"grad_norm": 0.02135417051613331, |
|
"learning_rate": 9.480692410119841e-05, |
|
"loss": 0.0007, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 2.1171770972037285, |
|
"grad_norm": 13.763138771057129, |
|
"learning_rate": 9.414114513981358e-05, |
|
"loss": 0.0163, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 2.130492676431425, |
|
"grad_norm": 3.230034112930298, |
|
"learning_rate": 9.347536617842877e-05, |
|
"loss": 0.0668, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.130492676431425, |
|
"eval_accuracy": 0.9745508982035929, |
|
"eval_loss": 0.1236691027879715, |
|
"eval_runtime": 18.9343, |
|
"eval_samples_per_second": 70.56, |
|
"eval_steps_per_second": 8.82, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.1438082556591214, |
|
"grad_norm": 0.0055719888769090176, |
|
"learning_rate": 9.280958721704395e-05, |
|
"loss": 0.001, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 2.157123834886818, |
|
"grad_norm": 0.007174134254455566, |
|
"learning_rate": 9.214380825565913e-05, |
|
"loss": 0.0353, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 2.170439414114514, |
|
"grad_norm": 0.005854015704244375, |
|
"learning_rate": 9.14780292942743e-05, |
|
"loss": 0.0011, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 2.1837549933422102, |
|
"grad_norm": 0.03955959528684616, |
|
"learning_rate": 9.081225033288948e-05, |
|
"loss": 0.0082, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 2.1970705725699067, |
|
"grad_norm": 0.007123819552361965, |
|
"learning_rate": 9.014647137150465e-05, |
|
"loss": 0.0009, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 2.210386151797603, |
|
"grad_norm": 0.15122471749782562, |
|
"learning_rate": 8.948069241011984e-05, |
|
"loss": 0.0025, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 2.2237017310252996, |
|
"grad_norm": 0.0059026069939136505, |
|
"learning_rate": 8.881491344873502e-05, |
|
"loss": 0.0211, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 2.237017310252996, |
|
"grad_norm": 0.005344110075384378, |
|
"learning_rate": 8.814913448735021e-05, |
|
"loss": 0.0215, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 2.2503328894806924, |
|
"grad_norm": 0.007038838230073452, |
|
"learning_rate": 8.748335552596539e-05, |
|
"loss": 0.0012, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 2.263648468708389, |
|
"grad_norm": 0.004509239457547665, |
|
"learning_rate": 8.681757656458056e-05, |
|
"loss": 0.0022, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 2.263648468708389, |
|
"eval_accuracy": 0.9872754491017964, |
|
"eval_loss": 0.0549810491502285, |
|
"eval_runtime": 19.1819, |
|
"eval_samples_per_second": 69.649, |
|
"eval_steps_per_second": 8.706, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 2.2769640479360853, |
|
"grad_norm": 0.04736039415001869, |
|
"learning_rate": 8.615179760319574e-05, |
|
"loss": 0.0015, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 2.2902796271637818, |
|
"grad_norm": 6.543766498565674, |
|
"learning_rate": 8.548601864181093e-05, |
|
"loss": 0.0414, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 2.303595206391478, |
|
"grad_norm": 0.004299995955079794, |
|
"learning_rate": 8.48202396804261e-05, |
|
"loss": 0.0008, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 2.316910785619174, |
|
"grad_norm": 0.005761590786278248, |
|
"learning_rate": 8.415446071904128e-05, |
|
"loss": 0.0586, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 2.3302263648468706, |
|
"grad_norm": 0.007165808696299791, |
|
"learning_rate": 8.348868175765646e-05, |
|
"loss": 0.0007, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 2.343541944074567, |
|
"grad_norm": 0.0066890171729028225, |
|
"learning_rate": 8.282290279627163e-05, |
|
"loss": 0.0374, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 2.3568575233022635, |
|
"grad_norm": 0.005914334207773209, |
|
"learning_rate": 8.215712383488682e-05, |
|
"loss": 0.0009, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 2.37017310252996, |
|
"grad_norm": 0.009822635911405087, |
|
"learning_rate": 8.1491344873502e-05, |
|
"loss": 0.0364, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 2.3834886817576564, |
|
"grad_norm": 0.008548504672944546, |
|
"learning_rate": 8.082556591211719e-05, |
|
"loss": 0.043, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 2.396804260985353, |
|
"grad_norm": 0.007220366504043341, |
|
"learning_rate": 8.015978695073236e-05, |
|
"loss": 0.0156, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.396804260985353, |
|
"eval_accuracy": 0.9895209580838323, |
|
"eval_loss": 0.03286071494221687, |
|
"eval_runtime": 19.1709, |
|
"eval_samples_per_second": 69.689, |
|
"eval_steps_per_second": 8.711, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.4101198402130493, |
|
"grad_norm": 0.006717954762279987, |
|
"learning_rate": 7.949400798934754e-05, |
|
"loss": 0.0009, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 2.4234354194407457, |
|
"grad_norm": 0.035514891147613525, |
|
"learning_rate": 7.882822902796272e-05, |
|
"loss": 0.0009, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 2.436750998668442, |
|
"grad_norm": 0.0049186451360583305, |
|
"learning_rate": 7.81624500665779e-05, |
|
"loss": 0.0046, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 2.4500665778961386, |
|
"grad_norm": 0.03821125254034996, |
|
"learning_rate": 7.749667110519307e-05, |
|
"loss": 0.0069, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 2.463382157123835, |
|
"grad_norm": 3.3322513103485107, |
|
"learning_rate": 7.683089214380826e-05, |
|
"loss": 0.0022, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 2.4766977363515315, |
|
"grad_norm": 0.004409165121614933, |
|
"learning_rate": 7.616511318242345e-05, |
|
"loss": 0.0023, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 2.490013315579228, |
|
"grad_norm": 0.004281247034668922, |
|
"learning_rate": 7.549933422103862e-05, |
|
"loss": 0.0258, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 2.5033288948069243, |
|
"grad_norm": 0.012024540454149246, |
|
"learning_rate": 7.48335552596538e-05, |
|
"loss": 0.0006, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 2.5166444740346208, |
|
"grad_norm": 0.004248355980962515, |
|
"learning_rate": 7.416777629826898e-05, |
|
"loss": 0.0005, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 2.5299600532623168, |
|
"grad_norm": 0.004529563244432211, |
|
"learning_rate": 7.350199733688415e-05, |
|
"loss": 0.0006, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 2.5299600532623168, |
|
"eval_accuracy": 0.9947604790419161, |
|
"eval_loss": 0.02548411302268505, |
|
"eval_runtime": 18.997, |
|
"eval_samples_per_second": 70.327, |
|
"eval_steps_per_second": 8.791, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 2.543275632490013, |
|
"grad_norm": 0.00455145537853241, |
|
"learning_rate": 7.283621837549934e-05, |
|
"loss": 0.0005, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 2.5565912117177096, |
|
"grad_norm": 0.004023432265967131, |
|
"learning_rate": 7.217043941411452e-05, |
|
"loss": 0.0005, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 2.569906790945406, |
|
"grad_norm": 0.004490096587687731, |
|
"learning_rate": 7.15046604527297e-05, |
|
"loss": 0.0007, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 2.5832223701731025, |
|
"grad_norm": 0.0075076790526509285, |
|
"learning_rate": 7.083888149134487e-05, |
|
"loss": 0.0012, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 2.596537949400799, |
|
"grad_norm": 0.0037563215009868145, |
|
"learning_rate": 7.017310252996006e-05, |
|
"loss": 0.0004, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 2.6098535286284954, |
|
"grad_norm": 0.0038872328586876392, |
|
"learning_rate": 6.950732356857524e-05, |
|
"loss": 0.0405, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 2.623169107856192, |
|
"grad_norm": 0.005599846597760916, |
|
"learning_rate": 6.884154460719041e-05, |
|
"loss": 0.0006, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 2.6364846870838883, |
|
"grad_norm": 0.003918104339390993, |
|
"learning_rate": 6.81757656458056e-05, |
|
"loss": 0.0009, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 2.6498002663115847, |
|
"grad_norm": 0.0037299636751413345, |
|
"learning_rate": 6.750998668442078e-05, |
|
"loss": 0.0004, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 2.6631158455392807, |
|
"grad_norm": 0.00351333268918097, |
|
"learning_rate": 6.684420772303596e-05, |
|
"loss": 0.0004, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.6631158455392807, |
|
"eval_accuracy": 0.9910179640718563, |
|
"eval_loss": 0.044503793120384216, |
|
"eval_runtime": 19.2349, |
|
"eval_samples_per_second": 69.457, |
|
"eval_steps_per_second": 8.682, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.676431424766977, |
|
"grad_norm": 0.0037041015457361937, |
|
"learning_rate": 6.617842876165113e-05, |
|
"loss": 0.0005, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 2.6897470039946736, |
|
"grad_norm": 0.0033594798296689987, |
|
"learning_rate": 6.551264980026631e-05, |
|
"loss": 0.0004, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 2.70306258322237, |
|
"grad_norm": 0.0033981057349592447, |
|
"learning_rate": 6.484687083888148e-05, |
|
"loss": 0.0004, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 2.7163781624500665, |
|
"grad_norm": 0.006549084093421698, |
|
"learning_rate": 6.418109187749667e-05, |
|
"loss": 0.0005, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 2.729693741677763, |
|
"grad_norm": 0.0034731528721749783, |
|
"learning_rate": 6.351531291611186e-05, |
|
"loss": 0.0004, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 2.7430093209054593, |
|
"grad_norm": 0.003631744533777237, |
|
"learning_rate": 6.284953395472704e-05, |
|
"loss": 0.0004, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 2.756324900133156, |
|
"grad_norm": 0.00373335974290967, |
|
"learning_rate": 6.218375499334222e-05, |
|
"loss": 0.0004, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 2.7696404793608522, |
|
"grad_norm": 0.003112897975370288, |
|
"learning_rate": 6.151797603195739e-05, |
|
"loss": 0.0004, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 2.7829560585885487, |
|
"grad_norm": 0.0032353100832551718, |
|
"learning_rate": 6.085219707057257e-05, |
|
"loss": 0.0221, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 2.796271637816245, |
|
"grad_norm": 0.003041923977434635, |
|
"learning_rate": 6.018641810918775e-05, |
|
"loss": 0.0011, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 2.796271637816245, |
|
"eval_accuracy": 0.9805389221556886, |
|
"eval_loss": 0.09204710274934769, |
|
"eval_runtime": 18.9463, |
|
"eval_samples_per_second": 70.515, |
|
"eval_steps_per_second": 8.814, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 2.8095872170439415, |
|
"grad_norm": 0.003456325735896826, |
|
"learning_rate": 5.9520639147802933e-05, |
|
"loss": 0.0416, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 2.822902796271638, |
|
"grad_norm": 0.004117018077522516, |
|
"learning_rate": 5.8854860186418116e-05, |
|
"loss": 0.0004, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 2.8362183754993344, |
|
"grad_norm": 0.00375298666767776, |
|
"learning_rate": 5.818908122503329e-05, |
|
"loss": 0.0004, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 2.849533954727031, |
|
"grad_norm": 4.614629745483398, |
|
"learning_rate": 5.752330226364847e-05, |
|
"loss": 0.0117, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 2.8628495339547273, |
|
"grad_norm": 0.006098424084484577, |
|
"learning_rate": 5.685752330226365e-05, |
|
"loss": 0.001, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 2.8761651131824233, |
|
"grad_norm": 0.20828425884246826, |
|
"learning_rate": 5.619174434087883e-05, |
|
"loss": 0.0007, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 2.8894806924101197, |
|
"grad_norm": 0.0036955540999770164, |
|
"learning_rate": 5.552596537949402e-05, |
|
"loss": 0.0004, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 2.902796271637816, |
|
"grad_norm": 0.003313822206109762, |
|
"learning_rate": 5.4860186418109194e-05, |
|
"loss": 0.0004, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 2.9161118508655126, |
|
"grad_norm": 0.0031216980423778296, |
|
"learning_rate": 5.419440745672437e-05, |
|
"loss": 0.0007, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 2.929427430093209, |
|
"grad_norm": 0.002954344032332301, |
|
"learning_rate": 5.352862849533955e-05, |
|
"loss": 0.0004, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 2.929427430093209, |
|
"eval_accuracy": 0.9917664670658682, |
|
"eval_loss": 0.04039301723241806, |
|
"eval_runtime": 18.9671, |
|
"eval_samples_per_second": 70.438, |
|
"eval_steps_per_second": 8.805, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 2.9427430093209055, |
|
"grad_norm": 107.526123046875, |
|
"learning_rate": 5.286284953395473e-05, |
|
"loss": 0.0429, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 2.956058588548602, |
|
"grad_norm": 0.0032062442041933537, |
|
"learning_rate": 5.2197070572569905e-05, |
|
"loss": 0.0004, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 2.9693741677762984, |
|
"grad_norm": 0.2084268033504486, |
|
"learning_rate": 5.153129161118508e-05, |
|
"loss": 0.015, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 2.982689747003995, |
|
"grad_norm": 0.004362909123301506, |
|
"learning_rate": 5.086551264980027e-05, |
|
"loss": 0.0005, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 2.996005326231691, |
|
"grad_norm": 0.0029051396995782852, |
|
"learning_rate": 5.0199733688415454e-05, |
|
"loss": 0.0008, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 3.0093209054593877, |
|
"grad_norm": 0.0038269031792879105, |
|
"learning_rate": 4.953395472703063e-05, |
|
"loss": 0.0007, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 3.0226364846870837, |
|
"grad_norm": 0.0031527807004749775, |
|
"learning_rate": 4.8868175765645806e-05, |
|
"loss": 0.0004, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 3.03595206391478, |
|
"grad_norm": 0.0027656673919409513, |
|
"learning_rate": 4.820239680426098e-05, |
|
"loss": 0.0003, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 3.0492676431424766, |
|
"grad_norm": 0.0027427240274846554, |
|
"learning_rate": 4.753661784287617e-05, |
|
"loss": 0.0004, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 3.062583222370173, |
|
"grad_norm": 0.004418679978698492, |
|
"learning_rate": 4.687083888149135e-05, |
|
"loss": 0.0003, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 3.062583222370173, |
|
"eval_accuracy": 0.9917664670658682, |
|
"eval_loss": 0.037005726248025894, |
|
"eval_runtime": 18.852, |
|
"eval_samples_per_second": 70.868, |
|
"eval_steps_per_second": 8.858, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 3.0758988015978694, |
|
"grad_norm": 0.003011292079463601, |
|
"learning_rate": 4.6205059920106524e-05, |
|
"loss": 0.0005, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 3.089214380825566, |
|
"grad_norm": 0.002671412192285061, |
|
"learning_rate": 4.553928095872171e-05, |
|
"loss": 0.0003, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 3.1025299600532623, |
|
"grad_norm": 0.002712644636631012, |
|
"learning_rate": 4.487350199733688e-05, |
|
"loss": 0.0003, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 3.1158455392809588, |
|
"grad_norm": 0.00504153361544013, |
|
"learning_rate": 4.4207723035952066e-05, |
|
"loss": 0.0003, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 3.129161118508655, |
|
"grad_norm": 0.003144737333059311, |
|
"learning_rate": 4.354194407456725e-05, |
|
"loss": 0.0003, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 3.1424766977363516, |
|
"grad_norm": 0.002637204946950078, |
|
"learning_rate": 4.2876165113182425e-05, |
|
"loss": 0.0003, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 3.155792276964048, |
|
"grad_norm": 0.0026598216500133276, |
|
"learning_rate": 4.22103861517976e-05, |
|
"loss": 0.0003, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 3.1691078561917445, |
|
"grad_norm": 0.002664501080289483, |
|
"learning_rate": 4.154460719041279e-05, |
|
"loss": 0.0003, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 3.182423435419441, |
|
"grad_norm": 0.0027599988970905542, |
|
"learning_rate": 4.087882822902797e-05, |
|
"loss": 0.0003, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 3.195739014647137, |
|
"grad_norm": 0.002599115017801523, |
|
"learning_rate": 4.021304926764314e-05, |
|
"loss": 0.0003, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 3.195739014647137, |
|
"eval_accuracy": 0.9917664670658682, |
|
"eval_loss": 0.03919079899787903, |
|
"eval_runtime": 18.8696, |
|
"eval_samples_per_second": 70.802, |
|
"eval_steps_per_second": 8.85, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 3.2090545938748334, |
|
"grad_norm": 0.0025887268129736185, |
|
"learning_rate": 3.954727030625832e-05, |
|
"loss": 0.0163, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 3.22237017310253, |
|
"grad_norm": 0.0027751659508794546, |
|
"learning_rate": 3.88814913448735e-05, |
|
"loss": 0.0003, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 3.2356857523302263, |
|
"grad_norm": 0.002749471226707101, |
|
"learning_rate": 3.8215712383488685e-05, |
|
"loss": 0.0003, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 3.2490013315579227, |
|
"grad_norm": 0.002491929102689028, |
|
"learning_rate": 3.754993342210386e-05, |
|
"loss": 0.0012, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 3.262316910785619, |
|
"grad_norm": 0.0031888161320239305, |
|
"learning_rate": 3.6884154460719044e-05, |
|
"loss": 0.0003, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 3.2756324900133156, |
|
"grad_norm": 0.004251156002283096, |
|
"learning_rate": 3.621837549933422e-05, |
|
"loss": 0.0003, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 3.288948069241012, |
|
"grad_norm": 0.0027103093452751637, |
|
"learning_rate": 3.55525965379494e-05, |
|
"loss": 0.0004, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 3.3022636484687085, |
|
"grad_norm": 0.0026021709199994802, |
|
"learning_rate": 3.4886817576564586e-05, |
|
"loss": 0.0003, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 3.315579227696405, |
|
"grad_norm": 0.002954940777271986, |
|
"learning_rate": 3.422103861517976e-05, |
|
"loss": 0.0003, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 3.3288948069241013, |
|
"grad_norm": 0.0026502846740186214, |
|
"learning_rate": 3.355525965379494e-05, |
|
"loss": 0.0003, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 3.3288948069241013, |
|
"eval_accuracy": 0.9932634730538922, |
|
"eval_loss": 0.03057168610394001, |
|
"eval_runtime": 19.2139, |
|
"eval_samples_per_second": 69.533, |
|
"eval_steps_per_second": 8.692, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 3.3422103861517978, |
|
"grad_norm": 0.0026077169459313154, |
|
"learning_rate": 3.288948069241012e-05, |
|
"loss": 0.0003, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 3.3555259653794938, |
|
"grad_norm": 0.002518726047128439, |
|
"learning_rate": 3.2223701731025304e-05, |
|
"loss": 0.0003, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 3.36884154460719, |
|
"grad_norm": 0.002446199534460902, |
|
"learning_rate": 3.155792276964048e-05, |
|
"loss": 0.0003, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 3.3821571238348866, |
|
"grad_norm": 0.01649678498506546, |
|
"learning_rate": 3.0892143808255656e-05, |
|
"loss": 0.0003, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 3.395472703062583, |
|
"grad_norm": 0.002374894917011261, |
|
"learning_rate": 3.0226364846870843e-05, |
|
"loss": 0.0003, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 3.4087882822902795, |
|
"grad_norm": 0.0023247618228197098, |
|
"learning_rate": 2.956058588548602e-05, |
|
"loss": 0.0003, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 3.422103861517976, |
|
"grad_norm": 0.0022955993190407753, |
|
"learning_rate": 2.88948069241012e-05, |
|
"loss": 0.0003, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 3.4354194407456724, |
|
"grad_norm": 0.0023930929601192474, |
|
"learning_rate": 2.822902796271638e-05, |
|
"loss": 0.0194, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 3.448735019973369, |
|
"grad_norm": 0.0025017596781253815, |
|
"learning_rate": 2.756324900133156e-05, |
|
"loss": 0.0003, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 3.4620505992010653, |
|
"grad_norm": 0.0025145343970507383, |
|
"learning_rate": 2.6897470039946737e-05, |
|
"loss": 0.0003, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 3.4620505992010653, |
|
"eval_accuracy": 0.9955089820359282, |
|
"eval_loss": 0.019898300990462303, |
|
"eval_runtime": 19.1238, |
|
"eval_samples_per_second": 69.861, |
|
"eval_steps_per_second": 8.733, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 3.4753661784287617, |
|
"grad_norm": 0.0024944059550762177, |
|
"learning_rate": 2.623169107856192e-05, |
|
"loss": 0.0003, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 3.488681757656458, |
|
"grad_norm": 0.002342221327126026, |
|
"learning_rate": 2.55659121171771e-05, |
|
"loss": 0.0005, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 3.5019973368841546, |
|
"grad_norm": 0.03469083085656166, |
|
"learning_rate": 2.4900133155792276e-05, |
|
"loss": 0.0003, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 3.515312916111851, |
|
"grad_norm": 0.0023358033504337072, |
|
"learning_rate": 2.423435419440746e-05, |
|
"loss": 0.0003, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 3.5286284953395475, |
|
"grad_norm": 0.0022878609597682953, |
|
"learning_rate": 2.3568575233022638e-05, |
|
"loss": 0.0003, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 3.541944074567244, |
|
"grad_norm": 0.0026905399281531572, |
|
"learning_rate": 2.2902796271637818e-05, |
|
"loss": 0.0003, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 3.5552596537949404, |
|
"grad_norm": 0.0026699311565607786, |
|
"learning_rate": 2.2237017310252997e-05, |
|
"loss": 0.0003, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 3.5685752330226364, |
|
"grad_norm": 0.0023570535704493523, |
|
"learning_rate": 2.1571238348868177e-05, |
|
"loss": 0.0003, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 3.581890812250333, |
|
"grad_norm": 0.0024163059424608946, |
|
"learning_rate": 2.0905459387483356e-05, |
|
"loss": 0.0003, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 3.5952063914780292, |
|
"grad_norm": 0.002381423255428672, |
|
"learning_rate": 2.0239680426098536e-05, |
|
"loss": 0.0003, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 3.5952063914780292, |
|
"eval_accuracy": 0.9947604790419161, |
|
"eval_loss": 0.024509532377123833, |
|
"eval_runtime": 19.0731, |
|
"eval_samples_per_second": 70.046, |
|
"eval_steps_per_second": 8.756, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 3.6085219707057257, |
|
"grad_norm": 0.0022262686397880316, |
|
"learning_rate": 1.9573901464713715e-05, |
|
"loss": 0.0003, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 3.621837549933422, |
|
"grad_norm": 0.0022425020579248667, |
|
"learning_rate": 1.8908122503328895e-05, |
|
"loss": 0.0003, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 3.6351531291611185, |
|
"grad_norm": 0.0022420468740165234, |
|
"learning_rate": 1.8242343541944078e-05, |
|
"loss": 0.0003, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 3.648468708388815, |
|
"grad_norm": 0.0021876932587474585, |
|
"learning_rate": 1.7576564580559254e-05, |
|
"loss": 0.0003, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 3.6617842876165114, |
|
"grad_norm": 0.0023544211871922016, |
|
"learning_rate": 1.6910785619174437e-05, |
|
"loss": 0.0002, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 3.675099866844208, |
|
"grad_norm": 0.002206595614552498, |
|
"learning_rate": 1.6245006657789616e-05, |
|
"loss": 0.0004, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 3.688415446071904, |
|
"grad_norm": 0.002322066342458129, |
|
"learning_rate": 1.5579227696404792e-05, |
|
"loss": 0.0002, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 3.7017310252996003, |
|
"grad_norm": 0.0022114135790616274, |
|
"learning_rate": 1.4913448735019975e-05, |
|
"loss": 0.0003, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 3.7150466045272967, |
|
"grad_norm": 0.0022223354317247868, |
|
"learning_rate": 1.4247669773635153e-05, |
|
"loss": 0.0004, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 3.728362183754993, |
|
"grad_norm": 0.002155436435714364, |
|
"learning_rate": 1.3581890812250334e-05, |
|
"loss": 0.0003, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 3.728362183754993, |
|
"eval_accuracy": 0.9947604790419161, |
|
"eval_loss": 0.025128666311502457, |
|
"eval_runtime": 18.7023, |
|
"eval_samples_per_second": 71.435, |
|
"eval_steps_per_second": 8.929, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 3.7416777629826896, |
|
"grad_norm": 0.0022344435565173626, |
|
"learning_rate": 1.2916111850865514e-05, |
|
"loss": 0.0003, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 3.754993342210386, |
|
"grad_norm": 0.04637530446052551, |
|
"learning_rate": 1.2250332889480692e-05, |
|
"loss": 0.0003, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 3.7683089214380825, |
|
"grad_norm": 0.002139828633517027, |
|
"learning_rate": 1.1584553928095873e-05, |
|
"loss": 0.0003, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 3.781624500665779, |
|
"grad_norm": 0.0020825008396059275, |
|
"learning_rate": 1.0918774966711052e-05, |
|
"loss": 0.0003, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 3.7949400798934754, |
|
"grad_norm": 0.0020064804702997208, |
|
"learning_rate": 1.0252996005326232e-05, |
|
"loss": 0.0003, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 3.808255659121172, |
|
"grad_norm": 0.002234045183286071, |
|
"learning_rate": 9.587217043941411e-06, |
|
"loss": 0.0002, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 3.8215712383488682, |
|
"grad_norm": 0.0021161320619285107, |
|
"learning_rate": 8.921438082556593e-06, |
|
"loss": 0.0003, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 3.8348868175765647, |
|
"grad_norm": 0.0022624481935054064, |
|
"learning_rate": 8.255659121171772e-06, |
|
"loss": 0.0002, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 3.848202396804261, |
|
"grad_norm": 0.0020989372860640287, |
|
"learning_rate": 7.589880159786951e-06, |
|
"loss": 0.0002, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 3.8615179760319576, |
|
"grad_norm": 0.002201867988333106, |
|
"learning_rate": 6.92410119840213e-06, |
|
"loss": 0.0002, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 3.8615179760319576, |
|
"eval_accuracy": 0.9947604790419161, |
|
"eval_loss": 0.025865301489830017, |
|
"eval_runtime": 18.6017, |
|
"eval_samples_per_second": 71.821, |
|
"eval_steps_per_second": 8.978, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 3.874833555259654, |
|
"grad_norm": 0.002228468656539917, |
|
"learning_rate": 6.258322237017311e-06, |
|
"loss": 0.0002, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 3.8881491344873504, |
|
"grad_norm": 0.0022046160884201527, |
|
"learning_rate": 5.59254327563249e-06, |
|
"loss": 0.0003, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 3.9014647137150464, |
|
"grad_norm": 0.002218181500211358, |
|
"learning_rate": 4.92676431424767e-06, |
|
"loss": 0.0002, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 3.914780292942743, |
|
"grad_norm": 0.001987821888178587, |
|
"learning_rate": 4.26098535286285e-06, |
|
"loss": 0.0002, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 3.9280958721704393, |
|
"grad_norm": 0.0021837118547409773, |
|
"learning_rate": 3.5952063914780293e-06, |
|
"loss": 0.0002, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 3.9414114513981358, |
|
"grad_norm": 0.002148882020264864, |
|
"learning_rate": 2.9294274300932092e-06, |
|
"loss": 0.0002, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 3.954727030625832, |
|
"grad_norm": 0.002326791873201728, |
|
"learning_rate": 2.2636484687083888e-06, |
|
"loss": 0.0002, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 3.9680426098535286, |
|
"grad_norm": 0.001993759535253048, |
|
"learning_rate": 1.5978695073235687e-06, |
|
"loss": 0.0002, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 3.981358189081225, |
|
"grad_norm": 0.0020522773265838623, |
|
"learning_rate": 9.320905459387485e-07, |
|
"loss": 0.0002, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 3.9946737683089215, |
|
"grad_norm": 0.0021599766332656145, |
|
"learning_rate": 2.6631158455392814e-07, |
|
"loss": 0.0002, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 3.9946737683089215, |
|
"eval_accuracy": 0.9947604790419161, |
|
"eval_loss": 0.02588406577706337, |
|
"eval_runtime": 18.7868, |
|
"eval_samples_per_second": 71.114, |
|
"eval_steps_per_second": 8.889, |
|
"step": 3000 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 3004, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.7242650208772915e+18, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|