{ "best_metric": 0.019898300990462303, "best_model_checkpoint": "./vit-spoof/checkpoint-2600", "epoch": 4.0, "eval_steps": 100, "global_step": 3004, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.013315579227696404, "grad_norm": 2.9065542221069336, "learning_rate": 0.00019933422103861519, "loss": 0.4683, "step": 10 }, { "epoch": 0.02663115845539281, "grad_norm": 5.029272556304932, "learning_rate": 0.00019866844207723036, "loss": 0.4784, "step": 20 }, { "epoch": 0.03994673768308921, "grad_norm": 1.0903401374816895, "learning_rate": 0.00019800266311584554, "loss": 0.2297, "step": 30 }, { "epoch": 0.05326231691078562, "grad_norm": 13.801852226257324, "learning_rate": 0.00019733688415446071, "loss": 0.3551, "step": 40 }, { "epoch": 0.06657789613848203, "grad_norm": 7.049527645111084, "learning_rate": 0.0001966711051930759, "loss": 0.1737, "step": 50 }, { "epoch": 0.07989347536617843, "grad_norm": 4.438915729522705, "learning_rate": 0.00019600532623169107, "loss": 0.4895, "step": 60 }, { "epoch": 0.09320905459387484, "grad_norm": 2.351810932159424, "learning_rate": 0.00019533954727030627, "loss": 0.1397, "step": 70 }, { "epoch": 0.10652463382157124, "grad_norm": 0.2668318748474121, "learning_rate": 0.00019467376830892145, "loss": 0.2286, "step": 80 }, { "epoch": 0.11984021304926765, "grad_norm": 0.18876244127750397, "learning_rate": 0.00019400798934753662, "loss": 0.0623, "step": 90 }, { "epoch": 0.13315579227696406, "grad_norm": 0.08632949739694595, "learning_rate": 0.00019334221038615183, "loss": 0.0682, "step": 100 }, { "epoch": 0.13315579227696406, "eval_accuracy": 0.9505988023952096, "eval_loss": 0.16598451137542725, "eval_runtime": 18.9844, "eval_samples_per_second": 70.374, "eval_steps_per_second": 8.797, "step": 100 }, { "epoch": 0.14647137150466044, "grad_norm": 13.015838623046875, "learning_rate": 0.000192676431424767, "loss": 0.1278, "step": 110 }, { "epoch": 0.15978695073235685, "grad_norm": 0.0676693394780159, "learning_rate": 0.00019201065246338218, "loss": 0.048, "step": 120 }, { "epoch": 0.17310252996005326, "grad_norm": 0.08669548481702805, "learning_rate": 0.00019134487350199735, "loss": 0.0652, "step": 130 }, { "epoch": 0.18641810918774968, "grad_norm": 2.5868160724639893, "learning_rate": 0.00019067909454061253, "loss": 0.1912, "step": 140 }, { "epoch": 0.19973368841544606, "grad_norm": 0.046874042600393295, "learning_rate": 0.0001900133155792277, "loss": 0.0425, "step": 150 }, { "epoch": 0.21304926764314247, "grad_norm": 0.7398509383201599, "learning_rate": 0.00018934753661784288, "loss": 0.0862, "step": 160 }, { "epoch": 0.22636484687083888, "grad_norm": 1.8706042766571045, "learning_rate": 0.00018868175765645806, "loss": 0.4261, "step": 170 }, { "epoch": 0.2396804260985353, "grad_norm": 8.126788139343262, "learning_rate": 0.00018801597869507323, "loss": 0.2851, "step": 180 }, { "epoch": 0.2529960053262317, "grad_norm": 0.4172131419181824, "learning_rate": 0.0001873501997336884, "loss": 0.1001, "step": 190 }, { "epoch": 0.2663115845539281, "grad_norm": 0.6365886926651001, "learning_rate": 0.00018668442077230361, "loss": 0.0711, "step": 200 }, { "epoch": 0.2663115845539281, "eval_accuracy": 0.9805389221556886, "eval_loss": 0.07567165791988373, "eval_runtime": 19.0052, "eval_samples_per_second": 70.297, "eval_steps_per_second": 8.787, "step": 200 }, { "epoch": 0.2796271637816245, "grad_norm": 0.057595234364271164, "learning_rate": 0.0001860186418109188, "loss": 0.1514, "step": 210 }, { "epoch": 0.2929427430093209, "grad_norm": 8.712677955627441, "learning_rate": 0.00018535286284953397, "loss": 0.2079, "step": 220 }, { "epoch": 0.3062583222370173, "grad_norm": 0.33387067914009094, "learning_rate": 0.00018468708388814914, "loss": 0.0374, "step": 230 }, { "epoch": 0.3195739014647137, "grad_norm": 12.936437606811523, "learning_rate": 0.00018402130492676432, "loss": 0.154, "step": 240 }, { "epoch": 0.33288948069241014, "grad_norm": 0.08830924332141876, "learning_rate": 0.0001833555259653795, "loss": 0.0712, "step": 250 }, { "epoch": 0.34620505992010653, "grad_norm": 0.13204559683799744, "learning_rate": 0.00018268974700399467, "loss": 0.1426, "step": 260 }, { "epoch": 0.3595206391478029, "grad_norm": 0.2291734367609024, "learning_rate": 0.00018202396804260987, "loss": 0.0472, "step": 270 }, { "epoch": 0.37283621837549935, "grad_norm": 0.054096538573503494, "learning_rate": 0.00018135818908122505, "loss": 0.086, "step": 280 }, { "epoch": 0.38615179760319573, "grad_norm": 4.264747619628906, "learning_rate": 0.00018069241011984023, "loss": 0.1287, "step": 290 }, { "epoch": 0.3994673768308921, "grad_norm": 0.15629783272743225, "learning_rate": 0.0001800266311584554, "loss": 0.0112, "step": 300 }, { "epoch": 0.3994673768308921, "eval_accuracy": 0.9550898203592815, "eval_loss": 0.13134559988975525, "eval_runtime": 18.8497, "eval_samples_per_second": 70.876, "eval_steps_per_second": 8.86, "step": 300 }, { "epoch": 0.41278295605858856, "grad_norm": 4.84935188293457, "learning_rate": 0.00017936085219707058, "loss": 0.117, "step": 310 }, { "epoch": 0.42609853528628494, "grad_norm": 0.580627977848053, "learning_rate": 0.00017869507323568575, "loss": 0.1188, "step": 320 }, { "epoch": 0.4394141145139814, "grad_norm": 4.605671405792236, "learning_rate": 0.00017802929427430096, "loss": 0.1802, "step": 330 }, { "epoch": 0.45272969374167776, "grad_norm": 0.16331535577774048, "learning_rate": 0.00017736351531291613, "loss": 0.1645, "step": 340 }, { "epoch": 0.46604527296937415, "grad_norm": 9.795440673828125, "learning_rate": 0.0001766977363515313, "loss": 0.1133, "step": 350 }, { "epoch": 0.4793608521970706, "grad_norm": 1.2079029083251953, "learning_rate": 0.00017603195739014649, "loss": 0.06, "step": 360 }, { "epoch": 0.49267643142476697, "grad_norm": 0.02436252497136593, "learning_rate": 0.00017536617842876166, "loss": 0.0093, "step": 370 }, { "epoch": 0.5059920106524634, "grad_norm": 0.19958335161209106, "learning_rate": 0.00017470039946737684, "loss": 0.0398, "step": 380 }, { "epoch": 0.5193075898801598, "grad_norm": 0.8796051740646362, "learning_rate": 0.00017403462050599201, "loss": 0.1084, "step": 390 }, { "epoch": 0.5326231691078562, "grad_norm": 11.185724258422852, "learning_rate": 0.0001733688415446072, "loss": 0.0544, "step": 400 }, { "epoch": 0.5326231691078562, "eval_accuracy": 0.9850299401197605, "eval_loss": 0.05561767518520355, "eval_runtime": 19.0907, "eval_samples_per_second": 69.982, "eval_steps_per_second": 8.748, "step": 400 }, { "epoch": 0.5459387483355526, "grad_norm": 0.045906443148851395, "learning_rate": 0.00017270306258322237, "loss": 0.049, "step": 410 }, { "epoch": 0.559254327563249, "grad_norm": 0.03049567900598049, "learning_rate": 0.00017203728362183754, "loss": 0.0573, "step": 420 }, { "epoch": 0.5725699067909454, "grad_norm": 8.935633659362793, "learning_rate": 0.00017137150466045272, "loss": 0.324, "step": 430 }, { "epoch": 0.5858854860186418, "grad_norm": 0.19179384410381317, "learning_rate": 0.00017070572569906792, "loss": 0.0121, "step": 440 }, { "epoch": 0.5992010652463382, "grad_norm": 3.0050430297851562, "learning_rate": 0.0001700399467376831, "loss": 0.0361, "step": 450 }, { "epoch": 0.6125166444740346, "grad_norm": 0.0664597898721695, "learning_rate": 0.00016937416777629827, "loss": 0.1106, "step": 460 }, { "epoch": 0.625832223701731, "grad_norm": 5.098622798919678, "learning_rate": 0.00016870838881491348, "loss": 0.1938, "step": 470 }, { "epoch": 0.6391478029294274, "grad_norm": 4.372994899749756, "learning_rate": 0.00016804260985352865, "loss": 0.0807, "step": 480 }, { "epoch": 0.6524633821571239, "grad_norm": 0.28381890058517456, "learning_rate": 0.00016737683089214383, "loss": 0.0463, "step": 490 }, { "epoch": 0.6657789613848203, "grad_norm": 0.8805880546569824, "learning_rate": 0.000166711051930759, "loss": 0.0557, "step": 500 }, { "epoch": 0.6657789613848203, "eval_accuracy": 0.968562874251497, "eval_loss": 0.09653697907924652, "eval_runtime": 18.9994, "eval_samples_per_second": 70.318, "eval_steps_per_second": 8.79, "step": 500 }, { "epoch": 0.6790945406125166, "grad_norm": 0.3464203178882599, "learning_rate": 0.00016604527296937418, "loss": 0.1231, "step": 510 }, { "epoch": 0.6924101198402131, "grad_norm": 0.10989736020565033, "learning_rate": 0.00016537949400798936, "loss": 0.0666, "step": 520 }, { "epoch": 0.7057256990679095, "grad_norm": 2.2785890102386475, "learning_rate": 0.00016471371504660453, "loss": 0.1636, "step": 530 }, { "epoch": 0.7190412782956058, "grad_norm": 0.07484255731105804, "learning_rate": 0.0001640479360852197, "loss": 0.0288, "step": 540 }, { "epoch": 0.7323568575233023, "grad_norm": 0.02765025570988655, "learning_rate": 0.0001633821571238349, "loss": 0.0514, "step": 550 }, { "epoch": 0.7456724367509987, "grad_norm": 0.07284240424633026, "learning_rate": 0.00016271637816245006, "loss": 0.0547, "step": 560 }, { "epoch": 0.758988015978695, "grad_norm": 0.07628186792135239, "learning_rate": 0.00016205059920106524, "loss": 0.0239, "step": 570 }, { "epoch": 0.7723035952063915, "grad_norm": 0.051849640905857086, "learning_rate": 0.00016138482023968042, "loss": 0.0568, "step": 580 }, { "epoch": 0.7856191744340879, "grad_norm": 0.12565506994724274, "learning_rate": 0.00016071904127829562, "loss": 0.0568, "step": 590 }, { "epoch": 0.7989347536617842, "grad_norm": 0.022169604897499084, "learning_rate": 0.0001600532623169108, "loss": 0.016, "step": 600 }, { "epoch": 0.7989347536617842, "eval_accuracy": 0.9827844311377245, "eval_loss": 0.05710091441869736, "eval_runtime": 19.0923, "eval_samples_per_second": 69.976, "eval_steps_per_second": 8.747, "step": 600 }, { "epoch": 0.8122503328894807, "grad_norm": 8.000443458557129, "learning_rate": 0.00015938748335552597, "loss": 0.0764, "step": 610 }, { "epoch": 0.8255659121171771, "grad_norm": 4.5883965492248535, "learning_rate": 0.00015872170439414115, "loss": 0.1149, "step": 620 }, { "epoch": 0.8388814913448736, "grad_norm": 0.1661900281906128, "learning_rate": 0.00015805592543275632, "loss": 0.0243, "step": 630 }, { "epoch": 0.8521970705725699, "grad_norm": 2.3490681648254395, "learning_rate": 0.00015739014647137153, "loss": 0.1185, "step": 640 }, { "epoch": 0.8655126498002663, "grad_norm": 0.3317602276802063, "learning_rate": 0.0001567243675099867, "loss": 0.0235, "step": 650 }, { "epoch": 0.8788282290279628, "grad_norm": 0.038634102791547775, "learning_rate": 0.00015605858854860188, "loss": 0.0774, "step": 660 }, { "epoch": 0.8921438082556591, "grad_norm": 0.05563969910144806, "learning_rate": 0.00015539280958721705, "loss": 0.0812, "step": 670 }, { "epoch": 0.9054593874833555, "grad_norm": 0.029876109212636948, "learning_rate": 0.00015472703062583223, "loss": 0.0314, "step": 680 }, { "epoch": 0.918774966711052, "grad_norm": 1.0358515977859497, "learning_rate": 0.0001540612516644474, "loss": 0.0538, "step": 690 }, { "epoch": 0.9320905459387483, "grad_norm": 0.21664029359817505, "learning_rate": 0.00015339547270306258, "loss": 0.0382, "step": 700 }, { "epoch": 0.9320905459387483, "eval_accuracy": 0.9925149700598802, "eval_loss": 0.029164018109440804, "eval_runtime": 19.0562, "eval_samples_per_second": 70.108, "eval_steps_per_second": 8.764, "step": 700 }, { "epoch": 0.9454061251664447, "grad_norm": 0.24551372230052948, "learning_rate": 0.00015272969374167776, "loss": 0.0445, "step": 710 }, { "epoch": 0.9587217043941412, "grad_norm": 0.024771912023425102, "learning_rate": 0.00015206391478029296, "loss": 0.0366, "step": 720 }, { "epoch": 0.9720372836218375, "grad_norm": 0.027900271117687225, "learning_rate": 0.00015139813581890814, "loss": 0.0044, "step": 730 }, { "epoch": 0.9853528628495339, "grad_norm": 0.023675180971622467, "learning_rate": 0.00015073235685752331, "loss": 0.0191, "step": 740 }, { "epoch": 0.9986684420772304, "grad_norm": 0.1361338347196579, "learning_rate": 0.0001500665778961385, "loss": 0.0496, "step": 750 }, { "epoch": 1.0119840213049267, "grad_norm": 0.016286784783005714, "learning_rate": 0.00014940079893475367, "loss": 0.0658, "step": 760 }, { "epoch": 1.0252996005326231, "grad_norm": 0.04202214255928993, "learning_rate": 0.00014873501997336884, "loss": 0.074, "step": 770 }, { "epoch": 1.0386151797603196, "grad_norm": 0.10847309976816177, "learning_rate": 0.00014806924101198402, "loss": 0.0871, "step": 780 }, { "epoch": 1.051930758988016, "grad_norm": 0.049633290618658066, "learning_rate": 0.0001474034620505992, "loss": 0.0098, "step": 790 }, { "epoch": 1.0652463382157125, "grad_norm": 0.029612813144922256, "learning_rate": 0.00014673768308921437, "loss": 0.0281, "step": 800 }, { "epoch": 1.0652463382157125, "eval_accuracy": 0.9782934131736527, "eval_loss": 0.07025627046823502, "eval_runtime": 18.7557, "eval_samples_per_second": 71.232, "eval_steps_per_second": 8.904, "step": 800 }, { "epoch": 1.078561917443409, "grad_norm": 15.133095741271973, "learning_rate": 0.00014607190412782957, "loss": 0.1002, "step": 810 }, { "epoch": 1.0918774966711051, "grad_norm": 0.051363505423069, "learning_rate": 0.00014540612516644475, "loss": 0.0365, "step": 820 }, { "epoch": 1.1051930758988016, "grad_norm": 0.03931460902094841, "learning_rate": 0.00014474034620505993, "loss": 0.0033, "step": 830 }, { "epoch": 1.118508655126498, "grad_norm": 0.02210480161011219, "learning_rate": 0.0001440745672436751, "loss": 0.0695, "step": 840 }, { "epoch": 1.1318242343541944, "grad_norm": 1.2839746475219727, "learning_rate": 0.0001434087882822903, "loss": 0.0675, "step": 850 }, { "epoch": 1.1451398135818909, "grad_norm": 0.31109488010406494, "learning_rate": 0.00014274300932090548, "loss": 0.0486, "step": 860 }, { "epoch": 1.158455392809587, "grad_norm": 1.5788602828979492, "learning_rate": 0.00014207723035952066, "loss": 0.0325, "step": 870 }, { "epoch": 1.1717709720372835, "grad_norm": 0.03321152552962303, "learning_rate": 0.00014141145139813583, "loss": 0.0796, "step": 880 }, { "epoch": 1.18508655126498, "grad_norm": 12.448341369628906, "learning_rate": 0.000140745672436751, "loss": 0.0515, "step": 890 }, { "epoch": 1.1984021304926764, "grad_norm": 0.1457500457763672, "learning_rate": 0.0001400798934753662, "loss": 0.023, "step": 900 }, { "epoch": 1.1984021304926764, "eval_accuracy": 0.9932634730538922, "eval_loss": 0.02668871358036995, "eval_runtime": 18.662, "eval_samples_per_second": 71.589, "eval_steps_per_second": 8.949, "step": 900 }, { "epoch": 1.2117177097203728, "grad_norm": 0.14423052966594696, "learning_rate": 0.00013941411451398136, "loss": 0.08, "step": 910 }, { "epoch": 1.2250332889480693, "grad_norm": 0.055785391479730606, "learning_rate": 0.00013874833555259654, "loss": 0.0445, "step": 920 }, { "epoch": 1.2383488681757657, "grad_norm": 0.0294383205473423, "learning_rate": 0.00013808255659121172, "loss": 0.0414, "step": 930 }, { "epoch": 1.2516644474034622, "grad_norm": 0.0800587385892868, "learning_rate": 0.0001374167776298269, "loss": 0.003, "step": 940 }, { "epoch": 1.2649800266311584, "grad_norm": 0.08012738078832626, "learning_rate": 0.00013675099866844207, "loss": 0.0042, "step": 950 }, { "epoch": 1.2782956058588548, "grad_norm": 0.022230001166462898, "learning_rate": 0.00013608521970705724, "loss": 0.0361, "step": 960 }, { "epoch": 1.2916111850865513, "grad_norm": 0.014080135151743889, "learning_rate": 0.00013541944074567242, "loss": 0.004, "step": 970 }, { "epoch": 1.3049267643142477, "grad_norm": 0.01428599376231432, "learning_rate": 0.00013475366178428762, "loss": 0.0016, "step": 980 }, { "epoch": 1.3182423435419441, "grad_norm": 0.01126811746507883, "learning_rate": 0.0001340878828229028, "loss": 0.0014, "step": 990 }, { "epoch": 1.3315579227696404, "grad_norm": 0.016759509220719337, "learning_rate": 0.00013342210386151798, "loss": 0.0695, "step": 1000 }, { "epoch": 1.3315579227696404, "eval_accuracy": 0.9887724550898204, "eval_loss": 0.043992191553115845, "eval_runtime": 18.8129, "eval_samples_per_second": 71.015, "eval_steps_per_second": 8.877, "step": 1000 }, { "epoch": 1.3448735019973368, "grad_norm": 12.30389404296875, "learning_rate": 0.00013275632490013318, "loss": 0.0344, "step": 1010 }, { "epoch": 1.3581890812250332, "grad_norm": 0.019324608147144318, "learning_rate": 0.00013209054593874836, "loss": 0.1322, "step": 1020 }, { "epoch": 1.3715046604527297, "grad_norm": 0.08052018284797668, "learning_rate": 0.00013142476697736353, "loss": 0.0878, "step": 1030 }, { "epoch": 1.3848202396804261, "grad_norm": 0.015013271011412144, "learning_rate": 0.0001307589880159787, "loss": 0.0176, "step": 1040 }, { "epoch": 1.3981358189081226, "grad_norm": 1.5079394578933716, "learning_rate": 0.00013009320905459388, "loss": 0.0757, "step": 1050 }, { "epoch": 1.411451398135819, "grad_norm": 1.3230667114257812, "learning_rate": 0.00012942743009320906, "loss": 0.054, "step": 1060 }, { "epoch": 1.4247669773635154, "grad_norm": 0.025523126125335693, "learning_rate": 0.00012876165113182424, "loss": 0.0439, "step": 1070 }, { "epoch": 1.4380825565912116, "grad_norm": 4.955082893371582, "learning_rate": 0.0001280958721704394, "loss": 0.0196, "step": 1080 }, { "epoch": 1.451398135818908, "grad_norm": 0.07307817041873932, "learning_rate": 0.0001274300932090546, "loss": 0.0637, "step": 1090 }, { "epoch": 1.4647137150466045, "grad_norm": 0.016766542568802834, "learning_rate": 0.00012676431424766976, "loss": 0.0215, "step": 1100 }, { "epoch": 1.4647137150466045, "eval_accuracy": 0.9910179640718563, "eval_loss": 0.031917981803417206, "eval_runtime": 18.9879, "eval_samples_per_second": 70.361, "eval_steps_per_second": 8.795, "step": 1100 }, { "epoch": 1.478029294274301, "grad_norm": 0.010708771646022797, "learning_rate": 0.00012609853528628497, "loss": 0.0036, "step": 1110 }, { "epoch": 1.4913448735019974, "grad_norm": 0.010356970131397247, "learning_rate": 0.00012543275632490014, "loss": 0.0016, "step": 1120 }, { "epoch": 1.5046604527296936, "grad_norm": 0.010580988600850105, "learning_rate": 0.00012476697736351532, "loss": 0.0354, "step": 1130 }, { "epoch": 1.51797603195739, "grad_norm": 0.009461847133934498, "learning_rate": 0.0001241011984021305, "loss": 0.0011, "step": 1140 }, { "epoch": 1.5312916111850865, "grad_norm": 0.011837669648230076, "learning_rate": 0.00012343541944074567, "loss": 0.0434, "step": 1150 }, { "epoch": 1.544607190412783, "grad_norm": 0.01360328495502472, "learning_rate": 0.00012276964047936085, "loss": 0.0013, "step": 1160 }, { "epoch": 1.5579227696404794, "grad_norm": 0.010293539613485336, "learning_rate": 0.00012210386151797602, "loss": 0.0013, "step": 1170 }, { "epoch": 1.5712383488681758, "grad_norm": 0.011664211750030518, "learning_rate": 0.00012143808255659121, "loss": 0.0013, "step": 1180 }, { "epoch": 1.5845539280958723, "grad_norm": 0.03968218341469765, "learning_rate": 0.00012077230359520639, "loss": 0.0067, "step": 1190 }, { "epoch": 1.5978695073235687, "grad_norm": 1.3615704774856567, "learning_rate": 0.00012010652463382157, "loss": 0.0236, "step": 1200 }, { "epoch": 1.5978695073235687, "eval_accuracy": 0.9880239520958084, "eval_loss": 0.06240475922822952, "eval_runtime": 19.0358, "eval_samples_per_second": 70.184, "eval_steps_per_second": 8.773, "step": 1200 }, { "epoch": 1.6111850865512651, "grad_norm": 0.3500171899795532, "learning_rate": 0.00011944074567243674, "loss": 0.0198, "step": 1210 }, { "epoch": 1.6245006657789614, "grad_norm": 0.007627115119248629, "learning_rate": 0.00011877496671105193, "loss": 0.0016, "step": 1220 }, { "epoch": 1.6378162450066578, "grad_norm": 0.07766377925872803, "learning_rate": 0.00011810918774966711, "loss": 0.001, "step": 1230 }, { "epoch": 1.6511318242343542, "grad_norm": 0.008079525083303452, "learning_rate": 0.00011744340878828231, "loss": 0.001, "step": 1240 }, { "epoch": 1.6644474034620504, "grad_norm": 14.718483924865723, "learning_rate": 0.00011677762982689749, "loss": 0.0543, "step": 1250 }, { "epoch": 1.6777629826897469, "grad_norm": 3.2829227447509766, "learning_rate": 0.00011611185086551266, "loss": 0.0409, "step": 1260 }, { "epoch": 1.6910785619174433, "grad_norm": 0.8293498158454895, "learning_rate": 0.00011544607190412784, "loss": 0.0632, "step": 1270 }, { "epoch": 1.7043941411451398, "grad_norm": 0.15176649391651154, "learning_rate": 0.00011478029294274302, "loss": 0.0056, "step": 1280 }, { "epoch": 1.7177097203728362, "grad_norm": 0.015873638913035393, "learning_rate": 0.00011411451398135819, "loss": 0.0032, "step": 1290 }, { "epoch": 1.7310252996005326, "grad_norm": 0.011890141293406487, "learning_rate": 0.00011344873501997337, "loss": 0.0021, "step": 1300 }, { "epoch": 1.7310252996005326, "eval_accuracy": 0.9895209580838323, "eval_loss": 0.05610267445445061, "eval_runtime": 18.9536, "eval_samples_per_second": 70.488, "eval_steps_per_second": 8.811, "step": 1300 }, { "epoch": 1.744340878828229, "grad_norm": 0.009211612865328789, "learning_rate": 0.00011278295605858856, "loss": 0.0014, "step": 1310 }, { "epoch": 1.7576564580559255, "grad_norm": 0.015255720354616642, "learning_rate": 0.00011211717709720373, "loss": 0.0088, "step": 1320 }, { "epoch": 1.770972037283622, "grad_norm": 0.009559585712850094, "learning_rate": 0.00011145139813581891, "loss": 0.0203, "step": 1330 }, { "epoch": 1.7842876165113184, "grad_norm": 39.935569763183594, "learning_rate": 0.00011078561917443409, "loss": 0.0302, "step": 1340 }, { "epoch": 1.7976031957390146, "grad_norm": 0.010259670205414295, "learning_rate": 0.00011011984021304926, "loss": 0.0721, "step": 1350 }, { "epoch": 1.810918774966711, "grad_norm": 0.013963967561721802, "learning_rate": 0.00010945406125166447, "loss": 0.0358, "step": 1360 }, { "epoch": 1.8242343541944075, "grad_norm": 0.2606169283390045, "learning_rate": 0.00010878828229027964, "loss": 0.0179, "step": 1370 }, { "epoch": 1.8375499334221037, "grad_norm": 21.295629501342773, "learning_rate": 0.00010812250332889482, "loss": 0.0677, "step": 1380 }, { "epoch": 1.8508655126498001, "grad_norm": 0.014277924783527851, "learning_rate": 0.00010745672436751, "loss": 0.0074, "step": 1390 }, { "epoch": 1.8641810918774966, "grad_norm": 0.007295672316104174, "learning_rate": 0.00010679094540612517, "loss": 0.0218, "step": 1400 }, { "epoch": 1.8641810918774966, "eval_accuracy": 0.9820359281437125, "eval_loss": 0.07637928426265717, "eval_runtime": 18.728, "eval_samples_per_second": 71.337, "eval_steps_per_second": 8.917, "step": 1400 }, { "epoch": 1.877496671105193, "grad_norm": 0.12700679898262024, "learning_rate": 0.00010612516644474036, "loss": 0.0066, "step": 1410 }, { "epoch": 1.8908122503328895, "grad_norm": 0.5147867202758789, "learning_rate": 0.00010545938748335554, "loss": 0.0024, "step": 1420 }, { "epoch": 1.904127829560586, "grad_norm": 0.007088396232575178, "learning_rate": 0.00010479360852197071, "loss": 0.0016, "step": 1430 }, { "epoch": 1.9174434087882823, "grad_norm": 0.07631803303956985, "learning_rate": 0.00010412782956058589, "loss": 0.0016, "step": 1440 }, { "epoch": 1.9307589880159788, "grad_norm": 0.02045373059809208, "learning_rate": 0.00010346205059920106, "loss": 0.0062, "step": 1450 }, { "epoch": 1.9440745672436752, "grad_norm": 0.034513406455516815, "learning_rate": 0.00010279627163781624, "loss": 0.0418, "step": 1460 }, { "epoch": 1.9573901464713717, "grad_norm": 0.006663164123892784, "learning_rate": 0.00010213049267643142, "loss": 0.0355, "step": 1470 }, { "epoch": 1.9707057256990679, "grad_norm": 0.15765492618083954, "learning_rate": 0.0001014647137150466, "loss": 0.0708, "step": 1480 }, { "epoch": 1.9840213049267643, "grad_norm": 4.438633441925049, "learning_rate": 0.0001007989347536618, "loss": 0.0889, "step": 1490 }, { "epoch": 1.9973368841544608, "grad_norm": 0.006967665161937475, "learning_rate": 0.00010013315579227697, "loss": 0.0102, "step": 1500 }, { "epoch": 1.9973368841544608, "eval_accuracy": 0.9910179640718563, "eval_loss": 0.034723859280347824, "eval_runtime": 18.7741, "eval_samples_per_second": 71.162, "eval_steps_per_second": 8.895, "step": 1500 }, { "epoch": 2.010652463382157, "grad_norm": 0.09451789408922195, "learning_rate": 9.946737683089215e-05, "loss": 0.0049, "step": 1510 }, { "epoch": 2.0239680426098534, "grad_norm": 0.008569799363613129, "learning_rate": 9.880159786950732e-05, "loss": 0.0173, "step": 1520 }, { "epoch": 2.03728362183755, "grad_norm": 0.3698938488960266, "learning_rate": 9.813581890812251e-05, "loss": 0.0013, "step": 1530 }, { "epoch": 2.0505992010652463, "grad_norm": 0.016449321061372757, "learning_rate": 9.747003994673769e-05, "loss": 0.0015, "step": 1540 }, { "epoch": 2.0639147802929427, "grad_norm": 0.06819990277290344, "learning_rate": 9.680426098535287e-05, "loss": 0.001, "step": 1550 }, { "epoch": 2.077230359520639, "grad_norm": 0.006248504854738712, "learning_rate": 9.613848202396804e-05, "loss": 0.0011, "step": 1560 }, { "epoch": 2.0905459387483356, "grad_norm": 0.005318405572324991, "learning_rate": 9.547270306258322e-05, "loss": 0.0277, "step": 1570 }, { "epoch": 2.103861517976032, "grad_norm": 0.02135417051613331, "learning_rate": 9.480692410119841e-05, "loss": 0.0007, "step": 1580 }, { "epoch": 2.1171770972037285, "grad_norm": 13.763138771057129, "learning_rate": 9.414114513981358e-05, "loss": 0.0163, "step": 1590 }, { "epoch": 2.130492676431425, "grad_norm": 3.230034112930298, "learning_rate": 9.347536617842877e-05, "loss": 0.0668, "step": 1600 }, { "epoch": 2.130492676431425, "eval_accuracy": 0.9745508982035929, "eval_loss": 0.1236691027879715, "eval_runtime": 18.9343, "eval_samples_per_second": 70.56, "eval_steps_per_second": 8.82, "step": 1600 }, { "epoch": 2.1438082556591214, "grad_norm": 0.0055719888769090176, "learning_rate": 9.280958721704395e-05, "loss": 0.001, "step": 1610 }, { "epoch": 2.157123834886818, "grad_norm": 0.007174134254455566, "learning_rate": 9.214380825565913e-05, "loss": 0.0353, "step": 1620 }, { "epoch": 2.170439414114514, "grad_norm": 0.005854015704244375, "learning_rate": 9.14780292942743e-05, "loss": 0.0011, "step": 1630 }, { "epoch": 2.1837549933422102, "grad_norm": 0.03955959528684616, "learning_rate": 9.081225033288948e-05, "loss": 0.0082, "step": 1640 }, { "epoch": 2.1970705725699067, "grad_norm": 0.007123819552361965, "learning_rate": 9.014647137150465e-05, "loss": 0.0009, "step": 1650 }, { "epoch": 2.210386151797603, "grad_norm": 0.15122471749782562, "learning_rate": 8.948069241011984e-05, "loss": 0.0025, "step": 1660 }, { "epoch": 2.2237017310252996, "grad_norm": 0.0059026069939136505, "learning_rate": 8.881491344873502e-05, "loss": 0.0211, "step": 1670 }, { "epoch": 2.237017310252996, "grad_norm": 0.005344110075384378, "learning_rate": 8.814913448735021e-05, "loss": 0.0215, "step": 1680 }, { "epoch": 2.2503328894806924, "grad_norm": 0.007038838230073452, "learning_rate": 8.748335552596539e-05, "loss": 0.0012, "step": 1690 }, { "epoch": 2.263648468708389, "grad_norm": 0.004509239457547665, "learning_rate": 8.681757656458056e-05, "loss": 0.0022, "step": 1700 }, { "epoch": 2.263648468708389, "eval_accuracy": 0.9872754491017964, "eval_loss": 0.0549810491502285, "eval_runtime": 19.1819, "eval_samples_per_second": 69.649, "eval_steps_per_second": 8.706, "step": 1700 }, { "epoch": 2.2769640479360853, "grad_norm": 0.04736039415001869, "learning_rate": 8.615179760319574e-05, "loss": 0.0015, "step": 1710 }, { "epoch": 2.2902796271637818, "grad_norm": 6.543766498565674, "learning_rate": 8.548601864181093e-05, "loss": 0.0414, "step": 1720 }, { "epoch": 2.303595206391478, "grad_norm": 0.004299995955079794, "learning_rate": 8.48202396804261e-05, "loss": 0.0008, "step": 1730 }, { "epoch": 2.316910785619174, "grad_norm": 0.005761590786278248, "learning_rate": 8.415446071904128e-05, "loss": 0.0586, "step": 1740 }, { "epoch": 2.3302263648468706, "grad_norm": 0.007165808696299791, "learning_rate": 8.348868175765646e-05, "loss": 0.0007, "step": 1750 }, { "epoch": 2.343541944074567, "grad_norm": 0.0066890171729028225, "learning_rate": 8.282290279627163e-05, "loss": 0.0374, "step": 1760 }, { "epoch": 2.3568575233022635, "grad_norm": 0.005914334207773209, "learning_rate": 8.215712383488682e-05, "loss": 0.0009, "step": 1770 }, { "epoch": 2.37017310252996, "grad_norm": 0.009822635911405087, "learning_rate": 8.1491344873502e-05, "loss": 0.0364, "step": 1780 }, { "epoch": 2.3834886817576564, "grad_norm": 0.008548504672944546, "learning_rate": 8.082556591211719e-05, "loss": 0.043, "step": 1790 }, { "epoch": 2.396804260985353, "grad_norm": 0.007220366504043341, "learning_rate": 8.015978695073236e-05, "loss": 0.0156, "step": 1800 }, { "epoch": 2.396804260985353, "eval_accuracy": 0.9895209580838323, "eval_loss": 0.03286071494221687, "eval_runtime": 19.1709, "eval_samples_per_second": 69.689, "eval_steps_per_second": 8.711, "step": 1800 }, { "epoch": 2.4101198402130493, "grad_norm": 0.006717954762279987, "learning_rate": 7.949400798934754e-05, "loss": 0.0009, "step": 1810 }, { "epoch": 2.4234354194407457, "grad_norm": 0.035514891147613525, "learning_rate": 7.882822902796272e-05, "loss": 0.0009, "step": 1820 }, { "epoch": 2.436750998668442, "grad_norm": 0.0049186451360583305, "learning_rate": 7.81624500665779e-05, "loss": 0.0046, "step": 1830 }, { "epoch": 2.4500665778961386, "grad_norm": 0.03821125254034996, "learning_rate": 7.749667110519307e-05, "loss": 0.0069, "step": 1840 }, { "epoch": 2.463382157123835, "grad_norm": 3.3322513103485107, "learning_rate": 7.683089214380826e-05, "loss": 0.0022, "step": 1850 }, { "epoch": 2.4766977363515315, "grad_norm": 0.004409165121614933, "learning_rate": 7.616511318242345e-05, "loss": 0.0023, "step": 1860 }, { "epoch": 2.490013315579228, "grad_norm": 0.004281247034668922, "learning_rate": 7.549933422103862e-05, "loss": 0.0258, "step": 1870 }, { "epoch": 2.5033288948069243, "grad_norm": 0.012024540454149246, "learning_rate": 7.48335552596538e-05, "loss": 0.0006, "step": 1880 }, { "epoch": 2.5166444740346208, "grad_norm": 0.004248355980962515, "learning_rate": 7.416777629826898e-05, "loss": 0.0005, "step": 1890 }, { "epoch": 2.5299600532623168, "grad_norm": 0.004529563244432211, "learning_rate": 7.350199733688415e-05, "loss": 0.0006, "step": 1900 }, { "epoch": 2.5299600532623168, "eval_accuracy": 0.9947604790419161, "eval_loss": 0.02548411302268505, "eval_runtime": 18.997, "eval_samples_per_second": 70.327, "eval_steps_per_second": 8.791, "step": 1900 }, { "epoch": 2.543275632490013, "grad_norm": 0.00455145537853241, "learning_rate": 7.283621837549934e-05, "loss": 0.0005, "step": 1910 }, { "epoch": 2.5565912117177096, "grad_norm": 0.004023432265967131, "learning_rate": 7.217043941411452e-05, "loss": 0.0005, "step": 1920 }, { "epoch": 2.569906790945406, "grad_norm": 0.004490096587687731, "learning_rate": 7.15046604527297e-05, "loss": 0.0007, "step": 1930 }, { "epoch": 2.5832223701731025, "grad_norm": 0.0075076790526509285, "learning_rate": 7.083888149134487e-05, "loss": 0.0012, "step": 1940 }, { "epoch": 2.596537949400799, "grad_norm": 0.0037563215009868145, "learning_rate": 7.017310252996006e-05, "loss": 0.0004, "step": 1950 }, { "epoch": 2.6098535286284954, "grad_norm": 0.0038872328586876392, "learning_rate": 6.950732356857524e-05, "loss": 0.0405, "step": 1960 }, { "epoch": 2.623169107856192, "grad_norm": 0.005599846597760916, "learning_rate": 6.884154460719041e-05, "loss": 0.0006, "step": 1970 }, { "epoch": 2.6364846870838883, "grad_norm": 0.003918104339390993, "learning_rate": 6.81757656458056e-05, "loss": 0.0009, "step": 1980 }, { "epoch": 2.6498002663115847, "grad_norm": 0.0037299636751413345, "learning_rate": 6.750998668442078e-05, "loss": 0.0004, "step": 1990 }, { "epoch": 2.6631158455392807, "grad_norm": 0.00351333268918097, "learning_rate": 6.684420772303596e-05, "loss": 0.0004, "step": 2000 }, { "epoch": 2.6631158455392807, "eval_accuracy": 0.9910179640718563, "eval_loss": 0.044503793120384216, "eval_runtime": 19.2349, "eval_samples_per_second": 69.457, "eval_steps_per_second": 8.682, "step": 2000 }, { "epoch": 2.676431424766977, "grad_norm": 0.0037041015457361937, "learning_rate": 6.617842876165113e-05, "loss": 0.0005, "step": 2010 }, { "epoch": 2.6897470039946736, "grad_norm": 0.0033594798296689987, "learning_rate": 6.551264980026631e-05, "loss": 0.0004, "step": 2020 }, { "epoch": 2.70306258322237, "grad_norm": 0.0033981057349592447, "learning_rate": 6.484687083888148e-05, "loss": 0.0004, "step": 2030 }, { "epoch": 2.7163781624500665, "grad_norm": 0.006549084093421698, "learning_rate": 6.418109187749667e-05, "loss": 0.0005, "step": 2040 }, { "epoch": 2.729693741677763, "grad_norm": 0.0034731528721749783, "learning_rate": 6.351531291611186e-05, "loss": 0.0004, "step": 2050 }, { "epoch": 2.7430093209054593, "grad_norm": 0.003631744533777237, "learning_rate": 6.284953395472704e-05, "loss": 0.0004, "step": 2060 }, { "epoch": 2.756324900133156, "grad_norm": 0.00373335974290967, "learning_rate": 6.218375499334222e-05, "loss": 0.0004, "step": 2070 }, { "epoch": 2.7696404793608522, "grad_norm": 0.003112897975370288, "learning_rate": 6.151797603195739e-05, "loss": 0.0004, "step": 2080 }, { "epoch": 2.7829560585885487, "grad_norm": 0.0032353100832551718, "learning_rate": 6.085219707057257e-05, "loss": 0.0221, "step": 2090 }, { "epoch": 2.796271637816245, "grad_norm": 0.003041923977434635, "learning_rate": 6.018641810918775e-05, "loss": 0.0011, "step": 2100 }, { "epoch": 2.796271637816245, "eval_accuracy": 0.9805389221556886, "eval_loss": 0.09204710274934769, "eval_runtime": 18.9463, "eval_samples_per_second": 70.515, "eval_steps_per_second": 8.814, "step": 2100 }, { "epoch": 2.8095872170439415, "grad_norm": 0.003456325735896826, "learning_rate": 5.9520639147802933e-05, "loss": 0.0416, "step": 2110 }, { "epoch": 2.822902796271638, "grad_norm": 0.004117018077522516, "learning_rate": 5.8854860186418116e-05, "loss": 0.0004, "step": 2120 }, { "epoch": 2.8362183754993344, "grad_norm": 0.00375298666767776, "learning_rate": 5.818908122503329e-05, "loss": 0.0004, "step": 2130 }, { "epoch": 2.849533954727031, "grad_norm": 4.614629745483398, "learning_rate": 5.752330226364847e-05, "loss": 0.0117, "step": 2140 }, { "epoch": 2.8628495339547273, "grad_norm": 0.006098424084484577, "learning_rate": 5.685752330226365e-05, "loss": 0.001, "step": 2150 }, { "epoch": 2.8761651131824233, "grad_norm": 0.20828425884246826, "learning_rate": 5.619174434087883e-05, "loss": 0.0007, "step": 2160 }, { "epoch": 2.8894806924101197, "grad_norm": 0.0036955540999770164, "learning_rate": 5.552596537949402e-05, "loss": 0.0004, "step": 2170 }, { "epoch": 2.902796271637816, "grad_norm": 0.003313822206109762, "learning_rate": 5.4860186418109194e-05, "loss": 0.0004, "step": 2180 }, { "epoch": 2.9161118508655126, "grad_norm": 0.0031216980423778296, "learning_rate": 5.419440745672437e-05, "loss": 0.0007, "step": 2190 }, { "epoch": 2.929427430093209, "grad_norm": 0.002954344032332301, "learning_rate": 5.352862849533955e-05, "loss": 0.0004, "step": 2200 }, { "epoch": 2.929427430093209, "eval_accuracy": 0.9917664670658682, "eval_loss": 0.04039301723241806, "eval_runtime": 18.9671, "eval_samples_per_second": 70.438, "eval_steps_per_second": 8.805, "step": 2200 }, { "epoch": 2.9427430093209055, "grad_norm": 107.526123046875, "learning_rate": 5.286284953395473e-05, "loss": 0.0429, "step": 2210 }, { "epoch": 2.956058588548602, "grad_norm": 0.0032062442041933537, "learning_rate": 5.2197070572569905e-05, "loss": 0.0004, "step": 2220 }, { "epoch": 2.9693741677762984, "grad_norm": 0.2084268033504486, "learning_rate": 5.153129161118508e-05, "loss": 0.015, "step": 2230 }, { "epoch": 2.982689747003995, "grad_norm": 0.004362909123301506, "learning_rate": 5.086551264980027e-05, "loss": 0.0005, "step": 2240 }, { "epoch": 2.996005326231691, "grad_norm": 0.0029051396995782852, "learning_rate": 5.0199733688415454e-05, "loss": 0.0008, "step": 2250 }, { "epoch": 3.0093209054593877, "grad_norm": 0.0038269031792879105, "learning_rate": 4.953395472703063e-05, "loss": 0.0007, "step": 2260 }, { "epoch": 3.0226364846870837, "grad_norm": 0.0031527807004749775, "learning_rate": 4.8868175765645806e-05, "loss": 0.0004, "step": 2270 }, { "epoch": 3.03595206391478, "grad_norm": 0.0027656673919409513, "learning_rate": 4.820239680426098e-05, "loss": 0.0003, "step": 2280 }, { "epoch": 3.0492676431424766, "grad_norm": 0.0027427240274846554, "learning_rate": 4.753661784287617e-05, "loss": 0.0004, "step": 2290 }, { "epoch": 3.062583222370173, "grad_norm": 0.004418679978698492, "learning_rate": 4.687083888149135e-05, "loss": 0.0003, "step": 2300 }, { "epoch": 3.062583222370173, "eval_accuracy": 0.9917664670658682, "eval_loss": 0.037005726248025894, "eval_runtime": 18.852, "eval_samples_per_second": 70.868, "eval_steps_per_second": 8.858, "step": 2300 }, { "epoch": 3.0758988015978694, "grad_norm": 0.003011292079463601, "learning_rate": 4.6205059920106524e-05, "loss": 0.0005, "step": 2310 }, { "epoch": 3.089214380825566, "grad_norm": 0.002671412192285061, "learning_rate": 4.553928095872171e-05, "loss": 0.0003, "step": 2320 }, { "epoch": 3.1025299600532623, "grad_norm": 0.002712644636631012, "learning_rate": 4.487350199733688e-05, "loss": 0.0003, "step": 2330 }, { "epoch": 3.1158455392809588, "grad_norm": 0.00504153361544013, "learning_rate": 4.4207723035952066e-05, "loss": 0.0003, "step": 2340 }, { "epoch": 3.129161118508655, "grad_norm": 0.003144737333059311, "learning_rate": 4.354194407456725e-05, "loss": 0.0003, "step": 2350 }, { "epoch": 3.1424766977363516, "grad_norm": 0.002637204946950078, "learning_rate": 4.2876165113182425e-05, "loss": 0.0003, "step": 2360 }, { "epoch": 3.155792276964048, "grad_norm": 0.0026598216500133276, "learning_rate": 4.22103861517976e-05, "loss": 0.0003, "step": 2370 }, { "epoch": 3.1691078561917445, "grad_norm": 0.002664501080289483, "learning_rate": 4.154460719041279e-05, "loss": 0.0003, "step": 2380 }, { "epoch": 3.182423435419441, "grad_norm": 0.0027599988970905542, "learning_rate": 4.087882822902797e-05, "loss": 0.0003, "step": 2390 }, { "epoch": 3.195739014647137, "grad_norm": 0.002599115017801523, "learning_rate": 4.021304926764314e-05, "loss": 0.0003, "step": 2400 }, { "epoch": 3.195739014647137, "eval_accuracy": 0.9917664670658682, "eval_loss": 0.03919079899787903, "eval_runtime": 18.8696, "eval_samples_per_second": 70.802, "eval_steps_per_second": 8.85, "step": 2400 }, { "epoch": 3.2090545938748334, "grad_norm": 0.0025887268129736185, "learning_rate": 3.954727030625832e-05, "loss": 0.0163, "step": 2410 }, { "epoch": 3.22237017310253, "grad_norm": 0.0027751659508794546, "learning_rate": 3.88814913448735e-05, "loss": 0.0003, "step": 2420 }, { "epoch": 3.2356857523302263, "grad_norm": 0.002749471226707101, "learning_rate": 3.8215712383488685e-05, "loss": 0.0003, "step": 2430 }, { "epoch": 3.2490013315579227, "grad_norm": 0.002491929102689028, "learning_rate": 3.754993342210386e-05, "loss": 0.0012, "step": 2440 }, { "epoch": 3.262316910785619, "grad_norm": 0.0031888161320239305, "learning_rate": 3.6884154460719044e-05, "loss": 0.0003, "step": 2450 }, { "epoch": 3.2756324900133156, "grad_norm": 0.004251156002283096, "learning_rate": 3.621837549933422e-05, "loss": 0.0003, "step": 2460 }, { "epoch": 3.288948069241012, "grad_norm": 0.0027103093452751637, "learning_rate": 3.55525965379494e-05, "loss": 0.0004, "step": 2470 }, { "epoch": 3.3022636484687085, "grad_norm": 0.0026021709199994802, "learning_rate": 3.4886817576564586e-05, "loss": 0.0003, "step": 2480 }, { "epoch": 3.315579227696405, "grad_norm": 0.002954940777271986, "learning_rate": 3.422103861517976e-05, "loss": 0.0003, "step": 2490 }, { "epoch": 3.3288948069241013, "grad_norm": 0.0026502846740186214, "learning_rate": 3.355525965379494e-05, "loss": 0.0003, "step": 2500 }, { "epoch": 3.3288948069241013, "eval_accuracy": 0.9932634730538922, "eval_loss": 0.03057168610394001, "eval_runtime": 19.2139, "eval_samples_per_second": 69.533, "eval_steps_per_second": 8.692, "step": 2500 }, { "epoch": 3.3422103861517978, "grad_norm": 0.0026077169459313154, "learning_rate": 3.288948069241012e-05, "loss": 0.0003, "step": 2510 }, { "epoch": 3.3555259653794938, "grad_norm": 0.002518726047128439, "learning_rate": 3.2223701731025304e-05, "loss": 0.0003, "step": 2520 }, { "epoch": 3.36884154460719, "grad_norm": 0.002446199534460902, "learning_rate": 3.155792276964048e-05, "loss": 0.0003, "step": 2530 }, { "epoch": 3.3821571238348866, "grad_norm": 0.01649678498506546, "learning_rate": 3.0892143808255656e-05, "loss": 0.0003, "step": 2540 }, { "epoch": 3.395472703062583, "grad_norm": 0.002374894917011261, "learning_rate": 3.0226364846870843e-05, "loss": 0.0003, "step": 2550 }, { "epoch": 3.4087882822902795, "grad_norm": 0.0023247618228197098, "learning_rate": 2.956058588548602e-05, "loss": 0.0003, "step": 2560 }, { "epoch": 3.422103861517976, "grad_norm": 0.0022955993190407753, "learning_rate": 2.88948069241012e-05, "loss": 0.0003, "step": 2570 }, { "epoch": 3.4354194407456724, "grad_norm": 0.0023930929601192474, "learning_rate": 2.822902796271638e-05, "loss": 0.0194, "step": 2580 }, { "epoch": 3.448735019973369, "grad_norm": 0.0025017596781253815, "learning_rate": 2.756324900133156e-05, "loss": 0.0003, "step": 2590 }, { "epoch": 3.4620505992010653, "grad_norm": 0.0025145343970507383, "learning_rate": 2.6897470039946737e-05, "loss": 0.0003, "step": 2600 }, { "epoch": 3.4620505992010653, "eval_accuracy": 0.9955089820359282, "eval_loss": 0.019898300990462303, "eval_runtime": 19.1238, "eval_samples_per_second": 69.861, "eval_steps_per_second": 8.733, "step": 2600 }, { "epoch": 3.4753661784287617, "grad_norm": 0.0024944059550762177, "learning_rate": 2.623169107856192e-05, "loss": 0.0003, "step": 2610 }, { "epoch": 3.488681757656458, "grad_norm": 0.002342221327126026, "learning_rate": 2.55659121171771e-05, "loss": 0.0005, "step": 2620 }, { "epoch": 3.5019973368841546, "grad_norm": 0.03469083085656166, "learning_rate": 2.4900133155792276e-05, "loss": 0.0003, "step": 2630 }, { "epoch": 3.515312916111851, "grad_norm": 0.0023358033504337072, "learning_rate": 2.423435419440746e-05, "loss": 0.0003, "step": 2640 }, { "epoch": 3.5286284953395475, "grad_norm": 0.0022878609597682953, "learning_rate": 2.3568575233022638e-05, "loss": 0.0003, "step": 2650 }, { "epoch": 3.541944074567244, "grad_norm": 0.0026905399281531572, "learning_rate": 2.2902796271637818e-05, "loss": 0.0003, "step": 2660 }, { "epoch": 3.5552596537949404, "grad_norm": 0.0026699311565607786, "learning_rate": 2.2237017310252997e-05, "loss": 0.0003, "step": 2670 }, { "epoch": 3.5685752330226364, "grad_norm": 0.0023570535704493523, "learning_rate": 2.1571238348868177e-05, "loss": 0.0003, "step": 2680 }, { "epoch": 3.581890812250333, "grad_norm": 0.0024163059424608946, "learning_rate": 2.0905459387483356e-05, "loss": 0.0003, "step": 2690 }, { "epoch": 3.5952063914780292, "grad_norm": 0.002381423255428672, "learning_rate": 2.0239680426098536e-05, "loss": 0.0003, "step": 2700 }, { "epoch": 3.5952063914780292, "eval_accuracy": 0.9947604790419161, "eval_loss": 0.024509532377123833, "eval_runtime": 19.0731, "eval_samples_per_second": 70.046, "eval_steps_per_second": 8.756, "step": 2700 }, { "epoch": 3.6085219707057257, "grad_norm": 0.0022262686397880316, "learning_rate": 1.9573901464713715e-05, "loss": 0.0003, "step": 2710 }, { "epoch": 3.621837549933422, "grad_norm": 0.0022425020579248667, "learning_rate": 1.8908122503328895e-05, "loss": 0.0003, "step": 2720 }, { "epoch": 3.6351531291611185, "grad_norm": 0.0022420468740165234, "learning_rate": 1.8242343541944078e-05, "loss": 0.0003, "step": 2730 }, { "epoch": 3.648468708388815, "grad_norm": 0.0021876932587474585, "learning_rate": 1.7576564580559254e-05, "loss": 0.0003, "step": 2740 }, { "epoch": 3.6617842876165114, "grad_norm": 0.0023544211871922016, "learning_rate": 1.6910785619174437e-05, "loss": 0.0002, "step": 2750 }, { "epoch": 3.675099866844208, "grad_norm": 0.002206595614552498, "learning_rate": 1.6245006657789616e-05, "loss": 0.0004, "step": 2760 }, { "epoch": 3.688415446071904, "grad_norm": 0.002322066342458129, "learning_rate": 1.5579227696404792e-05, "loss": 0.0002, "step": 2770 }, { "epoch": 3.7017310252996003, "grad_norm": 0.0022114135790616274, "learning_rate": 1.4913448735019975e-05, "loss": 0.0003, "step": 2780 }, { "epoch": 3.7150466045272967, "grad_norm": 0.0022223354317247868, "learning_rate": 1.4247669773635153e-05, "loss": 0.0004, "step": 2790 }, { "epoch": 3.728362183754993, "grad_norm": 0.002155436435714364, "learning_rate": 1.3581890812250334e-05, "loss": 0.0003, "step": 2800 }, { "epoch": 3.728362183754993, "eval_accuracy": 0.9947604790419161, "eval_loss": 0.025128666311502457, "eval_runtime": 18.7023, "eval_samples_per_second": 71.435, "eval_steps_per_second": 8.929, "step": 2800 }, { "epoch": 3.7416777629826896, "grad_norm": 0.0022344435565173626, "learning_rate": 1.2916111850865514e-05, "loss": 0.0003, "step": 2810 }, { "epoch": 3.754993342210386, "grad_norm": 0.04637530446052551, "learning_rate": 1.2250332889480692e-05, "loss": 0.0003, "step": 2820 }, { "epoch": 3.7683089214380825, "grad_norm": 0.002139828633517027, "learning_rate": 1.1584553928095873e-05, "loss": 0.0003, "step": 2830 }, { "epoch": 3.781624500665779, "grad_norm": 0.0020825008396059275, "learning_rate": 1.0918774966711052e-05, "loss": 0.0003, "step": 2840 }, { "epoch": 3.7949400798934754, "grad_norm": 0.0020064804702997208, "learning_rate": 1.0252996005326232e-05, "loss": 0.0003, "step": 2850 }, { "epoch": 3.808255659121172, "grad_norm": 0.002234045183286071, "learning_rate": 9.587217043941411e-06, "loss": 0.0002, "step": 2860 }, { "epoch": 3.8215712383488682, "grad_norm": 0.0021161320619285107, "learning_rate": 8.921438082556593e-06, "loss": 0.0003, "step": 2870 }, { "epoch": 3.8348868175765647, "grad_norm": 0.0022624481935054064, "learning_rate": 8.255659121171772e-06, "loss": 0.0002, "step": 2880 }, { "epoch": 3.848202396804261, "grad_norm": 0.0020989372860640287, "learning_rate": 7.589880159786951e-06, "loss": 0.0002, "step": 2890 }, { "epoch": 3.8615179760319576, "grad_norm": 0.002201867988333106, "learning_rate": 6.92410119840213e-06, "loss": 0.0002, "step": 2900 }, { "epoch": 3.8615179760319576, "eval_accuracy": 0.9947604790419161, "eval_loss": 0.025865301489830017, "eval_runtime": 18.6017, "eval_samples_per_second": 71.821, "eval_steps_per_second": 8.978, "step": 2900 }, { "epoch": 3.874833555259654, "grad_norm": 0.002228468656539917, "learning_rate": 6.258322237017311e-06, "loss": 0.0002, "step": 2910 }, { "epoch": 3.8881491344873504, "grad_norm": 0.0022046160884201527, "learning_rate": 5.59254327563249e-06, "loss": 0.0003, "step": 2920 }, { "epoch": 3.9014647137150464, "grad_norm": 0.002218181500211358, "learning_rate": 4.92676431424767e-06, "loss": 0.0002, "step": 2930 }, { "epoch": 3.914780292942743, "grad_norm": 0.001987821888178587, "learning_rate": 4.26098535286285e-06, "loss": 0.0002, "step": 2940 }, { "epoch": 3.9280958721704393, "grad_norm": 0.0021837118547409773, "learning_rate": 3.5952063914780293e-06, "loss": 0.0002, "step": 2950 }, { "epoch": 3.9414114513981358, "grad_norm": 0.002148882020264864, "learning_rate": 2.9294274300932092e-06, "loss": 0.0002, "step": 2960 }, { "epoch": 3.954727030625832, "grad_norm": 0.002326791873201728, "learning_rate": 2.2636484687083888e-06, "loss": 0.0002, "step": 2970 }, { "epoch": 3.9680426098535286, "grad_norm": 0.001993759535253048, "learning_rate": 1.5978695073235687e-06, "loss": 0.0002, "step": 2980 }, { "epoch": 3.981358189081225, "grad_norm": 0.0020522773265838623, "learning_rate": 9.320905459387485e-07, "loss": 0.0002, "step": 2990 }, { "epoch": 3.9946737683089215, "grad_norm": 0.0021599766332656145, "learning_rate": 2.6631158455392814e-07, "loss": 0.0002, "step": 3000 }, { "epoch": 3.9946737683089215, "eval_accuracy": 0.9947604790419161, "eval_loss": 0.02588406577706337, "eval_runtime": 18.7868, "eval_samples_per_second": 71.114, "eval_steps_per_second": 8.889, "step": 3000 } ], "logging_steps": 10, "max_steps": 3004, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.7242650208772915e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }