diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,87070 @@ +{ + "best_metric": 0.80437648, + "best_model_checkpoint": "/qlgy0912/llm_sft_output/qwen2_5-7b/v0-20240927-140411/checkpoint-76000", + "epoch": 1.994779743070036, + "eval_steps": 500, + "global_step": 85500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "acc": 0.69601125, + "epoch": 2.333075722888931e-05, + "grad_norm": 65.0, + "learning_rate": 3.888024883359254e-09, + "loss": 1.2215333, + "memory(GiB)": 101.91, + "step": 1, + "train_speed(iter/s)": 0.081993 + }, + { + "acc": 0.6643041, + "epoch": 0.0002333075722888931, + "grad_norm": 52.25, + "learning_rate": 3.888024883359254e-08, + "loss": 1.42277463, + "memory(GiB)": 103.58, + "step": 10, + "train_speed(iter/s)": 0.306137 + }, + { + "acc": 0.64442787, + "epoch": 0.0004666151445777862, + "grad_norm": 63.25, + "learning_rate": 7.776049766718508e-08, + "loss": 1.4146143, + "memory(GiB)": 106.04, + "step": 20, + "train_speed(iter/s)": 0.340883 + }, + { + "acc": 0.6630681, + "epoch": 0.0006999227168666793, + "grad_norm": 58.25, + "learning_rate": 1.1664074650077761e-07, + "loss": 1.34018517, + "memory(GiB)": 106.04, + "step": 30, + "train_speed(iter/s)": 0.358391 + }, + { + "acc": 0.65009575, + "epoch": 0.0009332302891555724, + "grad_norm": 54.5, + "learning_rate": 1.5552099533437016e-07, + "loss": 1.45347633, + "memory(GiB)": 106.17, + "step": 40, + "train_speed(iter/s)": 0.375122 + }, + { + "acc": 0.6623395, + "epoch": 0.0011665378614444655, + "grad_norm": 28.25, + "learning_rate": 1.944012441679627e-07, + "loss": 1.3650135, + "memory(GiB)": 107.6, + "step": 50, + "train_speed(iter/s)": 0.374611 + }, + { + "acc": 0.65479803, + "epoch": 0.0013998454337333585, + "grad_norm": 192.0, + "learning_rate": 2.3328149300155523e-07, + "loss": 1.39007854, + "memory(GiB)": 107.6, + "step": 60, + "train_speed(iter/s)": 0.384975 + }, + { + "acc": 0.67412581, + "epoch": 0.0016331530060222517, + "grad_norm": 42.75, + "learning_rate": 2.721617418351478e-07, + "loss": 1.3340167, + "memory(GiB)": 107.6, + "step": 70, + "train_speed(iter/s)": 0.392443 + }, + { + "acc": 0.64180751, + "epoch": 0.0018664605783111448, + "grad_norm": 46.0, + "learning_rate": 3.110419906687403e-07, + "loss": 1.44007759, + "memory(GiB)": 107.6, + "step": 80, + "train_speed(iter/s)": 0.400513 + }, + { + "acc": 0.65562725, + "epoch": 0.0020997681506000378, + "grad_norm": 169.0, + "learning_rate": 3.4992223950233286e-07, + "loss": 1.377672, + "memory(GiB)": 107.6, + "step": 90, + "train_speed(iter/s)": 0.404057 + }, + { + "acc": 0.65258265, + "epoch": 0.002333075722888931, + "grad_norm": 201.0, + "learning_rate": 3.888024883359254e-07, + "loss": 1.40249634, + "memory(GiB)": 107.73, + "step": 100, + "train_speed(iter/s)": 0.404893 + }, + { + "acc": 0.66373754, + "epoch": 0.0025663832951778242, + "grad_norm": 44.5, + "learning_rate": 4.2768273716951787e-07, + "loss": 1.3577816, + "memory(GiB)": 107.73, + "step": 110, + "train_speed(iter/s)": 0.406581 + }, + { + "acc": 0.66515074, + "epoch": 0.002799690867466717, + "grad_norm": 69.5, + "learning_rate": 4.6656298600311046e-07, + "loss": 1.33736687, + "memory(GiB)": 107.73, + "step": 120, + "train_speed(iter/s)": 0.406629 + }, + { + "acc": 0.6434556, + "epoch": 0.0030329984397556103, + "grad_norm": 39.75, + "learning_rate": 5.054432348367029e-07, + "loss": 1.45937138, + "memory(GiB)": 107.73, + "step": 130, + "train_speed(iter/s)": 0.409521 + }, + { + "acc": 0.63647475, + "epoch": 0.0032663060120445035, + "grad_norm": 43.5, + "learning_rate": 5.443234836702956e-07, + "loss": 1.45315819, + "memory(GiB)": 107.73, + "step": 140, + "train_speed(iter/s)": 0.409846 + }, + { + "acc": 0.66497622, + "epoch": 0.0034996135843333967, + "grad_norm": 18.875, + "learning_rate": 5.832037325038881e-07, + "loss": 1.33231993, + "memory(GiB)": 107.73, + "step": 150, + "train_speed(iter/s)": 0.41013 + }, + { + "acc": 0.67984552, + "epoch": 0.0037329211566222895, + "grad_norm": 64.0, + "learning_rate": 6.220839813374806e-07, + "loss": 1.25875092, + "memory(GiB)": 109.26, + "step": 160, + "train_speed(iter/s)": 0.409049 + }, + { + "acc": 0.66304803, + "epoch": 0.003966228728911183, + "grad_norm": 41.25, + "learning_rate": 6.609642301710731e-07, + "loss": 1.35497437, + "memory(GiB)": 109.26, + "step": 170, + "train_speed(iter/s)": 0.408629 + }, + { + "acc": 0.6695015, + "epoch": 0.0041995363012000755, + "grad_norm": 24.125, + "learning_rate": 6.998444790046657e-07, + "loss": 1.32009106, + "memory(GiB)": 109.26, + "step": 180, + "train_speed(iter/s)": 0.408194 + }, + { + "acc": 0.64377413, + "epoch": 0.004432843873488969, + "grad_norm": 189.0, + "learning_rate": 7.387247278382582e-07, + "loss": 1.45081196, + "memory(GiB)": 109.26, + "step": 190, + "train_speed(iter/s)": 0.40967 + }, + { + "acc": 0.66846848, + "epoch": 0.004666151445777862, + "grad_norm": 32.25, + "learning_rate": 7.776049766718508e-07, + "loss": 1.34669209, + "memory(GiB)": 109.26, + "step": 200, + "train_speed(iter/s)": 0.409516 + }, + { + "acc": 0.66021504, + "epoch": 0.004899459018066755, + "grad_norm": 22.25, + "learning_rate": 8.164852255054432e-07, + "loss": 1.3446023, + "memory(GiB)": 109.26, + "step": 210, + "train_speed(iter/s)": 0.40949 + }, + { + "acc": 0.67646599, + "epoch": 0.0051327665903556485, + "grad_norm": 22.625, + "learning_rate": 8.553654743390357e-07, + "loss": 1.24624052, + "memory(GiB)": 109.26, + "step": 220, + "train_speed(iter/s)": 0.408686 + }, + { + "acc": 0.67372646, + "epoch": 0.005366074162644541, + "grad_norm": 187.0, + "learning_rate": 8.942457231726284e-07, + "loss": 1.31202698, + "memory(GiB)": 109.26, + "step": 230, + "train_speed(iter/s)": 0.408586 + }, + { + "acc": 0.67337198, + "epoch": 0.005599381734933434, + "grad_norm": 87.0, + "learning_rate": 9.331259720062209e-07, + "loss": 1.32102585, + "memory(GiB)": 109.26, + "step": 240, + "train_speed(iter/s)": 0.409543 + }, + { + "acc": 0.68346677, + "epoch": 0.005832689307222328, + "grad_norm": 19.125, + "learning_rate": 9.720062208398133e-07, + "loss": 1.25821629, + "memory(GiB)": 109.26, + "step": 250, + "train_speed(iter/s)": 0.409961 + }, + { + "acc": 0.6582798, + "epoch": 0.0060659968795112205, + "grad_norm": 27.0, + "learning_rate": 1.0108864696734059e-06, + "loss": 1.37746887, + "memory(GiB)": 109.26, + "step": 260, + "train_speed(iter/s)": 0.410468 + }, + { + "acc": 0.65754652, + "epoch": 0.006299304451800113, + "grad_norm": 30.375, + "learning_rate": 1.0497667185069986e-06, + "loss": 1.34862213, + "memory(GiB)": 109.64, + "step": 270, + "train_speed(iter/s)": 0.410747 + }, + { + "acc": 0.67330213, + "epoch": 0.006532612024089007, + "grad_norm": 49.0, + "learning_rate": 1.0886469673405912e-06, + "loss": 1.3070693, + "memory(GiB)": 109.64, + "step": 280, + "train_speed(iter/s)": 0.411653 + }, + { + "acc": 0.68029757, + "epoch": 0.0067659195963779, + "grad_norm": 34.5, + "learning_rate": 1.1275272161741837e-06, + "loss": 1.24388609, + "memory(GiB)": 109.64, + "step": 290, + "train_speed(iter/s)": 0.411959 + }, + { + "acc": 0.69043927, + "epoch": 0.006999227168666793, + "grad_norm": 22.875, + "learning_rate": 1.1664074650077762e-06, + "loss": 1.24611559, + "memory(GiB)": 109.77, + "step": 300, + "train_speed(iter/s)": 0.411594 + }, + { + "acc": 0.68904781, + "epoch": 0.007232534740955686, + "grad_norm": 12.75, + "learning_rate": 1.2052877138413686e-06, + "loss": 1.20471973, + "memory(GiB)": 109.77, + "step": 310, + "train_speed(iter/s)": 0.410459 + }, + { + "acc": 0.6853941, + "epoch": 0.007465842313244579, + "grad_norm": 24.875, + "learning_rate": 1.2441679626749613e-06, + "loss": 1.23733406, + "memory(GiB)": 109.77, + "step": 320, + "train_speed(iter/s)": 0.411169 + }, + { + "acc": 0.69481087, + "epoch": 0.007699149885533473, + "grad_norm": 85.0, + "learning_rate": 1.2830482115085538e-06, + "loss": 1.23310585, + "memory(GiB)": 109.77, + "step": 330, + "train_speed(iter/s)": 0.411728 + }, + { + "acc": 0.69631901, + "epoch": 0.007932457457822365, + "grad_norm": 35.0, + "learning_rate": 1.3219284603421462e-06, + "loss": 1.18948269, + "memory(GiB)": 109.77, + "step": 340, + "train_speed(iter/s)": 0.412145 + }, + { + "acc": 0.66458092, + "epoch": 0.00816576503011126, + "grad_norm": 17.625, + "learning_rate": 1.360808709175739e-06, + "loss": 1.3200798, + "memory(GiB)": 109.77, + "step": 350, + "train_speed(iter/s)": 0.411958 + }, + { + "acc": 0.68819141, + "epoch": 0.008399072602400151, + "grad_norm": 32.5, + "learning_rate": 1.3996889580093314e-06, + "loss": 1.22151852, + "memory(GiB)": 109.77, + "step": 360, + "train_speed(iter/s)": 0.412697 + }, + { + "acc": 0.68463111, + "epoch": 0.008632380174689045, + "grad_norm": 38.0, + "learning_rate": 1.4385692068429238e-06, + "loss": 1.26127262, + "memory(GiB)": 109.77, + "step": 370, + "train_speed(iter/s)": 0.413014 + }, + { + "acc": 0.66406565, + "epoch": 0.008865687746977938, + "grad_norm": 21.75, + "learning_rate": 1.4774494556765165e-06, + "loss": 1.33156033, + "memory(GiB)": 109.77, + "step": 380, + "train_speed(iter/s)": 0.412519 + }, + { + "acc": 0.67032399, + "epoch": 0.00909899531926683, + "grad_norm": 22.875, + "learning_rate": 1.5163297045101088e-06, + "loss": 1.27387772, + "memory(GiB)": 109.77, + "step": 390, + "train_speed(iter/s)": 0.412242 + }, + { + "acc": 0.68596916, + "epoch": 0.009332302891555724, + "grad_norm": 15.0, + "learning_rate": 1.5552099533437016e-06, + "loss": 1.23029289, + "memory(GiB)": 109.77, + "step": 400, + "train_speed(iter/s)": 0.41303 + }, + { + "acc": 0.68200741, + "epoch": 0.009565610463844618, + "grad_norm": 19.75, + "learning_rate": 1.594090202177294e-06, + "loss": 1.23422031, + "memory(GiB)": 109.77, + "step": 410, + "train_speed(iter/s)": 0.413378 + }, + { + "acc": 0.68170118, + "epoch": 0.00979891803613351, + "grad_norm": 23.875, + "learning_rate": 1.6329704510108864e-06, + "loss": 1.24763041, + "memory(GiB)": 109.77, + "step": 420, + "train_speed(iter/s)": 0.413317 + }, + { + "acc": 0.68019838, + "epoch": 0.010032225608422403, + "grad_norm": 12.5625, + "learning_rate": 1.6718506998444792e-06, + "loss": 1.25324621, + "memory(GiB)": 109.77, + "step": 430, + "train_speed(iter/s)": 0.413645 + }, + { + "acc": 0.68601837, + "epoch": 0.010265533180711297, + "grad_norm": 22.125, + "learning_rate": 1.7107309486780715e-06, + "loss": 1.21224289, + "memory(GiB)": 109.77, + "step": 440, + "train_speed(iter/s)": 0.413557 + }, + { + "acc": 0.70243044, + "epoch": 0.010498840753000189, + "grad_norm": 32.0, + "learning_rate": 1.7496111975116642e-06, + "loss": 1.15849266, + "memory(GiB)": 109.77, + "step": 450, + "train_speed(iter/s)": 0.412938 + }, + { + "acc": 0.68498225, + "epoch": 0.010732148325289083, + "grad_norm": 21.0, + "learning_rate": 1.7884914463452568e-06, + "loss": 1.22883425, + "memory(GiB)": 109.77, + "step": 460, + "train_speed(iter/s)": 0.413629 + }, + { + "acc": 0.71473951, + "epoch": 0.010965455897577976, + "grad_norm": 27.75, + "learning_rate": 1.8273716951788493e-06, + "loss": 1.14569626, + "memory(GiB)": 109.77, + "step": 470, + "train_speed(iter/s)": 0.413969 + }, + { + "acc": 0.68733125, + "epoch": 0.011198763469866868, + "grad_norm": 17.75, + "learning_rate": 1.8662519440124418e-06, + "loss": 1.21703568, + "memory(GiB)": 109.77, + "step": 480, + "train_speed(iter/s)": 0.414002 + }, + { + "acc": 0.68442454, + "epoch": 0.011432071042155762, + "grad_norm": 29.875, + "learning_rate": 1.9051321928460342e-06, + "loss": 1.21695004, + "memory(GiB)": 109.77, + "step": 490, + "train_speed(iter/s)": 0.414733 + }, + { + "acc": 0.69048681, + "epoch": 0.011665378614444655, + "grad_norm": 39.5, + "learning_rate": 1.9440124416796267e-06, + "loss": 1.17031765, + "memory(GiB)": 109.77, + "step": 500, + "train_speed(iter/s)": 0.414989 + }, + { + "epoch": 0.011665378614444655, + "eval_acc": 0.6771181596993516, + "eval_loss": 1.170393943786621, + "eval_runtime": 1270.1553, + "eval_samples_per_second": 28.336, + "eval_steps_per_second": 14.168, + "step": 500 + }, + { + "acc": 0.68483567, + "epoch": 0.011898686186733547, + "grad_norm": 24.375, + "learning_rate": 1.9828926905132194e-06, + "loss": 1.21425552, + "memory(GiB)": 112.37, + "step": 510, + "train_speed(iter/s)": 0.20271 + }, + { + "acc": 0.71478772, + "epoch": 0.012131993759022441, + "grad_norm": 40.75, + "learning_rate": 2.0217729393468118e-06, + "loss": 1.08218079, + "memory(GiB)": 112.37, + "step": 520, + "train_speed(iter/s)": 0.204723 + }, + { + "acc": 0.69291697, + "epoch": 0.012365301331311335, + "grad_norm": 13.3125, + "learning_rate": 2.0606531881804045e-06, + "loss": 1.18659, + "memory(GiB)": 112.37, + "step": 530, + "train_speed(iter/s)": 0.206783 + }, + { + "acc": 0.70176544, + "epoch": 0.012598608903600227, + "grad_norm": 10.8125, + "learning_rate": 2.0995334370139973e-06, + "loss": 1.15439968, + "memory(GiB)": 112.37, + "step": 540, + "train_speed(iter/s)": 0.208567 + }, + { + "acc": 0.70656576, + "epoch": 0.01283191647588912, + "grad_norm": 24.75, + "learning_rate": 2.1384136858475896e-06, + "loss": 1.13584023, + "memory(GiB)": 112.37, + "step": 550, + "train_speed(iter/s)": 0.210478 + }, + { + "acc": 0.69827976, + "epoch": 0.013065224048178014, + "grad_norm": 18.375, + "learning_rate": 2.1772939346811823e-06, + "loss": 1.16824875, + "memory(GiB)": 112.37, + "step": 560, + "train_speed(iter/s)": 0.212251 + }, + { + "acc": 0.69722204, + "epoch": 0.013298531620466908, + "grad_norm": 20.625, + "learning_rate": 2.2161741835147746e-06, + "loss": 1.17354527, + "memory(GiB)": 112.37, + "step": 570, + "train_speed(iter/s)": 0.214144 + }, + { + "acc": 0.69391127, + "epoch": 0.0135318391927558, + "grad_norm": 12.25, + "learning_rate": 2.2550544323483674e-06, + "loss": 1.18383684, + "memory(GiB)": 112.37, + "step": 580, + "train_speed(iter/s)": 0.215836 + }, + { + "acc": 0.7008687, + "epoch": 0.013765146765044693, + "grad_norm": 21.125, + "learning_rate": 2.2939346811819597e-06, + "loss": 1.13180761, + "memory(GiB)": 112.37, + "step": 590, + "train_speed(iter/s)": 0.217548 + }, + { + "acc": 0.70401969, + "epoch": 0.013998454337333587, + "grad_norm": 14.1875, + "learning_rate": 2.3328149300155525e-06, + "loss": 1.14089346, + "memory(GiB)": 112.37, + "step": 600, + "train_speed(iter/s)": 0.219209 + }, + { + "acc": 0.67588034, + "epoch": 0.014231761909622479, + "grad_norm": 152.0, + "learning_rate": 2.3716951788491448e-06, + "loss": 1.282197, + "memory(GiB)": 112.37, + "step": 610, + "train_speed(iter/s)": 0.220998 + }, + { + "acc": 0.71631866, + "epoch": 0.014465069481911372, + "grad_norm": 10.0, + "learning_rate": 2.410575427682737e-06, + "loss": 1.09193068, + "memory(GiB)": 112.37, + "step": 620, + "train_speed(iter/s)": 0.22271 + }, + { + "acc": 0.70892181, + "epoch": 0.014698377054200266, + "grad_norm": 21.625, + "learning_rate": 2.44945567651633e-06, + "loss": 1.10817814, + "memory(GiB)": 112.37, + "step": 630, + "train_speed(iter/s)": 0.224373 + }, + { + "acc": 0.71971693, + "epoch": 0.014931684626489158, + "grad_norm": 22.75, + "learning_rate": 2.4883359253499226e-06, + "loss": 1.06725435, + "memory(GiB)": 112.37, + "step": 640, + "train_speed(iter/s)": 0.226002 + }, + { + "acc": 0.6958858, + "epoch": 0.015164992198778052, + "grad_norm": 13.5, + "learning_rate": 2.527216174183515e-06, + "loss": 1.14539852, + "memory(GiB)": 112.37, + "step": 650, + "train_speed(iter/s)": 0.227501 + }, + { + "acc": 0.7064106, + "epoch": 0.015398299771066945, + "grad_norm": 14.875, + "learning_rate": 2.5660964230171077e-06, + "loss": 1.09452753, + "memory(GiB)": 112.37, + "step": 660, + "train_speed(iter/s)": 0.229076 + }, + { + "acc": 0.68724928, + "epoch": 0.01563160734335584, + "grad_norm": 10.125, + "learning_rate": 2.6049766718507004e-06, + "loss": 1.17633553, + "memory(GiB)": 112.37, + "step": 670, + "train_speed(iter/s)": 0.230642 + }, + { + "acc": 0.71898904, + "epoch": 0.01586491491564473, + "grad_norm": 20.25, + "learning_rate": 2.6438569206842923e-06, + "loss": 1.08943672, + "memory(GiB)": 112.37, + "step": 680, + "train_speed(iter/s)": 0.232146 + }, + { + "acc": 0.70866594, + "epoch": 0.016098222487933623, + "grad_norm": 41.25, + "learning_rate": 2.682737169517885e-06, + "loss": 1.1266736, + "memory(GiB)": 112.37, + "step": 690, + "train_speed(iter/s)": 0.233723 + }, + { + "acc": 0.69511156, + "epoch": 0.01633153006022252, + "grad_norm": 16.75, + "learning_rate": 2.721617418351478e-06, + "loss": 1.15227604, + "memory(GiB)": 112.37, + "step": 700, + "train_speed(iter/s)": 0.235191 + }, + { + "acc": 0.69382067, + "epoch": 0.01656483763251141, + "grad_norm": 13.875, + "learning_rate": 2.76049766718507e-06, + "loss": 1.16943235, + "memory(GiB)": 112.37, + "step": 710, + "train_speed(iter/s)": 0.236655 + }, + { + "acc": 0.70306535, + "epoch": 0.016798145204800302, + "grad_norm": 55.5, + "learning_rate": 2.799377916018663e-06, + "loss": 1.11830854, + "memory(GiB)": 112.5, + "step": 720, + "train_speed(iter/s)": 0.238078 + }, + { + "acc": 0.7088171, + "epoch": 0.017031452777089198, + "grad_norm": 13.875, + "learning_rate": 2.838258164852255e-06, + "loss": 1.11293411, + "memory(GiB)": 112.5, + "step": 730, + "train_speed(iter/s)": 0.239489 + }, + { + "acc": 0.73532681, + "epoch": 0.01726476034937809, + "grad_norm": 15.375, + "learning_rate": 2.8771384136858475e-06, + "loss": 1.01635103, + "memory(GiB)": 112.5, + "step": 740, + "train_speed(iter/s)": 0.240802 + }, + { + "acc": 0.73056784, + "epoch": 0.01749806792166698, + "grad_norm": 8.6875, + "learning_rate": 2.9160186625194403e-06, + "loss": 1.03644848, + "memory(GiB)": 112.5, + "step": 750, + "train_speed(iter/s)": 0.24216 + }, + { + "acc": 0.71434994, + "epoch": 0.017731375493955877, + "grad_norm": 14.0625, + "learning_rate": 2.954898911353033e-06, + "loss": 1.09710608, + "memory(GiB)": 112.5, + "step": 760, + "train_speed(iter/s)": 0.24334 + }, + { + "acc": 0.71459484, + "epoch": 0.01796468306624477, + "grad_norm": 11.125, + "learning_rate": 2.9937791601866257e-06, + "loss": 1.10940857, + "memory(GiB)": 112.5, + "step": 770, + "train_speed(iter/s)": 0.24472 + }, + { + "acc": 0.72359238, + "epoch": 0.01819799063853366, + "grad_norm": 10.6875, + "learning_rate": 3.0326594090202176e-06, + "loss": 1.06084309, + "memory(GiB)": 112.5, + "step": 780, + "train_speed(iter/s)": 0.246022 + }, + { + "acc": 0.71572514, + "epoch": 0.018431298210822556, + "grad_norm": 11.8125, + "learning_rate": 3.0715396578538104e-06, + "loss": 1.07304935, + "memory(GiB)": 112.5, + "step": 790, + "train_speed(iter/s)": 0.247402 + }, + { + "acc": 0.73548241, + "epoch": 0.018664605783111448, + "grad_norm": 9.625, + "learning_rate": 3.110419906687403e-06, + "loss": 1.0010951, + "memory(GiB)": 112.5, + "step": 800, + "train_speed(iter/s)": 0.248693 + }, + { + "acc": 0.74713941, + "epoch": 0.01889791335540034, + "grad_norm": 58.0, + "learning_rate": 3.1493001555209955e-06, + "loss": 0.96250553, + "memory(GiB)": 112.5, + "step": 810, + "train_speed(iter/s)": 0.249902 + }, + { + "acc": 0.72171469, + "epoch": 0.019131220927689235, + "grad_norm": 25.0, + "learning_rate": 3.188180404354588e-06, + "loss": 1.06542034, + "memory(GiB)": 112.5, + "step": 820, + "train_speed(iter/s)": 0.251022 + }, + { + "acc": 0.72154565, + "epoch": 0.019364528499978127, + "grad_norm": 10.375, + "learning_rate": 3.2270606531881805e-06, + "loss": 1.06000462, + "memory(GiB)": 112.5, + "step": 830, + "train_speed(iter/s)": 0.252274 + }, + { + "acc": 0.72352486, + "epoch": 0.01959783607226702, + "grad_norm": 15.6875, + "learning_rate": 3.265940902021773e-06, + "loss": 1.07526264, + "memory(GiB)": 112.5, + "step": 840, + "train_speed(iter/s)": 0.253342 + }, + { + "acc": 0.70478067, + "epoch": 0.019831143644555915, + "grad_norm": 30.75, + "learning_rate": 3.3048211508553656e-06, + "loss": 1.0926899, + "memory(GiB)": 112.5, + "step": 850, + "train_speed(iter/s)": 0.254406 + }, + { + "acc": 0.7308609, + "epoch": 0.020064451216844807, + "grad_norm": 9.375, + "learning_rate": 3.3437013996889583e-06, + "loss": 1.01046867, + "memory(GiB)": 112.5, + "step": 860, + "train_speed(iter/s)": 0.255641 + }, + { + "acc": 0.69665556, + "epoch": 0.0202977587891337, + "grad_norm": 29.125, + "learning_rate": 3.382581648522551e-06, + "loss": 1.16045341, + "memory(GiB)": 112.5, + "step": 870, + "train_speed(iter/s)": 0.256795 + }, + { + "acc": 0.72442212, + "epoch": 0.020531066361422594, + "grad_norm": 19.875, + "learning_rate": 3.421461897356143e-06, + "loss": 1.05415115, + "memory(GiB)": 112.5, + "step": 880, + "train_speed(iter/s)": 0.2579 + }, + { + "acc": 0.71560249, + "epoch": 0.020764373933711486, + "grad_norm": 17.5, + "learning_rate": 3.4603421461897357e-06, + "loss": 1.08642292, + "memory(GiB)": 112.5, + "step": 890, + "train_speed(iter/s)": 0.259042 + }, + { + "acc": 0.7090786, + "epoch": 0.020997681506000378, + "grad_norm": 20.5, + "learning_rate": 3.4992223950233285e-06, + "loss": 1.12236042, + "memory(GiB)": 112.5, + "step": 900, + "train_speed(iter/s)": 0.260196 + }, + { + "acc": 0.72494149, + "epoch": 0.021230989078289273, + "grad_norm": 14.125, + "learning_rate": 3.5381026438569212e-06, + "loss": 1.06427841, + "memory(GiB)": 112.5, + "step": 910, + "train_speed(iter/s)": 0.261275 + }, + { + "acc": 0.73181691, + "epoch": 0.021464296650578165, + "grad_norm": 10.3125, + "learning_rate": 3.5769828926905135e-06, + "loss": 1.00808945, + "memory(GiB)": 112.5, + "step": 920, + "train_speed(iter/s)": 0.26246 + }, + { + "acc": 0.70418386, + "epoch": 0.021697604222867057, + "grad_norm": 9.0625, + "learning_rate": 3.615863141524106e-06, + "loss": 1.1433609, + "memory(GiB)": 112.5, + "step": 930, + "train_speed(iter/s)": 0.263556 + }, + { + "acc": 0.72886324, + "epoch": 0.021930911795155952, + "grad_norm": 6.9375, + "learning_rate": 3.6547433903576986e-06, + "loss": 1.02939377, + "memory(GiB)": 112.5, + "step": 940, + "train_speed(iter/s)": 0.264494 + }, + { + "acc": 0.71419945, + "epoch": 0.022164219367444844, + "grad_norm": 16.625, + "learning_rate": 3.693623639191291e-06, + "loss": 1.06986065, + "memory(GiB)": 112.5, + "step": 950, + "train_speed(iter/s)": 0.265441 + }, + { + "acc": 0.73321857, + "epoch": 0.022397526939733736, + "grad_norm": 14.3125, + "learning_rate": 3.7325038880248837e-06, + "loss": 1.01707897, + "memory(GiB)": 112.5, + "step": 960, + "train_speed(iter/s)": 0.266415 + }, + { + "acc": 0.70803647, + "epoch": 0.02263083451202263, + "grad_norm": 22.5, + "learning_rate": 3.7713841368584764e-06, + "loss": 1.15186424, + "memory(GiB)": 112.5, + "step": 970, + "train_speed(iter/s)": 0.267359 + }, + { + "acc": 0.7166934, + "epoch": 0.022864142084311524, + "grad_norm": 11.5, + "learning_rate": 3.8102643856920683e-06, + "loss": 1.08353691, + "memory(GiB)": 112.5, + "step": 980, + "train_speed(iter/s)": 0.268331 + }, + { + "acc": 0.75482187, + "epoch": 0.023097449656600415, + "grad_norm": 11.125, + "learning_rate": 3.849144634525661e-06, + "loss": 0.92038155, + "memory(GiB)": 112.5, + "step": 990, + "train_speed(iter/s)": 0.269254 + }, + { + "acc": 0.72072606, + "epoch": 0.02333075722888931, + "grad_norm": 17.375, + "learning_rate": 3.888024883359253e-06, + "loss": 1.032094, + "memory(GiB)": 112.5, + "step": 1000, + "train_speed(iter/s)": 0.270211 + }, + { + "epoch": 0.02333075722888931, + "eval_acc": 0.6944079715955378, + "eval_loss": 1.03887140750885, + "eval_runtime": 1269.799, + "eval_samples_per_second": 28.344, + "eval_steps_per_second": 14.172, + "step": 1000 + }, + { + "acc": 0.73672218, + "epoch": 0.023564064801178203, + "grad_norm": 11.6875, + "learning_rate": 3.9269051321928466e-06, + "loss": 0.97621746, + "memory(GiB)": 112.5, + "step": 1010, + "train_speed(iter/s)": 0.201516 + }, + { + "acc": 0.72829971, + "epoch": 0.023797372373467095, + "grad_norm": 12.625, + "learning_rate": 3.965785381026439e-06, + "loss": 1.04721756, + "memory(GiB)": 112.5, + "step": 1020, + "train_speed(iter/s)": 0.202516 + }, + { + "acc": 0.74315853, + "epoch": 0.02403067994575599, + "grad_norm": 12.0625, + "learning_rate": 4.004665629860031e-06, + "loss": 0.96440945, + "memory(GiB)": 112.5, + "step": 1030, + "train_speed(iter/s)": 0.203553 + }, + { + "acc": 0.71699724, + "epoch": 0.024263987518044882, + "grad_norm": 10.4375, + "learning_rate": 4.0435458786936235e-06, + "loss": 1.05050545, + "memory(GiB)": 112.5, + "step": 1040, + "train_speed(iter/s)": 0.20455 + }, + { + "acc": 0.73913536, + "epoch": 0.024497295090333774, + "grad_norm": 56.75, + "learning_rate": 4.082426127527217e-06, + "loss": 0.98256474, + "memory(GiB)": 112.5, + "step": 1050, + "train_speed(iter/s)": 0.205477 + }, + { + "acc": 0.72385454, + "epoch": 0.02473060266262267, + "grad_norm": 10.1875, + "learning_rate": 4.121306376360809e-06, + "loss": 1.03298187, + "memory(GiB)": 112.5, + "step": 1060, + "train_speed(iter/s)": 0.206437 + }, + { + "acc": 0.73873882, + "epoch": 0.02496391023491156, + "grad_norm": 13.625, + "learning_rate": 4.160186625194401e-06, + "loss": 1.0152791, + "memory(GiB)": 112.5, + "step": 1070, + "train_speed(iter/s)": 0.207325 + }, + { + "acc": 0.73708954, + "epoch": 0.025197217807200453, + "grad_norm": 12.3125, + "learning_rate": 4.1990668740279945e-06, + "loss": 0.98759651, + "memory(GiB)": 112.5, + "step": 1080, + "train_speed(iter/s)": 0.208333 + }, + { + "acc": 0.71818414, + "epoch": 0.02543052537948935, + "grad_norm": 65.0, + "learning_rate": 4.237947122861587e-06, + "loss": 1.0364994, + "memory(GiB)": 112.5, + "step": 1090, + "train_speed(iter/s)": 0.209286 + }, + { + "acc": 0.7241076, + "epoch": 0.02566383295177824, + "grad_norm": 16.5, + "learning_rate": 4.276827371695179e-06, + "loss": 1.03190594, + "memory(GiB)": 112.5, + "step": 1100, + "train_speed(iter/s)": 0.210209 + }, + { + "acc": 0.72726188, + "epoch": 0.025897140524067136, + "grad_norm": 18.25, + "learning_rate": 4.3157076205287715e-06, + "loss": 1.02791977, + "memory(GiB)": 112.5, + "step": 1110, + "train_speed(iter/s)": 0.211143 + }, + { + "acc": 0.73693681, + "epoch": 0.026130448096356028, + "grad_norm": 8.125, + "learning_rate": 4.354587869362365e-06, + "loss": 1.01167622, + "memory(GiB)": 112.5, + "step": 1120, + "train_speed(iter/s)": 0.212056 + }, + { + "acc": 0.73370867, + "epoch": 0.02636375566864492, + "grad_norm": 8.3125, + "learning_rate": 4.393468118195957e-06, + "loss": 1.00605249, + "memory(GiB)": 112.5, + "step": 1130, + "train_speed(iter/s)": 0.212952 + }, + { + "acc": 0.73210607, + "epoch": 0.026597063240933815, + "grad_norm": 15.8125, + "learning_rate": 4.432348367029549e-06, + "loss": 0.99587374, + "memory(GiB)": 112.5, + "step": 1140, + "train_speed(iter/s)": 0.213882 + }, + { + "acc": 0.73727956, + "epoch": 0.026830370813222707, + "grad_norm": 15.5, + "learning_rate": 4.471228615863142e-06, + "loss": 0.9840374, + "memory(GiB)": 112.5, + "step": 1150, + "train_speed(iter/s)": 0.214805 + }, + { + "acc": 0.73756843, + "epoch": 0.0270636783855116, + "grad_norm": 13.6875, + "learning_rate": 4.510108864696735e-06, + "loss": 1.0077363, + "memory(GiB)": 112.5, + "step": 1160, + "train_speed(iter/s)": 0.215706 + }, + { + "acc": 0.71364336, + "epoch": 0.027296985957800494, + "grad_norm": 9.375, + "learning_rate": 4.548989113530327e-06, + "loss": 1.07981796, + "memory(GiB)": 112.5, + "step": 1170, + "train_speed(iter/s)": 0.216546 + }, + { + "acc": 0.7402174, + "epoch": 0.027530293530089386, + "grad_norm": 6.34375, + "learning_rate": 4.587869362363919e-06, + "loss": 0.96098614, + "memory(GiB)": 112.5, + "step": 1180, + "train_speed(iter/s)": 0.217392 + }, + { + "acc": 0.72905722, + "epoch": 0.02776360110237828, + "grad_norm": 12.6875, + "learning_rate": 4.626749611197512e-06, + "loss": 1.03549309, + "memory(GiB)": 112.5, + "step": 1190, + "train_speed(iter/s)": 0.218326 + }, + { + "acc": 0.71565552, + "epoch": 0.027996908674667174, + "grad_norm": 6.21875, + "learning_rate": 4.665629860031105e-06, + "loss": 1.08136559, + "memory(GiB)": 112.5, + "step": 1200, + "train_speed(iter/s)": 0.2192 + }, + { + "acc": 0.75236349, + "epoch": 0.028230216246956066, + "grad_norm": 8.9375, + "learning_rate": 4.704510108864697e-06, + "loss": 0.95540047, + "memory(GiB)": 112.5, + "step": 1210, + "train_speed(iter/s)": 0.220119 + }, + { + "acc": 0.72307405, + "epoch": 0.028463523819244958, + "grad_norm": 10.875, + "learning_rate": 4.7433903576982896e-06, + "loss": 1.01828804, + "memory(GiB)": 112.5, + "step": 1220, + "train_speed(iter/s)": 0.221032 + }, + { + "acc": 0.71835833, + "epoch": 0.028696831391533853, + "grad_norm": 11.75, + "learning_rate": 4.782270606531883e-06, + "loss": 1.05052891, + "memory(GiB)": 112.5, + "step": 1230, + "train_speed(iter/s)": 0.221873 + }, + { + "acc": 0.72516937, + "epoch": 0.028930138963822745, + "grad_norm": 8.9375, + "learning_rate": 4.821150855365474e-06, + "loss": 1.02222395, + "memory(GiB)": 112.5, + "step": 1240, + "train_speed(iter/s)": 0.222722 + }, + { + "acc": 0.72957277, + "epoch": 0.029163446536111637, + "grad_norm": 10.1875, + "learning_rate": 4.860031104199067e-06, + "loss": 1.01317539, + "memory(GiB)": 112.5, + "step": 1250, + "train_speed(iter/s)": 0.223525 + }, + { + "acc": 0.74022655, + "epoch": 0.029396754108400532, + "grad_norm": 8.875, + "learning_rate": 4.89891135303266e-06, + "loss": 0.95849752, + "memory(GiB)": 112.5, + "step": 1260, + "train_speed(iter/s)": 0.224384 + }, + { + "acc": 0.7379168, + "epoch": 0.029630061680689424, + "grad_norm": 9.1875, + "learning_rate": 4.937791601866253e-06, + "loss": 0.96695614, + "memory(GiB)": 112.5, + "step": 1270, + "train_speed(iter/s)": 0.225256 + }, + { + "acc": 0.7323555, + "epoch": 0.029863369252978316, + "grad_norm": 8.125, + "learning_rate": 4.976671850699845e-06, + "loss": 1.0258522, + "memory(GiB)": 112.5, + "step": 1280, + "train_speed(iter/s)": 0.226036 + }, + { + "acc": 0.74023056, + "epoch": 0.03009667682526721, + "grad_norm": 11.1875, + "learning_rate": 5.0155520995334375e-06, + "loss": 0.97798119, + "memory(GiB)": 112.5, + "step": 1290, + "train_speed(iter/s)": 0.226849 + }, + { + "acc": 0.74629755, + "epoch": 0.030329984397556103, + "grad_norm": 10.0625, + "learning_rate": 5.05443234836703e-06, + "loss": 0.95969715, + "memory(GiB)": 112.5, + "step": 1300, + "train_speed(iter/s)": 0.227679 + }, + { + "acc": 0.74427795, + "epoch": 0.030563291969844995, + "grad_norm": 11.75, + "learning_rate": 5.093312597200622e-06, + "loss": 0.94915905, + "memory(GiB)": 112.5, + "step": 1310, + "train_speed(iter/s)": 0.228474 + }, + { + "acc": 0.73095655, + "epoch": 0.03079659954213389, + "grad_norm": 12.375, + "learning_rate": 5.132192846034215e-06, + "loss": 1.00967436, + "memory(GiB)": 112.5, + "step": 1320, + "train_speed(iter/s)": 0.229243 + }, + { + "acc": 0.74197416, + "epoch": 0.031029907114422783, + "grad_norm": 10.0625, + "learning_rate": 5.171073094867808e-06, + "loss": 0.95410166, + "memory(GiB)": 112.5, + "step": 1330, + "train_speed(iter/s)": 0.230016 + }, + { + "acc": 0.72437105, + "epoch": 0.03126321468671168, + "grad_norm": 9.6875, + "learning_rate": 5.209953343701401e-06, + "loss": 1.02081079, + "memory(GiB)": 112.5, + "step": 1340, + "train_speed(iter/s)": 0.230814 + }, + { + "acc": 0.71890373, + "epoch": 0.03149652225900057, + "grad_norm": 7.25, + "learning_rate": 5.248833592534993e-06, + "loss": 1.05639267, + "memory(GiB)": 112.5, + "step": 1350, + "train_speed(iter/s)": 0.231593 + }, + { + "acc": 0.73581686, + "epoch": 0.03172982983128946, + "grad_norm": 26.375, + "learning_rate": 5.287713841368585e-06, + "loss": 0.99747047, + "memory(GiB)": 112.5, + "step": 1360, + "train_speed(iter/s)": 0.232336 + }, + { + "acc": 0.73829999, + "epoch": 0.031963137403578354, + "grad_norm": 13.9375, + "learning_rate": 5.326594090202177e-06, + "loss": 0.97000275, + "memory(GiB)": 112.5, + "step": 1370, + "train_speed(iter/s)": 0.233092 + }, + { + "acc": 0.73752613, + "epoch": 0.032196444975867246, + "grad_norm": 9.25, + "learning_rate": 5.36547433903577e-06, + "loss": 0.97096643, + "memory(GiB)": 112.5, + "step": 1380, + "train_speed(iter/s)": 0.23385 + }, + { + "acc": 0.73803453, + "epoch": 0.03242975254815614, + "grad_norm": 8.0, + "learning_rate": 5.404354587869362e-06, + "loss": 0.97089777, + "memory(GiB)": 112.5, + "step": 1390, + "train_speed(iter/s)": 0.234563 + }, + { + "acc": 0.73658538, + "epoch": 0.03266306012044504, + "grad_norm": 12.5625, + "learning_rate": 5.443234836702956e-06, + "loss": 0.99308605, + "memory(GiB)": 112.5, + "step": 1400, + "train_speed(iter/s)": 0.235338 + }, + { + "acc": 0.71785216, + "epoch": 0.03289636769273393, + "grad_norm": 8.3125, + "learning_rate": 5.482115085536548e-06, + "loss": 1.07402363, + "memory(GiB)": 112.5, + "step": 1410, + "train_speed(iter/s)": 0.236091 + }, + { + "acc": 0.74998498, + "epoch": 0.03312967526502282, + "grad_norm": 8.25, + "learning_rate": 5.52099533437014e-06, + "loss": 0.93102665, + "memory(GiB)": 112.5, + "step": 1420, + "train_speed(iter/s)": 0.236812 + }, + { + "acc": 0.7163455, + "epoch": 0.03336298283731171, + "grad_norm": 13.5, + "learning_rate": 5.559875583203733e-06, + "loss": 1.08192539, + "memory(GiB)": 112.5, + "step": 1430, + "train_speed(iter/s)": 0.237497 + }, + { + "acc": 0.74807477, + "epoch": 0.033596290409600604, + "grad_norm": 17.125, + "learning_rate": 5.598755832037326e-06, + "loss": 0.94381409, + "memory(GiB)": 112.5, + "step": 1440, + "train_speed(iter/s)": 0.238196 + }, + { + "acc": 0.72739019, + "epoch": 0.033829597981889496, + "grad_norm": 9.0, + "learning_rate": 5.637636080870919e-06, + "loss": 1.02746716, + "memory(GiB)": 112.5, + "step": 1450, + "train_speed(iter/s)": 0.238917 + }, + { + "acc": 0.72795725, + "epoch": 0.034062905554178395, + "grad_norm": 9.1875, + "learning_rate": 5.67651632970451e-06, + "loss": 1.00883026, + "memory(GiB)": 112.5, + "step": 1460, + "train_speed(iter/s)": 0.239627 + }, + { + "acc": 0.74547176, + "epoch": 0.03429621312646729, + "grad_norm": 20.125, + "learning_rate": 5.715396578538103e-06, + "loss": 0.94491329, + "memory(GiB)": 112.5, + "step": 1470, + "train_speed(iter/s)": 0.240273 + }, + { + "acc": 0.76093078, + "epoch": 0.03452952069875618, + "grad_norm": 9.4375, + "learning_rate": 5.754276827371695e-06, + "loss": 0.89414454, + "memory(GiB)": 112.5, + "step": 1480, + "train_speed(iter/s)": 0.240967 + }, + { + "acc": 0.71505122, + "epoch": 0.03476282827104507, + "grad_norm": 11.6875, + "learning_rate": 5.793157076205288e-06, + "loss": 1.07852478, + "memory(GiB)": 112.5, + "step": 1490, + "train_speed(iter/s)": 0.241618 + }, + { + "acc": 0.73277426, + "epoch": 0.03499613584333396, + "grad_norm": 11.875, + "learning_rate": 5.8320373250388805e-06, + "loss": 0.99803505, + "memory(GiB)": 112.5, + "step": 1500, + "train_speed(iter/s)": 0.242283 + }, + { + "epoch": 0.03499613584333396, + "eval_acc": 0.7026027121066317, + "eval_loss": 0.9811294078826904, + "eval_runtime": 1269.74, + "eval_samples_per_second": 28.345, + "eval_steps_per_second": 14.173, + "step": 1500 + }, + { + "acc": 0.73681936, + "epoch": 0.03522944341562286, + "grad_norm": 7.21875, + "learning_rate": 5.870917573872474e-06, + "loss": 1.00402145, + "memory(GiB)": 112.5, + "step": 1510, + "train_speed(iter/s)": 0.201315 + }, + { + "acc": 0.75371494, + "epoch": 0.035462750987911754, + "grad_norm": 13.0625, + "learning_rate": 5.909797822706066e-06, + "loss": 0.91483002, + "memory(GiB)": 112.5, + "step": 1520, + "train_speed(iter/s)": 0.201999 + }, + { + "acc": 0.73148508, + "epoch": 0.035696058560200646, + "grad_norm": 7.3125, + "learning_rate": 5.948678071539658e-06, + "loss": 1.00952492, + "memory(GiB)": 112.5, + "step": 1530, + "train_speed(iter/s)": 0.202708 + }, + { + "acc": 0.74506388, + "epoch": 0.03592936613248954, + "grad_norm": 10.3125, + "learning_rate": 5.9875583203732515e-06, + "loss": 0.94095707, + "memory(GiB)": 112.5, + "step": 1540, + "train_speed(iter/s)": 0.20341 + }, + { + "acc": 0.7385951, + "epoch": 0.03616267370477843, + "grad_norm": 9.9375, + "learning_rate": 6.026438569206844e-06, + "loss": 0.96724091, + "memory(GiB)": 114.08, + "step": 1550, + "train_speed(iter/s)": 0.204074 + }, + { + "acc": 0.73922119, + "epoch": 0.03639598127706732, + "grad_norm": 17.375, + "learning_rate": 6.065318818040435e-06, + "loss": 0.96251907, + "memory(GiB)": 114.08, + "step": 1560, + "train_speed(iter/s)": 0.204769 + }, + { + "acc": 0.72654696, + "epoch": 0.03662928884935622, + "grad_norm": 10.5625, + "learning_rate": 6.1041990668740285e-06, + "loss": 1.03361921, + "memory(GiB)": 114.08, + "step": 1570, + "train_speed(iter/s)": 0.20549 + }, + { + "acc": 0.72640285, + "epoch": 0.03686259642164511, + "grad_norm": 17.25, + "learning_rate": 6.143079315707621e-06, + "loss": 1.04433689, + "memory(GiB)": 114.08, + "step": 1580, + "train_speed(iter/s)": 0.20617 + }, + { + "acc": 0.75693703, + "epoch": 0.037095903993934004, + "grad_norm": 7.75, + "learning_rate": 6.181959564541213e-06, + "loss": 0.89104633, + "memory(GiB)": 114.08, + "step": 1590, + "train_speed(iter/s)": 0.206865 + }, + { + "acc": 0.74660778, + "epoch": 0.037329211566222896, + "grad_norm": 13.375, + "learning_rate": 6.220839813374806e-06, + "loss": 0.92481155, + "memory(GiB)": 114.08, + "step": 1600, + "train_speed(iter/s)": 0.207546 + }, + { + "acc": 0.72870965, + "epoch": 0.03756251913851179, + "grad_norm": 7.21875, + "learning_rate": 6.259720062208399e-06, + "loss": 1.03226433, + "memory(GiB)": 114.08, + "step": 1610, + "train_speed(iter/s)": 0.208219 + }, + { + "acc": 0.72618661, + "epoch": 0.03779582671080068, + "grad_norm": 9.0, + "learning_rate": 6.298600311041991e-06, + "loss": 1.02390461, + "memory(GiB)": 114.08, + "step": 1620, + "train_speed(iter/s)": 0.208905 + }, + { + "acc": 0.72518511, + "epoch": 0.03802913428308958, + "grad_norm": 11.0625, + "learning_rate": 6.337480559875584e-06, + "loss": 1.02645645, + "memory(GiB)": 114.08, + "step": 1630, + "train_speed(iter/s)": 0.209512 + }, + { + "acc": 0.74411564, + "epoch": 0.03826244185537847, + "grad_norm": 7.28125, + "learning_rate": 6.376360808709176e-06, + "loss": 0.95807343, + "memory(GiB)": 114.08, + "step": 1640, + "train_speed(iter/s)": 0.21016 + }, + { + "acc": 0.73625383, + "epoch": 0.03849574942766736, + "grad_norm": 8.625, + "learning_rate": 6.4152410575427696e-06, + "loss": 0.96755209, + "memory(GiB)": 114.08, + "step": 1650, + "train_speed(iter/s)": 0.210758 + }, + { + "acc": 0.72957053, + "epoch": 0.038729056999956255, + "grad_norm": 9.625, + "learning_rate": 6.454121306376361e-06, + "loss": 0.99150848, + "memory(GiB)": 114.08, + "step": 1660, + "train_speed(iter/s)": 0.211372 + }, + { + "acc": 0.74594126, + "epoch": 0.038962364572245146, + "grad_norm": 9.1875, + "learning_rate": 6.493001555209953e-06, + "loss": 0.92405319, + "memory(GiB)": 114.08, + "step": 1670, + "train_speed(iter/s)": 0.212017 + }, + { + "acc": 0.73622656, + "epoch": 0.03919567214453404, + "grad_norm": 10.3125, + "learning_rate": 6.531881804043546e-06, + "loss": 0.98413754, + "memory(GiB)": 114.08, + "step": 1680, + "train_speed(iter/s)": 0.212643 + }, + { + "acc": 0.72955575, + "epoch": 0.03942897971682294, + "grad_norm": 11.125, + "learning_rate": 6.570762052877139e-06, + "loss": 1.01020985, + "memory(GiB)": 114.08, + "step": 1690, + "train_speed(iter/s)": 0.21329 + }, + { + "acc": 0.74046488, + "epoch": 0.03966228728911183, + "grad_norm": 10.1875, + "learning_rate": 6.609642301710731e-06, + "loss": 0.96516933, + "memory(GiB)": 114.08, + "step": 1700, + "train_speed(iter/s)": 0.21388 + }, + { + "acc": 0.74028692, + "epoch": 0.03989559486140072, + "grad_norm": 7.15625, + "learning_rate": 6.648522550544324e-06, + "loss": 0.97764168, + "memory(GiB)": 114.08, + "step": 1710, + "train_speed(iter/s)": 0.214504 + }, + { + "acc": 0.74346085, + "epoch": 0.04012890243368961, + "grad_norm": 7.1875, + "learning_rate": 6.687402799377917e-06, + "loss": 0.95602665, + "memory(GiB)": 114.08, + "step": 1720, + "train_speed(iter/s)": 0.215093 + }, + { + "acc": 0.74011574, + "epoch": 0.040362210005978505, + "grad_norm": 18.0, + "learning_rate": 6.726283048211509e-06, + "loss": 0.96684675, + "memory(GiB)": 114.08, + "step": 1730, + "train_speed(iter/s)": 0.215721 + }, + { + "acc": 0.73707952, + "epoch": 0.0405955175782674, + "grad_norm": 8.0, + "learning_rate": 6.765163297045102e-06, + "loss": 0.97277508, + "memory(GiB)": 114.08, + "step": 1740, + "train_speed(iter/s)": 0.216341 + }, + { + "acc": 0.72721863, + "epoch": 0.040828825150556296, + "grad_norm": 7.6875, + "learning_rate": 6.8040435458786945e-06, + "loss": 0.99633102, + "memory(GiB)": 114.08, + "step": 1750, + "train_speed(iter/s)": 0.216947 + }, + { + "acc": 0.76437578, + "epoch": 0.04106213272284519, + "grad_norm": 6.78125, + "learning_rate": 6.842923794712286e-06, + "loss": 0.86832066, + "memory(GiB)": 114.08, + "step": 1760, + "train_speed(iter/s)": 0.217536 + }, + { + "acc": 0.73662996, + "epoch": 0.04129544029513408, + "grad_norm": 7.1875, + "learning_rate": 6.881804043545879e-06, + "loss": 1.00305567, + "memory(GiB)": 114.08, + "step": 1770, + "train_speed(iter/s)": 0.218134 + }, + { + "acc": 0.73254414, + "epoch": 0.04152874786742297, + "grad_norm": 7.71875, + "learning_rate": 6.9206842923794715e-06, + "loss": 1.02277117, + "memory(GiB)": 114.08, + "step": 1780, + "train_speed(iter/s)": 0.218699 + }, + { + "acc": 0.74180479, + "epoch": 0.04176205543971186, + "grad_norm": 9.5, + "learning_rate": 6.959564541213064e-06, + "loss": 0.97406406, + "memory(GiB)": 114.08, + "step": 1790, + "train_speed(iter/s)": 0.219286 + }, + { + "acc": 0.75128908, + "epoch": 0.041995363012000755, + "grad_norm": 7.4375, + "learning_rate": 6.998444790046657e-06, + "loss": 0.92827549, + "memory(GiB)": 114.08, + "step": 1800, + "train_speed(iter/s)": 0.219875 + }, + { + "acc": 0.75414939, + "epoch": 0.042228670584289654, + "grad_norm": 8.6875, + "learning_rate": 7.037325038880249e-06, + "loss": 0.88612537, + "memory(GiB)": 114.08, + "step": 1810, + "train_speed(iter/s)": 0.220455 + }, + { + "acc": 0.74286633, + "epoch": 0.042461978156578546, + "grad_norm": 9.4375, + "learning_rate": 7.0762052877138424e-06, + "loss": 0.95695734, + "memory(GiB)": 114.08, + "step": 1820, + "train_speed(iter/s)": 0.221039 + }, + { + "acc": 0.72784061, + "epoch": 0.04269528572886744, + "grad_norm": 9.6875, + "learning_rate": 7.115085536547435e-06, + "loss": 1.02230368, + "memory(GiB)": 114.08, + "step": 1830, + "train_speed(iter/s)": 0.221625 + }, + { + "acc": 0.72988882, + "epoch": 0.04292859330115633, + "grad_norm": 14.8125, + "learning_rate": 7.153965785381027e-06, + "loss": 0.99553108, + "memory(GiB)": 114.08, + "step": 1840, + "train_speed(iter/s)": 0.222188 + }, + { + "acc": 0.74705415, + "epoch": 0.04316190087344522, + "grad_norm": 7.1875, + "learning_rate": 7.19284603421462e-06, + "loss": 0.95047102, + "memory(GiB)": 114.08, + "step": 1850, + "train_speed(iter/s)": 0.222809 + }, + { + "acc": 0.71940756, + "epoch": 0.043395208445734114, + "grad_norm": 6.53125, + "learning_rate": 7.231726283048212e-06, + "loss": 1.04833097, + "memory(GiB)": 114.08, + "step": 1860, + "train_speed(iter/s)": 0.223337 + }, + { + "acc": 0.73792143, + "epoch": 0.04362851601802301, + "grad_norm": 10.6875, + "learning_rate": 7.270606531881804e-06, + "loss": 0.97211323, + "memory(GiB)": 114.08, + "step": 1870, + "train_speed(iter/s)": 0.223914 + }, + { + "acc": 0.723668, + "epoch": 0.043861823590311905, + "grad_norm": 5.8125, + "learning_rate": 7.309486780715397e-06, + "loss": 1.0364295, + "memory(GiB)": 114.08, + "step": 1880, + "train_speed(iter/s)": 0.224504 + }, + { + "acc": 0.72725635, + "epoch": 0.0440951311626008, + "grad_norm": 8.0, + "learning_rate": 7.3483670295489895e-06, + "loss": 1.04018497, + "memory(GiB)": 114.08, + "step": 1890, + "train_speed(iter/s)": 0.225068 + }, + { + "acc": 0.75151863, + "epoch": 0.04432843873488969, + "grad_norm": 6.3125, + "learning_rate": 7.387247278382582e-06, + "loss": 0.9029954, + "memory(GiB)": 114.08, + "step": 1900, + "train_speed(iter/s)": 0.225623 + }, + { + "acc": 0.77624922, + "epoch": 0.04456174630717858, + "grad_norm": 10.8125, + "learning_rate": 7.426127527216175e-06, + "loss": 0.82208271, + "memory(GiB)": 114.08, + "step": 1910, + "train_speed(iter/s)": 0.226166 + }, + { + "acc": 0.7336926, + "epoch": 0.04479505387946747, + "grad_norm": 7.1875, + "learning_rate": 7.465007776049767e-06, + "loss": 0.98131351, + "memory(GiB)": 114.08, + "step": 1920, + "train_speed(iter/s)": 0.226693 + }, + { + "acc": 0.73093643, + "epoch": 0.04502836145175637, + "grad_norm": 8.0, + "learning_rate": 7.5038880248833605e-06, + "loss": 0.9974226, + "memory(GiB)": 114.08, + "step": 1930, + "train_speed(iter/s)": 0.227229 + }, + { + "acc": 0.73827853, + "epoch": 0.04526166902404526, + "grad_norm": 9.0, + "learning_rate": 7.542768273716953e-06, + "loss": 0.95264168, + "memory(GiB)": 114.08, + "step": 1940, + "train_speed(iter/s)": 0.227762 + }, + { + "acc": 0.74395685, + "epoch": 0.045494976596334155, + "grad_norm": 11.125, + "learning_rate": 7.581648522550545e-06, + "loss": 0.95124683, + "memory(GiB)": 114.08, + "step": 1950, + "train_speed(iter/s)": 0.228247 + }, + { + "acc": 0.74083662, + "epoch": 0.04572828416862305, + "grad_norm": 7.59375, + "learning_rate": 7.620528771384137e-06, + "loss": 0.97542019, + "memory(GiB)": 114.08, + "step": 1960, + "train_speed(iter/s)": 0.228772 + }, + { + "acc": 0.7502737, + "epoch": 0.04596159174091194, + "grad_norm": 5.96875, + "learning_rate": 7.659409020217729e-06, + "loss": 0.92420015, + "memory(GiB)": 114.08, + "step": 1970, + "train_speed(iter/s)": 0.229254 + }, + { + "acc": 0.75313225, + "epoch": 0.04619489931320083, + "grad_norm": 20.75, + "learning_rate": 7.698289269051322e-06, + "loss": 0.90695429, + "memory(GiB)": 114.08, + "step": 1980, + "train_speed(iter/s)": 0.229765 + }, + { + "acc": 0.74232001, + "epoch": 0.04642820688548973, + "grad_norm": 8.375, + "learning_rate": 7.737169517884915e-06, + "loss": 0.97582893, + "memory(GiB)": 114.08, + "step": 1990, + "train_speed(iter/s)": 0.230283 + }, + { + "acc": 0.757305, + "epoch": 0.04666151445777862, + "grad_norm": 6.40625, + "learning_rate": 7.776049766718507e-06, + "loss": 0.91142721, + "memory(GiB)": 114.08, + "step": 2000, + "train_speed(iter/s)": 0.230798 + }, + { + "epoch": 0.04666151445777862, + "eval_acc": 0.709365603036719, + "eval_loss": 0.9482490420341492, + "eval_runtime": 1270.2818, + "eval_samples_per_second": 28.333, + "eval_steps_per_second": 14.167, + "step": 2000 + }, + { + "acc": 0.73707428, + "epoch": 0.046894822030067514, + "grad_norm": 10.1875, + "learning_rate": 7.8149300155521e-06, + "loss": 0.97233877, + "memory(GiB)": 114.08, + "step": 2010, + "train_speed(iter/s)": 0.201446 + }, + { + "acc": 0.74674578, + "epoch": 0.047128129602356406, + "grad_norm": 11.125, + "learning_rate": 7.853810264385693e-06, + "loss": 0.93988552, + "memory(GiB)": 114.08, + "step": 2020, + "train_speed(iter/s)": 0.201937 + }, + { + "acc": 0.74976053, + "epoch": 0.0473614371746453, + "grad_norm": 8.0, + "learning_rate": 7.892690513219286e-06, + "loss": 0.91554279, + "memory(GiB)": 114.08, + "step": 2030, + "train_speed(iter/s)": 0.202447 + }, + { + "acc": 0.74252758, + "epoch": 0.04759474474693419, + "grad_norm": 11.625, + "learning_rate": 7.931570762052878e-06, + "loss": 0.93326302, + "memory(GiB)": 114.08, + "step": 2040, + "train_speed(iter/s)": 0.202943 + }, + { + "acc": 0.72653966, + "epoch": 0.04782805231922309, + "grad_norm": 10.125, + "learning_rate": 7.970451010886471e-06, + "loss": 1.01006451, + "memory(GiB)": 114.08, + "step": 2050, + "train_speed(iter/s)": 0.203443 + }, + { + "acc": 0.75554819, + "epoch": 0.04806135989151198, + "grad_norm": 5.875, + "learning_rate": 8.009331259720062e-06, + "loss": 0.90173092, + "memory(GiB)": 114.08, + "step": 2060, + "train_speed(iter/s)": 0.203953 + }, + { + "acc": 0.7355751, + "epoch": 0.04829466746380087, + "grad_norm": 6.78125, + "learning_rate": 8.048211508553656e-06, + "loss": 0.98420334, + "memory(GiB)": 114.08, + "step": 2070, + "train_speed(iter/s)": 0.204457 + }, + { + "acc": 0.74058089, + "epoch": 0.048527975036089764, + "grad_norm": 8.8125, + "learning_rate": 8.087091757387247e-06, + "loss": 0.96377907, + "memory(GiB)": 114.08, + "step": 2080, + "train_speed(iter/s)": 0.204931 + }, + { + "acc": 0.75542016, + "epoch": 0.048761282608378656, + "grad_norm": 7.125, + "learning_rate": 8.12597200622084e-06, + "loss": 0.90528049, + "memory(GiB)": 114.08, + "step": 2090, + "train_speed(iter/s)": 0.205464 + }, + { + "acc": 0.74561777, + "epoch": 0.04899459018066755, + "grad_norm": 9.6875, + "learning_rate": 8.164852255054433e-06, + "loss": 0.94761057, + "memory(GiB)": 114.08, + "step": 2100, + "train_speed(iter/s)": 0.205929 + }, + { + "acc": 0.75310192, + "epoch": 0.04922789775295645, + "grad_norm": 7.0625, + "learning_rate": 8.203732503888025e-06, + "loss": 0.90652037, + "memory(GiB)": 114.08, + "step": 2110, + "train_speed(iter/s)": 0.206417 + }, + { + "acc": 0.74553957, + "epoch": 0.04946120532524534, + "grad_norm": 8.8125, + "learning_rate": 8.242612752721618e-06, + "loss": 0.94698086, + "memory(GiB)": 114.08, + "step": 2120, + "train_speed(iter/s)": 0.206878 + }, + { + "acc": 0.74139624, + "epoch": 0.04969451289753423, + "grad_norm": 5.78125, + "learning_rate": 8.281493001555211e-06, + "loss": 0.9554759, + "memory(GiB)": 114.08, + "step": 2130, + "train_speed(iter/s)": 0.207363 + }, + { + "acc": 0.75223551, + "epoch": 0.04992782046982312, + "grad_norm": 7.1875, + "learning_rate": 8.320373250388803e-06, + "loss": 0.91746941, + "memory(GiB)": 114.08, + "step": 2140, + "train_speed(iter/s)": 0.20786 + }, + { + "acc": 0.75747862, + "epoch": 0.050161128042112015, + "grad_norm": 8.1875, + "learning_rate": 8.359253499222396e-06, + "loss": 0.90032187, + "memory(GiB)": 114.08, + "step": 2150, + "train_speed(iter/s)": 0.208349 + }, + { + "acc": 0.74738655, + "epoch": 0.050394435614400906, + "grad_norm": 8.0, + "learning_rate": 8.398133748055989e-06, + "loss": 0.95068922, + "memory(GiB)": 114.08, + "step": 2160, + "train_speed(iter/s)": 0.208799 + }, + { + "acc": 0.74726391, + "epoch": 0.050627743186689805, + "grad_norm": 10.875, + "learning_rate": 8.43701399688958e-06, + "loss": 0.92888641, + "memory(GiB)": 114.08, + "step": 2170, + "train_speed(iter/s)": 0.209331 + }, + { + "acc": 0.7601017, + "epoch": 0.0508610507589787, + "grad_norm": 6.59375, + "learning_rate": 8.475894245723174e-06, + "loss": 0.88537483, + "memory(GiB)": 114.08, + "step": 2180, + "train_speed(iter/s)": 0.209829 + }, + { + "acc": 0.74622045, + "epoch": 0.05109435833126759, + "grad_norm": 5.90625, + "learning_rate": 8.514774494556765e-06, + "loss": 0.9415596, + "memory(GiB)": 114.08, + "step": 2190, + "train_speed(iter/s)": 0.210313 + }, + { + "acc": 0.74102707, + "epoch": 0.05132766590355648, + "grad_norm": 13.25, + "learning_rate": 8.553654743390358e-06, + "loss": 0.970403, + "memory(GiB)": 114.08, + "step": 2200, + "train_speed(iter/s)": 0.210819 + }, + { + "acc": 0.74637518, + "epoch": 0.05156097347584537, + "grad_norm": 6.59375, + "learning_rate": 8.592534992223951e-06, + "loss": 0.93611584, + "memory(GiB)": 114.08, + "step": 2210, + "train_speed(iter/s)": 0.211296 + }, + { + "acc": 0.72973704, + "epoch": 0.05179428104813427, + "grad_norm": 13.0625, + "learning_rate": 8.631415241057543e-06, + "loss": 0.99803753, + "memory(GiB)": 114.08, + "step": 2220, + "train_speed(iter/s)": 0.211789 + }, + { + "acc": 0.70917277, + "epoch": 0.052027588620423164, + "grad_norm": 8.25, + "learning_rate": 8.670295489891136e-06, + "loss": 1.06907005, + "memory(GiB)": 114.08, + "step": 2230, + "train_speed(iter/s)": 0.212285 + }, + { + "acc": 0.74743881, + "epoch": 0.052260896192712056, + "grad_norm": 21.0, + "learning_rate": 8.70917573872473e-06, + "loss": 0.9445673, + "memory(GiB)": 114.08, + "step": 2240, + "train_speed(iter/s)": 0.212758 + }, + { + "acc": 0.74076357, + "epoch": 0.05249420376500095, + "grad_norm": 5.21875, + "learning_rate": 8.74805598755832e-06, + "loss": 0.97673054, + "memory(GiB)": 114.08, + "step": 2250, + "train_speed(iter/s)": 0.213229 + }, + { + "acc": 0.74062457, + "epoch": 0.05272751133728984, + "grad_norm": 5.96875, + "learning_rate": 8.786936236391914e-06, + "loss": 0.96884727, + "memory(GiB)": 114.08, + "step": 2260, + "train_speed(iter/s)": 0.213684 + }, + { + "acc": 0.72948203, + "epoch": 0.05296081890957873, + "grad_norm": 6.0625, + "learning_rate": 8.825816485225505e-06, + "loss": 1.01084576, + "memory(GiB)": 114.08, + "step": 2270, + "train_speed(iter/s)": 0.214158 + }, + { + "acc": 0.75192738, + "epoch": 0.05319412648186763, + "grad_norm": 6.15625, + "learning_rate": 8.864696734059099e-06, + "loss": 0.89396305, + "memory(GiB)": 114.08, + "step": 2280, + "train_speed(iter/s)": 0.214642 + }, + { + "acc": 0.76000366, + "epoch": 0.05342743405415652, + "grad_norm": 6.90625, + "learning_rate": 8.903576982892692e-06, + "loss": 0.90859699, + "memory(GiB)": 114.08, + "step": 2290, + "train_speed(iter/s)": 0.215099 + }, + { + "acc": 0.7458251, + "epoch": 0.053660741626445414, + "grad_norm": 7.3125, + "learning_rate": 8.942457231726283e-06, + "loss": 0.97098904, + "memory(GiB)": 114.08, + "step": 2300, + "train_speed(iter/s)": 0.21551 + }, + { + "acc": 0.73837004, + "epoch": 0.053894049198734306, + "grad_norm": 5.8125, + "learning_rate": 8.981337480559876e-06, + "loss": 0.95030222, + "memory(GiB)": 114.08, + "step": 2310, + "train_speed(iter/s)": 0.215929 + }, + { + "acc": 0.75400095, + "epoch": 0.0541273567710232, + "grad_norm": 8.1875, + "learning_rate": 9.02021772939347e-06, + "loss": 0.90010729, + "memory(GiB)": 114.08, + "step": 2320, + "train_speed(iter/s)": 0.216371 + }, + { + "acc": 0.75261707, + "epoch": 0.05436066434331209, + "grad_norm": 6.4375, + "learning_rate": 9.059097978227061e-06, + "loss": 0.94032478, + "memory(GiB)": 114.08, + "step": 2330, + "train_speed(iter/s)": 0.216778 + }, + { + "acc": 0.7515697, + "epoch": 0.05459397191560099, + "grad_norm": 5.34375, + "learning_rate": 9.097978227060654e-06, + "loss": 0.94090843, + "memory(GiB)": 114.08, + "step": 2340, + "train_speed(iter/s)": 0.217239 + }, + { + "acc": 0.7397305, + "epoch": 0.05482727948788988, + "grad_norm": 10.0625, + "learning_rate": 9.136858475894247e-06, + "loss": 0.97343073, + "memory(GiB)": 114.08, + "step": 2350, + "train_speed(iter/s)": 0.217667 + }, + { + "acc": 0.74915609, + "epoch": 0.05506058706017877, + "grad_norm": 6.65625, + "learning_rate": 9.175738724727839e-06, + "loss": 0.921453, + "memory(GiB)": 114.08, + "step": 2360, + "train_speed(iter/s)": 0.218073 + }, + { + "acc": 0.7468821, + "epoch": 0.055293894632467665, + "grad_norm": 9.75, + "learning_rate": 9.21461897356143e-06, + "loss": 0.95533428, + "memory(GiB)": 114.08, + "step": 2370, + "train_speed(iter/s)": 0.218486 + }, + { + "acc": 0.73546376, + "epoch": 0.05552720220475656, + "grad_norm": 6.96875, + "learning_rate": 9.253499222395023e-06, + "loss": 0.99596453, + "memory(GiB)": 114.08, + "step": 2380, + "train_speed(iter/s)": 0.218898 + }, + { + "acc": 0.73259478, + "epoch": 0.05576050977704545, + "grad_norm": 7.96875, + "learning_rate": 9.292379471228617e-06, + "loss": 0.99712763, + "memory(GiB)": 114.08, + "step": 2390, + "train_speed(iter/s)": 0.219283 + }, + { + "acc": 0.73952942, + "epoch": 0.05599381734933435, + "grad_norm": 5.71875, + "learning_rate": 9.33125972006221e-06, + "loss": 0.94039307, + "memory(GiB)": 114.08, + "step": 2400, + "train_speed(iter/s)": 0.21973 + }, + { + "acc": 0.74226012, + "epoch": 0.05622712492162324, + "grad_norm": 6.125, + "learning_rate": 9.370139968895801e-06, + "loss": 0.93415909, + "memory(GiB)": 114.08, + "step": 2410, + "train_speed(iter/s)": 0.220139 + }, + { + "acc": 0.75117826, + "epoch": 0.05646043249391213, + "grad_norm": 24.625, + "learning_rate": 9.409020217729394e-06, + "loss": 0.89564247, + "memory(GiB)": 114.08, + "step": 2420, + "train_speed(iter/s)": 0.220555 + }, + { + "acc": 0.74033728, + "epoch": 0.05669374006620102, + "grad_norm": 10.4375, + "learning_rate": 9.447900466562988e-06, + "loss": 0.96549892, + "memory(GiB)": 114.08, + "step": 2430, + "train_speed(iter/s)": 0.220978 + }, + { + "acc": 0.76307154, + "epoch": 0.056927047638489915, + "grad_norm": 17.375, + "learning_rate": 9.486780715396579e-06, + "loss": 0.86378517, + "memory(GiB)": 114.08, + "step": 2440, + "train_speed(iter/s)": 0.221411 + }, + { + "acc": 0.73024807, + "epoch": 0.05716035521077881, + "grad_norm": 12.6875, + "learning_rate": 9.525660964230172e-06, + "loss": 1.01558571, + "memory(GiB)": 114.08, + "step": 2450, + "train_speed(iter/s)": 0.221845 + }, + { + "acc": 0.75076675, + "epoch": 0.057393662783067706, + "grad_norm": 7.1875, + "learning_rate": 9.564541213063765e-06, + "loss": 0.91901255, + "memory(GiB)": 114.08, + "step": 2460, + "train_speed(iter/s)": 0.222309 + }, + { + "acc": 0.76636009, + "epoch": 0.0576269703553566, + "grad_norm": 4.5, + "learning_rate": 9.603421461897357e-06, + "loss": 0.85223923, + "memory(GiB)": 114.08, + "step": 2470, + "train_speed(iter/s)": 0.222694 + }, + { + "acc": 0.75716038, + "epoch": 0.05786027792764549, + "grad_norm": 6.5625, + "learning_rate": 9.642301710730948e-06, + "loss": 0.90162163, + "memory(GiB)": 114.08, + "step": 2480, + "train_speed(iter/s)": 0.223102 + }, + { + "acc": 0.74022632, + "epoch": 0.05809358549993438, + "grad_norm": 7.46875, + "learning_rate": 9.681181959564542e-06, + "loss": 0.96824207, + "memory(GiB)": 114.08, + "step": 2490, + "train_speed(iter/s)": 0.223508 + }, + { + "acc": 0.75365782, + "epoch": 0.058326893072223274, + "grad_norm": 6.75, + "learning_rate": 9.720062208398135e-06, + "loss": 0.93858395, + "memory(GiB)": 114.08, + "step": 2500, + "train_speed(iter/s)": 0.223925 + }, + { + "epoch": 0.058326893072223274, + "eval_acc": 0.7143739537577124, + "eval_loss": 0.9248969554901123, + "eval_runtime": 1269.8474, + "eval_samples_per_second": 28.343, + "eval_steps_per_second": 14.172, + "step": 2500 + }, + { + "acc": 0.7417676, + "epoch": 0.058560200644512166, + "grad_norm": 6.03125, + "learning_rate": 9.758942457231726e-06, + "loss": 0.95499773, + "memory(GiB)": 114.08, + "step": 2510, + "train_speed(iter/s)": 0.201204 + }, + { + "acc": 0.74519234, + "epoch": 0.058793508216801064, + "grad_norm": 8.75, + "learning_rate": 9.79782270606532e-06, + "loss": 0.94459839, + "memory(GiB)": 114.08, + "step": 2520, + "train_speed(iter/s)": 0.201609 + }, + { + "acc": 0.75776386, + "epoch": 0.059026815789089956, + "grad_norm": 4.96875, + "learning_rate": 9.836702954898913e-06, + "loss": 0.88159065, + "memory(GiB)": 114.08, + "step": 2530, + "train_speed(iter/s)": 0.201997 + }, + { + "acc": 0.75585461, + "epoch": 0.05926012336137885, + "grad_norm": 8.625, + "learning_rate": 9.875583203732506e-06, + "loss": 0.89656048, + "memory(GiB)": 114.08, + "step": 2540, + "train_speed(iter/s)": 0.202388 + }, + { + "acc": 0.74211607, + "epoch": 0.05949343093366774, + "grad_norm": 15.3125, + "learning_rate": 9.914463452566097e-06, + "loss": 0.94759617, + "memory(GiB)": 114.08, + "step": 2550, + "train_speed(iter/s)": 0.202765 + }, + { + "acc": 0.74785175, + "epoch": 0.05972673850595663, + "grad_norm": 6.9375, + "learning_rate": 9.95334370139969e-06, + "loss": 0.92067432, + "memory(GiB)": 114.08, + "step": 2560, + "train_speed(iter/s)": 0.203186 + }, + { + "acc": 0.73263016, + "epoch": 0.059960046078245524, + "grad_norm": 7.5625, + "learning_rate": 9.992223950233282e-06, + "loss": 0.99488621, + "memory(GiB)": 114.08, + "step": 2570, + "train_speed(iter/s)": 0.203615 + }, + { + "acc": 0.74043674, + "epoch": 0.06019335365053442, + "grad_norm": 7.8125, + "learning_rate": 9.999999771600465e-06, + "loss": 0.96276875, + "memory(GiB)": 114.08, + "step": 2580, + "train_speed(iter/s)": 0.204022 + }, + { + "acc": 0.78092365, + "epoch": 0.060426661222823315, + "grad_norm": 6.6875, + "learning_rate": 9.999998843727385e-06, + "loss": 0.79039125, + "memory(GiB)": 114.08, + "step": 2590, + "train_speed(iter/s)": 0.204452 + }, + { + "acc": 0.76478825, + "epoch": 0.06065996879511221, + "grad_norm": 5.5, + "learning_rate": 9.999997202105923e-06, + "loss": 0.85824947, + "memory(GiB)": 114.08, + "step": 2600, + "train_speed(iter/s)": 0.204846 + }, + { + "acc": 0.77252517, + "epoch": 0.0608932763674011, + "grad_norm": 7.1875, + "learning_rate": 9.999994846736312e-06, + "loss": 0.83852539, + "memory(GiB)": 114.08, + "step": 2610, + "train_speed(iter/s)": 0.20524 + }, + { + "acc": 0.73587317, + "epoch": 0.06112658393968999, + "grad_norm": 16.5, + "learning_rate": 9.99999177761889e-06, + "loss": 0.97813015, + "memory(GiB)": 114.08, + "step": 2620, + "train_speed(iter/s)": 0.205637 + }, + { + "acc": 0.75071974, + "epoch": 0.06135989151197888, + "grad_norm": 4.71875, + "learning_rate": 9.999987994754094e-06, + "loss": 0.92447548, + "memory(GiB)": 114.08, + "step": 2630, + "train_speed(iter/s)": 0.206024 + }, + { + "acc": 0.74091039, + "epoch": 0.06159319908426778, + "grad_norm": 6.21875, + "learning_rate": 9.999983498142464e-06, + "loss": 0.95536308, + "memory(GiB)": 114.08, + "step": 2640, + "train_speed(iter/s)": 0.206403 + }, + { + "acc": 0.74781256, + "epoch": 0.06182650665655667, + "grad_norm": 14.75, + "learning_rate": 9.999978287784642e-06, + "loss": 0.9136961, + "memory(GiB)": 114.08, + "step": 2650, + "train_speed(iter/s)": 0.206807 + }, + { + "acc": 0.74666529, + "epoch": 0.062059814228845565, + "grad_norm": 5.8125, + "learning_rate": 9.999972363681371e-06, + "loss": 0.90506544, + "memory(GiB)": 114.08, + "step": 2660, + "train_speed(iter/s)": 0.207192 + }, + { + "acc": 0.76904163, + "epoch": 0.06229312180113446, + "grad_norm": 5.84375, + "learning_rate": 9.9999657258335e-06, + "loss": 0.85019245, + "memory(GiB)": 114.08, + "step": 2670, + "train_speed(iter/s)": 0.207572 + }, + { + "acc": 0.7517498, + "epoch": 0.06252642937342336, + "grad_norm": 8.125, + "learning_rate": 9.999958374241974e-06, + "loss": 0.91046963, + "memory(GiB)": 114.08, + "step": 2680, + "train_speed(iter/s)": 0.207944 + }, + { + "acc": 0.76082239, + "epoch": 0.06275973694571224, + "grad_norm": 6.4375, + "learning_rate": 9.99995030890784e-06, + "loss": 0.87537355, + "memory(GiB)": 114.08, + "step": 2690, + "train_speed(iter/s)": 0.208348 + }, + { + "acc": 0.75649023, + "epoch": 0.06299304451800114, + "grad_norm": 6.90625, + "learning_rate": 9.999941529832254e-06, + "loss": 0.9041748, + "memory(GiB)": 114.08, + "step": 2700, + "train_speed(iter/s)": 0.208755 + }, + { + "acc": 0.7486876, + "epoch": 0.06322635209029003, + "grad_norm": 8.4375, + "learning_rate": 9.999932037016466e-06, + "loss": 0.94257088, + "memory(GiB)": 114.08, + "step": 2710, + "train_speed(iter/s)": 0.209144 + }, + { + "acc": 0.76688719, + "epoch": 0.06345965966257892, + "grad_norm": 15.0, + "learning_rate": 9.999921830461833e-06, + "loss": 0.88587494, + "memory(GiB)": 114.08, + "step": 2720, + "train_speed(iter/s)": 0.20952 + }, + { + "acc": 0.76675873, + "epoch": 0.06369296723486782, + "grad_norm": 4.90625, + "learning_rate": 9.99991091016981e-06, + "loss": 0.84941301, + "memory(GiB)": 114.08, + "step": 2730, + "train_speed(iter/s)": 0.209885 + }, + { + "acc": 0.76298923, + "epoch": 0.06392627480715671, + "grad_norm": 7.125, + "learning_rate": 9.99989927614196e-06, + "loss": 0.87009039, + "memory(GiB)": 114.08, + "step": 2740, + "train_speed(iter/s)": 0.210263 + }, + { + "acc": 0.75599327, + "epoch": 0.0641595823794456, + "grad_norm": 6.71875, + "learning_rate": 9.999886928379939e-06, + "loss": 0.88294525, + "memory(GiB)": 114.08, + "step": 2750, + "train_speed(iter/s)": 0.2106 + }, + { + "acc": 0.76202002, + "epoch": 0.06439288995173449, + "grad_norm": 7.5625, + "learning_rate": 9.99987386688551e-06, + "loss": 0.87443285, + "memory(GiB)": 114.08, + "step": 2760, + "train_speed(iter/s)": 0.210993 + }, + { + "acc": 0.74971361, + "epoch": 0.06462619752402339, + "grad_norm": 5.375, + "learning_rate": 9.99986009166054e-06, + "loss": 0.90866261, + "memory(GiB)": 114.08, + "step": 2770, + "train_speed(iter/s)": 0.211356 + }, + { + "acc": 0.75157547, + "epoch": 0.06485950509631228, + "grad_norm": 6.40625, + "learning_rate": 9.999845602706995e-06, + "loss": 0.89404106, + "memory(GiB)": 114.08, + "step": 2780, + "train_speed(iter/s)": 0.211714 + }, + { + "acc": 0.7517457, + "epoch": 0.06509281266860117, + "grad_norm": 7.9375, + "learning_rate": 9.999830400026941e-06, + "loss": 0.93447304, + "memory(GiB)": 114.08, + "step": 2790, + "train_speed(iter/s)": 0.212101 + }, + { + "acc": 0.74939804, + "epoch": 0.06532612024089007, + "grad_norm": 5.375, + "learning_rate": 9.999814483622552e-06, + "loss": 0.90546379, + "memory(GiB)": 114.08, + "step": 2800, + "train_speed(iter/s)": 0.212463 + }, + { + "acc": 0.74818916, + "epoch": 0.06555942781317896, + "grad_norm": 8.625, + "learning_rate": 9.999797853496097e-06, + "loss": 0.91956005, + "memory(GiB)": 114.08, + "step": 2810, + "train_speed(iter/s)": 0.212815 + }, + { + "acc": 0.74374294, + "epoch": 0.06579273538546786, + "grad_norm": 6.40625, + "learning_rate": 9.999780509649952e-06, + "loss": 0.95786781, + "memory(GiB)": 114.08, + "step": 2820, + "train_speed(iter/s)": 0.213177 + }, + { + "acc": 0.74510412, + "epoch": 0.06602604295775674, + "grad_norm": 6.125, + "learning_rate": 9.99976245208659e-06, + "loss": 0.96016006, + "memory(GiB)": 114.08, + "step": 2830, + "train_speed(iter/s)": 0.213555 + }, + { + "acc": 0.75915327, + "epoch": 0.06625935053004564, + "grad_norm": 7.03125, + "learning_rate": 9.99974368080859e-06, + "loss": 0.8857523, + "memory(GiB)": 114.08, + "step": 2840, + "train_speed(iter/s)": 0.213924 + }, + { + "acc": 0.74505186, + "epoch": 0.06649265810233454, + "grad_norm": 7.125, + "learning_rate": 9.999724195818634e-06, + "loss": 0.949786, + "memory(GiB)": 114.08, + "step": 2850, + "train_speed(iter/s)": 0.21428 + }, + { + "acc": 0.76169367, + "epoch": 0.06672596567462342, + "grad_norm": 7.5, + "learning_rate": 9.999703997119501e-06, + "loss": 0.85612011, + "memory(GiB)": 114.08, + "step": 2860, + "train_speed(iter/s)": 0.214653 + }, + { + "acc": 0.75327644, + "epoch": 0.06695927324691232, + "grad_norm": 7.625, + "learning_rate": 9.999683084714074e-06, + "loss": 0.8901247, + "memory(GiB)": 114.09, + "step": 2870, + "train_speed(iter/s)": 0.214995 + }, + { + "acc": 0.73931332, + "epoch": 0.06719258081920121, + "grad_norm": 10.0625, + "learning_rate": 9.999661458605339e-06, + "loss": 0.95245857, + "memory(GiB)": 114.09, + "step": 2880, + "train_speed(iter/s)": 0.215355 + }, + { + "acc": 0.76272769, + "epoch": 0.06742588839149011, + "grad_norm": 4.28125, + "learning_rate": 9.999639118796384e-06, + "loss": 0.86453457, + "memory(GiB)": 114.09, + "step": 2890, + "train_speed(iter/s)": 0.215716 + }, + { + "acc": 0.77728963, + "epoch": 0.06765919596377899, + "grad_norm": 7.96875, + "learning_rate": 9.999616065290396e-06, + "loss": 0.8032423, + "memory(GiB)": 114.09, + "step": 2900, + "train_speed(iter/s)": 0.216072 + }, + { + "acc": 0.75634222, + "epoch": 0.06789250353606789, + "grad_norm": 6.15625, + "learning_rate": 9.999592298090669e-06, + "loss": 0.86408386, + "memory(GiB)": 114.09, + "step": 2910, + "train_speed(iter/s)": 0.216406 + }, + { + "acc": 0.77233763, + "epoch": 0.06812581110835679, + "grad_norm": 5.5625, + "learning_rate": 9.999567817200592e-06, + "loss": 0.83319054, + "memory(GiB)": 114.09, + "step": 2920, + "train_speed(iter/s)": 0.21677 + }, + { + "acc": 0.73433094, + "epoch": 0.06835911868064568, + "grad_norm": 4.84375, + "learning_rate": 9.999542622623661e-06, + "loss": 0.98757401, + "memory(GiB)": 114.09, + "step": 2930, + "train_speed(iter/s)": 0.217116 + }, + { + "acc": 0.76126733, + "epoch": 0.06859242625293457, + "grad_norm": 7.125, + "learning_rate": 9.999516714363475e-06, + "loss": 0.89113979, + "memory(GiB)": 114.09, + "step": 2940, + "train_speed(iter/s)": 0.217466 + }, + { + "acc": 0.74799905, + "epoch": 0.06882573382522346, + "grad_norm": 4.71875, + "learning_rate": 9.99949009242373e-06, + "loss": 0.92414608, + "memory(GiB)": 114.09, + "step": 2950, + "train_speed(iter/s)": 0.217813 + }, + { + "acc": 0.75590515, + "epoch": 0.06905904139751236, + "grad_norm": 5.625, + "learning_rate": 9.999462756808227e-06, + "loss": 0.88417921, + "memory(GiB)": 114.09, + "step": 2960, + "train_speed(iter/s)": 0.218171 + }, + { + "acc": 0.73305931, + "epoch": 0.06929234896980126, + "grad_norm": 10.0625, + "learning_rate": 9.999434707520867e-06, + "loss": 0.96791725, + "memory(GiB)": 114.09, + "step": 2970, + "train_speed(iter/s)": 0.218519 + }, + { + "acc": 0.74652576, + "epoch": 0.06952565654209014, + "grad_norm": 5.03125, + "learning_rate": 9.999405944565654e-06, + "loss": 0.9400857, + "memory(GiB)": 114.09, + "step": 2980, + "train_speed(iter/s)": 0.218871 + }, + { + "acc": 0.75939126, + "epoch": 0.06975896411437904, + "grad_norm": 5.46875, + "learning_rate": 9.999376467946695e-06, + "loss": 0.88669376, + "memory(GiB)": 114.09, + "step": 2990, + "train_speed(iter/s)": 0.219224 + }, + { + "acc": 0.74682531, + "epoch": 0.06999227168666793, + "grad_norm": 6.0625, + "learning_rate": 9.999346277668198e-06, + "loss": 0.89512215, + "memory(GiB)": 114.09, + "step": 3000, + "train_speed(iter/s)": 0.21959 + }, + { + "epoch": 0.06999227168666793, + "eval_acc": 0.7189282955801313, + "eval_loss": 0.9065292477607727, + "eval_runtime": 1270.6255, + "eval_samples_per_second": 28.325, + "eval_steps_per_second": 14.163, + "step": 3000 + }, + { + "acc": 0.75453682, + "epoch": 0.07022557925895682, + "grad_norm": 5.0, + "learning_rate": 9.999315373734472e-06, + "loss": 0.87960033, + "memory(GiB)": 117.28, + "step": 3010, + "train_speed(iter/s)": 0.200973 + }, + { + "acc": 0.76882467, + "epoch": 0.07045888683124572, + "grad_norm": 10.75, + "learning_rate": 9.999283756149932e-06, + "loss": 0.83173504, + "memory(GiB)": 117.28, + "step": 3020, + "train_speed(iter/s)": 0.201327 + }, + { + "acc": 0.7758441, + "epoch": 0.07069219440353461, + "grad_norm": 5.21875, + "learning_rate": 9.999251424919083e-06, + "loss": 0.8254406, + "memory(GiB)": 117.28, + "step": 3030, + "train_speed(iter/s)": 0.201672 + }, + { + "acc": 0.7445199, + "epoch": 0.07092550197582351, + "grad_norm": 10.1875, + "learning_rate": 9.999218380046548e-06, + "loss": 0.93491154, + "memory(GiB)": 117.28, + "step": 3040, + "train_speed(iter/s)": 0.202018 + }, + { + "acc": 0.77481375, + "epoch": 0.07115880954811239, + "grad_norm": 6.90625, + "learning_rate": 9.99918462153704e-06, + "loss": 0.8122179, + "memory(GiB)": 117.28, + "step": 3050, + "train_speed(iter/s)": 0.202354 + }, + { + "acc": 0.7394393, + "epoch": 0.07139211712040129, + "grad_norm": 6.59375, + "learning_rate": 9.999150149395383e-06, + "loss": 0.97233334, + "memory(GiB)": 117.28, + "step": 3060, + "train_speed(iter/s)": 0.202695 + }, + { + "acc": 0.74153099, + "epoch": 0.07162542469269018, + "grad_norm": 5.53125, + "learning_rate": 9.99911496362649e-06, + "loss": 0.9292963, + "memory(GiB)": 117.28, + "step": 3070, + "train_speed(iter/s)": 0.203018 + }, + { + "acc": 0.74457722, + "epoch": 0.07185873226497907, + "grad_norm": 6.6875, + "learning_rate": 9.99907906423539e-06, + "loss": 0.95251703, + "memory(GiB)": 117.28, + "step": 3080, + "train_speed(iter/s)": 0.203382 + }, + { + "acc": 0.73962574, + "epoch": 0.07209203983726797, + "grad_norm": 6.875, + "learning_rate": 9.999042451227208e-06, + "loss": 0.9660202, + "memory(GiB)": 117.28, + "step": 3090, + "train_speed(iter/s)": 0.203734 + }, + { + "acc": 0.75118494, + "epoch": 0.07232534740955686, + "grad_norm": 7.65625, + "learning_rate": 9.999005124607167e-06, + "loss": 0.89292297, + "memory(GiB)": 117.28, + "step": 3100, + "train_speed(iter/s)": 0.204063 + }, + { + "acc": 0.77734885, + "epoch": 0.07255865498184576, + "grad_norm": 22.0, + "learning_rate": 9.998967084380596e-06, + "loss": 0.81176682, + "memory(GiB)": 117.28, + "step": 3110, + "train_speed(iter/s)": 0.204418 + }, + { + "acc": 0.75207405, + "epoch": 0.07279196255413464, + "grad_norm": 7.125, + "learning_rate": 9.998928330552925e-06, + "loss": 0.92476635, + "memory(GiB)": 117.28, + "step": 3120, + "train_speed(iter/s)": 0.204741 + }, + { + "acc": 0.75438986, + "epoch": 0.07302527012642354, + "grad_norm": 9.1875, + "learning_rate": 9.998888863129688e-06, + "loss": 0.9007534, + "memory(GiB)": 117.28, + "step": 3130, + "train_speed(iter/s)": 0.205082 + }, + { + "acc": 0.75198278, + "epoch": 0.07325857769871244, + "grad_norm": 9.1875, + "learning_rate": 9.998848682116518e-06, + "loss": 0.9158268, + "memory(GiB)": 117.28, + "step": 3140, + "train_speed(iter/s)": 0.205427 + }, + { + "acc": 0.75476503, + "epoch": 0.07349188527100133, + "grad_norm": 7.875, + "learning_rate": 9.998807787519151e-06, + "loss": 0.87708139, + "memory(GiB)": 117.28, + "step": 3150, + "train_speed(iter/s)": 0.205765 + }, + { + "acc": 0.75708885, + "epoch": 0.07372519284329022, + "grad_norm": 4.6875, + "learning_rate": 9.998766179343425e-06, + "loss": 0.87132273, + "memory(GiB)": 117.28, + "step": 3160, + "train_speed(iter/s)": 0.206091 + }, + { + "acc": 0.76350727, + "epoch": 0.07395850041557911, + "grad_norm": 6.375, + "learning_rate": 9.998723857595278e-06, + "loss": 0.8524066, + "memory(GiB)": 117.28, + "step": 3170, + "train_speed(iter/s)": 0.206437 + }, + { + "acc": 0.75130353, + "epoch": 0.07419180798786801, + "grad_norm": 6.46875, + "learning_rate": 9.998680822280752e-06, + "loss": 0.94219074, + "memory(GiB)": 117.28, + "step": 3180, + "train_speed(iter/s)": 0.20678 + }, + { + "acc": 0.74551325, + "epoch": 0.0744251155601569, + "grad_norm": 6.25, + "learning_rate": 9.998637073405992e-06, + "loss": 0.94350052, + "memory(GiB)": 117.28, + "step": 3190, + "train_speed(iter/s)": 0.207101 + }, + { + "acc": 0.74958072, + "epoch": 0.07465842313244579, + "grad_norm": 5.0, + "learning_rate": 9.998592610977241e-06, + "loss": 0.9216177, + "memory(GiB)": 117.28, + "step": 3200, + "train_speed(iter/s)": 0.207424 + }, + { + "acc": 0.75495872, + "epoch": 0.07489173070473469, + "grad_norm": 6.03125, + "learning_rate": 9.998547435000847e-06, + "loss": 0.89467354, + "memory(GiB)": 117.28, + "step": 3210, + "train_speed(iter/s)": 0.207777 + }, + { + "acc": 0.74197507, + "epoch": 0.07512503827702358, + "grad_norm": 6.5, + "learning_rate": 9.998501545483259e-06, + "loss": 0.94241686, + "memory(GiB)": 117.28, + "step": 3220, + "train_speed(iter/s)": 0.208109 + }, + { + "acc": 0.75202932, + "epoch": 0.07535834584931247, + "grad_norm": 5.375, + "learning_rate": 9.998454942431029e-06, + "loss": 0.91658039, + "memory(GiB)": 117.28, + "step": 3230, + "train_speed(iter/s)": 0.208416 + }, + { + "acc": 0.7436245, + "epoch": 0.07559165342160136, + "grad_norm": 9.3125, + "learning_rate": 9.998407625850806e-06, + "loss": 0.93169861, + "memory(GiB)": 117.28, + "step": 3240, + "train_speed(iter/s)": 0.208753 + }, + { + "acc": 0.76363297, + "epoch": 0.07582496099389026, + "grad_norm": 6.5625, + "learning_rate": 9.998359595749346e-06, + "loss": 0.88251057, + "memory(GiB)": 117.28, + "step": 3250, + "train_speed(iter/s)": 0.209087 + }, + { + "acc": 0.76651325, + "epoch": 0.07605826856617916, + "grad_norm": 6.53125, + "learning_rate": 9.998310852133506e-06, + "loss": 0.88197155, + "memory(GiB)": 117.28, + "step": 3260, + "train_speed(iter/s)": 0.209407 + }, + { + "acc": 0.73732243, + "epoch": 0.07629157613846804, + "grad_norm": 6.4375, + "learning_rate": 9.998261395010246e-06, + "loss": 0.97510824, + "memory(GiB)": 117.28, + "step": 3270, + "train_speed(iter/s)": 0.209726 + }, + { + "acc": 0.76042881, + "epoch": 0.07652488371075694, + "grad_norm": 5.625, + "learning_rate": 9.998211224386623e-06, + "loss": 0.88643074, + "memory(GiB)": 117.28, + "step": 3280, + "train_speed(iter/s)": 0.210037 + }, + { + "acc": 0.73288879, + "epoch": 0.07675819128304583, + "grad_norm": 8.1875, + "learning_rate": 9.998160340269799e-06, + "loss": 0.98039322, + "memory(GiB)": 117.28, + "step": 3290, + "train_speed(iter/s)": 0.210365 + }, + { + "acc": 0.74865913, + "epoch": 0.07699149885533473, + "grad_norm": 32.5, + "learning_rate": 9.998108742667038e-06, + "loss": 0.93164606, + "memory(GiB)": 117.28, + "step": 3300, + "train_speed(iter/s)": 0.210674 + }, + { + "acc": 0.73399429, + "epoch": 0.07722480642762361, + "grad_norm": 5.3125, + "learning_rate": 9.998056431585707e-06, + "loss": 0.97385902, + "memory(GiB)": 117.28, + "step": 3310, + "train_speed(iter/s)": 0.210992 + }, + { + "acc": 0.76379633, + "epoch": 0.07745811399991251, + "grad_norm": 4.96875, + "learning_rate": 9.998003407033271e-06, + "loss": 0.84101114, + "memory(GiB)": 117.28, + "step": 3320, + "train_speed(iter/s)": 0.211281 + }, + { + "acc": 0.76484241, + "epoch": 0.07769142157220141, + "grad_norm": 5.71875, + "learning_rate": 9.997949669017302e-06, + "loss": 0.8537735, + "memory(GiB)": 117.28, + "step": 3330, + "train_speed(iter/s)": 0.211582 + }, + { + "acc": 0.74536896, + "epoch": 0.07792472914449029, + "grad_norm": 9.0, + "learning_rate": 9.997895217545468e-06, + "loss": 0.99053888, + "memory(GiB)": 117.28, + "step": 3340, + "train_speed(iter/s)": 0.211872 + }, + { + "acc": 0.76393571, + "epoch": 0.07815803671677919, + "grad_norm": 6.25, + "learning_rate": 9.997840052625546e-06, + "loss": 0.8449995, + "memory(GiB)": 117.28, + "step": 3350, + "train_speed(iter/s)": 0.212198 + }, + { + "acc": 0.75357141, + "epoch": 0.07839134428906808, + "grad_norm": 7.96875, + "learning_rate": 9.997784174265407e-06, + "loss": 0.89924717, + "memory(GiB)": 117.28, + "step": 3360, + "train_speed(iter/s)": 0.212515 + }, + { + "acc": 0.74694405, + "epoch": 0.07862465186135698, + "grad_norm": 5.375, + "learning_rate": 9.99772758247303e-06, + "loss": 0.916609, + "memory(GiB)": 117.28, + "step": 3370, + "train_speed(iter/s)": 0.212802 + }, + { + "acc": 0.75625582, + "epoch": 0.07885795943364587, + "grad_norm": 9.5, + "learning_rate": 9.99767027725649e-06, + "loss": 0.89150448, + "memory(GiB)": 117.28, + "step": 3380, + "train_speed(iter/s)": 0.213124 + }, + { + "acc": 0.77579999, + "epoch": 0.07909126700593476, + "grad_norm": 5.0625, + "learning_rate": 9.997612258623972e-06, + "loss": 0.80863886, + "memory(GiB)": 117.28, + "step": 3390, + "train_speed(iter/s)": 0.213441 + }, + { + "acc": 0.72985349, + "epoch": 0.07932457457822366, + "grad_norm": 7.90625, + "learning_rate": 9.997553526583755e-06, + "loss": 1.00999374, + "memory(GiB)": 117.28, + "step": 3400, + "train_speed(iter/s)": 0.21375 + }, + { + "acc": 0.75671997, + "epoch": 0.07955788215051254, + "grad_norm": 13.6875, + "learning_rate": 9.997494081144224e-06, + "loss": 0.89063911, + "memory(GiB)": 117.28, + "step": 3410, + "train_speed(iter/s)": 0.214062 + }, + { + "acc": 0.78263044, + "epoch": 0.07979118972280144, + "grad_norm": 8.75, + "learning_rate": 9.997433922313863e-06, + "loss": 0.78714075, + "memory(GiB)": 117.28, + "step": 3420, + "train_speed(iter/s)": 0.21435 + }, + { + "acc": 0.76892452, + "epoch": 0.08002449729509033, + "grad_norm": 5.21875, + "learning_rate": 9.997373050101265e-06, + "loss": 0.84681721, + "memory(GiB)": 117.28, + "step": 3430, + "train_speed(iter/s)": 0.21465 + }, + { + "acc": 0.74426117, + "epoch": 0.08025780486737923, + "grad_norm": 7.25, + "learning_rate": 9.997311464515113e-06, + "loss": 0.94508867, + "memory(GiB)": 117.28, + "step": 3440, + "train_speed(iter/s)": 0.214929 + }, + { + "acc": 0.7614994, + "epoch": 0.08049111243966812, + "grad_norm": 5.625, + "learning_rate": 9.997249165564203e-06, + "loss": 0.83586025, + "memory(GiB)": 117.28, + "step": 3450, + "train_speed(iter/s)": 0.215237 + }, + { + "acc": 0.75516629, + "epoch": 0.08072442001195701, + "grad_norm": 5.40625, + "learning_rate": 9.997186153257425e-06, + "loss": 0.89334755, + "memory(GiB)": 117.28, + "step": 3460, + "train_speed(iter/s)": 0.215538 + }, + { + "acc": 0.7614068, + "epoch": 0.08095772758424591, + "grad_norm": 4.96875, + "learning_rate": 9.997122427603777e-06, + "loss": 0.87028713, + "memory(GiB)": 117.28, + "step": 3470, + "train_speed(iter/s)": 0.215836 + }, + { + "acc": 0.73593702, + "epoch": 0.0811910351565348, + "grad_norm": 5.96875, + "learning_rate": 9.997057988612351e-06, + "loss": 0.96313515, + "memory(GiB)": 117.28, + "step": 3480, + "train_speed(iter/s)": 0.21614 + }, + { + "acc": 0.74797134, + "epoch": 0.08142434272882369, + "grad_norm": 6.3125, + "learning_rate": 9.996992836292352e-06, + "loss": 0.92687082, + "memory(GiB)": 117.28, + "step": 3490, + "train_speed(iter/s)": 0.216442 + }, + { + "acc": 0.77501402, + "epoch": 0.08165765030111259, + "grad_norm": 15.3125, + "learning_rate": 9.996926970653076e-06, + "loss": 0.80666208, + "memory(GiB)": 117.28, + "step": 3500, + "train_speed(iter/s)": 0.216742 + }, + { + "epoch": 0.08165765030111259, + "eval_acc": 0.7207238327126944, + "eval_loss": 0.8971706628799438, + "eval_runtime": 1269.6947, + "eval_samples_per_second": 28.346, + "eval_steps_per_second": 14.173, + "step": 3500 + }, + { + "acc": 0.75318351, + "epoch": 0.08189095787340148, + "grad_norm": 4.59375, + "learning_rate": 9.996860391703925e-06, + "loss": 0.9090744, + "memory(GiB)": 117.28, + "step": 3510, + "train_speed(iter/s)": 0.201043 + }, + { + "acc": 0.76253424, + "epoch": 0.08212426544569038, + "grad_norm": 6.125, + "learning_rate": 9.996793099454407e-06, + "loss": 0.87270107, + "memory(GiB)": 117.28, + "step": 3520, + "train_speed(iter/s)": 0.20133 + }, + { + "acc": 0.76890745, + "epoch": 0.08235757301797926, + "grad_norm": 6.375, + "learning_rate": 9.996725093914125e-06, + "loss": 0.846912, + "memory(GiB)": 117.28, + "step": 3530, + "train_speed(iter/s)": 0.20162 + }, + { + "acc": 0.75093694, + "epoch": 0.08259088059026816, + "grad_norm": 6.40625, + "learning_rate": 9.996656375092786e-06, + "loss": 0.91460123, + "memory(GiB)": 117.28, + "step": 3540, + "train_speed(iter/s)": 0.201922 + }, + { + "acc": 0.74694242, + "epoch": 0.08282418816255704, + "grad_norm": 8.25, + "learning_rate": 9.996586943000203e-06, + "loss": 0.94491138, + "memory(GiB)": 117.28, + "step": 3550, + "train_speed(iter/s)": 0.202225 + }, + { + "acc": 0.7550127, + "epoch": 0.08305749573484594, + "grad_norm": 6.8125, + "learning_rate": 9.996516797646285e-06, + "loss": 0.89832973, + "memory(GiB)": 117.28, + "step": 3560, + "train_speed(iter/s)": 0.202507 + }, + { + "acc": 0.76115112, + "epoch": 0.08329080330713484, + "grad_norm": 7.375, + "learning_rate": 9.996445939041043e-06, + "loss": 0.8638464, + "memory(GiB)": 117.28, + "step": 3570, + "train_speed(iter/s)": 0.20277 + }, + { + "acc": 0.7447835, + "epoch": 0.08352411087942373, + "grad_norm": 8.6875, + "learning_rate": 9.996374367194599e-06, + "loss": 0.94378824, + "memory(GiB)": 117.28, + "step": 3580, + "train_speed(iter/s)": 0.203058 + }, + { + "acc": 0.72563133, + "epoch": 0.08375741845171263, + "grad_norm": 38.5, + "learning_rate": 9.996302082117162e-06, + "loss": 1.1059824, + "memory(GiB)": 117.28, + "step": 3590, + "train_speed(iter/s)": 0.203348 + }, + { + "acc": 0.77572851, + "epoch": 0.08399072602400151, + "grad_norm": 19.75, + "learning_rate": 9.996229083819055e-06, + "loss": 0.79340849, + "memory(GiB)": 117.28, + "step": 3600, + "train_speed(iter/s)": 0.203658 + }, + { + "acc": 0.74407644, + "epoch": 0.08422403359629041, + "grad_norm": 4.90625, + "learning_rate": 9.996155372310699e-06, + "loss": 0.92524662, + "memory(GiB)": 117.28, + "step": 3610, + "train_speed(iter/s)": 0.203955 + }, + { + "acc": 0.74106493, + "epoch": 0.08445734116857931, + "grad_norm": 5.84375, + "learning_rate": 9.996080947602615e-06, + "loss": 0.96179218, + "memory(GiB)": 117.28, + "step": 3620, + "train_speed(iter/s)": 0.204248 + }, + { + "acc": 0.73815827, + "epoch": 0.0846906487408682, + "grad_norm": 6.125, + "learning_rate": 9.996005809705428e-06, + "loss": 0.95186195, + "memory(GiB)": 117.28, + "step": 3630, + "train_speed(iter/s)": 0.204526 + }, + { + "acc": 0.75756698, + "epoch": 0.08492395631315709, + "grad_norm": 8.125, + "learning_rate": 9.99592995862986e-06, + "loss": 0.8868536, + "memory(GiB)": 117.28, + "step": 3640, + "train_speed(iter/s)": 0.204816 + }, + { + "acc": 0.7773509, + "epoch": 0.08515726388544598, + "grad_norm": 5.6875, + "learning_rate": 9.995853394386743e-06, + "loss": 0.80111885, + "memory(GiB)": 117.28, + "step": 3650, + "train_speed(iter/s)": 0.205093 + }, + { + "acc": 0.75153332, + "epoch": 0.08539057145773488, + "grad_norm": 7.28125, + "learning_rate": 9.995776116987006e-06, + "loss": 0.92716045, + "memory(GiB)": 117.28, + "step": 3660, + "train_speed(iter/s)": 0.205357 + }, + { + "acc": 0.76465082, + "epoch": 0.08562387903002378, + "grad_norm": 5.53125, + "learning_rate": 9.995698126441678e-06, + "loss": 0.84658279, + "memory(GiB)": 117.28, + "step": 3670, + "train_speed(iter/s)": 0.205626 + }, + { + "acc": 0.75522108, + "epoch": 0.08585718660231266, + "grad_norm": 7.6875, + "learning_rate": 9.995619422761896e-06, + "loss": 0.89340916, + "memory(GiB)": 117.28, + "step": 3680, + "train_speed(iter/s)": 0.205898 + }, + { + "acc": 0.76193209, + "epoch": 0.08609049417460156, + "grad_norm": 5.34375, + "learning_rate": 9.995540005958891e-06, + "loss": 0.85231638, + "memory(GiB)": 117.28, + "step": 3690, + "train_speed(iter/s)": 0.206188 + }, + { + "acc": 0.74403105, + "epoch": 0.08632380174689044, + "grad_norm": 5.1875, + "learning_rate": 9.995459876044e-06, + "loss": 0.96531124, + "memory(GiB)": 117.28, + "step": 3700, + "train_speed(iter/s)": 0.206473 + }, + { + "acc": 0.76611438, + "epoch": 0.08655710931917934, + "grad_norm": 4.9375, + "learning_rate": 9.995379033028666e-06, + "loss": 0.85632906, + "memory(GiB)": 117.28, + "step": 3710, + "train_speed(iter/s)": 0.206772 + }, + { + "acc": 0.74768972, + "epoch": 0.08679041689146823, + "grad_norm": 4.46875, + "learning_rate": 9.995297476924424e-06, + "loss": 0.94205217, + "memory(GiB)": 117.28, + "step": 3720, + "train_speed(iter/s)": 0.207058 + }, + { + "acc": 0.75818162, + "epoch": 0.08702372446375713, + "grad_norm": 4.78125, + "learning_rate": 9.99521520774292e-06, + "loss": 0.88370638, + "memory(GiB)": 117.28, + "step": 3730, + "train_speed(iter/s)": 0.207322 + }, + { + "acc": 0.76598167, + "epoch": 0.08725703203604603, + "grad_norm": 4.625, + "learning_rate": 9.995132225495896e-06, + "loss": 0.8380312, + "memory(GiB)": 117.28, + "step": 3740, + "train_speed(iter/s)": 0.207593 + }, + { + "acc": 0.74331932, + "epoch": 0.08749033960833491, + "grad_norm": 4.71875, + "learning_rate": 9.995048530195198e-06, + "loss": 0.94072809, + "memory(GiB)": 117.28, + "step": 3750, + "train_speed(iter/s)": 0.207867 + }, + { + "acc": 0.74893398, + "epoch": 0.08772364718062381, + "grad_norm": 8.9375, + "learning_rate": 9.99496412185277e-06, + "loss": 0.93704576, + "memory(GiB)": 117.28, + "step": 3760, + "train_speed(iter/s)": 0.208126 + }, + { + "acc": 0.74125042, + "epoch": 0.0879569547529127, + "grad_norm": 8.25, + "learning_rate": 9.994879000480668e-06, + "loss": 0.96825314, + "memory(GiB)": 117.28, + "step": 3770, + "train_speed(iter/s)": 0.208406 + }, + { + "acc": 0.77773728, + "epoch": 0.0881902623252016, + "grad_norm": 6.5625, + "learning_rate": 9.994793166091039e-06, + "loss": 0.81420488, + "memory(GiB)": 117.28, + "step": 3780, + "train_speed(iter/s)": 0.208696 + }, + { + "acc": 0.75972219, + "epoch": 0.08842356989749049, + "grad_norm": 6.59375, + "learning_rate": 9.994706618696137e-06, + "loss": 0.89639206, + "memory(GiB)": 117.28, + "step": 3790, + "train_speed(iter/s)": 0.208963 + }, + { + "acc": 0.76652493, + "epoch": 0.08865687746977938, + "grad_norm": 7.15625, + "learning_rate": 9.994619358308316e-06, + "loss": 0.83598127, + "memory(GiB)": 117.28, + "step": 3800, + "train_speed(iter/s)": 0.209241 + }, + { + "acc": 0.76296563, + "epoch": 0.08889018504206828, + "grad_norm": 10.4375, + "learning_rate": 9.994531384940032e-06, + "loss": 0.85312195, + "memory(GiB)": 117.28, + "step": 3810, + "train_speed(iter/s)": 0.209509 + }, + { + "acc": 0.76371479, + "epoch": 0.08912349261435716, + "grad_norm": 7.96875, + "learning_rate": 9.994442698603844e-06, + "loss": 0.8648243, + "memory(GiB)": 117.28, + "step": 3820, + "train_speed(iter/s)": 0.209773 + }, + { + "acc": 0.76453638, + "epoch": 0.08935680018664606, + "grad_norm": 7.59375, + "learning_rate": 9.99435329931241e-06, + "loss": 0.85945387, + "memory(GiB)": 117.28, + "step": 3830, + "train_speed(iter/s)": 0.210032 + }, + { + "acc": 0.76211448, + "epoch": 0.08959010775893494, + "grad_norm": 7.1875, + "learning_rate": 9.994263187078496e-06, + "loss": 0.86692085, + "memory(GiB)": 117.28, + "step": 3840, + "train_speed(iter/s)": 0.210304 + }, + { + "acc": 0.76937609, + "epoch": 0.08982341533122384, + "grad_norm": 4.90625, + "learning_rate": 9.994172361914962e-06, + "loss": 0.84302549, + "memory(GiB)": 117.28, + "step": 3850, + "train_speed(iter/s)": 0.210561 + }, + { + "acc": 0.76893077, + "epoch": 0.09005672290351274, + "grad_norm": 5.21875, + "learning_rate": 9.994080823834775e-06, + "loss": 0.84133387, + "memory(GiB)": 117.54, + "step": 3860, + "train_speed(iter/s)": 0.21084 + }, + { + "acc": 0.75491571, + "epoch": 0.09029003047580163, + "grad_norm": 5.53125, + "learning_rate": 9.993988572851e-06, + "loss": 0.90440521, + "memory(GiB)": 117.54, + "step": 3870, + "train_speed(iter/s)": 0.2111 + }, + { + "acc": 0.74396949, + "epoch": 0.09052333804809053, + "grad_norm": 8.0625, + "learning_rate": 9.993895608976806e-06, + "loss": 0.95679359, + "memory(GiB)": 117.54, + "step": 3880, + "train_speed(iter/s)": 0.211355 + }, + { + "acc": 0.75566082, + "epoch": 0.09075664562037941, + "grad_norm": 8.9375, + "learning_rate": 9.993801932225466e-06, + "loss": 0.89417982, + "memory(GiB)": 117.54, + "step": 3890, + "train_speed(iter/s)": 0.211633 + }, + { + "acc": 0.76257367, + "epoch": 0.09098995319266831, + "grad_norm": 5.625, + "learning_rate": 9.993707542610351e-06, + "loss": 0.87649937, + "memory(GiB)": 117.54, + "step": 3900, + "train_speed(iter/s)": 0.211899 + }, + { + "acc": 0.73606167, + "epoch": 0.09122326076495721, + "grad_norm": 7.34375, + "learning_rate": 9.993612440144935e-06, + "loss": 0.9821991, + "memory(GiB)": 117.54, + "step": 3910, + "train_speed(iter/s)": 0.212158 + }, + { + "acc": 0.75946436, + "epoch": 0.0914565683372461, + "grad_norm": 6.03125, + "learning_rate": 9.993516624842792e-06, + "loss": 0.8709816, + "memory(GiB)": 117.54, + "step": 3920, + "train_speed(iter/s)": 0.212416 + }, + { + "acc": 0.74540901, + "epoch": 0.09168987590953499, + "grad_norm": 8.5, + "learning_rate": 9.993420096717603e-06, + "loss": 0.94000492, + "memory(GiB)": 117.54, + "step": 3930, + "train_speed(iter/s)": 0.212684 + }, + { + "acc": 0.77204304, + "epoch": 0.09192318348182388, + "grad_norm": 8.0, + "learning_rate": 9.993322855783146e-06, + "loss": 0.81134806, + "memory(GiB)": 117.54, + "step": 3940, + "train_speed(iter/s)": 0.212916 + }, + { + "acc": 0.75332932, + "epoch": 0.09215649105411278, + "grad_norm": 7.5, + "learning_rate": 9.993224902053302e-06, + "loss": 0.88411512, + "memory(GiB)": 117.54, + "step": 3950, + "train_speed(iter/s)": 0.213189 + }, + { + "acc": 0.73475533, + "epoch": 0.09238979862640166, + "grad_norm": 4.6875, + "learning_rate": 9.993126235542053e-06, + "loss": 0.98314781, + "memory(GiB)": 117.54, + "step": 3960, + "train_speed(iter/s)": 0.213459 + }, + { + "acc": 0.76145267, + "epoch": 0.09262310619869056, + "grad_norm": 8.375, + "learning_rate": 9.993026856263486e-06, + "loss": 0.86928034, + "memory(GiB)": 117.54, + "step": 3970, + "train_speed(iter/s)": 0.213725 + }, + { + "acc": 0.76495628, + "epoch": 0.09285641377097946, + "grad_norm": 5.5625, + "learning_rate": 9.992926764231784e-06, + "loss": 0.83155947, + "memory(GiB)": 117.54, + "step": 3980, + "train_speed(iter/s)": 0.213988 + }, + { + "acc": 0.75666361, + "epoch": 0.09308972134326834, + "grad_norm": 6.84375, + "learning_rate": 9.992825959461237e-06, + "loss": 0.88326521, + "memory(GiB)": 117.54, + "step": 3990, + "train_speed(iter/s)": 0.214237 + }, + { + "acc": 0.774965, + "epoch": 0.09332302891555724, + "grad_norm": 5.125, + "learning_rate": 9.992724441966234e-06, + "loss": 0.8087532, + "memory(GiB)": 117.54, + "step": 4000, + "train_speed(iter/s)": 0.214481 + }, + { + "epoch": 0.09332302891555724, + "eval_acc": 0.7228697654255328, + "eval_loss": 0.8877829313278198, + "eval_runtime": 1271.4977, + "eval_samples_per_second": 28.306, + "eval_steps_per_second": 14.153, + "step": 4000 + }, + { + "acc": 0.76409369, + "epoch": 0.09355633648784613, + "grad_norm": 4.625, + "learning_rate": 9.99262221176127e-06, + "loss": 0.85309639, + "memory(GiB)": 117.54, + "step": 4010, + "train_speed(iter/s)": 0.200878 + }, + { + "acc": 0.76466002, + "epoch": 0.09378964406013503, + "grad_norm": 5.84375, + "learning_rate": 9.992519268860934e-06, + "loss": 0.87510233, + "memory(GiB)": 117.54, + "step": 4020, + "train_speed(iter/s)": 0.201146 + }, + { + "acc": 0.74888363, + "epoch": 0.09402295163242393, + "grad_norm": 4.75, + "learning_rate": 9.992415613279922e-06, + "loss": 0.91498604, + "memory(GiB)": 117.54, + "step": 4030, + "train_speed(iter/s)": 0.201399 + }, + { + "acc": 0.7692903, + "epoch": 0.09425625920471281, + "grad_norm": 4.34375, + "learning_rate": 9.992311245033033e-06, + "loss": 0.83095226, + "memory(GiB)": 117.54, + "step": 4040, + "train_speed(iter/s)": 0.201636 + }, + { + "acc": 0.7509232, + "epoch": 0.09448956677700171, + "grad_norm": 5.0, + "learning_rate": 9.992206164135163e-06, + "loss": 0.92139778, + "memory(GiB)": 117.54, + "step": 4050, + "train_speed(iter/s)": 0.201873 + }, + { + "acc": 0.77602358, + "epoch": 0.0947228743492906, + "grad_norm": 8.125, + "learning_rate": 9.992100370601313e-06, + "loss": 0.79812717, + "memory(GiB)": 117.54, + "step": 4060, + "train_speed(iter/s)": 0.202133 + }, + { + "acc": 0.7557579, + "epoch": 0.0949561819215795, + "grad_norm": 6.875, + "learning_rate": 9.991993864446585e-06, + "loss": 0.89294071, + "memory(GiB)": 117.54, + "step": 4070, + "train_speed(iter/s)": 0.202411 + }, + { + "acc": 0.73773766, + "epoch": 0.09518948949386838, + "grad_norm": 5.46875, + "learning_rate": 9.991886645686184e-06, + "loss": 0.95771408, + "memory(GiB)": 117.54, + "step": 4080, + "train_speed(iter/s)": 0.20265 + }, + { + "acc": 0.75475817, + "epoch": 0.09542279706615728, + "grad_norm": 8.3125, + "learning_rate": 9.991778714335415e-06, + "loss": 0.90779686, + "memory(GiB)": 117.54, + "step": 4090, + "train_speed(iter/s)": 0.202915 + }, + { + "acc": 0.73981633, + "epoch": 0.09565610463844618, + "grad_norm": 6.90625, + "learning_rate": 9.991670070409684e-06, + "loss": 0.96604061, + "memory(GiB)": 117.54, + "step": 4100, + "train_speed(iter/s)": 0.203177 + }, + { + "acc": 0.76607499, + "epoch": 0.09588941221073506, + "grad_norm": 5.78125, + "learning_rate": 9.991560713924501e-06, + "loss": 0.85471153, + "memory(GiB)": 117.54, + "step": 4110, + "train_speed(iter/s)": 0.20342 + }, + { + "acc": 0.77161045, + "epoch": 0.09612271978302396, + "grad_norm": 8.125, + "learning_rate": 9.991450644895476e-06, + "loss": 0.84620152, + "memory(GiB)": 117.54, + "step": 4120, + "train_speed(iter/s)": 0.203677 + }, + { + "acc": 0.77144375, + "epoch": 0.09635602735531285, + "grad_norm": 4.625, + "learning_rate": 9.99133986333832e-06, + "loss": 0.83797445, + "memory(GiB)": 117.54, + "step": 4130, + "train_speed(iter/s)": 0.20393 + }, + { + "acc": 0.73537869, + "epoch": 0.09658933492760174, + "grad_norm": 6.3125, + "learning_rate": 9.99122836926885e-06, + "loss": 0.96916094, + "memory(GiB)": 117.54, + "step": 4140, + "train_speed(iter/s)": 0.204181 + }, + { + "acc": 0.75800476, + "epoch": 0.09682264249989064, + "grad_norm": 5.625, + "learning_rate": 9.991116162702981e-06, + "loss": 0.86221294, + "memory(GiB)": 117.54, + "step": 4150, + "train_speed(iter/s)": 0.204423 + }, + { + "acc": 0.75571017, + "epoch": 0.09705595007217953, + "grad_norm": 5.5, + "learning_rate": 9.991003243656728e-06, + "loss": 0.88189182, + "memory(GiB)": 117.54, + "step": 4160, + "train_speed(iter/s)": 0.204672 + }, + { + "acc": 0.75988812, + "epoch": 0.09728925764446843, + "grad_norm": 8.5, + "learning_rate": 9.990889612146213e-06, + "loss": 0.88356085, + "memory(GiB)": 117.54, + "step": 4170, + "train_speed(iter/s)": 0.204882 + }, + { + "acc": 0.74842806, + "epoch": 0.09752256521675731, + "grad_norm": 4.96875, + "learning_rate": 9.990775268187654e-06, + "loss": 0.93167324, + "memory(GiB)": 117.54, + "step": 4180, + "train_speed(iter/s)": 0.20513 + }, + { + "acc": 0.76515474, + "epoch": 0.09775587278904621, + "grad_norm": 35.75, + "learning_rate": 9.990660211797378e-06, + "loss": 0.92051849, + "memory(GiB)": 117.54, + "step": 4190, + "train_speed(iter/s)": 0.205376 + }, + { + "acc": 0.7553896, + "epoch": 0.0979891803613351, + "grad_norm": 5.375, + "learning_rate": 9.990544442991805e-06, + "loss": 0.89758406, + "memory(GiB)": 117.54, + "step": 4200, + "train_speed(iter/s)": 0.20564 + }, + { + "acc": 0.75030479, + "epoch": 0.098222487933624, + "grad_norm": 5.3125, + "learning_rate": 9.99042796178746e-06, + "loss": 0.91539536, + "memory(GiB)": 117.54, + "step": 4210, + "train_speed(iter/s)": 0.205895 + }, + { + "acc": 0.74988375, + "epoch": 0.0984557955059129, + "grad_norm": 4.25, + "learning_rate": 9.990310768200977e-06, + "loss": 0.9145546, + "memory(GiB)": 117.54, + "step": 4220, + "train_speed(iter/s)": 0.206142 + }, + { + "acc": 0.73188667, + "epoch": 0.09868910307820178, + "grad_norm": 5.65625, + "learning_rate": 9.99019286224908e-06, + "loss": 0.98185387, + "memory(GiB)": 117.54, + "step": 4230, + "train_speed(iter/s)": 0.20638 + }, + { + "acc": 0.75859137, + "epoch": 0.09892241065049068, + "grad_norm": 8.125, + "learning_rate": 9.990074243948602e-06, + "loss": 0.90473709, + "memory(GiB)": 117.54, + "step": 4240, + "train_speed(iter/s)": 0.206628 + }, + { + "acc": 0.76608834, + "epoch": 0.09915571822277956, + "grad_norm": 6.34375, + "learning_rate": 9.989954913316476e-06, + "loss": 0.85702991, + "memory(GiB)": 117.54, + "step": 4250, + "train_speed(iter/s)": 0.20688 + }, + { + "acc": 0.76376286, + "epoch": 0.09938902579506846, + "grad_norm": 9.625, + "learning_rate": 9.989834870369735e-06, + "loss": 0.84923592, + "memory(GiB)": 117.54, + "step": 4260, + "train_speed(iter/s)": 0.207135 + }, + { + "acc": 0.77528548, + "epoch": 0.09962233336735736, + "grad_norm": 5.1875, + "learning_rate": 9.989714115125515e-06, + "loss": 0.79958353, + "memory(GiB)": 117.54, + "step": 4270, + "train_speed(iter/s)": 0.207364 + }, + { + "acc": 0.75993328, + "epoch": 0.09985564093964625, + "grad_norm": 10.625, + "learning_rate": 9.989592647601056e-06, + "loss": 0.86611481, + "memory(GiB)": 117.54, + "step": 4280, + "train_speed(iter/s)": 0.207609 + }, + { + "acc": 0.75393734, + "epoch": 0.10008894851193514, + "grad_norm": 6.5625, + "learning_rate": 9.989470467813696e-06, + "loss": 0.91193056, + "memory(GiB)": 117.54, + "step": 4290, + "train_speed(iter/s)": 0.207846 + }, + { + "acc": 0.75926132, + "epoch": 0.10032225608422403, + "grad_norm": 8.0, + "learning_rate": 9.989347575780874e-06, + "loss": 0.88821259, + "memory(GiB)": 117.54, + "step": 4300, + "train_speed(iter/s)": 0.208096 + }, + { + "acc": 0.75386171, + "epoch": 0.10055556365651293, + "grad_norm": 7.65625, + "learning_rate": 9.989223971520136e-06, + "loss": 0.89129429, + "memory(GiB)": 117.54, + "step": 4310, + "train_speed(iter/s)": 0.208334 + }, + { + "acc": 0.75681725, + "epoch": 0.10078887122880181, + "grad_norm": 6.65625, + "learning_rate": 9.989099655049128e-06, + "loss": 0.87771931, + "memory(GiB)": 117.54, + "step": 4320, + "train_speed(iter/s)": 0.20856 + }, + { + "acc": 0.7391355, + "epoch": 0.10102217880109071, + "grad_norm": 10.0, + "learning_rate": 9.98897462638559e-06, + "loss": 0.96863251, + "memory(GiB)": 117.54, + "step": 4330, + "train_speed(iter/s)": 0.20879 + }, + { + "acc": 0.76924124, + "epoch": 0.10125548637337961, + "grad_norm": 5.78125, + "learning_rate": 9.988848885547376e-06, + "loss": 0.8352129, + "memory(GiB)": 117.54, + "step": 4340, + "train_speed(iter/s)": 0.209021 + }, + { + "acc": 0.77654471, + "epoch": 0.1014887939456685, + "grad_norm": 7.375, + "learning_rate": 9.988722432552431e-06, + "loss": 0.79848795, + "memory(GiB)": 117.54, + "step": 4350, + "train_speed(iter/s)": 0.209261 + }, + { + "acc": 0.74507241, + "epoch": 0.1017221015179574, + "grad_norm": 5.625, + "learning_rate": 9.988595267418809e-06, + "loss": 0.92341309, + "memory(GiB)": 117.54, + "step": 4360, + "train_speed(iter/s)": 0.209503 + }, + { + "acc": 0.76773772, + "epoch": 0.10195540909024628, + "grad_norm": 5.4375, + "learning_rate": 9.988467390164662e-06, + "loss": 0.84407911, + "memory(GiB)": 117.54, + "step": 4370, + "train_speed(iter/s)": 0.209715 + }, + { + "acc": 0.74648399, + "epoch": 0.10218871666253518, + "grad_norm": 5.0625, + "learning_rate": 9.988338800808245e-06, + "loss": 0.93966646, + "memory(GiB)": 117.54, + "step": 4380, + "train_speed(iter/s)": 0.209944 + }, + { + "acc": 0.76602821, + "epoch": 0.10242202423482408, + "grad_norm": 6.6875, + "learning_rate": 9.988209499367911e-06, + "loss": 0.8338089, + "memory(GiB)": 117.54, + "step": 4390, + "train_speed(iter/s)": 0.210194 + }, + { + "acc": 0.76203928, + "epoch": 0.10265533180711296, + "grad_norm": 4.34375, + "learning_rate": 9.988079485862121e-06, + "loss": 0.87073536, + "memory(GiB)": 117.54, + "step": 4400, + "train_speed(iter/s)": 0.210432 + }, + { + "acc": 0.74238539, + "epoch": 0.10288863937940186, + "grad_norm": 12.375, + "learning_rate": 9.987948760309434e-06, + "loss": 0.95618248, + "memory(GiB)": 117.54, + "step": 4410, + "train_speed(iter/s)": 0.210669 + }, + { + "acc": 0.7657311, + "epoch": 0.10312194695169075, + "grad_norm": 5.4375, + "learning_rate": 9.987817322728509e-06, + "loss": 0.85881863, + "memory(GiB)": 117.54, + "step": 4420, + "train_speed(iter/s)": 0.210901 + }, + { + "acc": 0.76233692, + "epoch": 0.10335525452397964, + "grad_norm": 4.4375, + "learning_rate": 9.98768517313811e-06, + "loss": 0.8707386, + "memory(GiB)": 117.54, + "step": 4430, + "train_speed(iter/s)": 0.211135 + }, + { + "acc": 0.75146971, + "epoch": 0.10358856209626854, + "grad_norm": 6.9375, + "learning_rate": 9.987552311557103e-06, + "loss": 0.90720158, + "memory(GiB)": 117.54, + "step": 4440, + "train_speed(iter/s)": 0.211368 + }, + { + "acc": 0.77758303, + "epoch": 0.10382186966855743, + "grad_norm": 18.25, + "learning_rate": 9.987418738004453e-06, + "loss": 0.81924438, + "memory(GiB)": 117.54, + "step": 4450, + "train_speed(iter/s)": 0.211579 + }, + { + "acc": 0.73662348, + "epoch": 0.10405517724084633, + "grad_norm": 5.59375, + "learning_rate": 9.987284452499227e-06, + "loss": 0.9785923, + "memory(GiB)": 117.54, + "step": 4460, + "train_speed(iter/s)": 0.211822 + }, + { + "acc": 0.74656315, + "epoch": 0.10428848481313521, + "grad_norm": 17.875, + "learning_rate": 9.987149455060592e-06, + "loss": 0.9363677, + "memory(GiB)": 117.54, + "step": 4470, + "train_speed(iter/s)": 0.212055 + }, + { + "acc": 0.75212345, + "epoch": 0.10452179238542411, + "grad_norm": 4.71875, + "learning_rate": 9.987013745707824e-06, + "loss": 0.93190804, + "memory(GiB)": 117.54, + "step": 4480, + "train_speed(iter/s)": 0.212285 + }, + { + "acc": 0.7772028, + "epoch": 0.104755099957713, + "grad_norm": 4.46875, + "learning_rate": 9.986877324460288e-06, + "loss": 0.79044576, + "memory(GiB)": 117.54, + "step": 4490, + "train_speed(iter/s)": 0.212502 + }, + { + "acc": 0.76467466, + "epoch": 0.1049884075300019, + "grad_norm": 4.65625, + "learning_rate": 9.986740191337467e-06, + "loss": 0.86383896, + "memory(GiB)": 117.54, + "step": 4500, + "train_speed(iter/s)": 0.212726 + }, + { + "epoch": 0.1049884075300019, + "eval_acc": 0.7243065794777408, + "eval_loss": 0.8816430568695068, + "eval_runtime": 1268.0964, + "eval_samples_per_second": 28.382, + "eval_steps_per_second": 14.191, + "step": 4500 + }, + { + "acc": 0.74907675, + "epoch": 0.1052217151022908, + "grad_norm": 4.90625, + "learning_rate": 9.986602346358932e-06, + "loss": 0.92465839, + "memory(GiB)": 117.54, + "step": 4510, + "train_speed(iter/s)": 0.200795 + }, + { + "acc": 0.77655468, + "epoch": 0.10545502267457968, + "grad_norm": 5.6875, + "learning_rate": 9.986463789544359e-06, + "loss": 0.79614592, + "memory(GiB)": 117.54, + "step": 4520, + "train_speed(iter/s)": 0.20104 + }, + { + "acc": 0.74242306, + "epoch": 0.10568833024686858, + "grad_norm": 5.625, + "learning_rate": 9.986324520913528e-06, + "loss": 0.9382905, + "memory(GiB)": 117.54, + "step": 4530, + "train_speed(iter/s)": 0.201278 + }, + { + "acc": 0.7789465, + "epoch": 0.10592163781915746, + "grad_norm": 5.1875, + "learning_rate": 9.986184540486322e-06, + "loss": 0.80827017, + "memory(GiB)": 117.54, + "step": 4540, + "train_speed(iter/s)": 0.201497 + }, + { + "acc": 0.78276606, + "epoch": 0.10615494539144636, + "grad_norm": 5.9375, + "learning_rate": 9.98604384828272e-06, + "loss": 0.78076072, + "memory(GiB)": 117.54, + "step": 4550, + "train_speed(iter/s)": 0.201723 + }, + { + "acc": 0.75945024, + "epoch": 0.10638825296373526, + "grad_norm": 5.375, + "learning_rate": 9.985902444322809e-06, + "loss": 0.86157227, + "memory(GiB)": 117.54, + "step": 4560, + "train_speed(iter/s)": 0.201954 + }, + { + "acc": 0.75581503, + "epoch": 0.10662156053602415, + "grad_norm": 5.90625, + "learning_rate": 9.98576032862677e-06, + "loss": 0.89451599, + "memory(GiB)": 117.54, + "step": 4570, + "train_speed(iter/s)": 0.202173 + }, + { + "acc": 0.7623549, + "epoch": 0.10685486810831304, + "grad_norm": 5.3125, + "learning_rate": 9.985617501214895e-06, + "loss": 0.86804514, + "memory(GiB)": 117.54, + "step": 4580, + "train_speed(iter/s)": 0.202413 + }, + { + "acc": 0.75258703, + "epoch": 0.10708817568060193, + "grad_norm": 4.78125, + "learning_rate": 9.985473962107568e-06, + "loss": 0.916399, + "memory(GiB)": 117.54, + "step": 4590, + "train_speed(iter/s)": 0.202637 + }, + { + "acc": 0.75255208, + "epoch": 0.10732148325289083, + "grad_norm": 6.71875, + "learning_rate": 9.985329711325282e-06, + "loss": 0.90113773, + "memory(GiB)": 117.54, + "step": 4600, + "train_speed(iter/s)": 0.20288 + }, + { + "acc": 0.75763159, + "epoch": 0.10755479082517971, + "grad_norm": 6.875, + "learning_rate": 9.985184748888627e-06, + "loss": 0.91873503, + "memory(GiB)": 117.54, + "step": 4610, + "train_speed(iter/s)": 0.203115 + }, + { + "acc": 0.77346997, + "epoch": 0.10778809839746861, + "grad_norm": 5.4375, + "learning_rate": 9.985039074818298e-06, + "loss": 0.8059083, + "memory(GiB)": 117.54, + "step": 4620, + "train_speed(iter/s)": 0.203336 + }, + { + "acc": 0.75904865, + "epoch": 0.10802140596975751, + "grad_norm": 5.59375, + "learning_rate": 9.98489268913509e-06, + "loss": 0.87097445, + "memory(GiB)": 117.54, + "step": 4630, + "train_speed(iter/s)": 0.203564 + }, + { + "acc": 0.75039043, + "epoch": 0.1082547135420464, + "grad_norm": 6.65625, + "learning_rate": 9.984745591859899e-06, + "loss": 0.90553379, + "memory(GiB)": 117.54, + "step": 4640, + "train_speed(iter/s)": 0.203807 + }, + { + "acc": 0.76141634, + "epoch": 0.1084880211143353, + "grad_norm": 4.21875, + "learning_rate": 9.98459778301372e-06, + "loss": 0.8886508, + "memory(GiB)": 117.54, + "step": 4650, + "train_speed(iter/s)": 0.20402 + }, + { + "acc": 0.76583786, + "epoch": 0.10872132868662418, + "grad_norm": 9.625, + "learning_rate": 9.984449262617659e-06, + "loss": 0.86086102, + "memory(GiB)": 117.54, + "step": 4660, + "train_speed(iter/s)": 0.20427 + }, + { + "acc": 0.76089182, + "epoch": 0.10895463625891308, + "grad_norm": 6.09375, + "learning_rate": 9.984300030692913e-06, + "loss": 0.87257662, + "memory(GiB)": 117.54, + "step": 4670, + "train_speed(iter/s)": 0.204504 + }, + { + "acc": 0.75297747, + "epoch": 0.10918794383120198, + "grad_norm": 9.5, + "learning_rate": 9.984150087260784e-06, + "loss": 0.90016375, + "memory(GiB)": 117.54, + "step": 4680, + "train_speed(iter/s)": 0.204719 + }, + { + "acc": 0.74139366, + "epoch": 0.10942125140349086, + "grad_norm": 6.46875, + "learning_rate": 9.983999432342679e-06, + "loss": 0.9342598, + "memory(GiB)": 117.54, + "step": 4690, + "train_speed(iter/s)": 0.204946 + }, + { + "acc": 0.75930262, + "epoch": 0.10965455897577976, + "grad_norm": 5.75, + "learning_rate": 9.983848065960103e-06, + "loss": 0.8737071, + "memory(GiB)": 117.54, + "step": 4700, + "train_speed(iter/s)": 0.205175 + }, + { + "acc": 0.77567587, + "epoch": 0.10988786654806865, + "grad_norm": 8.125, + "learning_rate": 9.983695988134662e-06, + "loss": 0.79876671, + "memory(GiB)": 117.54, + "step": 4710, + "train_speed(iter/s)": 0.205389 + }, + { + "acc": 0.75503325, + "epoch": 0.11012117412035755, + "grad_norm": 6.03125, + "learning_rate": 9.983543198888069e-06, + "loss": 0.90450735, + "memory(GiB)": 117.54, + "step": 4720, + "train_speed(iter/s)": 0.205611 + }, + { + "acc": 0.75266876, + "epoch": 0.11035448169264643, + "grad_norm": 5.0625, + "learning_rate": 9.98338969824213e-06, + "loss": 0.9201149, + "memory(GiB)": 117.54, + "step": 4730, + "train_speed(iter/s)": 0.205829 + }, + { + "acc": 0.76564531, + "epoch": 0.11058778926493533, + "grad_norm": 6.625, + "learning_rate": 9.98323548621876e-06, + "loss": 0.85361395, + "memory(GiB)": 117.54, + "step": 4740, + "train_speed(iter/s)": 0.206042 + }, + { + "acc": 0.74129553, + "epoch": 0.11082109683722423, + "grad_norm": 9.4375, + "learning_rate": 9.983080562839971e-06, + "loss": 0.96250782, + "memory(GiB)": 117.54, + "step": 4750, + "train_speed(iter/s)": 0.206248 + }, + { + "acc": 0.76101308, + "epoch": 0.11105440440951311, + "grad_norm": 5.71875, + "learning_rate": 9.982924928127881e-06, + "loss": 0.85979137, + "memory(GiB)": 117.54, + "step": 4760, + "train_speed(iter/s)": 0.206465 + }, + { + "acc": 0.78050613, + "epoch": 0.11128771198180201, + "grad_norm": 5.28125, + "learning_rate": 9.982768582104705e-06, + "loss": 0.80721207, + "memory(GiB)": 117.54, + "step": 4770, + "train_speed(iter/s)": 0.206675 + }, + { + "acc": 0.74485474, + "epoch": 0.1115210195540909, + "grad_norm": 4.4375, + "learning_rate": 9.98261152479276e-06, + "loss": 0.95470104, + "memory(GiB)": 117.54, + "step": 4780, + "train_speed(iter/s)": 0.20689 + }, + { + "acc": 0.76254826, + "epoch": 0.1117543271263798, + "grad_norm": 5.9375, + "learning_rate": 9.982453756214467e-06, + "loss": 0.84586992, + "memory(GiB)": 117.54, + "step": 4790, + "train_speed(iter/s)": 0.207109 + }, + { + "acc": 0.76125956, + "epoch": 0.1119876346986687, + "grad_norm": 7.71875, + "learning_rate": 9.982295276392349e-06, + "loss": 0.88607883, + "memory(GiB)": 117.54, + "step": 4800, + "train_speed(iter/s)": 0.207333 + }, + { + "acc": 0.73547726, + "epoch": 0.11222094227095758, + "grad_norm": 5.8125, + "learning_rate": 9.982136085349028e-06, + "loss": 0.96996174, + "memory(GiB)": 117.54, + "step": 4810, + "train_speed(iter/s)": 0.207542 + }, + { + "acc": 0.77212734, + "epoch": 0.11245424984324648, + "grad_norm": 7.40625, + "learning_rate": 9.981976183107227e-06, + "loss": 0.83480778, + "memory(GiB)": 117.54, + "step": 4820, + "train_speed(iter/s)": 0.20776 + }, + { + "acc": 0.76515455, + "epoch": 0.11268755741553536, + "grad_norm": 7.1875, + "learning_rate": 9.981815569689774e-06, + "loss": 0.88458767, + "memory(GiB)": 117.54, + "step": 4830, + "train_speed(iter/s)": 0.207986 + }, + { + "acc": 0.73565607, + "epoch": 0.11292086498782426, + "grad_norm": 6.09375, + "learning_rate": 9.981654245119594e-06, + "loss": 0.98005428, + "memory(GiB)": 117.54, + "step": 4840, + "train_speed(iter/s)": 0.208198 + }, + { + "acc": 0.74796729, + "epoch": 0.11315417256011315, + "grad_norm": 7.5625, + "learning_rate": 9.98149220941972e-06, + "loss": 0.90888004, + "memory(GiB)": 117.54, + "step": 4850, + "train_speed(iter/s)": 0.208403 + }, + { + "acc": 0.76575413, + "epoch": 0.11338748013240205, + "grad_norm": 6.4375, + "learning_rate": 9.981329462613278e-06, + "loss": 0.8409193, + "memory(GiB)": 117.54, + "step": 4860, + "train_speed(iter/s)": 0.208616 + }, + { + "acc": 0.76135168, + "epoch": 0.11362078770469095, + "grad_norm": 4.96875, + "learning_rate": 9.981166004723504e-06, + "loss": 0.8730361, + "memory(GiB)": 117.54, + "step": 4870, + "train_speed(iter/s)": 0.208832 + }, + { + "acc": 0.75576491, + "epoch": 0.11385409527697983, + "grad_norm": 5.96875, + "learning_rate": 9.981001835773729e-06, + "loss": 0.86638641, + "memory(GiB)": 117.54, + "step": 4880, + "train_speed(iter/s)": 0.20906 + }, + { + "acc": 0.78793383, + "epoch": 0.11408740284926873, + "grad_norm": 6.34375, + "learning_rate": 9.98083695578739e-06, + "loss": 0.78035274, + "memory(GiB)": 117.54, + "step": 4890, + "train_speed(iter/s)": 0.209275 + }, + { + "acc": 0.76127825, + "epoch": 0.11432071042155761, + "grad_norm": 4.59375, + "learning_rate": 9.980671364788022e-06, + "loss": 0.91963701, + "memory(GiB)": 117.54, + "step": 4900, + "train_speed(iter/s)": 0.20949 + }, + { + "acc": 0.73467045, + "epoch": 0.11455401799384651, + "grad_norm": 5.53125, + "learning_rate": 9.980505062799262e-06, + "loss": 0.9670414, + "memory(GiB)": 117.54, + "step": 4910, + "train_speed(iter/s)": 0.209715 + }, + { + "acc": 0.78355579, + "epoch": 0.11478732556613541, + "grad_norm": 6.0, + "learning_rate": 9.980338049844854e-06, + "loss": 0.78485413, + "memory(GiB)": 117.54, + "step": 4920, + "train_speed(iter/s)": 0.209923 + }, + { + "acc": 0.74745569, + "epoch": 0.1150206331384243, + "grad_norm": 5.40625, + "learning_rate": 9.980170325948633e-06, + "loss": 0.92230692, + "memory(GiB)": 117.54, + "step": 4930, + "train_speed(iter/s)": 0.210131 + }, + { + "acc": 0.75931578, + "epoch": 0.1152539407107132, + "grad_norm": 6.53125, + "learning_rate": 9.980001891134548e-06, + "loss": 0.87786045, + "memory(GiB)": 117.54, + "step": 4940, + "train_speed(iter/s)": 0.210361 + }, + { + "acc": 0.7422245, + "epoch": 0.11548724828300208, + "grad_norm": 4.875, + "learning_rate": 9.979832745426637e-06, + "loss": 0.94051199, + "memory(GiB)": 117.54, + "step": 4950, + "train_speed(iter/s)": 0.21057 + }, + { + "acc": 0.75386519, + "epoch": 0.11572055585529098, + "grad_norm": 8.1875, + "learning_rate": 9.97966288884905e-06, + "loss": 0.89630547, + "memory(GiB)": 117.54, + "step": 4960, + "train_speed(iter/s)": 0.210783 + }, + { + "acc": 0.74389353, + "epoch": 0.11595386342757986, + "grad_norm": 6.375, + "learning_rate": 9.979492321426032e-06, + "loss": 0.93810577, + "memory(GiB)": 117.54, + "step": 4970, + "train_speed(iter/s)": 0.210987 + }, + { + "acc": 0.76075001, + "epoch": 0.11618717099986876, + "grad_norm": 5.4375, + "learning_rate": 9.97932104318193e-06, + "loss": 0.84013653, + "memory(GiB)": 117.54, + "step": 4980, + "train_speed(iter/s)": 0.211194 + }, + { + "acc": 0.77435451, + "epoch": 0.11642047857215766, + "grad_norm": 4.84375, + "learning_rate": 9.979149054141197e-06, + "loss": 0.80839243, + "memory(GiB)": 117.54, + "step": 4990, + "train_speed(iter/s)": 0.211393 + }, + { + "acc": 0.7572638, + "epoch": 0.11665378614444655, + "grad_norm": 7.3125, + "learning_rate": 9.978976354328383e-06, + "loss": 0.94224863, + "memory(GiB)": 117.54, + "step": 5000, + "train_speed(iter/s)": 0.21161 + }, + { + "epoch": 0.11665378614444655, + "eval_acc": 0.7257033574718734, + "eval_loss": 0.876083493232727, + "eval_runtime": 1268.8673, + "eval_samples_per_second": 28.365, + "eval_steps_per_second": 14.183, + "step": 5000 + }, + { + "acc": 0.74820719, + "epoch": 0.11688709371673545, + "grad_norm": 4.8125, + "learning_rate": 9.97880294376814e-06, + "loss": 0.91807137, + "memory(GiB)": 117.54, + "step": 5010, + "train_speed(iter/s)": 0.200898 + }, + { + "acc": 0.75378785, + "epoch": 0.11712040128902433, + "grad_norm": 5.59375, + "learning_rate": 9.978628822485224e-06, + "loss": 0.92413845, + "memory(GiB)": 117.54, + "step": 5020, + "train_speed(iter/s)": 0.201111 + }, + { + "acc": 0.73901176, + "epoch": 0.11735370886131323, + "grad_norm": 7.34375, + "learning_rate": 9.978453990504488e-06, + "loss": 0.95172367, + "memory(GiB)": 117.54, + "step": 5030, + "train_speed(iter/s)": 0.201322 + }, + { + "acc": 0.75668182, + "epoch": 0.11758701643360213, + "grad_norm": 5.25, + "learning_rate": 9.978278447850894e-06, + "loss": 0.89718676, + "memory(GiB)": 117.54, + "step": 5040, + "train_speed(iter/s)": 0.201526 + }, + { + "acc": 0.75949087, + "epoch": 0.11782032400589101, + "grad_norm": 5.59375, + "learning_rate": 9.978102194549498e-06, + "loss": 0.89434929, + "memory(GiB)": 117.54, + "step": 5050, + "train_speed(iter/s)": 0.201738 + }, + { + "acc": 0.75701513, + "epoch": 0.11805363157817991, + "grad_norm": 6.4375, + "learning_rate": 9.977925230625455e-06, + "loss": 0.86358986, + "memory(GiB)": 117.54, + "step": 5060, + "train_speed(iter/s)": 0.201951 + }, + { + "acc": 0.76386681, + "epoch": 0.1182869391504688, + "grad_norm": 7.3125, + "learning_rate": 9.977747556104036e-06, + "loss": 0.87934084, + "memory(GiB)": 117.54, + "step": 5070, + "train_speed(iter/s)": 0.202151 + }, + { + "acc": 0.75649219, + "epoch": 0.1185202467227577, + "grad_norm": 5.15625, + "learning_rate": 9.9775691710106e-06, + "loss": 0.88812695, + "memory(GiB)": 117.54, + "step": 5080, + "train_speed(iter/s)": 0.202352 + }, + { + "acc": 0.75279508, + "epoch": 0.11875355429504658, + "grad_norm": 4.875, + "learning_rate": 9.977390075370607e-06, + "loss": 0.90212612, + "memory(GiB)": 117.54, + "step": 5090, + "train_speed(iter/s)": 0.202571 + }, + { + "acc": 0.75354643, + "epoch": 0.11898686186733548, + "grad_norm": 6.875, + "learning_rate": 9.97721026920963e-06, + "loss": 0.89249134, + "memory(GiB)": 117.54, + "step": 5100, + "train_speed(iter/s)": 0.202776 + }, + { + "acc": 0.74565663, + "epoch": 0.11922016943962438, + "grad_norm": 5.53125, + "learning_rate": 9.977029752553331e-06, + "loss": 0.93843594, + "memory(GiB)": 117.54, + "step": 5110, + "train_speed(iter/s)": 0.202986 + }, + { + "acc": 0.75341768, + "epoch": 0.11945347701191326, + "grad_norm": 4.5, + "learning_rate": 9.97684852542748e-06, + "loss": 0.88506985, + "memory(GiB)": 117.54, + "step": 5120, + "train_speed(iter/s)": 0.203191 + }, + { + "acc": 0.74062805, + "epoch": 0.11968678458420216, + "grad_norm": 5.65625, + "learning_rate": 9.976666587857951e-06, + "loss": 0.94102879, + "memory(GiB)": 117.54, + "step": 5130, + "train_speed(iter/s)": 0.203403 + }, + { + "acc": 0.76677217, + "epoch": 0.11992009215649105, + "grad_norm": 4.96875, + "learning_rate": 9.97648393987071e-06, + "loss": 0.85841618, + "memory(GiB)": 117.54, + "step": 5140, + "train_speed(iter/s)": 0.203589 + }, + { + "acc": 0.7633976, + "epoch": 0.12015339972877995, + "grad_norm": 8.0625, + "learning_rate": 9.976300581491833e-06, + "loss": 0.83148098, + "memory(GiB)": 117.54, + "step": 5150, + "train_speed(iter/s)": 0.203784 + }, + { + "acc": 0.7448679, + "epoch": 0.12038670730106885, + "grad_norm": 5.65625, + "learning_rate": 9.976116512747493e-06, + "loss": 0.94005127, + "memory(GiB)": 117.54, + "step": 5160, + "train_speed(iter/s)": 0.203989 + }, + { + "acc": 0.76915073, + "epoch": 0.12062001487335773, + "grad_norm": 6.8125, + "learning_rate": 9.975931733663966e-06, + "loss": 0.83365669, + "memory(GiB)": 117.54, + "step": 5170, + "train_speed(iter/s)": 0.204202 + }, + { + "acc": 0.7760251, + "epoch": 0.12085332244564663, + "grad_norm": 4.40625, + "learning_rate": 9.97574624426763e-06, + "loss": 0.810886, + "memory(GiB)": 117.54, + "step": 5180, + "train_speed(iter/s)": 0.204402 + }, + { + "acc": 0.74410958, + "epoch": 0.12108663001793551, + "grad_norm": 5.9375, + "learning_rate": 9.975560044584964e-06, + "loss": 0.92947273, + "memory(GiB)": 117.54, + "step": 5190, + "train_speed(iter/s)": 0.204599 + }, + { + "acc": 0.74937978, + "epoch": 0.12131993759022441, + "grad_norm": 8.0, + "learning_rate": 9.975373134642545e-06, + "loss": 0.92720146, + "memory(GiB)": 117.54, + "step": 5200, + "train_speed(iter/s)": 0.2048 + }, + { + "acc": 0.77070284, + "epoch": 0.12155324516251331, + "grad_norm": 5.90625, + "learning_rate": 9.975185514467058e-06, + "loss": 0.82222948, + "memory(GiB)": 117.54, + "step": 5210, + "train_speed(iter/s)": 0.204998 + }, + { + "acc": 0.7634182, + "epoch": 0.1217865527348022, + "grad_norm": 6.3125, + "learning_rate": 9.974997184085285e-06, + "loss": 0.86025238, + "memory(GiB)": 117.54, + "step": 5220, + "train_speed(iter/s)": 0.205205 + }, + { + "acc": 0.75734425, + "epoch": 0.1220198603070911, + "grad_norm": 5.59375, + "learning_rate": 9.974808143524107e-06, + "loss": 0.87401457, + "memory(GiB)": 117.54, + "step": 5230, + "train_speed(iter/s)": 0.20541 + }, + { + "acc": 0.75092173, + "epoch": 0.12225316787937998, + "grad_norm": 4.1875, + "learning_rate": 9.974618392810513e-06, + "loss": 0.93851089, + "memory(GiB)": 117.54, + "step": 5240, + "train_speed(iter/s)": 0.205619 + }, + { + "acc": 0.74711943, + "epoch": 0.12248647545166888, + "grad_norm": 6.625, + "learning_rate": 9.974427931971588e-06, + "loss": 0.96647253, + "memory(GiB)": 117.54, + "step": 5250, + "train_speed(iter/s)": 0.205826 + }, + { + "acc": 0.74908724, + "epoch": 0.12271978302395777, + "grad_norm": 6.34375, + "learning_rate": 9.97423676103452e-06, + "loss": 0.93568954, + "memory(GiB)": 117.54, + "step": 5260, + "train_speed(iter/s)": 0.206042 + }, + { + "acc": 0.74937153, + "epoch": 0.12295309059624666, + "grad_norm": 8.125, + "learning_rate": 9.974044880026602e-06, + "loss": 0.94437447, + "memory(GiB)": 117.54, + "step": 5270, + "train_speed(iter/s)": 0.206236 + }, + { + "acc": 0.76040206, + "epoch": 0.12318639816853556, + "grad_norm": 7.59375, + "learning_rate": 9.97385228897522e-06, + "loss": 0.88464508, + "memory(GiB)": 117.54, + "step": 5280, + "train_speed(iter/s)": 0.206441 + }, + { + "acc": 0.77492819, + "epoch": 0.12341970574082445, + "grad_norm": 12.25, + "learning_rate": 9.97365898790787e-06, + "loss": 0.80230036, + "memory(GiB)": 117.54, + "step": 5290, + "train_speed(iter/s)": 0.206638 + }, + { + "acc": 0.75563636, + "epoch": 0.12365301331311335, + "grad_norm": 4.5625, + "learning_rate": 9.973464976852144e-06, + "loss": 0.8981823, + "memory(GiB)": 117.54, + "step": 5300, + "train_speed(iter/s)": 0.206834 + }, + { + "acc": 0.75627747, + "epoch": 0.12388632088540223, + "grad_norm": 6.90625, + "learning_rate": 9.973270255835737e-06, + "loss": 0.8913538, + "memory(GiB)": 117.54, + "step": 5310, + "train_speed(iter/s)": 0.207026 + }, + { + "acc": 0.75131683, + "epoch": 0.12411962845769113, + "grad_norm": 5.96875, + "learning_rate": 9.973074824886446e-06, + "loss": 0.93807335, + "memory(GiB)": 117.54, + "step": 5320, + "train_speed(iter/s)": 0.207209 + }, + { + "acc": 0.74059019, + "epoch": 0.12435293602998003, + "grad_norm": 5.53125, + "learning_rate": 9.972878684032169e-06, + "loss": 0.96218138, + "memory(GiB)": 117.54, + "step": 5330, + "train_speed(iter/s)": 0.207402 + }, + { + "acc": 0.76596565, + "epoch": 0.12458624360226891, + "grad_norm": 6.875, + "learning_rate": 9.972681833300903e-06, + "loss": 0.86264296, + "memory(GiB)": 117.54, + "step": 5340, + "train_speed(iter/s)": 0.207611 + }, + { + "acc": 0.74638081, + "epoch": 0.12481955117455781, + "grad_norm": 4.78125, + "learning_rate": 9.972484272720751e-06, + "loss": 0.91274433, + "memory(GiB)": 117.54, + "step": 5350, + "train_speed(iter/s)": 0.207813 + }, + { + "acc": 0.75317116, + "epoch": 0.1250528587468467, + "grad_norm": 4.78125, + "learning_rate": 9.972286002319913e-06, + "loss": 0.91048222, + "memory(GiB)": 117.54, + "step": 5360, + "train_speed(iter/s)": 0.208016 + }, + { + "acc": 0.77595243, + "epoch": 0.12528616631913558, + "grad_norm": 6.90625, + "learning_rate": 9.972087022126693e-06, + "loss": 0.81915474, + "memory(GiB)": 117.54, + "step": 5370, + "train_speed(iter/s)": 0.208199 + }, + { + "acc": 0.76468878, + "epoch": 0.12551947389142448, + "grad_norm": 6.75, + "learning_rate": 9.971887332169494e-06, + "loss": 0.84793015, + "memory(GiB)": 117.54, + "step": 5380, + "train_speed(iter/s)": 0.208388 + }, + { + "acc": 0.75001726, + "epoch": 0.12575278146371338, + "grad_norm": 7.375, + "learning_rate": 9.971686932476825e-06, + "loss": 0.90510178, + "memory(GiB)": 117.54, + "step": 5390, + "train_speed(iter/s)": 0.208579 + }, + { + "acc": 0.7452601, + "epoch": 0.12598608903600228, + "grad_norm": 4.71875, + "learning_rate": 9.971485823077288e-06, + "loss": 0.92312012, + "memory(GiB)": 117.54, + "step": 5400, + "train_speed(iter/s)": 0.20877 + }, + { + "acc": 0.75537529, + "epoch": 0.12621939660829118, + "grad_norm": 5.46875, + "learning_rate": 9.971284003999595e-06, + "loss": 0.89949427, + "memory(GiB)": 117.54, + "step": 5410, + "train_speed(iter/s)": 0.208957 + }, + { + "acc": 0.74212222, + "epoch": 0.12645270418058005, + "grad_norm": 6.15625, + "learning_rate": 9.971081475272555e-06, + "loss": 0.93930664, + "memory(GiB)": 117.54, + "step": 5420, + "train_speed(iter/s)": 0.209154 + }, + { + "acc": 0.76207142, + "epoch": 0.12668601175286895, + "grad_norm": 5.4375, + "learning_rate": 9.97087823692508e-06, + "loss": 0.86257935, + "memory(GiB)": 117.54, + "step": 5430, + "train_speed(iter/s)": 0.20934 + }, + { + "acc": 0.76142607, + "epoch": 0.12691931932515785, + "grad_norm": 5.78125, + "learning_rate": 9.970674288986178e-06, + "loss": 0.86087112, + "memory(GiB)": 117.54, + "step": 5440, + "train_speed(iter/s)": 0.209527 + }, + { + "acc": 0.77505727, + "epoch": 0.12715262689744675, + "grad_norm": 7.0625, + "learning_rate": 9.970469631484967e-06, + "loss": 0.81593513, + "memory(GiB)": 117.54, + "step": 5450, + "train_speed(iter/s)": 0.209699 + }, + { + "acc": 0.75893412, + "epoch": 0.12738593446973565, + "grad_norm": 6.03125, + "learning_rate": 9.970264264450659e-06, + "loss": 0.85409393, + "memory(GiB)": 117.54, + "step": 5460, + "train_speed(iter/s)": 0.209887 + }, + { + "acc": 0.76101146, + "epoch": 0.12761924204202452, + "grad_norm": 4.5625, + "learning_rate": 9.970058187912572e-06, + "loss": 0.89274902, + "memory(GiB)": 117.54, + "step": 5470, + "train_speed(iter/s)": 0.210074 + }, + { + "acc": 0.76868429, + "epoch": 0.12785254961431342, + "grad_norm": 5.6875, + "learning_rate": 9.969851401900122e-06, + "loss": 0.83084993, + "memory(GiB)": 117.54, + "step": 5480, + "train_speed(iter/s)": 0.210247 + }, + { + "acc": 0.75309086, + "epoch": 0.12808585718660231, + "grad_norm": 5.4375, + "learning_rate": 9.969643906442828e-06, + "loss": 0.90782976, + "memory(GiB)": 117.54, + "step": 5490, + "train_speed(iter/s)": 0.210435 + }, + { + "acc": 0.76415911, + "epoch": 0.1283191647588912, + "grad_norm": 5.46875, + "learning_rate": 9.96943570157031e-06, + "loss": 0.85922518, + "memory(GiB)": 117.54, + "step": 5500, + "train_speed(iter/s)": 0.210616 + }, + { + "epoch": 0.1283191647588912, + "eval_acc": 0.7270218249364108, + "eval_loss": 0.8716417551040649, + "eval_runtime": 1270.1331, + "eval_samples_per_second": 28.336, + "eval_steps_per_second": 14.169, + "step": 5500 + }, + { + "acc": 0.73600473, + "epoch": 0.1285524723311801, + "grad_norm": 7.03125, + "learning_rate": 9.969226787312288e-06, + "loss": 0.96929722, + "memory(GiB)": 117.54, + "step": 5510, + "train_speed(iter/s)": 0.200915 + }, + { + "acc": 0.76587524, + "epoch": 0.12878577990346898, + "grad_norm": 8.375, + "learning_rate": 9.969017163698587e-06, + "loss": 0.848559, + "memory(GiB)": 117.54, + "step": 5520, + "train_speed(iter/s)": 0.201104 + }, + { + "acc": 0.75857878, + "epoch": 0.12901908747575788, + "grad_norm": 5.0625, + "learning_rate": 9.96880683075913e-06, + "loss": 0.88542099, + "memory(GiB)": 117.54, + "step": 5530, + "train_speed(iter/s)": 0.201282 + }, + { + "acc": 0.77082253, + "epoch": 0.12925239504804678, + "grad_norm": 6.09375, + "learning_rate": 9.96859578852394e-06, + "loss": 0.81673002, + "memory(GiB)": 117.54, + "step": 5540, + "train_speed(iter/s)": 0.201462 + }, + { + "acc": 0.76157207, + "epoch": 0.12948570262033568, + "grad_norm": 5.625, + "learning_rate": 9.968384037023147e-06, + "loss": 0.87119312, + "memory(GiB)": 117.54, + "step": 5550, + "train_speed(iter/s)": 0.201654 + }, + { + "acc": 0.75955338, + "epoch": 0.12971901019262455, + "grad_norm": 5.21875, + "learning_rate": 9.968171576286973e-06, + "loss": 0.86215534, + "memory(GiB)": 117.54, + "step": 5560, + "train_speed(iter/s)": 0.20183 + }, + { + "acc": 0.77258162, + "epoch": 0.12995231776491345, + "grad_norm": 4.5, + "learning_rate": 9.96795840634575e-06, + "loss": 0.81898003, + "memory(GiB)": 117.54, + "step": 5570, + "train_speed(iter/s)": 0.202015 + }, + { + "acc": 0.75478101, + "epoch": 0.13018562533720235, + "grad_norm": 4.40625, + "learning_rate": 9.96774452722991e-06, + "loss": 0.91053543, + "memory(GiB)": 117.54, + "step": 5580, + "train_speed(iter/s)": 0.202209 + }, + { + "acc": 0.77054396, + "epoch": 0.13041893290949125, + "grad_norm": 6.78125, + "learning_rate": 9.967529938969981e-06, + "loss": 0.81340065, + "memory(GiB)": 117.54, + "step": 5590, + "train_speed(iter/s)": 0.202403 + }, + { + "acc": 0.75519128, + "epoch": 0.13065224048178015, + "grad_norm": 4.28125, + "learning_rate": 9.967314641596595e-06, + "loss": 0.87434855, + "memory(GiB)": 117.54, + "step": 5600, + "train_speed(iter/s)": 0.202598 + }, + { + "acc": 0.74656882, + "epoch": 0.13088554805406902, + "grad_norm": 8.875, + "learning_rate": 9.967098635140489e-06, + "loss": 0.92529068, + "memory(GiB)": 117.54, + "step": 5610, + "train_speed(iter/s)": 0.202788 + }, + { + "acc": 0.76542044, + "epoch": 0.13111885562635792, + "grad_norm": 7.9375, + "learning_rate": 9.966881919632494e-06, + "loss": 0.86160889, + "memory(GiB)": 117.54, + "step": 5620, + "train_speed(iter/s)": 0.202986 + }, + { + "acc": 0.76484203, + "epoch": 0.13135216319864682, + "grad_norm": 5.0625, + "learning_rate": 9.966664495103548e-06, + "loss": 0.85322952, + "memory(GiB)": 117.54, + "step": 5630, + "train_speed(iter/s)": 0.203181 + }, + { + "acc": 0.75730438, + "epoch": 0.13158547077093571, + "grad_norm": 5.25, + "learning_rate": 9.96644636158469e-06, + "loss": 0.85542212, + "memory(GiB)": 117.54, + "step": 5640, + "train_speed(iter/s)": 0.203364 + }, + { + "acc": 0.75253458, + "epoch": 0.1318187783432246, + "grad_norm": 19.875, + "learning_rate": 9.966227519107054e-06, + "loss": 0.96850071, + "memory(GiB)": 117.54, + "step": 5650, + "train_speed(iter/s)": 0.203551 + }, + { + "acc": 0.76095395, + "epoch": 0.13205208591551348, + "grad_norm": 4.25, + "learning_rate": 9.966007967701884e-06, + "loss": 0.88187704, + "memory(GiB)": 117.54, + "step": 5660, + "train_speed(iter/s)": 0.203728 + }, + { + "acc": 0.76344061, + "epoch": 0.13228539348780238, + "grad_norm": 7.3125, + "learning_rate": 9.965787707400521e-06, + "loss": 0.87797689, + "memory(GiB)": 117.54, + "step": 5670, + "train_speed(iter/s)": 0.203918 + }, + { + "acc": 0.7655921, + "epoch": 0.13251870106009128, + "grad_norm": 4.15625, + "learning_rate": 9.965566738234403e-06, + "loss": 0.83435678, + "memory(GiB)": 117.54, + "step": 5680, + "train_speed(iter/s)": 0.204106 + }, + { + "acc": 0.76264372, + "epoch": 0.13275200863238018, + "grad_norm": 7.1875, + "learning_rate": 9.965345060235075e-06, + "loss": 0.86193419, + "memory(GiB)": 117.54, + "step": 5690, + "train_speed(iter/s)": 0.204292 + }, + { + "acc": 0.75327988, + "epoch": 0.13298531620466908, + "grad_norm": 6.875, + "learning_rate": 9.965122673434182e-06, + "loss": 0.91502628, + "memory(GiB)": 117.54, + "step": 5700, + "train_speed(iter/s)": 0.204459 + }, + { + "acc": 0.75798521, + "epoch": 0.13321862377695795, + "grad_norm": 7.15625, + "learning_rate": 9.964899577863472e-06, + "loss": 0.86956244, + "memory(GiB)": 117.54, + "step": 5710, + "train_speed(iter/s)": 0.204649 + }, + { + "acc": 0.75251207, + "epoch": 0.13345193134924685, + "grad_norm": 6.03125, + "learning_rate": 9.964675773554789e-06, + "loss": 0.907057, + "memory(GiB)": 117.54, + "step": 5720, + "train_speed(iter/s)": 0.204833 + }, + { + "acc": 0.75791512, + "epoch": 0.13368523892153575, + "grad_norm": 6.03125, + "learning_rate": 9.96445126054008e-06, + "loss": 0.89783096, + "memory(GiB)": 117.54, + "step": 5730, + "train_speed(iter/s)": 0.205017 + }, + { + "acc": 0.76094255, + "epoch": 0.13391854649382465, + "grad_norm": 6.4375, + "learning_rate": 9.964226038851397e-06, + "loss": 0.88064919, + "memory(GiB)": 117.54, + "step": 5740, + "train_speed(iter/s)": 0.205195 + }, + { + "acc": 0.76472521, + "epoch": 0.13415185406611355, + "grad_norm": 5.375, + "learning_rate": 9.964000108520889e-06, + "loss": 0.84428978, + "memory(GiB)": 117.54, + "step": 5750, + "train_speed(iter/s)": 0.205379 + }, + { + "acc": 0.76074214, + "epoch": 0.13438516163840242, + "grad_norm": 5.40625, + "learning_rate": 9.963773469580806e-06, + "loss": 0.85086288, + "memory(GiB)": 117.54, + "step": 5760, + "train_speed(iter/s)": 0.205562 + }, + { + "acc": 0.76351795, + "epoch": 0.13461846921069132, + "grad_norm": 4.40625, + "learning_rate": 9.963546122063504e-06, + "loss": 0.86668959, + "memory(GiB)": 117.54, + "step": 5770, + "train_speed(iter/s)": 0.205749 + }, + { + "acc": 0.76371803, + "epoch": 0.13485177678298021, + "grad_norm": 8.5625, + "learning_rate": 9.963318066001433e-06, + "loss": 0.86080093, + "memory(GiB)": 117.54, + "step": 5780, + "train_speed(iter/s)": 0.205929 + }, + { + "acc": 0.7618516, + "epoch": 0.1350850843552691, + "grad_norm": 7.1875, + "learning_rate": 9.963089301427152e-06, + "loss": 0.84876232, + "memory(GiB)": 117.54, + "step": 5790, + "train_speed(iter/s)": 0.206111 + }, + { + "acc": 0.77087803, + "epoch": 0.13531839192755798, + "grad_norm": 5.0, + "learning_rate": 9.962859828373315e-06, + "loss": 0.83783426, + "memory(GiB)": 117.54, + "step": 5800, + "train_speed(iter/s)": 0.206293 + }, + { + "acc": 0.77675738, + "epoch": 0.13555169949984688, + "grad_norm": 7.90625, + "learning_rate": 9.96262964687268e-06, + "loss": 0.83521681, + "memory(GiB)": 117.54, + "step": 5810, + "train_speed(iter/s)": 0.206466 + }, + { + "acc": 0.76050811, + "epoch": 0.13578500707213578, + "grad_norm": 6.1875, + "learning_rate": 9.9623987569581e-06, + "loss": 0.88420362, + "memory(GiB)": 117.54, + "step": 5820, + "train_speed(iter/s)": 0.206646 + }, + { + "acc": 0.73890877, + "epoch": 0.13601831464442468, + "grad_norm": 5.90625, + "learning_rate": 9.962167158662543e-06, + "loss": 0.95491695, + "memory(GiB)": 117.54, + "step": 5830, + "train_speed(iter/s)": 0.206826 + }, + { + "acc": 0.74483318, + "epoch": 0.13625162221671358, + "grad_norm": 4.15625, + "learning_rate": 9.961934852019066e-06, + "loss": 0.92905416, + "memory(GiB)": 117.54, + "step": 5840, + "train_speed(iter/s)": 0.207007 + }, + { + "acc": 0.74667826, + "epoch": 0.13648492978900245, + "grad_norm": 4.6875, + "learning_rate": 9.96170183706083e-06, + "loss": 0.92713289, + "memory(GiB)": 117.54, + "step": 5850, + "train_speed(iter/s)": 0.207186 + }, + { + "acc": 0.74078379, + "epoch": 0.13671823736129135, + "grad_norm": 7.71875, + "learning_rate": 9.961468113821096e-06, + "loss": 0.94992075, + "memory(GiB)": 117.54, + "step": 5860, + "train_speed(iter/s)": 0.207359 + }, + { + "acc": 0.77699747, + "epoch": 0.13695154493358025, + "grad_norm": 5.84375, + "learning_rate": 9.96123368233323e-06, + "loss": 0.78689365, + "memory(GiB)": 117.54, + "step": 5870, + "train_speed(iter/s)": 0.207542 + }, + { + "acc": 0.74916096, + "epoch": 0.13718485250586915, + "grad_norm": 7.625, + "learning_rate": 9.9609985426307e-06, + "loss": 0.93192005, + "memory(GiB)": 117.54, + "step": 5880, + "train_speed(iter/s)": 0.207728 + }, + { + "acc": 0.7669405, + "epoch": 0.13741816007815805, + "grad_norm": 6.25, + "learning_rate": 9.960762694747068e-06, + "loss": 0.87499352, + "memory(GiB)": 117.54, + "step": 5890, + "train_speed(iter/s)": 0.207914 + }, + { + "acc": 0.77017179, + "epoch": 0.13765146765044692, + "grad_norm": 6.4375, + "learning_rate": 9.960526138716e-06, + "loss": 0.83586044, + "memory(GiB)": 117.54, + "step": 5900, + "train_speed(iter/s)": 0.208098 + }, + { + "acc": 0.75412836, + "epoch": 0.13788477522273582, + "grad_norm": 7.78125, + "learning_rate": 9.960288874571271e-06, + "loss": 0.90887356, + "memory(GiB)": 117.54, + "step": 5910, + "train_speed(iter/s)": 0.208286 + }, + { + "acc": 0.76210413, + "epoch": 0.13811808279502472, + "grad_norm": 6.0, + "learning_rate": 9.960050902346743e-06, + "loss": 0.85120649, + "memory(GiB)": 117.54, + "step": 5920, + "train_speed(iter/s)": 0.208456 + }, + { + "acc": 0.75921898, + "epoch": 0.13835139036731361, + "grad_norm": 8.875, + "learning_rate": 9.959812222076391e-06, + "loss": 0.87873182, + "memory(GiB)": 117.54, + "step": 5930, + "train_speed(iter/s)": 0.208634 + }, + { + "acc": 0.75706267, + "epoch": 0.1385846979396025, + "grad_norm": 7.96875, + "learning_rate": 9.959572833794283e-06, + "loss": 0.86111183, + "memory(GiB)": 117.54, + "step": 5940, + "train_speed(iter/s)": 0.208814 + }, + { + "acc": 0.74689651, + "epoch": 0.13881800551189138, + "grad_norm": 6.34375, + "learning_rate": 9.959332737534597e-06, + "loss": 0.92088099, + "memory(GiB)": 117.54, + "step": 5950, + "train_speed(iter/s)": 0.208985 + }, + { + "acc": 0.7608963, + "epoch": 0.13905131308418028, + "grad_norm": 17.625, + "learning_rate": 9.959091933331601e-06, + "loss": 0.85624313, + "memory(GiB)": 117.54, + "step": 5960, + "train_speed(iter/s)": 0.209159 + }, + { + "acc": 0.77537665, + "epoch": 0.13928462065646918, + "grad_norm": 4.96875, + "learning_rate": 9.958850421219675e-06, + "loss": 0.79838071, + "memory(GiB)": 117.54, + "step": 5970, + "train_speed(iter/s)": 0.209339 + }, + { + "acc": 0.75666475, + "epoch": 0.13951792822875808, + "grad_norm": 7.78125, + "learning_rate": 9.958608201233288e-06, + "loss": 0.89333801, + "memory(GiB)": 117.54, + "step": 5980, + "train_speed(iter/s)": 0.209526 + }, + { + "acc": 0.75140362, + "epoch": 0.13975123580104698, + "grad_norm": 4.75, + "learning_rate": 9.958365273407023e-06, + "loss": 0.9052248, + "memory(GiB)": 117.54, + "step": 5990, + "train_speed(iter/s)": 0.209691 + }, + { + "acc": 0.73782969, + "epoch": 0.13998454337333585, + "grad_norm": 13.3125, + "learning_rate": 9.958121637775554e-06, + "loss": 0.96962986, + "memory(GiB)": 117.54, + "step": 6000, + "train_speed(iter/s)": 0.209875 + }, + { + "epoch": 0.13998454337333585, + "eval_acc": 0.727715409606508, + "eval_loss": 0.8690560460090637, + "eval_runtime": 1270.3769, + "eval_samples_per_second": 28.331, + "eval_steps_per_second": 14.166, + "step": 6000 + }, + { + "acc": 0.75419903, + "epoch": 0.14021785094562475, + "grad_norm": 6.4375, + "learning_rate": 9.957877294373665e-06, + "loss": 0.91555004, + "memory(GiB)": 117.54, + "step": 6010, + "train_speed(iter/s)": 0.200992 + }, + { + "acc": 0.74702139, + "epoch": 0.14045115851791365, + "grad_norm": 7.0, + "learning_rate": 9.957632243236231e-06, + "loss": 0.92673454, + "memory(GiB)": 117.54, + "step": 6020, + "train_speed(iter/s)": 0.201172 + }, + { + "acc": 0.75800505, + "epoch": 0.14068446609020255, + "grad_norm": 6.53125, + "learning_rate": 9.957386484398233e-06, + "loss": 0.87921581, + "memory(GiB)": 117.54, + "step": 6030, + "train_speed(iter/s)": 0.201343 + }, + { + "acc": 0.74582348, + "epoch": 0.14091777366249145, + "grad_norm": 5.21875, + "learning_rate": 9.957140017894754e-06, + "loss": 0.91472492, + "memory(GiB)": 117.54, + "step": 6040, + "train_speed(iter/s)": 0.201522 + }, + { + "acc": 0.78675618, + "epoch": 0.14115108123478032, + "grad_norm": 6.03125, + "learning_rate": 9.956892843760979e-06, + "loss": 0.75914721, + "memory(GiB)": 117.54, + "step": 6050, + "train_speed(iter/s)": 0.201696 + }, + { + "acc": 0.74411964, + "epoch": 0.14138438880706922, + "grad_norm": 12.0625, + "learning_rate": 9.956644962032192e-06, + "loss": 0.93103161, + "memory(GiB)": 117.54, + "step": 6060, + "train_speed(iter/s)": 0.201878 + }, + { + "acc": 0.7726965, + "epoch": 0.14161769637935812, + "grad_norm": 7.375, + "learning_rate": 9.956396372743775e-06, + "loss": 0.82945976, + "memory(GiB)": 117.54, + "step": 6070, + "train_speed(iter/s)": 0.202042 + }, + { + "acc": 0.78586645, + "epoch": 0.14185100395164701, + "grad_norm": 6.875, + "learning_rate": 9.956147075931215e-06, + "loss": 0.76539755, + "memory(GiB)": 117.54, + "step": 6080, + "train_speed(iter/s)": 0.202205 + }, + { + "acc": 0.75904989, + "epoch": 0.14208431152393589, + "grad_norm": 8.75, + "learning_rate": 9.955897071630101e-06, + "loss": 0.87475338, + "memory(GiB)": 117.54, + "step": 6090, + "train_speed(iter/s)": 0.202374 + }, + { + "acc": 0.76456523, + "epoch": 0.14231761909622478, + "grad_norm": 6.03125, + "learning_rate": 9.955646359876118e-06, + "loss": 0.86053829, + "memory(GiB)": 117.54, + "step": 6100, + "train_speed(iter/s)": 0.202548 + }, + { + "acc": 0.76305366, + "epoch": 0.14255092666851368, + "grad_norm": 6.84375, + "learning_rate": 9.955394940705057e-06, + "loss": 0.85630341, + "memory(GiB)": 117.54, + "step": 6110, + "train_speed(iter/s)": 0.202712 + }, + { + "acc": 0.7719203, + "epoch": 0.14278423424080258, + "grad_norm": 8.1875, + "learning_rate": 9.95514281415281e-06, + "loss": 0.82893257, + "memory(GiB)": 117.54, + "step": 6120, + "train_speed(iter/s)": 0.20288 + }, + { + "acc": 0.76380386, + "epoch": 0.14301754181309148, + "grad_norm": 7.1875, + "learning_rate": 9.954889980255363e-06, + "loss": 0.84828529, + "memory(GiB)": 117.54, + "step": 6130, + "train_speed(iter/s)": 0.203048 + }, + { + "acc": 0.77324157, + "epoch": 0.14325084938538035, + "grad_norm": 4.34375, + "learning_rate": 9.954636439048813e-06, + "loss": 0.82084036, + "memory(GiB)": 117.54, + "step": 6140, + "train_speed(iter/s)": 0.203199 + }, + { + "acc": 0.76487913, + "epoch": 0.14348415695766925, + "grad_norm": 6.03125, + "learning_rate": 9.95438219056935e-06, + "loss": 0.85029163, + "memory(GiB)": 117.54, + "step": 6150, + "train_speed(iter/s)": 0.203348 + }, + { + "acc": 0.75904469, + "epoch": 0.14371746452995815, + "grad_norm": 7.875, + "learning_rate": 9.954127234853267e-06, + "loss": 0.89040432, + "memory(GiB)": 117.54, + "step": 6160, + "train_speed(iter/s)": 0.203524 + }, + { + "acc": 0.76309099, + "epoch": 0.14395077210224705, + "grad_norm": 4.75, + "learning_rate": 9.953871571936962e-06, + "loss": 0.85744724, + "memory(GiB)": 117.54, + "step": 6170, + "train_speed(iter/s)": 0.203691 + }, + { + "acc": 0.77011318, + "epoch": 0.14418407967453595, + "grad_norm": 6.625, + "learning_rate": 9.953615201856928e-06, + "loss": 0.8331212, + "memory(GiB)": 117.54, + "step": 6180, + "train_speed(iter/s)": 0.203861 + }, + { + "acc": 0.77067161, + "epoch": 0.14441738724682482, + "grad_norm": 5.53125, + "learning_rate": 9.953358124649764e-06, + "loss": 0.82904701, + "memory(GiB)": 117.54, + "step": 6190, + "train_speed(iter/s)": 0.20402 + }, + { + "acc": 0.75703244, + "epoch": 0.14465069481911372, + "grad_norm": 6.59375, + "learning_rate": 9.953100340352166e-06, + "loss": 0.87819614, + "memory(GiB)": 117.54, + "step": 6200, + "train_speed(iter/s)": 0.20419 + }, + { + "acc": 0.76388779, + "epoch": 0.14488400239140262, + "grad_norm": 6.78125, + "learning_rate": 9.952841849000935e-06, + "loss": 0.84278336, + "memory(GiB)": 117.54, + "step": 6210, + "train_speed(iter/s)": 0.204357 + }, + { + "acc": 0.78060641, + "epoch": 0.14511730996369152, + "grad_norm": 6.90625, + "learning_rate": 9.952582650632967e-06, + "loss": 0.80852413, + "memory(GiB)": 117.54, + "step": 6220, + "train_speed(iter/s)": 0.204527 + }, + { + "acc": 0.76564684, + "epoch": 0.14535061753598041, + "grad_norm": 5.875, + "learning_rate": 9.952322745285266e-06, + "loss": 0.83691425, + "memory(GiB)": 117.54, + "step": 6230, + "train_speed(iter/s)": 0.20469 + }, + { + "acc": 0.77099514, + "epoch": 0.14558392510826929, + "grad_norm": 9.75, + "learning_rate": 9.95206213299493e-06, + "loss": 0.83298168, + "memory(GiB)": 117.54, + "step": 6240, + "train_speed(iter/s)": 0.204862 + }, + { + "acc": 0.76582918, + "epoch": 0.14581723268055818, + "grad_norm": 4.75, + "learning_rate": 9.951800813799164e-06, + "loss": 0.85061855, + "memory(GiB)": 117.54, + "step": 6250, + "train_speed(iter/s)": 0.205033 + }, + { + "acc": 0.77523489, + "epoch": 0.14605054025284708, + "grad_norm": 5.90625, + "learning_rate": 9.95153878773527e-06, + "loss": 0.8064352, + "memory(GiB)": 117.54, + "step": 6260, + "train_speed(iter/s)": 0.205197 + }, + { + "acc": 0.77384329, + "epoch": 0.14628384782513598, + "grad_norm": 7.375, + "learning_rate": 9.951276054840654e-06, + "loss": 0.80671873, + "memory(GiB)": 117.54, + "step": 6270, + "train_speed(iter/s)": 0.205362 + }, + { + "acc": 0.75420942, + "epoch": 0.14651715539742488, + "grad_norm": 6.5625, + "learning_rate": 9.951012615152816e-06, + "loss": 0.92444849, + "memory(GiB)": 117.54, + "step": 6280, + "train_speed(iter/s)": 0.205538 + }, + { + "acc": 0.78133068, + "epoch": 0.14675046296971375, + "grad_norm": 9.5, + "learning_rate": 9.950748468709368e-06, + "loss": 0.78833847, + "memory(GiB)": 117.54, + "step": 6290, + "train_speed(iter/s)": 0.205701 + }, + { + "acc": 0.74810176, + "epoch": 0.14698377054200265, + "grad_norm": 5.78125, + "learning_rate": 9.950483615548014e-06, + "loss": 0.92472363, + "memory(GiB)": 117.54, + "step": 6300, + "train_speed(iter/s)": 0.205871 + }, + { + "acc": 0.7479497, + "epoch": 0.14721707811429155, + "grad_norm": 5.9375, + "learning_rate": 9.950218055706563e-06, + "loss": 0.92439194, + "memory(GiB)": 117.54, + "step": 6310, + "train_speed(iter/s)": 0.206038 + }, + { + "acc": 0.77187309, + "epoch": 0.14745038568658045, + "grad_norm": 5.40625, + "learning_rate": 9.94995178922292e-06, + "loss": 0.83006401, + "memory(GiB)": 117.54, + "step": 6320, + "train_speed(iter/s)": 0.206198 + }, + { + "acc": 0.75937672, + "epoch": 0.14768369325886932, + "grad_norm": 7.03125, + "learning_rate": 9.949684816135098e-06, + "loss": 0.88418226, + "memory(GiB)": 117.54, + "step": 6330, + "train_speed(iter/s)": 0.206369 + }, + { + "acc": 0.7517971, + "epoch": 0.14791700083115822, + "grad_norm": 4.96875, + "learning_rate": 9.949417136481207e-06, + "loss": 0.92034903, + "memory(GiB)": 117.54, + "step": 6340, + "train_speed(iter/s)": 0.206546 + }, + { + "acc": 0.75734596, + "epoch": 0.14815030840344712, + "grad_norm": 5.1875, + "learning_rate": 9.94914875029946e-06, + "loss": 0.90017557, + "memory(GiB)": 117.54, + "step": 6350, + "train_speed(iter/s)": 0.206707 + }, + { + "acc": 0.75256109, + "epoch": 0.14838361597573602, + "grad_norm": 7.0625, + "learning_rate": 9.948879657628164e-06, + "loss": 0.9080761, + "memory(GiB)": 117.54, + "step": 6360, + "train_speed(iter/s)": 0.206879 + }, + { + "acc": 0.76637239, + "epoch": 0.14861692354802492, + "grad_norm": 13.8125, + "learning_rate": 9.948609858505734e-06, + "loss": 0.84904175, + "memory(GiB)": 117.54, + "step": 6370, + "train_speed(iter/s)": 0.207045 + }, + { + "acc": 0.73998928, + "epoch": 0.1488502311203138, + "grad_norm": 5.25, + "learning_rate": 9.948339352970683e-06, + "loss": 0.97184429, + "memory(GiB)": 117.54, + "step": 6380, + "train_speed(iter/s)": 0.207209 + }, + { + "acc": 0.75804882, + "epoch": 0.14908353869260269, + "grad_norm": 8.0, + "learning_rate": 9.948068141061631e-06, + "loss": 0.85682335, + "memory(GiB)": 117.54, + "step": 6390, + "train_speed(iter/s)": 0.207362 + }, + { + "acc": 0.72696562, + "epoch": 0.14931684626489158, + "grad_norm": 6.6875, + "learning_rate": 9.947796222817286e-06, + "loss": 1.01356945, + "memory(GiB)": 117.54, + "step": 6400, + "train_speed(iter/s)": 0.207524 + }, + { + "acc": 0.76806068, + "epoch": 0.14955015383718048, + "grad_norm": 5.125, + "learning_rate": 9.94752359827647e-06, + "loss": 0.85701227, + "memory(GiB)": 117.54, + "step": 6410, + "train_speed(iter/s)": 0.207681 + }, + { + "acc": 0.76032314, + "epoch": 0.14978346140946938, + "grad_norm": 7.25, + "learning_rate": 9.947250267478094e-06, + "loss": 0.87021475, + "memory(GiB)": 117.54, + "step": 6420, + "train_speed(iter/s)": 0.207838 + }, + { + "acc": 0.75938177, + "epoch": 0.15001676898175825, + "grad_norm": 5.03125, + "learning_rate": 9.946976230461183e-06, + "loss": 0.88507252, + "memory(GiB)": 117.54, + "step": 6430, + "train_speed(iter/s)": 0.208015 + }, + { + "acc": 0.74555626, + "epoch": 0.15025007655404715, + "grad_norm": 5.40625, + "learning_rate": 9.946701487264851e-06, + "loss": 0.9236105, + "memory(GiB)": 117.54, + "step": 6440, + "train_speed(iter/s)": 0.208181 + }, + { + "acc": 0.761584, + "epoch": 0.15048338412633605, + "grad_norm": 4.71875, + "learning_rate": 9.946426037928319e-06, + "loss": 0.86102915, + "memory(GiB)": 117.54, + "step": 6450, + "train_speed(iter/s)": 0.208345 + }, + { + "acc": 0.7360589, + "epoch": 0.15071669169862495, + "grad_norm": 8.5, + "learning_rate": 9.946149882490907e-06, + "loss": 0.94918003, + "memory(GiB)": 117.54, + "step": 6460, + "train_speed(iter/s)": 0.208502 + }, + { + "acc": 0.76649723, + "epoch": 0.15094999927091385, + "grad_norm": 4.78125, + "learning_rate": 9.945873020992036e-06, + "loss": 0.86450176, + "memory(GiB)": 117.54, + "step": 6470, + "train_speed(iter/s)": 0.208672 + }, + { + "acc": 0.77010989, + "epoch": 0.15118330684320272, + "grad_norm": 5.53125, + "learning_rate": 9.945595453471228e-06, + "loss": 0.82765493, + "memory(GiB)": 117.54, + "step": 6480, + "train_speed(iter/s)": 0.20882 + }, + { + "acc": 0.76764402, + "epoch": 0.15141661441549162, + "grad_norm": 13.0625, + "learning_rate": 9.945317179968105e-06, + "loss": 0.84424448, + "memory(GiB)": 117.54, + "step": 6490, + "train_speed(iter/s)": 0.208984 + }, + { + "acc": 0.75681572, + "epoch": 0.15164992198778052, + "grad_norm": 4.4375, + "learning_rate": 9.945038200522392e-06, + "loss": 0.9044385, + "memory(GiB)": 117.54, + "step": 6500, + "train_speed(iter/s)": 0.209132 + }, + { + "epoch": 0.15164992198778052, + "eval_acc": 0.7281223361007858, + "eval_loss": 0.8657384514808655, + "eval_runtime": 1269.6135, + "eval_samples_per_second": 28.348, + "eval_steps_per_second": 14.174, + "step": 6500 + }, + { + "acc": 0.77224231, + "epoch": 0.15188322956006942, + "grad_norm": 6.875, + "learning_rate": 9.944758515173912e-06, + "loss": 0.82228127, + "memory(GiB)": 117.54, + "step": 6510, + "train_speed(iter/s)": 0.200985 + }, + { + "acc": 0.76387882, + "epoch": 0.15211653713235831, + "grad_norm": 5.21875, + "learning_rate": 9.944478123962592e-06, + "loss": 0.85496302, + "memory(GiB)": 117.54, + "step": 6520, + "train_speed(iter/s)": 0.20115 + }, + { + "acc": 0.75840979, + "epoch": 0.15234984470464719, + "grad_norm": 4.8125, + "learning_rate": 9.944197026928454e-06, + "loss": 0.88294678, + "memory(GiB)": 117.54, + "step": 6530, + "train_speed(iter/s)": 0.201303 + }, + { + "acc": 0.76922741, + "epoch": 0.15258315227693608, + "grad_norm": 4.84375, + "learning_rate": 9.943915224111627e-06, + "loss": 0.87327852, + "memory(GiB)": 117.54, + "step": 6540, + "train_speed(iter/s)": 0.201457 + }, + { + "acc": 0.75248985, + "epoch": 0.15281645984922498, + "grad_norm": 8.4375, + "learning_rate": 9.943632715552338e-06, + "loss": 0.88687239, + "memory(GiB)": 117.54, + "step": 6550, + "train_speed(iter/s)": 0.201614 + }, + { + "acc": 0.75970192, + "epoch": 0.15304976742151388, + "grad_norm": 14.4375, + "learning_rate": 9.943349501290916e-06, + "loss": 0.85225811, + "memory(GiB)": 117.54, + "step": 6560, + "train_speed(iter/s)": 0.201782 + }, + { + "acc": 0.77734346, + "epoch": 0.15328307499380278, + "grad_norm": 5.3125, + "learning_rate": 9.943065581367788e-06, + "loss": 0.81777477, + "memory(GiB)": 117.54, + "step": 6570, + "train_speed(iter/s)": 0.201947 + }, + { + "acc": 0.75825195, + "epoch": 0.15351638256609165, + "grad_norm": 7.625, + "learning_rate": 9.942780955823485e-06, + "loss": 0.86885967, + "memory(GiB)": 117.54, + "step": 6580, + "train_speed(iter/s)": 0.202112 + }, + { + "acc": 0.76747169, + "epoch": 0.15374969013838055, + "grad_norm": 4.4375, + "learning_rate": 9.942495624698636e-06, + "loss": 0.84972601, + "memory(GiB)": 117.54, + "step": 6590, + "train_speed(iter/s)": 0.202268 + }, + { + "acc": 0.75881453, + "epoch": 0.15398299771066945, + "grad_norm": 5.46875, + "learning_rate": 9.942209588033973e-06, + "loss": 0.87035789, + "memory(GiB)": 117.54, + "step": 6600, + "train_speed(iter/s)": 0.202422 + }, + { + "acc": 0.74656525, + "epoch": 0.15421630528295835, + "grad_norm": 5.03125, + "learning_rate": 9.941922845870326e-06, + "loss": 0.92789898, + "memory(GiB)": 117.54, + "step": 6610, + "train_speed(iter/s)": 0.202575 + }, + { + "acc": 0.7642231, + "epoch": 0.15444961285524722, + "grad_norm": 5.4375, + "learning_rate": 9.941635398248628e-06, + "loss": 0.83677711, + "memory(GiB)": 117.54, + "step": 6620, + "train_speed(iter/s)": 0.202738 + }, + { + "acc": 0.77601938, + "epoch": 0.15468292042753612, + "grad_norm": 6.65625, + "learning_rate": 9.941347245209914e-06, + "loss": 0.78568192, + "memory(GiB)": 117.54, + "step": 6630, + "train_speed(iter/s)": 0.202885 + }, + { + "acc": 0.75074358, + "epoch": 0.15491622799982502, + "grad_norm": 6.375, + "learning_rate": 9.941058386795314e-06, + "loss": 0.90226822, + "memory(GiB)": 117.54, + "step": 6640, + "train_speed(iter/s)": 0.203031 + }, + { + "acc": 0.77472057, + "epoch": 0.15514953557211392, + "grad_norm": 4.53125, + "learning_rate": 9.940768823046067e-06, + "loss": 0.83820248, + "memory(GiB)": 117.54, + "step": 6650, + "train_speed(iter/s)": 0.203189 + }, + { + "acc": 0.77502432, + "epoch": 0.15538284314440282, + "grad_norm": 5.78125, + "learning_rate": 9.940478554003506e-06, + "loss": 0.79055486, + "memory(GiB)": 117.54, + "step": 6660, + "train_speed(iter/s)": 0.203346 + }, + { + "acc": 0.76303005, + "epoch": 0.1556161507166917, + "grad_norm": 6.53125, + "learning_rate": 9.940187579709064e-06, + "loss": 0.86618824, + "memory(GiB)": 117.54, + "step": 6670, + "train_speed(iter/s)": 0.203512 + }, + { + "acc": 0.74345989, + "epoch": 0.15584945828898059, + "grad_norm": 4.96875, + "learning_rate": 9.939895900204281e-06, + "loss": 0.91676807, + "memory(GiB)": 117.54, + "step": 6680, + "train_speed(iter/s)": 0.203669 + }, + { + "acc": 0.76821184, + "epoch": 0.15608276586126948, + "grad_norm": 5.46875, + "learning_rate": 9.939603515530796e-06, + "loss": 0.8443615, + "memory(GiB)": 117.54, + "step": 6690, + "train_speed(iter/s)": 0.203826 + }, + { + "acc": 0.75775089, + "epoch": 0.15631607343355838, + "grad_norm": 6.21875, + "learning_rate": 9.939310425730342e-06, + "loss": 0.83715219, + "memory(GiB)": 117.54, + "step": 6700, + "train_speed(iter/s)": 0.203982 + }, + { + "acc": 0.75873032, + "epoch": 0.15654938100584728, + "grad_norm": 4.875, + "learning_rate": 9.939016630844758e-06, + "loss": 0.87146358, + "memory(GiB)": 117.54, + "step": 6710, + "train_speed(iter/s)": 0.204137 + }, + { + "acc": 0.76880903, + "epoch": 0.15678268857813615, + "grad_norm": 5.53125, + "learning_rate": 9.938722130915988e-06, + "loss": 0.81287689, + "memory(GiB)": 117.54, + "step": 6720, + "train_speed(iter/s)": 0.20429 + }, + { + "acc": 0.76790185, + "epoch": 0.15701599615042505, + "grad_norm": 4.84375, + "learning_rate": 9.938426925986066e-06, + "loss": 0.85982666, + "memory(GiB)": 117.54, + "step": 6730, + "train_speed(iter/s)": 0.204444 + }, + { + "acc": 0.75649314, + "epoch": 0.15724930372271395, + "grad_norm": 5.15625, + "learning_rate": 9.938131016097137e-06, + "loss": 0.89570599, + "memory(GiB)": 117.54, + "step": 6740, + "train_speed(iter/s)": 0.204591 + }, + { + "acc": 0.75742426, + "epoch": 0.15748261129500285, + "grad_norm": 6.375, + "learning_rate": 9.937834401291437e-06, + "loss": 0.88885574, + "memory(GiB)": 117.54, + "step": 6750, + "train_speed(iter/s)": 0.204744 + }, + { + "acc": 0.75880747, + "epoch": 0.15771591886729175, + "grad_norm": 6.5, + "learning_rate": 9.937537081611313e-06, + "loss": 0.89389744, + "memory(GiB)": 117.54, + "step": 6760, + "train_speed(iter/s)": 0.2049 + }, + { + "acc": 0.76473279, + "epoch": 0.15794922643958062, + "grad_norm": 5.03125, + "learning_rate": 9.937239057099205e-06, + "loss": 0.84799232, + "memory(GiB)": 117.54, + "step": 6770, + "train_speed(iter/s)": 0.205043 + }, + { + "acc": 0.76936255, + "epoch": 0.15818253401186952, + "grad_norm": 6.625, + "learning_rate": 9.936940327797655e-06, + "loss": 0.8422473, + "memory(GiB)": 117.54, + "step": 6780, + "train_speed(iter/s)": 0.205188 + }, + { + "acc": 0.75960865, + "epoch": 0.15841584158415842, + "grad_norm": 6.15625, + "learning_rate": 9.936640893749308e-06, + "loss": 0.86526461, + "memory(GiB)": 117.54, + "step": 6790, + "train_speed(iter/s)": 0.205344 + }, + { + "acc": 0.75233178, + "epoch": 0.15864914915644732, + "grad_norm": 5.59375, + "learning_rate": 9.936340754996906e-06, + "loss": 0.89683399, + "memory(GiB)": 117.54, + "step": 6800, + "train_speed(iter/s)": 0.205496 + }, + { + "acc": 0.75799913, + "epoch": 0.15888245672873622, + "grad_norm": 6.40625, + "learning_rate": 9.936039911583298e-06, + "loss": 0.88384476, + "memory(GiB)": 117.54, + "step": 6810, + "train_speed(iter/s)": 0.205656 + }, + { + "acc": 0.76789608, + "epoch": 0.1591157643010251, + "grad_norm": 6.0, + "learning_rate": 9.935738363551424e-06, + "loss": 0.84808712, + "memory(GiB)": 117.54, + "step": 6820, + "train_speed(iter/s)": 0.205793 + }, + { + "acc": 0.73517952, + "epoch": 0.15934907187331399, + "grad_norm": 5.53125, + "learning_rate": 9.935436110944335e-06, + "loss": 0.95522289, + "memory(GiB)": 117.54, + "step": 6830, + "train_speed(iter/s)": 0.205947 + }, + { + "acc": 0.75344524, + "epoch": 0.15958237944560288, + "grad_norm": 6.4375, + "learning_rate": 9.935133153805172e-06, + "loss": 0.91215868, + "memory(GiB)": 117.54, + "step": 6840, + "train_speed(iter/s)": 0.206099 + }, + { + "acc": 0.73823147, + "epoch": 0.15981568701789178, + "grad_norm": 6.46875, + "learning_rate": 9.934829492177187e-06, + "loss": 0.9458313, + "memory(GiB)": 117.54, + "step": 6850, + "train_speed(iter/s)": 0.206255 + }, + { + "acc": 0.74221478, + "epoch": 0.16004899459018065, + "grad_norm": 5.40625, + "learning_rate": 9.934525126103725e-06, + "loss": 0.94343185, + "memory(GiB)": 117.54, + "step": 6860, + "train_speed(iter/s)": 0.206398 + }, + { + "acc": 0.76774483, + "epoch": 0.16028230216246955, + "grad_norm": 7.4375, + "learning_rate": 9.934220055628233e-06, + "loss": 0.85056896, + "memory(GiB)": 117.54, + "step": 6870, + "train_speed(iter/s)": 0.206556 + }, + { + "acc": 0.75645094, + "epoch": 0.16051560973475845, + "grad_norm": 5.375, + "learning_rate": 9.933914280794266e-06, + "loss": 0.87650585, + "memory(GiB)": 117.54, + "step": 6880, + "train_speed(iter/s)": 0.206713 + }, + { + "acc": 0.76912999, + "epoch": 0.16074891730704735, + "grad_norm": 5.75, + "learning_rate": 9.933607801645464e-06, + "loss": 0.85444221, + "memory(GiB)": 117.54, + "step": 6890, + "train_speed(iter/s)": 0.206863 + }, + { + "acc": 0.76916509, + "epoch": 0.16098222487933625, + "grad_norm": 5.84375, + "learning_rate": 9.933300618225584e-06, + "loss": 0.83676643, + "memory(GiB)": 117.54, + "step": 6900, + "train_speed(iter/s)": 0.207006 + }, + { + "acc": 0.7689889, + "epoch": 0.16121553245162512, + "grad_norm": 12.5625, + "learning_rate": 9.932992730578473e-06, + "loss": 0.85195637, + "memory(GiB)": 117.54, + "step": 6910, + "train_speed(iter/s)": 0.207151 + }, + { + "acc": 0.7639226, + "epoch": 0.16144884002391402, + "grad_norm": 4.8125, + "learning_rate": 9.932684138748083e-06, + "loss": 0.8682415, + "memory(GiB)": 117.54, + "step": 6920, + "train_speed(iter/s)": 0.207303 + }, + { + "acc": 0.74718456, + "epoch": 0.16168214759620292, + "grad_norm": 5.28125, + "learning_rate": 9.932374842778466e-06, + "loss": 0.95030527, + "memory(GiB)": 117.54, + "step": 6930, + "train_speed(iter/s)": 0.207455 + }, + { + "acc": 0.76108999, + "epoch": 0.16191545516849182, + "grad_norm": 8.9375, + "learning_rate": 9.932064842713773e-06, + "loss": 0.88821373, + "memory(GiB)": 117.54, + "step": 6940, + "train_speed(iter/s)": 0.207608 + }, + { + "acc": 0.76204557, + "epoch": 0.16214876274078072, + "grad_norm": 5.3125, + "learning_rate": 9.931754138598256e-06, + "loss": 0.83484173, + "memory(GiB)": 117.54, + "step": 6950, + "train_speed(iter/s)": 0.207758 + }, + { + "acc": 0.77546968, + "epoch": 0.1623820703130696, + "grad_norm": 6.3125, + "learning_rate": 9.931442730476266e-06, + "loss": 0.8154768, + "memory(GiB)": 117.54, + "step": 6960, + "train_speed(iter/s)": 0.207903 + }, + { + "acc": 0.74842148, + "epoch": 0.1626153778853585, + "grad_norm": 7.5625, + "learning_rate": 9.931130618392262e-06, + "loss": 0.91668129, + "memory(GiB)": 117.54, + "step": 6970, + "train_speed(iter/s)": 0.208058 + }, + { + "acc": 0.75227356, + "epoch": 0.16284868545764739, + "grad_norm": 5.59375, + "learning_rate": 9.930817802390794e-06, + "loss": 0.90838165, + "memory(GiB)": 117.54, + "step": 6980, + "train_speed(iter/s)": 0.208207 + }, + { + "acc": 0.77258139, + "epoch": 0.16308199302993628, + "grad_norm": 5.5625, + "learning_rate": 9.930504282516517e-06, + "loss": 0.81022301, + "memory(GiB)": 117.54, + "step": 6990, + "train_speed(iter/s)": 0.208358 + }, + { + "acc": 0.76104183, + "epoch": 0.16331530060222518, + "grad_norm": 4.125, + "learning_rate": 9.930190058814185e-06, + "loss": 0.87542782, + "memory(GiB)": 117.54, + "step": 7000, + "train_speed(iter/s)": 0.208517 + }, + { + "epoch": 0.16331530060222518, + "eval_acc": 0.7292125980342936, + "eval_loss": 0.8633277416229248, + "eval_runtime": 1270.1232, + "eval_samples_per_second": 28.337, + "eval_steps_per_second": 14.169, + "step": 7000 + }, + { + "acc": 0.76679716, + "epoch": 0.16354860817451405, + "grad_norm": 4.59375, + "learning_rate": 9.929875131328655e-06, + "loss": 0.84782066, + "memory(GiB)": 117.54, + "step": 7010, + "train_speed(iter/s)": 0.200947 + }, + { + "acc": 0.74878941, + "epoch": 0.16378191574680295, + "grad_norm": 4.15625, + "learning_rate": 9.929559500104883e-06, + "loss": 0.91403189, + "memory(GiB)": 117.54, + "step": 7020, + "train_speed(iter/s)": 0.201094 + }, + { + "acc": 0.76715469, + "epoch": 0.16401522331909185, + "grad_norm": 6.4375, + "learning_rate": 9.929243165187922e-06, + "loss": 0.84933138, + "memory(GiB)": 117.54, + "step": 7030, + "train_speed(iter/s)": 0.201244 + }, + { + "acc": 0.76669478, + "epoch": 0.16424853089138075, + "grad_norm": 3.796875, + "learning_rate": 9.928926126622933e-06, + "loss": 0.84990997, + "memory(GiB)": 117.54, + "step": 7040, + "train_speed(iter/s)": 0.201391 + }, + { + "acc": 0.76388216, + "epoch": 0.16448183846366965, + "grad_norm": 6.25, + "learning_rate": 9.928608384455172e-06, + "loss": 0.87723694, + "memory(GiB)": 117.54, + "step": 7050, + "train_speed(iter/s)": 0.201534 + }, + { + "acc": 0.74291534, + "epoch": 0.16471514603595852, + "grad_norm": 32.5, + "learning_rate": 9.928289938729996e-06, + "loss": 0.95859528, + "memory(GiB)": 117.54, + "step": 7060, + "train_speed(iter/s)": 0.201681 + }, + { + "acc": 0.75394406, + "epoch": 0.16494845360824742, + "grad_norm": 5.34375, + "learning_rate": 9.92797078949286e-06, + "loss": 0.90898361, + "memory(GiB)": 117.54, + "step": 7070, + "train_speed(iter/s)": 0.201821 + }, + { + "acc": 0.74871893, + "epoch": 0.16518176118053632, + "grad_norm": 5.125, + "learning_rate": 9.927650936789329e-06, + "loss": 0.90822897, + "memory(GiB)": 117.54, + "step": 7080, + "train_speed(iter/s)": 0.201961 + }, + { + "acc": 0.77017407, + "epoch": 0.16541506875282522, + "grad_norm": 7.90625, + "learning_rate": 9.927330380665056e-06, + "loss": 0.84466286, + "memory(GiB)": 117.54, + "step": 7090, + "train_speed(iter/s)": 0.202117 + }, + { + "acc": 0.75458536, + "epoch": 0.1656483763251141, + "grad_norm": 4.9375, + "learning_rate": 9.927009121165803e-06, + "loss": 0.90666466, + "memory(GiB)": 117.54, + "step": 7100, + "train_speed(iter/s)": 0.202266 + }, + { + "acc": 0.75543022, + "epoch": 0.165881683897403, + "grad_norm": 5.46875, + "learning_rate": 9.92668715833743e-06, + "loss": 0.8722743, + "memory(GiB)": 117.54, + "step": 7110, + "train_speed(iter/s)": 0.202404 + }, + { + "acc": 0.77939939, + "epoch": 0.16611499146969189, + "grad_norm": 10.75, + "learning_rate": 9.926364492225894e-06, + "loss": 0.78749428, + "memory(GiB)": 117.54, + "step": 7120, + "train_speed(iter/s)": 0.202551 + }, + { + "acc": 0.79106226, + "epoch": 0.16634829904198078, + "grad_norm": 7.3125, + "learning_rate": 9.92604112287726e-06, + "loss": 0.74738359, + "memory(GiB)": 117.54, + "step": 7130, + "train_speed(iter/s)": 0.202705 + }, + { + "acc": 0.76939073, + "epoch": 0.16658160661426968, + "grad_norm": 4.8125, + "learning_rate": 9.925717050337686e-06, + "loss": 0.8409626, + "memory(GiB)": 117.54, + "step": 7140, + "train_speed(iter/s)": 0.202843 + }, + { + "acc": 0.76826558, + "epoch": 0.16681491418655855, + "grad_norm": 5.65625, + "learning_rate": 9.925392274653435e-06, + "loss": 0.82716637, + "memory(GiB)": 117.54, + "step": 7150, + "train_speed(iter/s)": 0.202993 + }, + { + "acc": 0.76554303, + "epoch": 0.16704822175884745, + "grad_norm": 4.21875, + "learning_rate": 9.925066795870868e-06, + "loss": 0.849158, + "memory(GiB)": 117.54, + "step": 7160, + "train_speed(iter/s)": 0.203138 + }, + { + "acc": 0.78131075, + "epoch": 0.16728152933113635, + "grad_norm": 5.65625, + "learning_rate": 9.924740614036445e-06, + "loss": 0.77836246, + "memory(GiB)": 117.54, + "step": 7170, + "train_speed(iter/s)": 0.203275 + }, + { + "acc": 0.77169647, + "epoch": 0.16751483690342525, + "grad_norm": 6.46875, + "learning_rate": 9.92441372919673e-06, + "loss": 0.83209658, + "memory(GiB)": 117.54, + "step": 7180, + "train_speed(iter/s)": 0.203419 + }, + { + "acc": 0.75183039, + "epoch": 0.16774814447571415, + "grad_norm": 4.875, + "learning_rate": 9.924086141398385e-06, + "loss": 0.92607002, + "memory(GiB)": 117.54, + "step": 7190, + "train_speed(iter/s)": 0.203569 + }, + { + "acc": 0.77899566, + "epoch": 0.16798145204800302, + "grad_norm": 9.9375, + "learning_rate": 9.923757850688176e-06, + "loss": 0.78890448, + "memory(GiB)": 117.54, + "step": 7200, + "train_speed(iter/s)": 0.203724 + }, + { + "acc": 0.7672864, + "epoch": 0.16821475962029192, + "grad_norm": 5.375, + "learning_rate": 9.923428857112963e-06, + "loss": 0.82765884, + "memory(GiB)": 117.54, + "step": 7210, + "train_speed(iter/s)": 0.203869 + }, + { + "acc": 0.76710939, + "epoch": 0.16844806719258082, + "grad_norm": 7.78125, + "learning_rate": 9.923099160719711e-06, + "loss": 0.83284779, + "memory(GiB)": 117.54, + "step": 7220, + "train_speed(iter/s)": 0.204016 + }, + { + "acc": 0.76684504, + "epoch": 0.16868137476486972, + "grad_norm": 7.5, + "learning_rate": 9.922768761555485e-06, + "loss": 0.84021397, + "memory(GiB)": 117.54, + "step": 7230, + "train_speed(iter/s)": 0.20416 + }, + { + "acc": 0.75771241, + "epoch": 0.16891468233715862, + "grad_norm": 6.625, + "learning_rate": 9.922437659667448e-06, + "loss": 0.8986228, + "memory(GiB)": 117.54, + "step": 7240, + "train_speed(iter/s)": 0.204305 + }, + { + "acc": 0.77050877, + "epoch": 0.1691479899094475, + "grad_norm": 5.0, + "learning_rate": 9.922105855102864e-06, + "loss": 0.83454065, + "memory(GiB)": 117.54, + "step": 7250, + "train_speed(iter/s)": 0.204448 + }, + { + "acc": 0.77722654, + "epoch": 0.1693812974817364, + "grad_norm": 6.21875, + "learning_rate": 9.921773347909098e-06, + "loss": 0.79222279, + "memory(GiB)": 117.54, + "step": 7260, + "train_speed(iter/s)": 0.204598 + }, + { + "acc": 0.76436782, + "epoch": 0.16961460505402529, + "grad_norm": 6.75, + "learning_rate": 9.921440138133619e-06, + "loss": 0.843188, + "memory(GiB)": 117.54, + "step": 7270, + "train_speed(iter/s)": 0.20475 + }, + { + "acc": 0.75543094, + "epoch": 0.16984791262631418, + "grad_norm": 6.15625, + "learning_rate": 9.921106225823988e-06, + "loss": 0.91073751, + "memory(GiB)": 117.54, + "step": 7280, + "train_speed(iter/s)": 0.204899 + }, + { + "acc": 0.76456766, + "epoch": 0.17008122019860308, + "grad_norm": 5.625, + "learning_rate": 9.920771611027875e-06, + "loss": 0.83511429, + "memory(GiB)": 117.54, + "step": 7290, + "train_speed(iter/s)": 0.205052 + }, + { + "acc": 0.73825827, + "epoch": 0.17031452777089195, + "grad_norm": 5.3125, + "learning_rate": 9.920436293793043e-06, + "loss": 0.97240715, + "memory(GiB)": 117.54, + "step": 7300, + "train_speed(iter/s)": 0.205195 + }, + { + "acc": 0.74200888, + "epoch": 0.17054783534318085, + "grad_norm": 6.59375, + "learning_rate": 9.920100274167359e-06, + "loss": 0.9331625, + "memory(GiB)": 117.54, + "step": 7310, + "train_speed(iter/s)": 0.205347 + }, + { + "acc": 0.76395473, + "epoch": 0.17078114291546975, + "grad_norm": 7.46875, + "learning_rate": 9.91976355219879e-06, + "loss": 0.85346889, + "memory(GiB)": 117.54, + "step": 7320, + "train_speed(iter/s)": 0.205459 + }, + { + "acc": 0.75636325, + "epoch": 0.17101445048775865, + "grad_norm": 5.78125, + "learning_rate": 9.919426127935404e-06, + "loss": 0.89509583, + "memory(GiB)": 117.54, + "step": 7330, + "train_speed(iter/s)": 0.205592 + }, + { + "acc": 0.76570711, + "epoch": 0.17124775806004755, + "grad_norm": 5.53125, + "learning_rate": 9.919088001425367e-06, + "loss": 0.85378265, + "memory(GiB)": 117.54, + "step": 7340, + "train_speed(iter/s)": 0.205738 + }, + { + "acc": 0.75373197, + "epoch": 0.17148106563233642, + "grad_norm": 8.8125, + "learning_rate": 9.918749172716946e-06, + "loss": 0.88216324, + "memory(GiB)": 117.54, + "step": 7350, + "train_speed(iter/s)": 0.205869 + }, + { + "acc": 0.75796227, + "epoch": 0.17171437320462532, + "grad_norm": 7.0, + "learning_rate": 9.91840964185851e-06, + "loss": 0.8882925, + "memory(GiB)": 117.54, + "step": 7360, + "train_speed(iter/s)": 0.206007 + }, + { + "acc": 0.76364655, + "epoch": 0.17194768077691422, + "grad_norm": 5.53125, + "learning_rate": 9.918069408898527e-06, + "loss": 0.85295391, + "memory(GiB)": 117.54, + "step": 7370, + "train_speed(iter/s)": 0.206146 + }, + { + "acc": 0.77610121, + "epoch": 0.17218098834920312, + "grad_norm": 6.40625, + "learning_rate": 9.917728473885564e-06, + "loss": 0.78088756, + "memory(GiB)": 117.54, + "step": 7380, + "train_speed(iter/s)": 0.206278 + }, + { + "acc": 0.7751833, + "epoch": 0.172414295921492, + "grad_norm": 8.0, + "learning_rate": 9.91738683686829e-06, + "loss": 0.82343903, + "memory(GiB)": 117.54, + "step": 7390, + "train_speed(iter/s)": 0.206425 + }, + { + "acc": 0.75816336, + "epoch": 0.1726476034937809, + "grad_norm": 6.40625, + "learning_rate": 9.917044497895474e-06, + "loss": 0.86748638, + "memory(GiB)": 117.54, + "step": 7400, + "train_speed(iter/s)": 0.206522 + }, + { + "acc": 0.76400776, + "epoch": 0.1728809110660698, + "grad_norm": 5.25, + "learning_rate": 9.916701457015983e-06, + "loss": 0.86720943, + "memory(GiB)": 117.54, + "step": 7410, + "train_speed(iter/s)": 0.206658 + }, + { + "acc": 0.75771341, + "epoch": 0.17311421863835869, + "grad_norm": 5.15625, + "learning_rate": 9.91635771427879e-06, + "loss": 0.88320751, + "memory(GiB)": 117.54, + "step": 7420, + "train_speed(iter/s)": 0.206797 + }, + { + "acc": 0.7662806, + "epoch": 0.17334752621064758, + "grad_norm": 6.0625, + "learning_rate": 9.91601326973296e-06, + "loss": 0.835116, + "memory(GiB)": 117.54, + "step": 7430, + "train_speed(iter/s)": 0.206941 + }, + { + "acc": 0.77026176, + "epoch": 0.17358083378293646, + "grad_norm": 7.625, + "learning_rate": 9.915668123427662e-06, + "loss": 0.84814157, + "memory(GiB)": 117.54, + "step": 7440, + "train_speed(iter/s)": 0.207086 + }, + { + "acc": 0.76674738, + "epoch": 0.17381414135522535, + "grad_norm": 4.6875, + "learning_rate": 9.91532227541217e-06, + "loss": 0.85983877, + "memory(GiB)": 117.54, + "step": 7450, + "train_speed(iter/s)": 0.207226 + }, + { + "acc": 0.75598326, + "epoch": 0.17404744892751425, + "grad_norm": 6.03125, + "learning_rate": 9.91497572573585e-06, + "loss": 0.87792206, + "memory(GiB)": 117.54, + "step": 7460, + "train_speed(iter/s)": 0.207368 + }, + { + "acc": 0.77461433, + "epoch": 0.17428075649980315, + "grad_norm": 4.8125, + "learning_rate": 9.914628474448173e-06, + "loss": 0.81929722, + "memory(GiB)": 117.54, + "step": 7470, + "train_speed(iter/s)": 0.207504 + }, + { + "acc": 0.75475292, + "epoch": 0.17451406407209205, + "grad_norm": 5.53125, + "learning_rate": 9.91428052159871e-06, + "loss": 0.88305016, + "memory(GiB)": 117.54, + "step": 7480, + "train_speed(iter/s)": 0.207644 + }, + { + "acc": 0.77698159, + "epoch": 0.17474737164438092, + "grad_norm": 5.40625, + "learning_rate": 9.913931867237129e-06, + "loss": 0.81536446, + "memory(GiB)": 117.54, + "step": 7490, + "train_speed(iter/s)": 0.207778 + }, + { + "acc": 0.75837955, + "epoch": 0.17498067921666982, + "grad_norm": 4.71875, + "learning_rate": 9.913582511413201e-06, + "loss": 0.87997665, + "memory(GiB)": 117.54, + "step": 7500, + "train_speed(iter/s)": 0.207923 + }, + { + "epoch": 0.17498067921666982, + "eval_acc": 0.7296547562596777, + "eval_loss": 0.8604273200035095, + "eval_runtime": 1269.7388, + "eval_samples_per_second": 28.345, + "eval_steps_per_second": 14.173, + "step": 7500 + }, + { + "acc": 0.75066805, + "epoch": 0.17521398678895872, + "grad_norm": 5.96875, + "learning_rate": 9.913232454176797e-06, + "loss": 0.88283882, + "memory(GiB)": 117.54, + "step": 7510, + "train_speed(iter/s)": 0.2009 + }, + { + "acc": 0.73445344, + "epoch": 0.17544729436124762, + "grad_norm": 6.25, + "learning_rate": 9.912881695577889e-06, + "loss": 0.97776871, + "memory(GiB)": 117.54, + "step": 7520, + "train_speed(iter/s)": 0.201035 + }, + { + "acc": 0.77131672, + "epoch": 0.17568060193353652, + "grad_norm": 6.21875, + "learning_rate": 9.912530235666546e-06, + "loss": 0.82929153, + "memory(GiB)": 117.54, + "step": 7530, + "train_speed(iter/s)": 0.201167 + }, + { + "acc": 0.77159753, + "epoch": 0.1759139095058254, + "grad_norm": 8.5625, + "learning_rate": 9.912178074492937e-06, + "loss": 0.81753139, + "memory(GiB)": 117.54, + "step": 7540, + "train_speed(iter/s)": 0.201308 + }, + { + "acc": 0.75888309, + "epoch": 0.1761472170781143, + "grad_norm": 5.9375, + "learning_rate": 9.911825212107337e-06, + "loss": 0.85590801, + "memory(GiB)": 117.54, + "step": 7550, + "train_speed(iter/s)": 0.20145 + }, + { + "acc": 0.77193937, + "epoch": 0.1763805246504032, + "grad_norm": 6.21875, + "learning_rate": 9.911471648560114e-06, + "loss": 0.81675425, + "memory(GiB)": 117.54, + "step": 7560, + "train_speed(iter/s)": 0.201582 + }, + { + "acc": 0.75802794, + "epoch": 0.17661383222269209, + "grad_norm": 5.625, + "learning_rate": 9.91111738390174e-06, + "loss": 0.91160278, + "memory(GiB)": 117.54, + "step": 7570, + "train_speed(iter/s)": 0.201724 + }, + { + "acc": 0.77081351, + "epoch": 0.17684713979498098, + "grad_norm": 4.84375, + "learning_rate": 9.910762418182786e-06, + "loss": 0.81983862, + "memory(GiB)": 117.54, + "step": 7580, + "train_speed(iter/s)": 0.201858 + }, + { + "acc": 0.76340055, + "epoch": 0.17708044736726986, + "grad_norm": 14.375, + "learning_rate": 9.910406751453923e-06, + "loss": 0.85830593, + "memory(GiB)": 117.54, + "step": 7590, + "train_speed(iter/s)": 0.202005 + }, + { + "acc": 0.75996342, + "epoch": 0.17731375493955875, + "grad_norm": 5.0625, + "learning_rate": 9.910050383765924e-06, + "loss": 0.86299267, + "memory(GiB)": 117.54, + "step": 7600, + "train_speed(iter/s)": 0.202147 + }, + { + "acc": 0.73427224, + "epoch": 0.17754706251184765, + "grad_norm": 4.875, + "learning_rate": 9.909693315169657e-06, + "loss": 0.96497478, + "memory(GiB)": 117.54, + "step": 7610, + "train_speed(iter/s)": 0.202278 + }, + { + "acc": 0.76901188, + "epoch": 0.17778037008413655, + "grad_norm": 5.9375, + "learning_rate": 9.909335545716097e-06, + "loss": 0.82169056, + "memory(GiB)": 117.54, + "step": 7620, + "train_speed(iter/s)": 0.202416 + }, + { + "acc": 0.75650215, + "epoch": 0.17801367765642542, + "grad_norm": 6.53125, + "learning_rate": 9.908977075456314e-06, + "loss": 0.8937892, + "memory(GiB)": 117.54, + "step": 7630, + "train_speed(iter/s)": 0.202547 + }, + { + "acc": 0.78488522, + "epoch": 0.17824698522871432, + "grad_norm": 5.53125, + "learning_rate": 9.90861790444148e-06, + "loss": 0.76624699, + "memory(GiB)": 117.54, + "step": 7640, + "train_speed(iter/s)": 0.202691 + }, + { + "acc": 0.77700996, + "epoch": 0.17848029280100322, + "grad_norm": 5.75, + "learning_rate": 9.908258032722865e-06, + "loss": 0.79376793, + "memory(GiB)": 117.54, + "step": 7650, + "train_speed(iter/s)": 0.202822 + }, + { + "acc": 0.78278093, + "epoch": 0.17871360037329212, + "grad_norm": 6.09375, + "learning_rate": 9.907897460351842e-06, + "loss": 0.80827179, + "memory(GiB)": 117.54, + "step": 7660, + "train_speed(iter/s)": 0.202955 + }, + { + "acc": 0.7604249, + "epoch": 0.17894690794558102, + "grad_norm": 7.09375, + "learning_rate": 9.907536187379883e-06, + "loss": 0.88734035, + "memory(GiB)": 117.54, + "step": 7670, + "train_speed(iter/s)": 0.203091 + }, + { + "acc": 0.76182446, + "epoch": 0.1791802155178699, + "grad_norm": 5.90625, + "learning_rate": 9.907174213858556e-06, + "loss": 0.88050871, + "memory(GiB)": 117.54, + "step": 7680, + "train_speed(iter/s)": 0.203227 + }, + { + "acc": 0.76138544, + "epoch": 0.1794135230901588, + "grad_norm": 12.4375, + "learning_rate": 9.906811539839539e-06, + "loss": 0.86582584, + "memory(GiB)": 117.54, + "step": 7690, + "train_speed(iter/s)": 0.203366 + }, + { + "acc": 0.77104373, + "epoch": 0.1796468306624477, + "grad_norm": 5.1875, + "learning_rate": 9.9064481653746e-06, + "loss": 0.83353386, + "memory(GiB)": 117.54, + "step": 7700, + "train_speed(iter/s)": 0.203486 + }, + { + "acc": 0.77400684, + "epoch": 0.1798801382347366, + "grad_norm": 4.46875, + "learning_rate": 9.906084090515609e-06, + "loss": 0.83886662, + "memory(GiB)": 117.54, + "step": 7710, + "train_speed(iter/s)": 0.203625 + }, + { + "acc": 0.75984573, + "epoch": 0.18011344580702549, + "grad_norm": 5.96875, + "learning_rate": 9.90571931531454e-06, + "loss": 0.88316107, + "memory(GiB)": 117.54, + "step": 7720, + "train_speed(iter/s)": 0.203764 + }, + { + "acc": 0.76342249, + "epoch": 0.18034675337931436, + "grad_norm": 4.46875, + "learning_rate": 9.905353839823463e-06, + "loss": 0.85615129, + "memory(GiB)": 117.54, + "step": 7730, + "train_speed(iter/s)": 0.203908 + }, + { + "acc": 0.78326778, + "epoch": 0.18058006095160326, + "grad_norm": 6.09375, + "learning_rate": 9.904987664094553e-06, + "loss": 0.76730537, + "memory(GiB)": 117.54, + "step": 7740, + "train_speed(iter/s)": 0.204033 + }, + { + "acc": 0.75868654, + "epoch": 0.18081336852389215, + "grad_norm": 6.40625, + "learning_rate": 9.904620788180076e-06, + "loss": 0.89008236, + "memory(GiB)": 117.54, + "step": 7750, + "train_speed(iter/s)": 0.204174 + }, + { + "acc": 0.76263795, + "epoch": 0.18104667609618105, + "grad_norm": 5.5, + "learning_rate": 9.904253212132406e-06, + "loss": 0.87411928, + "memory(GiB)": 117.54, + "step": 7760, + "train_speed(iter/s)": 0.204312 + }, + { + "acc": 0.76147628, + "epoch": 0.18127998366846995, + "grad_norm": 6.0, + "learning_rate": 9.903884936004017e-06, + "loss": 0.86704102, + "memory(GiB)": 117.54, + "step": 7770, + "train_speed(iter/s)": 0.204445 + }, + { + "acc": 0.76194277, + "epoch": 0.18151329124075882, + "grad_norm": 7.5, + "learning_rate": 9.903515959847477e-06, + "loss": 0.83300629, + "memory(GiB)": 117.54, + "step": 7780, + "train_speed(iter/s)": 0.204576 + }, + { + "acc": 0.7630332, + "epoch": 0.18174659881304772, + "grad_norm": 8.5, + "learning_rate": 9.903146283715459e-06, + "loss": 0.85758762, + "memory(GiB)": 117.54, + "step": 7790, + "train_speed(iter/s)": 0.204716 + }, + { + "acc": 0.74647932, + "epoch": 0.18197990638533662, + "grad_norm": 5.5625, + "learning_rate": 9.902775907660733e-06, + "loss": 0.91409321, + "memory(GiB)": 117.54, + "step": 7800, + "train_speed(iter/s)": 0.204846 + }, + { + "acc": 0.74644547, + "epoch": 0.18221321395762552, + "grad_norm": 5.1875, + "learning_rate": 9.90240483173617e-06, + "loss": 0.95661182, + "memory(GiB)": 117.54, + "step": 7810, + "train_speed(iter/s)": 0.204982 + }, + { + "acc": 0.75153122, + "epoch": 0.18244652152991442, + "grad_norm": 6.4375, + "learning_rate": 9.902033055994739e-06, + "loss": 0.90269318, + "memory(GiB)": 117.54, + "step": 7820, + "train_speed(iter/s)": 0.205106 + }, + { + "acc": 0.75813732, + "epoch": 0.1826798291022033, + "grad_norm": 83.5, + "learning_rate": 9.901660580489517e-06, + "loss": 0.88405313, + "memory(GiB)": 117.54, + "step": 7830, + "train_speed(iter/s)": 0.20524 + }, + { + "acc": 0.78136988, + "epoch": 0.1829131366744922, + "grad_norm": 9.25, + "learning_rate": 9.90128740527367e-06, + "loss": 0.79004688, + "memory(GiB)": 117.54, + "step": 7840, + "train_speed(iter/s)": 0.205378 + }, + { + "acc": 0.77203064, + "epoch": 0.1831464442467811, + "grad_norm": 4.6875, + "learning_rate": 9.900913530400469e-06, + "loss": 0.80475359, + "memory(GiB)": 117.54, + "step": 7850, + "train_speed(iter/s)": 0.205502 + }, + { + "acc": 0.76842432, + "epoch": 0.18337975181906999, + "grad_norm": 5.03125, + "learning_rate": 9.900538955923287e-06, + "loss": 0.84965286, + "memory(GiB)": 117.54, + "step": 7860, + "train_speed(iter/s)": 0.205631 + }, + { + "acc": 0.7858345, + "epoch": 0.18361305939135886, + "grad_norm": 4.625, + "learning_rate": 9.900163681895591e-06, + "loss": 0.7589952, + "memory(GiB)": 117.54, + "step": 7870, + "train_speed(iter/s)": 0.205758 + }, + { + "acc": 0.75342913, + "epoch": 0.18384636696364776, + "grad_norm": 6.03125, + "learning_rate": 9.899787708370954e-06, + "loss": 0.89493637, + "memory(GiB)": 117.54, + "step": 7880, + "train_speed(iter/s)": 0.20589 + }, + { + "acc": 0.7876399, + "epoch": 0.18407967453593665, + "grad_norm": 7.21875, + "learning_rate": 9.899411035403044e-06, + "loss": 0.74116306, + "memory(GiB)": 117.54, + "step": 7890, + "train_speed(iter/s)": 0.206028 + }, + { + "acc": 0.78092766, + "epoch": 0.18431298210822555, + "grad_norm": 4.71875, + "learning_rate": 9.899033663045632e-06, + "loss": 0.79192057, + "memory(GiB)": 117.54, + "step": 7900, + "train_speed(iter/s)": 0.206168 + }, + { + "acc": 0.75420218, + "epoch": 0.18454628968051445, + "grad_norm": 9.8125, + "learning_rate": 9.898655591352589e-06, + "loss": 0.88830757, + "memory(GiB)": 117.54, + "step": 7910, + "train_speed(iter/s)": 0.206303 + }, + { + "acc": 0.75896831, + "epoch": 0.18477959725280332, + "grad_norm": 6.8125, + "learning_rate": 9.898276820377882e-06, + "loss": 0.8895916, + "memory(GiB)": 117.54, + "step": 7920, + "train_speed(iter/s)": 0.206432 + }, + { + "acc": 0.75719609, + "epoch": 0.18501290482509222, + "grad_norm": 6.875, + "learning_rate": 9.897897350175583e-06, + "loss": 0.89884977, + "memory(GiB)": 117.54, + "step": 7930, + "train_speed(iter/s)": 0.206564 + }, + { + "acc": 0.75096016, + "epoch": 0.18524621239738112, + "grad_norm": 6.875, + "learning_rate": 9.897517180799858e-06, + "loss": 0.90399666, + "memory(GiB)": 117.54, + "step": 7940, + "train_speed(iter/s)": 0.206695 + }, + { + "acc": 0.76634254, + "epoch": 0.18547951996967002, + "grad_norm": 5.5, + "learning_rate": 9.89713631230498e-06, + "loss": 0.84754448, + "memory(GiB)": 117.54, + "step": 7950, + "train_speed(iter/s)": 0.206826 + }, + { + "acc": 0.76263399, + "epoch": 0.18571282754195892, + "grad_norm": 5.1875, + "learning_rate": 9.896754744745315e-06, + "loss": 0.87414799, + "memory(GiB)": 117.54, + "step": 7960, + "train_speed(iter/s)": 0.206959 + }, + { + "acc": 0.76916466, + "epoch": 0.1859461351142478, + "grad_norm": 6.25, + "learning_rate": 9.896372478175336e-06, + "loss": 0.85287285, + "memory(GiB)": 117.54, + "step": 7970, + "train_speed(iter/s)": 0.207091 + }, + { + "acc": 0.76025381, + "epoch": 0.1861794426865367, + "grad_norm": 23.5, + "learning_rate": 9.895989512649605e-06, + "loss": 0.85706806, + "memory(GiB)": 117.54, + "step": 7980, + "train_speed(iter/s)": 0.207216 + }, + { + "acc": 0.78468599, + "epoch": 0.1864127502588256, + "grad_norm": 8.4375, + "learning_rate": 9.895605848222794e-06, + "loss": 0.78214846, + "memory(GiB)": 117.54, + "step": 7990, + "train_speed(iter/s)": 0.207344 + }, + { + "acc": 0.76179285, + "epoch": 0.1866460578311145, + "grad_norm": 5.4375, + "learning_rate": 9.89522148494967e-06, + "loss": 0.87004957, + "memory(GiB)": 117.54, + "step": 8000, + "train_speed(iter/s)": 0.20748 + }, + { + "epoch": 0.1866460578311145, + "eval_acc": 0.7304815809310498, + "eval_loss": 0.8580639958381653, + "eval_runtime": 1269.3016, + "eval_samples_per_second": 28.355, + "eval_steps_per_second": 14.178, + "step": 8000 + }, + { + "acc": 0.75695009, + "epoch": 0.18687936540340339, + "grad_norm": 8.125, + "learning_rate": 9.894836422885101e-06, + "loss": 0.87596626, + "memory(GiB)": 117.54, + "step": 8010, + "train_speed(iter/s)": 0.20091 + }, + { + "acc": 0.76842041, + "epoch": 0.18711267297569226, + "grad_norm": 5.1875, + "learning_rate": 9.894450662084055e-06, + "loss": 0.84125156, + "memory(GiB)": 117.54, + "step": 8020, + "train_speed(iter/s)": 0.20104 + }, + { + "acc": 0.75734787, + "epoch": 0.18734598054798116, + "grad_norm": 5.4375, + "learning_rate": 9.8940642026016e-06, + "loss": 0.88770895, + "memory(GiB)": 117.54, + "step": 8030, + "train_speed(iter/s)": 0.201174 + }, + { + "acc": 0.74493351, + "epoch": 0.18757928812027005, + "grad_norm": 17.875, + "learning_rate": 9.8936770444929e-06, + "loss": 0.93823605, + "memory(GiB)": 117.54, + "step": 8040, + "train_speed(iter/s)": 0.201304 + }, + { + "acc": 0.77957907, + "epoch": 0.18781259569255895, + "grad_norm": 4.5625, + "learning_rate": 9.893289187813224e-06, + "loss": 0.79470148, + "memory(GiB)": 117.54, + "step": 8050, + "train_speed(iter/s)": 0.201441 + }, + { + "acc": 0.76955447, + "epoch": 0.18804590326484785, + "grad_norm": 6.0, + "learning_rate": 9.892900632617939e-06, + "loss": 0.82833557, + "memory(GiB)": 117.54, + "step": 8060, + "train_speed(iter/s)": 0.201567 + }, + { + "acc": 0.77944613, + "epoch": 0.18827921083713672, + "grad_norm": 5.3125, + "learning_rate": 9.892511378962509e-06, + "loss": 0.78586206, + "memory(GiB)": 117.54, + "step": 8070, + "train_speed(iter/s)": 0.201694 + }, + { + "acc": 0.78304958, + "epoch": 0.18851251840942562, + "grad_norm": 6.1875, + "learning_rate": 9.892121426902502e-06, + "loss": 0.7807333, + "memory(GiB)": 117.54, + "step": 8080, + "train_speed(iter/s)": 0.201821 + }, + { + "acc": 0.77091231, + "epoch": 0.18874582598171452, + "grad_norm": 7.75, + "learning_rate": 9.891730776493579e-06, + "loss": 0.82303715, + "memory(GiB)": 117.54, + "step": 8090, + "train_speed(iter/s)": 0.201941 + }, + { + "acc": 0.76864967, + "epoch": 0.18897913355400342, + "grad_norm": 6.03125, + "learning_rate": 9.891339427791513e-06, + "loss": 0.82791605, + "memory(GiB)": 117.54, + "step": 8100, + "train_speed(iter/s)": 0.202068 + }, + { + "acc": 0.78619051, + "epoch": 0.18921244112629232, + "grad_norm": 6.125, + "learning_rate": 9.890947380852163e-06, + "loss": 0.78398085, + "memory(GiB)": 117.54, + "step": 8110, + "train_speed(iter/s)": 0.202193 + }, + { + "acc": 0.763907, + "epoch": 0.1894457486985812, + "grad_norm": 5.3125, + "learning_rate": 9.890554635731496e-06, + "loss": 0.86447706, + "memory(GiB)": 117.54, + "step": 8120, + "train_speed(iter/s)": 0.202319 + }, + { + "acc": 0.75075836, + "epoch": 0.1896790562708701, + "grad_norm": 4.6875, + "learning_rate": 9.890161192485573e-06, + "loss": 0.91182461, + "memory(GiB)": 117.54, + "step": 8130, + "train_speed(iter/s)": 0.202452 + }, + { + "acc": 0.78794985, + "epoch": 0.189912363843159, + "grad_norm": 5.28125, + "learning_rate": 9.889767051170563e-06, + "loss": 0.75603533, + "memory(GiB)": 117.54, + "step": 8140, + "train_speed(iter/s)": 0.202579 + }, + { + "acc": 0.77740474, + "epoch": 0.1901456714154479, + "grad_norm": 6.6875, + "learning_rate": 9.889372211842726e-06, + "loss": 0.80326691, + "memory(GiB)": 117.54, + "step": 8150, + "train_speed(iter/s)": 0.202706 + }, + { + "acc": 0.77399597, + "epoch": 0.19037897898773676, + "grad_norm": 5.03125, + "learning_rate": 9.888976674558426e-06, + "loss": 0.8158947, + "memory(GiB)": 117.54, + "step": 8160, + "train_speed(iter/s)": 0.202837 + }, + { + "acc": 0.74653988, + "epoch": 0.19061228656002566, + "grad_norm": 6.375, + "learning_rate": 9.888580439374126e-06, + "loss": 0.91792946, + "memory(GiB)": 117.54, + "step": 8170, + "train_speed(iter/s)": 0.20296 + }, + { + "acc": 0.74516058, + "epoch": 0.19084559413231456, + "grad_norm": 5.34375, + "learning_rate": 9.888183506346389e-06, + "loss": 0.91901636, + "memory(GiB)": 117.54, + "step": 8180, + "train_speed(iter/s)": 0.203084 + }, + { + "acc": 0.76203327, + "epoch": 0.19107890170460345, + "grad_norm": 6.4375, + "learning_rate": 9.887785875531875e-06, + "loss": 0.87705736, + "memory(GiB)": 117.54, + "step": 8190, + "train_speed(iter/s)": 0.203213 + }, + { + "acc": 0.75964112, + "epoch": 0.19131220927689235, + "grad_norm": 4.25, + "learning_rate": 9.887387546987349e-06, + "loss": 0.87349434, + "memory(GiB)": 117.54, + "step": 8200, + "train_speed(iter/s)": 0.203336 + }, + { + "acc": 0.77561655, + "epoch": 0.19154551684918122, + "grad_norm": 4.21875, + "learning_rate": 9.886988520769669e-06, + "loss": 0.80881062, + "memory(GiB)": 117.54, + "step": 8210, + "train_speed(iter/s)": 0.203451 + }, + { + "acc": 0.76098909, + "epoch": 0.19177882442147012, + "grad_norm": 6.96875, + "learning_rate": 9.886588796935797e-06, + "loss": 0.87041025, + "memory(GiB)": 117.54, + "step": 8220, + "train_speed(iter/s)": 0.203578 + }, + { + "acc": 0.7447237, + "epoch": 0.19201213199375902, + "grad_norm": 5.25, + "learning_rate": 9.886188375542795e-06, + "loss": 0.93633842, + "memory(GiB)": 117.54, + "step": 8230, + "train_speed(iter/s)": 0.20371 + }, + { + "acc": 0.76472883, + "epoch": 0.19224543956604792, + "grad_norm": 5.15625, + "learning_rate": 9.885787256647822e-06, + "loss": 0.85684052, + "memory(GiB)": 117.54, + "step": 8240, + "train_speed(iter/s)": 0.203835 + }, + { + "acc": 0.78805923, + "epoch": 0.19247874713833682, + "grad_norm": 5.84375, + "learning_rate": 9.885385440308137e-06, + "loss": 0.73936048, + "memory(GiB)": 117.54, + "step": 8250, + "train_speed(iter/s)": 0.203964 + }, + { + "acc": 0.75716724, + "epoch": 0.1927120547106257, + "grad_norm": 7.59375, + "learning_rate": 9.8849829265811e-06, + "loss": 0.89653664, + "memory(GiB)": 117.54, + "step": 8260, + "train_speed(iter/s)": 0.204088 + }, + { + "acc": 0.78018217, + "epoch": 0.1929453622829146, + "grad_norm": 5.75, + "learning_rate": 9.884579715524168e-06, + "loss": 0.77996178, + "memory(GiB)": 117.54, + "step": 8270, + "train_speed(iter/s)": 0.204218 + }, + { + "acc": 0.74880614, + "epoch": 0.1931786698552035, + "grad_norm": 6.28125, + "learning_rate": 9.884175807194902e-06, + "loss": 0.90730553, + "memory(GiB)": 117.54, + "step": 8280, + "train_speed(iter/s)": 0.204343 + }, + { + "acc": 0.77563052, + "epoch": 0.1934119774274924, + "grad_norm": 6.25, + "learning_rate": 9.883771201650958e-06, + "loss": 0.80887585, + "memory(GiB)": 117.54, + "step": 8290, + "train_speed(iter/s)": 0.204473 + }, + { + "acc": 0.76742563, + "epoch": 0.1936452849997813, + "grad_norm": 5.71875, + "learning_rate": 9.883365898950094e-06, + "loss": 0.83605604, + "memory(GiB)": 117.54, + "step": 8300, + "train_speed(iter/s)": 0.204603 + }, + { + "acc": 0.77535963, + "epoch": 0.19387859257207016, + "grad_norm": 5.65625, + "learning_rate": 9.882959899150166e-06, + "loss": 0.79957132, + "memory(GiB)": 117.54, + "step": 8310, + "train_speed(iter/s)": 0.204735 + }, + { + "acc": 0.76650438, + "epoch": 0.19411190014435906, + "grad_norm": 8.25, + "learning_rate": 9.882553202309131e-06, + "loss": 0.83275509, + "memory(GiB)": 117.54, + "step": 8320, + "train_speed(iter/s)": 0.204866 + }, + { + "acc": 0.76723166, + "epoch": 0.19434520771664796, + "grad_norm": 6.84375, + "learning_rate": 9.882145808485045e-06, + "loss": 0.83784151, + "memory(GiB)": 117.54, + "step": 8330, + "train_speed(iter/s)": 0.204991 + }, + { + "acc": 0.76202507, + "epoch": 0.19457851528893685, + "grad_norm": 6.6875, + "learning_rate": 9.881737717736063e-06, + "loss": 0.8698616, + "memory(GiB)": 117.54, + "step": 8340, + "train_speed(iter/s)": 0.205119 + }, + { + "acc": 0.76839499, + "epoch": 0.19481182286122575, + "grad_norm": 6.71875, + "learning_rate": 9.88132893012044e-06, + "loss": 0.85822887, + "memory(GiB)": 117.54, + "step": 8350, + "train_speed(iter/s)": 0.205249 + }, + { + "acc": 0.75704322, + "epoch": 0.19504513043351462, + "grad_norm": 4.8125, + "learning_rate": 9.88091944569653e-06, + "loss": 0.88926487, + "memory(GiB)": 117.54, + "step": 8360, + "train_speed(iter/s)": 0.20537 + }, + { + "acc": 0.7531908, + "epoch": 0.19527843800580352, + "grad_norm": 6.21875, + "learning_rate": 9.880509264522788e-06, + "loss": 0.89586658, + "memory(GiB)": 117.54, + "step": 8370, + "train_speed(iter/s)": 0.205492 + }, + { + "acc": 0.76802006, + "epoch": 0.19551174557809242, + "grad_norm": 4.59375, + "learning_rate": 9.880098386657765e-06, + "loss": 0.83442373, + "memory(GiB)": 117.54, + "step": 8380, + "train_speed(iter/s)": 0.205617 + }, + { + "acc": 0.75828171, + "epoch": 0.19574505315038132, + "grad_norm": 12.1875, + "learning_rate": 9.879686812160116e-06, + "loss": 0.86747961, + "memory(GiB)": 117.54, + "step": 8390, + "train_speed(iter/s)": 0.205742 + }, + { + "acc": 0.77685285, + "epoch": 0.1959783607226702, + "grad_norm": 13.125, + "learning_rate": 9.87927454108859e-06, + "loss": 0.81312294, + "memory(GiB)": 117.54, + "step": 8400, + "train_speed(iter/s)": 0.205862 + }, + { + "acc": 0.75890455, + "epoch": 0.1962116682949591, + "grad_norm": 4.625, + "learning_rate": 9.878861573502044e-06, + "loss": 0.90135555, + "memory(GiB)": 117.54, + "step": 8410, + "train_speed(iter/s)": 0.205988 + }, + { + "acc": 0.77306185, + "epoch": 0.196444975867248, + "grad_norm": 5.4375, + "learning_rate": 9.878447909459423e-06, + "loss": 0.84094868, + "memory(GiB)": 117.54, + "step": 8420, + "train_speed(iter/s)": 0.206109 + }, + { + "acc": 0.77186542, + "epoch": 0.1966782834395369, + "grad_norm": 4.53125, + "learning_rate": 9.878033549019781e-06, + "loss": 0.81778107, + "memory(GiB)": 117.54, + "step": 8430, + "train_speed(iter/s)": 0.206226 + }, + { + "acc": 0.7592206, + "epoch": 0.1969115910118258, + "grad_norm": 5.75, + "learning_rate": 9.877618492242267e-06, + "loss": 0.90185413, + "memory(GiB)": 117.54, + "step": 8440, + "train_speed(iter/s)": 0.206345 + }, + { + "acc": 0.77778916, + "epoch": 0.19714489858411466, + "grad_norm": 5.28125, + "learning_rate": 9.877202739186132e-06, + "loss": 0.79591818, + "memory(GiB)": 117.54, + "step": 8450, + "train_speed(iter/s)": 0.20647 + }, + { + "acc": 0.77902613, + "epoch": 0.19737820615640356, + "grad_norm": 9.125, + "learning_rate": 9.876786289910721e-06, + "loss": 0.79726973, + "memory(GiB)": 117.54, + "step": 8460, + "train_speed(iter/s)": 0.206596 + }, + { + "acc": 0.75946159, + "epoch": 0.19761151372869246, + "grad_norm": 8.6875, + "learning_rate": 9.876369144475484e-06, + "loss": 0.8990036, + "memory(GiB)": 117.54, + "step": 8470, + "train_speed(iter/s)": 0.206715 + }, + { + "acc": 0.74261923, + "epoch": 0.19784482130098135, + "grad_norm": 6.125, + "learning_rate": 9.875951302939967e-06, + "loss": 0.91385193, + "memory(GiB)": 117.54, + "step": 8480, + "train_speed(iter/s)": 0.206839 + }, + { + "acc": 0.78411551, + "epoch": 0.19807812887327025, + "grad_norm": 4.53125, + "learning_rate": 9.87553276536382e-06, + "loss": 0.79219027, + "memory(GiB)": 117.54, + "step": 8490, + "train_speed(iter/s)": 0.206957 + }, + { + "acc": 0.7668088, + "epoch": 0.19831143644555912, + "grad_norm": 9.125, + "learning_rate": 9.875113531806785e-06, + "loss": 0.83160515, + "memory(GiB)": 117.54, + "step": 8500, + "train_speed(iter/s)": 0.207077 + }, + { + "epoch": 0.19831143644555912, + "eval_acc": 0.731228333486271, + "eval_loss": 0.8559815883636475, + "eval_runtime": 1270.6586, + "eval_samples_per_second": 28.325, + "eval_steps_per_second": 14.163, + "step": 8500 + }, + { + "acc": 0.75493984, + "epoch": 0.19854474401784802, + "grad_norm": 6.3125, + "learning_rate": 9.874693602328711e-06, + "loss": 0.896556, + "memory(GiB)": 117.54, + "step": 8510, + "train_speed(iter/s)": 0.200895 + }, + { + "acc": 0.78136353, + "epoch": 0.19877805159013692, + "grad_norm": 5.03125, + "learning_rate": 9.874272976989541e-06, + "loss": 0.78143473, + "memory(GiB)": 117.54, + "step": 8520, + "train_speed(iter/s)": 0.201016 + }, + { + "acc": 0.76888914, + "epoch": 0.19901135916242582, + "grad_norm": 6.0625, + "learning_rate": 9.87385165584932e-06, + "loss": 0.82431383, + "memory(GiB)": 117.54, + "step": 8530, + "train_speed(iter/s)": 0.201146 + }, + { + "acc": 0.75388536, + "epoch": 0.19924466673471472, + "grad_norm": 5.84375, + "learning_rate": 9.873429638968191e-06, + "loss": 0.89886112, + "memory(GiB)": 117.54, + "step": 8540, + "train_speed(iter/s)": 0.20127 + }, + { + "acc": 0.74465961, + "epoch": 0.1994779743070036, + "grad_norm": 5.5, + "learning_rate": 9.873006926406397e-06, + "loss": 0.93650312, + "memory(GiB)": 117.54, + "step": 8550, + "train_speed(iter/s)": 0.201395 + }, + { + "acc": 0.76810255, + "epoch": 0.1997112818792925, + "grad_norm": 11.0625, + "learning_rate": 9.872583518224279e-06, + "loss": 0.83152609, + "memory(GiB)": 117.54, + "step": 8560, + "train_speed(iter/s)": 0.201522 + }, + { + "acc": 0.75034895, + "epoch": 0.1999445894515814, + "grad_norm": 4.375, + "learning_rate": 9.872159414482279e-06, + "loss": 0.90507641, + "memory(GiB)": 117.54, + "step": 8570, + "train_speed(iter/s)": 0.201644 + }, + { + "acc": 0.76051202, + "epoch": 0.2001778970238703, + "grad_norm": 5.40625, + "learning_rate": 9.871734615240938e-06, + "loss": 0.88187637, + "memory(GiB)": 117.54, + "step": 8580, + "train_speed(iter/s)": 0.201772 + }, + { + "acc": 0.75299907, + "epoch": 0.2004112045961592, + "grad_norm": 6.875, + "learning_rate": 9.871309120560897e-06, + "loss": 0.88183479, + "memory(GiB)": 117.54, + "step": 8590, + "train_speed(iter/s)": 0.201899 + }, + { + "acc": 0.77017865, + "epoch": 0.20064451216844806, + "grad_norm": 7.375, + "learning_rate": 9.870882930502894e-06, + "loss": 0.84091225, + "memory(GiB)": 117.54, + "step": 8600, + "train_speed(iter/s)": 0.202013 + }, + { + "acc": 0.76628962, + "epoch": 0.20087781974073696, + "grad_norm": 6.84375, + "learning_rate": 9.870456045127767e-06, + "loss": 0.87175293, + "memory(GiB)": 117.54, + "step": 8610, + "train_speed(iter/s)": 0.202136 + }, + { + "acc": 0.77316351, + "epoch": 0.20111112731302586, + "grad_norm": 5.59375, + "learning_rate": 9.870028464496455e-06, + "loss": 0.83083458, + "memory(GiB)": 117.54, + "step": 8620, + "train_speed(iter/s)": 0.202255 + }, + { + "acc": 0.74215651, + "epoch": 0.20134443488531475, + "grad_norm": 7.46875, + "learning_rate": 9.869600188669995e-06, + "loss": 0.96017342, + "memory(GiB)": 117.54, + "step": 8630, + "train_speed(iter/s)": 0.20237 + }, + { + "acc": 0.76919737, + "epoch": 0.20157774245760363, + "grad_norm": 4.625, + "learning_rate": 9.869171217709522e-06, + "loss": 0.82330322, + "memory(GiB)": 117.54, + "step": 8640, + "train_speed(iter/s)": 0.202492 + }, + { + "acc": 0.76708584, + "epoch": 0.20181105002989252, + "grad_norm": 9.375, + "learning_rate": 9.86874155167627e-06, + "loss": 0.82590866, + "memory(GiB)": 117.54, + "step": 8650, + "train_speed(iter/s)": 0.202616 + }, + { + "acc": 0.7566463, + "epoch": 0.20204435760218142, + "grad_norm": 5.96875, + "learning_rate": 9.868311190631578e-06, + "loss": 0.8966464, + "memory(GiB)": 117.54, + "step": 8660, + "train_speed(iter/s)": 0.202744 + }, + { + "acc": 0.75115132, + "epoch": 0.20227766517447032, + "grad_norm": 4.4375, + "learning_rate": 9.867880134636877e-06, + "loss": 0.91379795, + "memory(GiB)": 117.54, + "step": 8670, + "train_speed(iter/s)": 0.202873 + }, + { + "acc": 0.78020339, + "epoch": 0.20251097274675922, + "grad_norm": 5.78125, + "learning_rate": 9.867448383753702e-06, + "loss": 0.78962541, + "memory(GiB)": 117.54, + "step": 8680, + "train_speed(iter/s)": 0.202989 + }, + { + "acc": 0.77505159, + "epoch": 0.2027442803190481, + "grad_norm": 10.125, + "learning_rate": 9.867015938043685e-06, + "loss": 0.82020912, + "memory(GiB)": 117.54, + "step": 8690, + "train_speed(iter/s)": 0.203111 + }, + { + "acc": 0.79113216, + "epoch": 0.202977587891337, + "grad_norm": 6.03125, + "learning_rate": 9.866582797568556e-06, + "loss": 0.7530036, + "memory(GiB)": 117.54, + "step": 8700, + "train_speed(iter/s)": 0.203235 + }, + { + "acc": 0.75312705, + "epoch": 0.2032108954636259, + "grad_norm": 7.125, + "learning_rate": 9.866148962390146e-06, + "loss": 0.90598717, + "memory(GiB)": 117.54, + "step": 8710, + "train_speed(iter/s)": 0.203359 + }, + { + "acc": 0.765062, + "epoch": 0.2034442030359148, + "grad_norm": 4.28125, + "learning_rate": 9.865714432570384e-06, + "loss": 0.86992626, + "memory(GiB)": 117.54, + "step": 8720, + "train_speed(iter/s)": 0.203478 + }, + { + "acc": 0.76311955, + "epoch": 0.2036775106082037, + "grad_norm": 6.25, + "learning_rate": 9.8652792081713e-06, + "loss": 0.85822725, + "memory(GiB)": 117.54, + "step": 8730, + "train_speed(iter/s)": 0.203588 + }, + { + "acc": 0.74687796, + "epoch": 0.20391081818049256, + "grad_norm": 6.71875, + "learning_rate": 9.864843289255026e-06, + "loss": 0.91143522, + "memory(GiB)": 117.54, + "step": 8740, + "train_speed(iter/s)": 0.203712 + }, + { + "acc": 0.75389738, + "epoch": 0.20414412575278146, + "grad_norm": 6.9375, + "learning_rate": 9.864406675883784e-06, + "loss": 0.90044785, + "memory(GiB)": 117.54, + "step": 8750, + "train_speed(iter/s)": 0.203828 + }, + { + "acc": 0.75677834, + "epoch": 0.20437743332507036, + "grad_norm": 7.6875, + "learning_rate": 9.863969368119902e-06, + "loss": 0.88022346, + "memory(GiB)": 117.54, + "step": 8760, + "train_speed(iter/s)": 0.203942 + }, + { + "acc": 0.76615176, + "epoch": 0.20461074089735926, + "grad_norm": 8.4375, + "learning_rate": 9.863531366025804e-06, + "loss": 0.8678463, + "memory(GiB)": 117.54, + "step": 8770, + "train_speed(iter/s)": 0.204065 + }, + { + "acc": 0.76859779, + "epoch": 0.20484404846964815, + "grad_norm": 6.6875, + "learning_rate": 9.863092669664018e-06, + "loss": 0.8375844, + "memory(GiB)": 117.54, + "step": 8780, + "train_speed(iter/s)": 0.204184 + }, + { + "acc": 0.76002398, + "epoch": 0.20507735604193703, + "grad_norm": 6.4375, + "learning_rate": 9.862653279097166e-06, + "loss": 0.86004581, + "memory(GiB)": 117.54, + "step": 8790, + "train_speed(iter/s)": 0.204307 + }, + { + "acc": 0.75623741, + "epoch": 0.20531066361422592, + "grad_norm": 10.0625, + "learning_rate": 9.86221319438797e-06, + "loss": 0.87964134, + "memory(GiB)": 117.54, + "step": 8800, + "train_speed(iter/s)": 0.204424 + }, + { + "acc": 0.74737983, + "epoch": 0.20554397118651482, + "grad_norm": 5.0625, + "learning_rate": 9.861772415599256e-06, + "loss": 0.93885403, + "memory(GiB)": 117.54, + "step": 8810, + "train_speed(iter/s)": 0.204549 + }, + { + "acc": 0.74907146, + "epoch": 0.20577727875880372, + "grad_norm": 6.125, + "learning_rate": 9.861330942793939e-06, + "loss": 0.90780516, + "memory(GiB)": 117.54, + "step": 8820, + "train_speed(iter/s)": 0.204666 + }, + { + "acc": 0.7763402, + "epoch": 0.20601058633109262, + "grad_norm": 4.96875, + "learning_rate": 9.860888776035043e-06, + "loss": 0.80836182, + "memory(GiB)": 117.54, + "step": 8830, + "train_speed(iter/s)": 0.204762 + }, + { + "acc": 0.74543486, + "epoch": 0.2062438939033815, + "grad_norm": 5.40625, + "learning_rate": 9.860445915385687e-06, + "loss": 0.91186695, + "memory(GiB)": 117.54, + "step": 8840, + "train_speed(iter/s)": 0.204882 + }, + { + "acc": 0.76045542, + "epoch": 0.2064772014756704, + "grad_norm": 5.3125, + "learning_rate": 9.860002360909086e-06, + "loss": 0.84907341, + "memory(GiB)": 117.54, + "step": 8850, + "train_speed(iter/s)": 0.20499 + }, + { + "acc": 0.74592304, + "epoch": 0.2067105090479593, + "grad_norm": 4.03125, + "learning_rate": 9.859558112668563e-06, + "loss": 0.89834366, + "memory(GiB)": 117.54, + "step": 8860, + "train_speed(iter/s)": 0.20511 + }, + { + "acc": 0.78678131, + "epoch": 0.2069438166202482, + "grad_norm": 6.28125, + "learning_rate": 9.85911317072753e-06, + "loss": 0.76607027, + "memory(GiB)": 117.54, + "step": 8870, + "train_speed(iter/s)": 0.205225 + }, + { + "acc": 0.76762028, + "epoch": 0.2071771241925371, + "grad_norm": 8.375, + "learning_rate": 9.858667535149503e-06, + "loss": 0.83826656, + "memory(GiB)": 117.54, + "step": 8880, + "train_speed(iter/s)": 0.205346 + }, + { + "acc": 0.78018045, + "epoch": 0.20741043176482596, + "grad_norm": 8.5625, + "learning_rate": 9.858221205998097e-06, + "loss": 0.79698944, + "memory(GiB)": 117.54, + "step": 8890, + "train_speed(iter/s)": 0.205468 + }, + { + "acc": 0.76134815, + "epoch": 0.20764373933711486, + "grad_norm": 7.9375, + "learning_rate": 9.857774183337025e-06, + "loss": 0.86685352, + "memory(GiB)": 117.54, + "step": 8900, + "train_speed(iter/s)": 0.205593 + }, + { + "acc": 0.77512984, + "epoch": 0.20787704690940376, + "grad_norm": 6.84375, + "learning_rate": 9.8573264672301e-06, + "loss": 0.8138916, + "memory(GiB)": 117.54, + "step": 8910, + "train_speed(iter/s)": 0.205715 + }, + { + "acc": 0.74891701, + "epoch": 0.20811035448169266, + "grad_norm": 5.4375, + "learning_rate": 9.856878057741233e-06, + "loss": 0.89883404, + "memory(GiB)": 117.54, + "step": 8920, + "train_speed(iter/s)": 0.20584 + }, + { + "acc": 0.76096349, + "epoch": 0.20834366205398153, + "grad_norm": 6.0, + "learning_rate": 9.856428954934434e-06, + "loss": 0.85659542, + "memory(GiB)": 117.54, + "step": 8930, + "train_speed(iter/s)": 0.205957 + }, + { + "acc": 0.76727762, + "epoch": 0.20857696962627043, + "grad_norm": 6.125, + "learning_rate": 9.855979158873812e-06, + "loss": 0.82931509, + "memory(GiB)": 117.54, + "step": 8940, + "train_speed(iter/s)": 0.206077 + }, + { + "acc": 0.76898785, + "epoch": 0.20881027719855932, + "grad_norm": 7.09375, + "learning_rate": 9.855528669623576e-06, + "loss": 0.81118364, + "memory(GiB)": 117.54, + "step": 8950, + "train_speed(iter/s)": 0.206187 + }, + { + "acc": 0.744419, + "epoch": 0.20904358477084822, + "grad_norm": 7.5, + "learning_rate": 9.855077487248034e-06, + "loss": 0.93352222, + "memory(GiB)": 117.54, + "step": 8960, + "train_speed(iter/s)": 0.206303 + }, + { + "acc": 0.7688417, + "epoch": 0.20927689234313712, + "grad_norm": 4.75, + "learning_rate": 9.85462561181159e-06, + "loss": 0.8121109, + "memory(GiB)": 117.54, + "step": 8970, + "train_speed(iter/s)": 0.206412 + }, + { + "acc": 0.7531055, + "epoch": 0.209510199915426, + "grad_norm": 5.96875, + "learning_rate": 9.85417304337875e-06, + "loss": 0.89911957, + "memory(GiB)": 117.54, + "step": 8980, + "train_speed(iter/s)": 0.206523 + }, + { + "acc": 0.74656286, + "epoch": 0.2097435074877149, + "grad_norm": 6.625, + "learning_rate": 9.85371978201412e-06, + "loss": 0.93082085, + "memory(GiB)": 117.54, + "step": 8990, + "train_speed(iter/s)": 0.206644 + }, + { + "acc": 0.7624054, + "epoch": 0.2099768150600038, + "grad_norm": 5.90625, + "learning_rate": 9.8532658277824e-06, + "loss": 0.86307392, + "memory(GiB)": 117.54, + "step": 9000, + "train_speed(iter/s)": 0.206763 + }, + { + "epoch": 0.2099768150600038, + "eval_acc": 0.7314818418060042, + "eval_loss": 0.8535876274108887, + "eval_runtime": 1269.9932, + "eval_samples_per_second": 28.34, + "eval_steps_per_second": 14.17, + "step": 9000 + }, + { + "acc": 0.76916304, + "epoch": 0.2102101226322927, + "grad_norm": 5.125, + "learning_rate": 9.852811180748391e-06, + "loss": 0.83703365, + "memory(GiB)": 117.54, + "step": 9010, + "train_speed(iter/s)": 0.200937 + }, + { + "acc": 0.77324028, + "epoch": 0.2104434302045816, + "grad_norm": 8.75, + "learning_rate": 9.852355840976996e-06, + "loss": 0.8149888, + "memory(GiB)": 117.54, + "step": 9020, + "train_speed(iter/s)": 0.201051 + }, + { + "acc": 0.76231546, + "epoch": 0.21067673777687046, + "grad_norm": 7.28125, + "learning_rate": 9.851899808533218e-06, + "loss": 0.85619822, + "memory(GiB)": 117.54, + "step": 9030, + "train_speed(iter/s)": 0.201166 + }, + { + "acc": 0.76985378, + "epoch": 0.21091004534915936, + "grad_norm": 7.0, + "learning_rate": 9.851443083482149e-06, + "loss": 0.83200512, + "memory(GiB)": 117.54, + "step": 9040, + "train_speed(iter/s)": 0.20128 + }, + { + "acc": 0.7554831, + "epoch": 0.21114335292144826, + "grad_norm": 7.3125, + "learning_rate": 9.850985665888988e-06, + "loss": 0.89478712, + "memory(GiB)": 117.54, + "step": 9050, + "train_speed(iter/s)": 0.201393 + }, + { + "acc": 0.76672964, + "epoch": 0.21137666049373716, + "grad_norm": 5.65625, + "learning_rate": 9.850527555819036e-06, + "loss": 0.85093822, + "memory(GiB)": 117.54, + "step": 9060, + "train_speed(iter/s)": 0.201505 + }, + { + "acc": 0.76372004, + "epoch": 0.21160996806602606, + "grad_norm": 5.0625, + "learning_rate": 9.850068753337683e-06, + "loss": 0.86411572, + "memory(GiB)": 117.54, + "step": 9070, + "train_speed(iter/s)": 0.201607 + }, + { + "acc": 0.76573353, + "epoch": 0.21184327563831493, + "grad_norm": 5.25, + "learning_rate": 9.849609258510423e-06, + "loss": 0.83601494, + "memory(GiB)": 117.54, + "step": 9080, + "train_speed(iter/s)": 0.201721 + }, + { + "acc": 0.74441233, + "epoch": 0.21207658321060383, + "grad_norm": 5.4375, + "learning_rate": 9.84914907140285e-06, + "loss": 0.92128773, + "memory(GiB)": 117.54, + "step": 9090, + "train_speed(iter/s)": 0.201835 + }, + { + "acc": 0.76748171, + "epoch": 0.21230989078289272, + "grad_norm": 8.125, + "learning_rate": 9.848688192080657e-06, + "loss": 0.83393631, + "memory(GiB)": 117.54, + "step": 9100, + "train_speed(iter/s)": 0.201944 + }, + { + "acc": 0.7691443, + "epoch": 0.21254319835518162, + "grad_norm": 5.625, + "learning_rate": 9.848226620609634e-06, + "loss": 0.85348082, + "memory(GiB)": 117.54, + "step": 9110, + "train_speed(iter/s)": 0.202057 + }, + { + "acc": 0.75306101, + "epoch": 0.21277650592747052, + "grad_norm": 4.78125, + "learning_rate": 9.847764357055669e-06, + "loss": 0.90074921, + "memory(GiB)": 117.54, + "step": 9120, + "train_speed(iter/s)": 0.202178 + }, + { + "acc": 0.76210451, + "epoch": 0.2130098134997594, + "grad_norm": 5.71875, + "learning_rate": 9.84730140148475e-06, + "loss": 0.870508, + "memory(GiB)": 117.54, + "step": 9130, + "train_speed(iter/s)": 0.202294 + }, + { + "acc": 0.77442284, + "epoch": 0.2132431210720483, + "grad_norm": 6.0625, + "learning_rate": 9.846837753962964e-06, + "loss": 0.81348133, + "memory(GiB)": 117.54, + "step": 9140, + "train_speed(iter/s)": 0.202414 + }, + { + "acc": 0.75884104, + "epoch": 0.2134764286443372, + "grad_norm": 5.1875, + "learning_rate": 9.846373414556495e-06, + "loss": 0.85764046, + "memory(GiB)": 117.54, + "step": 9150, + "train_speed(iter/s)": 0.202533 + }, + { + "acc": 0.76104126, + "epoch": 0.2137097362166261, + "grad_norm": 20.75, + "learning_rate": 9.84590838333163e-06, + "loss": 0.86669235, + "memory(GiB)": 117.54, + "step": 9160, + "train_speed(iter/s)": 0.202649 + }, + { + "acc": 0.78756847, + "epoch": 0.21394304378891496, + "grad_norm": 4.8125, + "learning_rate": 9.845442660354752e-06, + "loss": 0.76561289, + "memory(GiB)": 117.54, + "step": 9170, + "train_speed(iter/s)": 0.202771 + }, + { + "acc": 0.76651459, + "epoch": 0.21417635136120386, + "grad_norm": 4.375, + "learning_rate": 9.844976245692341e-06, + "loss": 0.84633484, + "memory(GiB)": 117.54, + "step": 9180, + "train_speed(iter/s)": 0.202893 + }, + { + "acc": 0.75503793, + "epoch": 0.21440965893349276, + "grad_norm": 7.46875, + "learning_rate": 9.84450913941098e-06, + "loss": 0.88526068, + "memory(GiB)": 117.54, + "step": 9190, + "train_speed(iter/s)": 0.202997 + }, + { + "acc": 0.76714697, + "epoch": 0.21464296650578166, + "grad_norm": 6.0625, + "learning_rate": 9.844041341577344e-06, + "loss": 0.82248907, + "memory(GiB)": 117.54, + "step": 9200, + "train_speed(iter/s)": 0.203109 + }, + { + "acc": 0.74727135, + "epoch": 0.21487627407807056, + "grad_norm": 4.125, + "learning_rate": 9.843572852258216e-06, + "loss": 0.91963558, + "memory(GiB)": 117.54, + "step": 9210, + "train_speed(iter/s)": 0.203216 + }, + { + "acc": 0.7496871, + "epoch": 0.21510958165035943, + "grad_norm": 4.78125, + "learning_rate": 9.843103671520469e-06, + "loss": 0.92044659, + "memory(GiB)": 117.54, + "step": 9220, + "train_speed(iter/s)": 0.203325 + }, + { + "acc": 0.76813116, + "epoch": 0.21534288922264833, + "grad_norm": 7.75, + "learning_rate": 9.842633799431081e-06, + "loss": 0.83491936, + "memory(GiB)": 117.54, + "step": 9230, + "train_speed(iter/s)": 0.203434 + }, + { + "acc": 0.7559175, + "epoch": 0.21557619679493722, + "grad_norm": 7.75, + "learning_rate": 9.842163236057123e-06, + "loss": 0.86974545, + "memory(GiB)": 117.54, + "step": 9240, + "train_speed(iter/s)": 0.203544 + }, + { + "acc": 0.75992718, + "epoch": 0.21580950436722612, + "grad_norm": 4.46875, + "learning_rate": 9.841691981465771e-06, + "loss": 0.87016802, + "memory(GiB)": 117.54, + "step": 9250, + "train_speed(iter/s)": 0.203653 + }, + { + "acc": 0.74953775, + "epoch": 0.21604281193951502, + "grad_norm": 7.09375, + "learning_rate": 9.841220035724295e-06, + "loss": 0.92368717, + "memory(GiB)": 117.54, + "step": 9260, + "train_speed(iter/s)": 0.203766 + }, + { + "acc": 0.77228193, + "epoch": 0.2162761195118039, + "grad_norm": 4.53125, + "learning_rate": 9.840747398900066e-06, + "loss": 0.80582151, + "memory(GiB)": 117.54, + "step": 9270, + "train_speed(iter/s)": 0.203884 + }, + { + "acc": 0.74684772, + "epoch": 0.2165094270840928, + "grad_norm": 4.84375, + "learning_rate": 9.840274071060552e-06, + "loss": 0.92780819, + "memory(GiB)": 117.54, + "step": 9280, + "train_speed(iter/s)": 0.203989 + }, + { + "acc": 0.77230215, + "epoch": 0.2167427346563817, + "grad_norm": 7.28125, + "learning_rate": 9.839800052273319e-06, + "loss": 0.83060665, + "memory(GiB)": 117.54, + "step": 9290, + "train_speed(iter/s)": 0.204102 + }, + { + "acc": 0.7695189, + "epoch": 0.2169760422286706, + "grad_norm": 5.6875, + "learning_rate": 9.839325342606034e-06, + "loss": 0.83614769, + "memory(GiB)": 117.54, + "step": 9300, + "train_speed(iter/s)": 0.204205 + }, + { + "acc": 0.75516381, + "epoch": 0.2172093498009595, + "grad_norm": 4.90625, + "learning_rate": 9.838849942126465e-06, + "loss": 0.88942032, + "memory(GiB)": 117.54, + "step": 9310, + "train_speed(iter/s)": 0.204315 + }, + { + "acc": 0.75599709, + "epoch": 0.21744265737324836, + "grad_norm": 6.5625, + "learning_rate": 9.83837385090247e-06, + "loss": 0.90213356, + "memory(GiB)": 117.54, + "step": 9320, + "train_speed(iter/s)": 0.204429 + }, + { + "acc": 0.7904994, + "epoch": 0.21767596494553726, + "grad_norm": 5.40625, + "learning_rate": 9.837897069002014e-06, + "loss": 0.7411685, + "memory(GiB)": 117.54, + "step": 9330, + "train_speed(iter/s)": 0.204543 + }, + { + "acc": 0.76367722, + "epoch": 0.21790927251782616, + "grad_norm": 5.78125, + "learning_rate": 9.837419596493158e-06, + "loss": 0.86719389, + "memory(GiB)": 117.54, + "step": 9340, + "train_speed(iter/s)": 0.204654 + }, + { + "acc": 0.77815828, + "epoch": 0.21814258009011506, + "grad_norm": 4.59375, + "learning_rate": 9.836941433444058e-06, + "loss": 0.78728352, + "memory(GiB)": 117.54, + "step": 9350, + "train_speed(iter/s)": 0.204767 + }, + { + "acc": 0.75084362, + "epoch": 0.21837588766240396, + "grad_norm": 5.125, + "learning_rate": 9.836462579922977e-06, + "loss": 0.90360699, + "memory(GiB)": 117.54, + "step": 9360, + "train_speed(iter/s)": 0.204881 + }, + { + "acc": 0.76905317, + "epoch": 0.21860919523469283, + "grad_norm": 6.0, + "learning_rate": 9.835983035998264e-06, + "loss": 0.85155258, + "memory(GiB)": 117.54, + "step": 9370, + "train_speed(iter/s)": 0.204998 + }, + { + "acc": 0.79401808, + "epoch": 0.21884250280698173, + "grad_norm": 4.15625, + "learning_rate": 9.835502801738379e-06, + "loss": 0.7368948, + "memory(GiB)": 117.54, + "step": 9380, + "train_speed(iter/s)": 0.205109 + }, + { + "acc": 0.76240873, + "epoch": 0.21907581037927062, + "grad_norm": 12.5625, + "learning_rate": 9.835021877211873e-06, + "loss": 0.86140232, + "memory(GiB)": 117.54, + "step": 9390, + "train_speed(iter/s)": 0.205222 + }, + { + "acc": 0.77225218, + "epoch": 0.21930911795155952, + "grad_norm": 7.59375, + "learning_rate": 9.834540262487399e-06, + "loss": 0.79418178, + "memory(GiB)": 117.54, + "step": 9400, + "train_speed(iter/s)": 0.205338 + }, + { + "acc": 0.76001101, + "epoch": 0.2195424255238484, + "grad_norm": 5.53125, + "learning_rate": 9.834057957633707e-06, + "loss": 0.85294189, + "memory(GiB)": 117.54, + "step": 9410, + "train_speed(iter/s)": 0.205444 + }, + { + "acc": 0.75794129, + "epoch": 0.2197757330961373, + "grad_norm": 4.625, + "learning_rate": 9.833574962719646e-06, + "loss": 0.8874979, + "memory(GiB)": 117.54, + "step": 9420, + "train_speed(iter/s)": 0.205545 + }, + { + "acc": 0.75950623, + "epoch": 0.2200090406684262, + "grad_norm": 5.4375, + "learning_rate": 9.833091277814163e-06, + "loss": 0.86289253, + "memory(GiB)": 117.54, + "step": 9430, + "train_speed(iter/s)": 0.205662 + }, + { + "acc": 0.75737329, + "epoch": 0.2202423482407151, + "grad_norm": 5.4375, + "learning_rate": 9.832606902986305e-06, + "loss": 0.94861965, + "memory(GiB)": 117.54, + "step": 9440, + "train_speed(iter/s)": 0.20577 + }, + { + "acc": 0.78068657, + "epoch": 0.220475655813004, + "grad_norm": 8.25, + "learning_rate": 9.832121838305214e-06, + "loss": 0.7791955, + "memory(GiB)": 117.54, + "step": 9450, + "train_speed(iter/s)": 0.205886 + }, + { + "acc": 0.76481562, + "epoch": 0.22070896338529286, + "grad_norm": 5.5, + "learning_rate": 9.831636083840135e-06, + "loss": 0.84020872, + "memory(GiB)": 117.54, + "step": 9460, + "train_speed(iter/s)": 0.205992 + }, + { + "acc": 0.78759332, + "epoch": 0.22094227095758176, + "grad_norm": 5.8125, + "learning_rate": 9.831149639660409e-06, + "loss": 0.78061409, + "memory(GiB)": 117.54, + "step": 9470, + "train_speed(iter/s)": 0.206099 + }, + { + "acc": 0.76587334, + "epoch": 0.22117557852987066, + "grad_norm": 10.6875, + "learning_rate": 9.830662505835476e-06, + "loss": 0.84685173, + "memory(GiB)": 117.54, + "step": 9480, + "train_speed(iter/s)": 0.206213 + }, + { + "acc": 0.75680537, + "epoch": 0.22140888610215956, + "grad_norm": 6.6875, + "learning_rate": 9.830174682434872e-06, + "loss": 0.88343801, + "memory(GiB)": 117.54, + "step": 9490, + "train_speed(iter/s)": 0.206329 + }, + { + "acc": 0.76228046, + "epoch": 0.22164219367444846, + "grad_norm": 3.953125, + "learning_rate": 9.829686169528237e-06, + "loss": 0.87286196, + "memory(GiB)": 117.54, + "step": 9500, + "train_speed(iter/s)": 0.206441 + }, + { + "epoch": 0.22164219367444846, + "eval_acc": 0.7321477215239133, + "eval_loss": 0.8513048887252808, + "eval_runtime": 1271.0526, + "eval_samples_per_second": 28.316, + "eval_steps_per_second": 14.158, + "step": 9500 + }, + { + "acc": 0.7736063, + "epoch": 0.22187550124673733, + "grad_norm": 4.78125, + "learning_rate": 9.829196967185302e-06, + "loss": 0.82306576, + "memory(GiB)": 117.54, + "step": 9510, + "train_speed(iter/s)": 0.20093 + }, + { + "acc": 0.77956977, + "epoch": 0.22210880881902623, + "grad_norm": 4.65625, + "learning_rate": 9.828707075475905e-06, + "loss": 0.78804388, + "memory(GiB)": 117.54, + "step": 9520, + "train_speed(iter/s)": 0.201042 + }, + { + "acc": 0.76058235, + "epoch": 0.22234211639131513, + "grad_norm": 8.6875, + "learning_rate": 9.828216494469975e-06, + "loss": 0.87693319, + "memory(GiB)": 117.54, + "step": 9530, + "train_speed(iter/s)": 0.201149 + }, + { + "acc": 0.77363548, + "epoch": 0.22257542396360402, + "grad_norm": 4.53125, + "learning_rate": 9.827725224237542e-06, + "loss": 0.83595715, + "memory(GiB)": 117.54, + "step": 9540, + "train_speed(iter/s)": 0.201247 + }, + { + "acc": 0.7730834, + "epoch": 0.22280873153589292, + "grad_norm": 8.8125, + "learning_rate": 9.827233264848737e-06, + "loss": 0.81986485, + "memory(GiB)": 117.54, + "step": 9550, + "train_speed(iter/s)": 0.201349 + }, + { + "acc": 0.74745684, + "epoch": 0.2230420391081818, + "grad_norm": 5.5625, + "learning_rate": 9.826740616373785e-06, + "loss": 0.89649019, + "memory(GiB)": 117.54, + "step": 9560, + "train_speed(iter/s)": 0.201461 + }, + { + "acc": 0.74992027, + "epoch": 0.2232753466804707, + "grad_norm": 8.9375, + "learning_rate": 9.826247278883012e-06, + "loss": 0.92695465, + "memory(GiB)": 117.54, + "step": 9570, + "train_speed(iter/s)": 0.201565 + }, + { + "acc": 0.78267574, + "epoch": 0.2235086542527596, + "grad_norm": 5.875, + "learning_rate": 9.825753252446843e-06, + "loss": 0.79742799, + "memory(GiB)": 117.54, + "step": 9580, + "train_speed(iter/s)": 0.201669 + }, + { + "acc": 0.78160505, + "epoch": 0.2237419618250485, + "grad_norm": 6.4375, + "learning_rate": 9.825258537135798e-06, + "loss": 0.8101265, + "memory(GiB)": 117.54, + "step": 9590, + "train_speed(iter/s)": 0.201776 + }, + { + "acc": 0.77349329, + "epoch": 0.2239752693973374, + "grad_norm": 4.71875, + "learning_rate": 9.8247631330205e-06, + "loss": 0.83796654, + "memory(GiB)": 117.54, + "step": 9600, + "train_speed(iter/s)": 0.201886 + }, + { + "acc": 0.78152819, + "epoch": 0.22420857696962626, + "grad_norm": 6.71875, + "learning_rate": 9.824267040171666e-06, + "loss": 0.80050898, + "memory(GiB)": 117.54, + "step": 9610, + "train_speed(iter/s)": 0.201988 + }, + { + "acc": 0.75929379, + "epoch": 0.22444188454191516, + "grad_norm": 4.90625, + "learning_rate": 9.823770258660113e-06, + "loss": 0.87012081, + "memory(GiB)": 117.54, + "step": 9620, + "train_speed(iter/s)": 0.202098 + }, + { + "acc": 0.78390207, + "epoch": 0.22467519211420406, + "grad_norm": 4.65625, + "learning_rate": 9.823272788556757e-06, + "loss": 0.76243534, + "memory(GiB)": 117.54, + "step": 9630, + "train_speed(iter/s)": 0.202202 + }, + { + "acc": 0.7557579, + "epoch": 0.22490849968649296, + "grad_norm": 7.53125, + "learning_rate": 9.822774629932612e-06, + "loss": 0.90164337, + "memory(GiB)": 117.54, + "step": 9640, + "train_speed(iter/s)": 0.202301 + }, + { + "acc": 0.76308393, + "epoch": 0.22514180725878186, + "grad_norm": 5.75, + "learning_rate": 9.822275782858788e-06, + "loss": 0.85689535, + "memory(GiB)": 117.54, + "step": 9650, + "train_speed(iter/s)": 0.202409 + }, + { + "acc": 0.7550004, + "epoch": 0.22537511483107073, + "grad_norm": 11.5625, + "learning_rate": 9.821776247406498e-06, + "loss": 0.8669714, + "memory(GiB)": 117.54, + "step": 9660, + "train_speed(iter/s)": 0.202514 + }, + { + "acc": 0.75697646, + "epoch": 0.22560842240335963, + "grad_norm": 5.03125, + "learning_rate": 9.821276023647049e-06, + "loss": 0.89136448, + "memory(GiB)": 117.54, + "step": 9670, + "train_speed(iter/s)": 0.202619 + }, + { + "acc": 0.77591777, + "epoch": 0.22584172997564853, + "grad_norm": 5.03125, + "learning_rate": 9.820775111651849e-06, + "loss": 0.80667782, + "memory(GiB)": 117.54, + "step": 9680, + "train_speed(iter/s)": 0.202725 + }, + { + "acc": 0.76816502, + "epoch": 0.22607503754793742, + "grad_norm": 5.96875, + "learning_rate": 9.820273511492401e-06, + "loss": 0.84152288, + "memory(GiB)": 117.54, + "step": 9690, + "train_speed(iter/s)": 0.202834 + }, + { + "acc": 0.76578364, + "epoch": 0.2263083451202263, + "grad_norm": 6.25, + "learning_rate": 9.819771223240312e-06, + "loss": 0.84982014, + "memory(GiB)": 117.54, + "step": 9700, + "train_speed(iter/s)": 0.202942 + }, + { + "acc": 0.76617751, + "epoch": 0.2265416526925152, + "grad_norm": 5.8125, + "learning_rate": 9.819268246967279e-06, + "loss": 0.82987022, + "memory(GiB)": 117.54, + "step": 9710, + "train_speed(iter/s)": 0.203048 + }, + { + "acc": 0.7701932, + "epoch": 0.2267749602648041, + "grad_norm": 7.21875, + "learning_rate": 9.818764582745103e-06, + "loss": 0.84979038, + "memory(GiB)": 117.54, + "step": 9720, + "train_speed(iter/s)": 0.203157 + }, + { + "acc": 0.77504597, + "epoch": 0.227008267837093, + "grad_norm": 5.25, + "learning_rate": 9.818260230645684e-06, + "loss": 0.81210556, + "memory(GiB)": 117.54, + "step": 9730, + "train_speed(iter/s)": 0.203266 + }, + { + "acc": 0.77041559, + "epoch": 0.2272415754093819, + "grad_norm": 5.28125, + "learning_rate": 9.817755190741018e-06, + "loss": 0.81857834, + "memory(GiB)": 117.54, + "step": 9740, + "train_speed(iter/s)": 0.203372 + }, + { + "acc": 0.76968975, + "epoch": 0.22747488298167076, + "grad_norm": 4.375, + "learning_rate": 9.817249463103196e-06, + "loss": 0.82577667, + "memory(GiB)": 117.54, + "step": 9750, + "train_speed(iter/s)": 0.203475 + }, + { + "acc": 0.79651542, + "epoch": 0.22770819055395966, + "grad_norm": 7.4375, + "learning_rate": 9.816743047804413e-06, + "loss": 0.73690844, + "memory(GiB)": 117.54, + "step": 9760, + "train_speed(iter/s)": 0.203583 + }, + { + "acc": 0.76188879, + "epoch": 0.22794149812624856, + "grad_norm": 4.5625, + "learning_rate": 9.816235944916959e-06, + "loss": 0.87093134, + "memory(GiB)": 117.54, + "step": 9770, + "train_speed(iter/s)": 0.203682 + }, + { + "acc": 0.76635122, + "epoch": 0.22817480569853746, + "grad_norm": 6.65625, + "learning_rate": 9.815728154513224e-06, + "loss": 0.84573898, + "memory(GiB)": 117.54, + "step": 9780, + "train_speed(iter/s)": 0.203777 + }, + { + "acc": 0.7656188, + "epoch": 0.22840811327082636, + "grad_norm": 8.5625, + "learning_rate": 9.815219676665694e-06, + "loss": 0.82872429, + "memory(GiB)": 117.54, + "step": 9790, + "train_speed(iter/s)": 0.203881 + }, + { + "acc": 0.75859795, + "epoch": 0.22864142084311523, + "grad_norm": 7.03125, + "learning_rate": 9.814710511446954e-06, + "loss": 0.88318329, + "memory(GiB)": 117.54, + "step": 9800, + "train_speed(iter/s)": 0.203992 + }, + { + "acc": 0.78200207, + "epoch": 0.22887472841540413, + "grad_norm": 5.375, + "learning_rate": 9.814200658929686e-06, + "loss": 0.78139582, + "memory(GiB)": 117.54, + "step": 9810, + "train_speed(iter/s)": 0.204096 + }, + { + "acc": 0.77523031, + "epoch": 0.22910803598769303, + "grad_norm": 6.53125, + "learning_rate": 9.813690119186673e-06, + "loss": 0.83075628, + "memory(GiB)": 117.54, + "step": 9820, + "train_speed(iter/s)": 0.204201 + }, + { + "acc": 0.76592264, + "epoch": 0.22934134355998193, + "grad_norm": 5.625, + "learning_rate": 9.813178892290793e-06, + "loss": 0.84281292, + "memory(GiB)": 117.54, + "step": 9830, + "train_speed(iter/s)": 0.204306 + }, + { + "acc": 0.75805759, + "epoch": 0.22957465113227082, + "grad_norm": 5.1875, + "learning_rate": 9.812666978315026e-06, + "loss": 0.88515167, + "memory(GiB)": 117.54, + "step": 9840, + "train_speed(iter/s)": 0.204417 + }, + { + "acc": 0.75032368, + "epoch": 0.2298079587045597, + "grad_norm": 5.03125, + "learning_rate": 9.812154377332446e-06, + "loss": 0.91492596, + "memory(GiB)": 117.54, + "step": 9850, + "train_speed(iter/s)": 0.204526 + }, + { + "acc": 0.77257566, + "epoch": 0.2300412662768486, + "grad_norm": 8.6875, + "learning_rate": 9.811641089416225e-06, + "loss": 0.81846771, + "memory(GiB)": 117.54, + "step": 9860, + "train_speed(iter/s)": 0.20463 + }, + { + "acc": 0.74543791, + "epoch": 0.2302745738491375, + "grad_norm": 7.96875, + "learning_rate": 9.811127114639637e-06, + "loss": 0.92871838, + "memory(GiB)": 117.54, + "step": 9870, + "train_speed(iter/s)": 0.204741 + }, + { + "acc": 0.79538689, + "epoch": 0.2305078814214264, + "grad_norm": 5.0625, + "learning_rate": 9.810612453076052e-06, + "loss": 0.72743883, + "memory(GiB)": 117.54, + "step": 9880, + "train_speed(iter/s)": 0.204846 + }, + { + "acc": 0.76809082, + "epoch": 0.2307411889937153, + "grad_norm": 5.75, + "learning_rate": 9.810097104798934e-06, + "loss": 0.84220142, + "memory(GiB)": 117.54, + "step": 9890, + "train_speed(iter/s)": 0.204953 + }, + { + "acc": 0.77071457, + "epoch": 0.23097449656600416, + "grad_norm": 7.28125, + "learning_rate": 9.809581069881854e-06, + "loss": 0.81765766, + "memory(GiB)": 117.54, + "step": 9900, + "train_speed(iter/s)": 0.205051 + }, + { + "acc": 0.76900225, + "epoch": 0.23120780413829306, + "grad_norm": 7.96875, + "learning_rate": 9.809064348398474e-06, + "loss": 0.84059877, + "memory(GiB)": 117.54, + "step": 9910, + "train_speed(iter/s)": 0.205156 + }, + { + "acc": 0.7770957, + "epoch": 0.23144111171058196, + "grad_norm": 7.5625, + "learning_rate": 9.808546940422555e-06, + "loss": 0.79744329, + "memory(GiB)": 117.54, + "step": 9920, + "train_speed(iter/s)": 0.205261 + }, + { + "acc": 0.78089809, + "epoch": 0.23167441928287086, + "grad_norm": 7.28125, + "learning_rate": 9.808028846027954e-06, + "loss": 0.79451566, + "memory(GiB)": 117.54, + "step": 9930, + "train_speed(iter/s)": 0.205354 + }, + { + "acc": 0.78215675, + "epoch": 0.23190772685515973, + "grad_norm": 5.09375, + "learning_rate": 9.807510065288635e-06, + "loss": 0.7874649, + "memory(GiB)": 117.54, + "step": 9940, + "train_speed(iter/s)": 0.205452 + }, + { + "acc": 0.7620882, + "epoch": 0.23214103442744863, + "grad_norm": 10.6875, + "learning_rate": 9.806990598278651e-06, + "loss": 0.8667222, + "memory(GiB)": 117.54, + "step": 9950, + "train_speed(iter/s)": 0.205557 + }, + { + "acc": 0.76581755, + "epoch": 0.23237434199973753, + "grad_norm": 6.125, + "learning_rate": 9.806470445072156e-06, + "loss": 0.83361053, + "memory(GiB)": 123.09, + "step": 9960, + "train_speed(iter/s)": 0.205654 + }, + { + "acc": 0.76487746, + "epoch": 0.23260764957202643, + "grad_norm": 5.1875, + "learning_rate": 9.8059496057434e-06, + "loss": 0.82635136, + "memory(GiB)": 123.09, + "step": 9970, + "train_speed(iter/s)": 0.205754 + }, + { + "acc": 0.75906668, + "epoch": 0.23284095714431532, + "grad_norm": 5.5, + "learning_rate": 9.805428080366733e-06, + "loss": 0.8668766, + "memory(GiB)": 123.09, + "step": 9980, + "train_speed(iter/s)": 0.205859 + }, + { + "acc": 0.75760293, + "epoch": 0.2330742647166042, + "grad_norm": 7.0, + "learning_rate": 9.804905869016603e-06, + "loss": 0.8750042, + "memory(GiB)": 123.09, + "step": 9990, + "train_speed(iter/s)": 0.205958 + }, + { + "acc": 0.75372882, + "epoch": 0.2333075722888931, + "grad_norm": 6.5625, + "learning_rate": 9.804382971767559e-06, + "loss": 0.92433701, + "memory(GiB)": 123.09, + "step": 10000, + "train_speed(iter/s)": 0.20606 + }, + { + "epoch": 0.2333075722888931, + "eval_acc": 0.7324511948441245, + "eval_loss": 0.8497293591499329, + "eval_runtime": 1272.2316, + "eval_samples_per_second": 28.29, + "eval_steps_per_second": 14.145, + "step": 10000 + }, + { + "acc": 0.77208815, + "epoch": 0.233540879861182, + "grad_norm": 5.46875, + "learning_rate": 9.803859388694238e-06, + "loss": 0.82612524, + "memory(GiB)": 123.09, + "step": 10010, + "train_speed(iter/s)": 0.200834 + }, + { + "acc": 0.76006479, + "epoch": 0.2337741874334709, + "grad_norm": 7.21875, + "learning_rate": 9.803335119871388e-06, + "loss": 0.87721148, + "memory(GiB)": 123.09, + "step": 10020, + "train_speed(iter/s)": 0.200935 + }, + { + "acc": 0.74797354, + "epoch": 0.2340074950057598, + "grad_norm": 5.125, + "learning_rate": 9.802810165373845e-06, + "loss": 0.93788738, + "memory(GiB)": 123.09, + "step": 10030, + "train_speed(iter/s)": 0.201046 + }, + { + "acc": 0.75885124, + "epoch": 0.23424080257804866, + "grad_norm": 9.375, + "learning_rate": 9.802284525276544e-06, + "loss": 0.86786547, + "memory(GiB)": 123.09, + "step": 10040, + "train_speed(iter/s)": 0.201152 + }, + { + "acc": 0.77307854, + "epoch": 0.23447411015033756, + "grad_norm": 4.6875, + "learning_rate": 9.801758199654522e-06, + "loss": 0.81534319, + "memory(GiB)": 123.09, + "step": 10050, + "train_speed(iter/s)": 0.201251 + }, + { + "acc": 0.75475931, + "epoch": 0.23470741772262646, + "grad_norm": 5.71875, + "learning_rate": 9.801231188582914e-06, + "loss": 0.88743048, + "memory(GiB)": 123.09, + "step": 10060, + "train_speed(iter/s)": 0.201359 + }, + { + "acc": 0.7665451, + "epoch": 0.23494072529491536, + "grad_norm": 6.53125, + "learning_rate": 9.800703492136948e-06, + "loss": 0.84892178, + "memory(GiB)": 123.09, + "step": 10070, + "train_speed(iter/s)": 0.201466 + }, + { + "acc": 0.77399416, + "epoch": 0.23517403286720426, + "grad_norm": 4.53125, + "learning_rate": 9.800175110391952e-06, + "loss": 0.82023621, + "memory(GiB)": 123.09, + "step": 10080, + "train_speed(iter/s)": 0.20157 + }, + { + "acc": 0.75192294, + "epoch": 0.23540734043949313, + "grad_norm": 4.875, + "learning_rate": 9.799646043423353e-06, + "loss": 0.91079044, + "memory(GiB)": 123.09, + "step": 10090, + "train_speed(iter/s)": 0.201671 + }, + { + "acc": 0.76243572, + "epoch": 0.23564064801178203, + "grad_norm": 5.71875, + "learning_rate": 9.799116291306677e-06, + "loss": 0.85686026, + "memory(GiB)": 123.09, + "step": 10100, + "train_speed(iter/s)": 0.201774 + }, + { + "acc": 0.76248827, + "epoch": 0.23587395558407093, + "grad_norm": 5.3125, + "learning_rate": 9.798585854117543e-06, + "loss": 0.856462, + "memory(GiB)": 123.09, + "step": 10110, + "train_speed(iter/s)": 0.201874 + }, + { + "acc": 0.77080183, + "epoch": 0.23610726315635983, + "grad_norm": 4.4375, + "learning_rate": 9.798054731931674e-06, + "loss": 0.8349369, + "memory(GiB)": 123.09, + "step": 10120, + "train_speed(iter/s)": 0.201974 + }, + { + "acc": 0.76452069, + "epoch": 0.23634057072864872, + "grad_norm": 7.125, + "learning_rate": 9.797522924824886e-06, + "loss": 0.85023165, + "memory(GiB)": 123.09, + "step": 10130, + "train_speed(iter/s)": 0.202084 + }, + { + "acc": 0.77273831, + "epoch": 0.2365738783009376, + "grad_norm": 4.84375, + "learning_rate": 9.796990432873093e-06, + "loss": 0.8241353, + "memory(GiB)": 123.09, + "step": 10140, + "train_speed(iter/s)": 0.202192 + }, + { + "acc": 0.77139888, + "epoch": 0.2368071858732265, + "grad_norm": 7.34375, + "learning_rate": 9.79645725615231e-06, + "loss": 0.85567112, + "memory(GiB)": 123.09, + "step": 10150, + "train_speed(iter/s)": 0.202304 + }, + { + "acc": 0.76190987, + "epoch": 0.2370404934455154, + "grad_norm": 4.875, + "learning_rate": 9.795923394738646e-06, + "loss": 0.8605545, + "memory(GiB)": 126.99, + "step": 10160, + "train_speed(iter/s)": 0.202404 + }, + { + "acc": 0.76465502, + "epoch": 0.2372738010178043, + "grad_norm": 9.1875, + "learning_rate": 9.795388848708312e-06, + "loss": 0.86345005, + "memory(GiB)": 126.99, + "step": 10170, + "train_speed(iter/s)": 0.202507 + }, + { + "acc": 0.76446505, + "epoch": 0.23750710859009316, + "grad_norm": 5.5, + "learning_rate": 9.794853618137612e-06, + "loss": 0.86249762, + "memory(GiB)": 126.99, + "step": 10180, + "train_speed(iter/s)": 0.202612 + }, + { + "acc": 0.76790714, + "epoch": 0.23774041616238206, + "grad_norm": 7.15625, + "learning_rate": 9.794317703102951e-06, + "loss": 0.82229271, + "memory(GiB)": 126.99, + "step": 10190, + "train_speed(iter/s)": 0.202713 + }, + { + "acc": 0.76957502, + "epoch": 0.23797372373467096, + "grad_norm": 11.125, + "learning_rate": 9.793781103680833e-06, + "loss": 0.86161594, + "memory(GiB)": 126.99, + "step": 10200, + "train_speed(iter/s)": 0.202818 + }, + { + "acc": 0.7653698, + "epoch": 0.23820703130695986, + "grad_norm": 4.40625, + "learning_rate": 9.793243819947851e-06, + "loss": 0.84829473, + "memory(GiB)": 126.99, + "step": 10210, + "train_speed(iter/s)": 0.202925 + }, + { + "acc": 0.7630024, + "epoch": 0.23844033887924876, + "grad_norm": 6.875, + "learning_rate": 9.79270585198071e-06, + "loss": 0.84405813, + "memory(GiB)": 126.99, + "step": 10220, + "train_speed(iter/s)": 0.203028 + }, + { + "acc": 0.76894646, + "epoch": 0.23867364645153763, + "grad_norm": 6.6875, + "learning_rate": 9.792167199856198e-06, + "loss": 0.83470745, + "memory(GiB)": 126.99, + "step": 10230, + "train_speed(iter/s)": 0.203128 + }, + { + "acc": 0.77872562, + "epoch": 0.23890695402382653, + "grad_norm": 8.5625, + "learning_rate": 9.791627863651212e-06, + "loss": 0.78085318, + "memory(GiB)": 126.99, + "step": 10240, + "train_speed(iter/s)": 0.203233 + }, + { + "acc": 0.76503386, + "epoch": 0.23914026159611543, + "grad_norm": 7.28125, + "learning_rate": 9.791087843442738e-06, + "loss": 0.87600975, + "memory(GiB)": 126.99, + "step": 10250, + "train_speed(iter/s)": 0.203335 + }, + { + "acc": 0.7559289, + "epoch": 0.23937356916840433, + "grad_norm": 8.8125, + "learning_rate": 9.790547139307869e-06, + "loss": 0.90565586, + "memory(GiB)": 126.99, + "step": 10260, + "train_speed(iter/s)": 0.203439 + }, + { + "acc": 0.77096276, + "epoch": 0.23960687674069323, + "grad_norm": 6.53125, + "learning_rate": 9.790005751323787e-06, + "loss": 0.83454618, + "memory(GiB)": 126.99, + "step": 10270, + "train_speed(iter/s)": 0.203535 + }, + { + "acc": 0.77344503, + "epoch": 0.2398401843129821, + "grad_norm": 5.0625, + "learning_rate": 9.789463679567775e-06, + "loss": 0.82620983, + "memory(GiB)": 126.99, + "step": 10280, + "train_speed(iter/s)": 0.203636 + }, + { + "acc": 0.75782394, + "epoch": 0.240073491885271, + "grad_norm": 5.625, + "learning_rate": 9.788920924117213e-06, + "loss": 0.89050446, + "memory(GiB)": 126.99, + "step": 10290, + "train_speed(iter/s)": 0.203738 + }, + { + "acc": 0.76385994, + "epoch": 0.2403067994575599, + "grad_norm": 5.15625, + "learning_rate": 9.788377485049583e-06, + "loss": 0.85634632, + "memory(GiB)": 126.99, + "step": 10300, + "train_speed(iter/s)": 0.203836 + }, + { + "acc": 0.7727993, + "epoch": 0.2405401070298488, + "grad_norm": 4.875, + "learning_rate": 9.787833362442456e-06, + "loss": 0.81806173, + "memory(GiB)": 126.99, + "step": 10310, + "train_speed(iter/s)": 0.203937 + }, + { + "acc": 0.75147309, + "epoch": 0.2407734146021377, + "grad_norm": 6.15625, + "learning_rate": 9.78728855637351e-06, + "loss": 0.8997839, + "memory(GiB)": 126.99, + "step": 10320, + "train_speed(iter/s)": 0.204036 + }, + { + "acc": 0.77584658, + "epoch": 0.24100672217442656, + "grad_norm": 7.5, + "learning_rate": 9.786743066920509e-06, + "loss": 0.79813576, + "memory(GiB)": 126.99, + "step": 10330, + "train_speed(iter/s)": 0.204138 + }, + { + "acc": 0.7439743, + "epoch": 0.24124002974671546, + "grad_norm": 4.90625, + "learning_rate": 9.786196894161329e-06, + "loss": 0.9293148, + "memory(GiB)": 126.99, + "step": 10340, + "train_speed(iter/s)": 0.20424 + }, + { + "acc": 0.77829103, + "epoch": 0.24147333731900436, + "grad_norm": 6.625, + "learning_rate": 9.78565003817393e-06, + "loss": 0.78798056, + "memory(GiB)": 126.99, + "step": 10350, + "train_speed(iter/s)": 0.204339 + }, + { + "acc": 0.75890827, + "epoch": 0.24170664489129326, + "grad_norm": 5.5625, + "learning_rate": 9.78510249903638e-06, + "loss": 0.87458324, + "memory(GiB)": 126.99, + "step": 10360, + "train_speed(iter/s)": 0.204444 + }, + { + "acc": 0.76287951, + "epoch": 0.24193995246358216, + "grad_norm": 4.84375, + "learning_rate": 9.784554276826839e-06, + "loss": 0.84869385, + "memory(GiB)": 126.99, + "step": 10370, + "train_speed(iter/s)": 0.204546 + }, + { + "acc": 0.74085793, + "epoch": 0.24217326003587103, + "grad_norm": 6.625, + "learning_rate": 9.784005371623564e-06, + "loss": 0.93879375, + "memory(GiB)": 126.99, + "step": 10380, + "train_speed(iter/s)": 0.204649 + }, + { + "acc": 0.77869744, + "epoch": 0.24240656760815993, + "grad_norm": 5.15625, + "learning_rate": 9.783455783504911e-06, + "loss": 0.78200302, + "memory(GiB)": 126.99, + "step": 10390, + "train_speed(iter/s)": 0.20475 + }, + { + "acc": 0.7931366, + "epoch": 0.24263987518044883, + "grad_norm": 6.40625, + "learning_rate": 9.782905512549336e-06, + "loss": 0.73280854, + "memory(GiB)": 126.99, + "step": 10400, + "train_speed(iter/s)": 0.204851 + }, + { + "acc": 0.75236487, + "epoch": 0.24287318275273773, + "grad_norm": 4.375, + "learning_rate": 9.78235455883539e-06, + "loss": 0.90658245, + "memory(GiB)": 126.99, + "step": 10410, + "train_speed(iter/s)": 0.204943 + }, + { + "acc": 0.7735096, + "epoch": 0.24310649032502663, + "grad_norm": 8.75, + "learning_rate": 9.781802922441716e-06, + "loss": 0.81277504, + "memory(GiB)": 126.99, + "step": 10420, + "train_speed(iter/s)": 0.205038 + }, + { + "acc": 0.76845932, + "epoch": 0.2433397978973155, + "grad_norm": 5.84375, + "learning_rate": 9.781250603447069e-06, + "loss": 0.85649538, + "memory(GiB)": 126.99, + "step": 10430, + "train_speed(iter/s)": 0.205138 + }, + { + "acc": 0.76778679, + "epoch": 0.2435731054696044, + "grad_norm": 7.40625, + "learning_rate": 9.780697601930282e-06, + "loss": 0.84032097, + "memory(GiB)": 126.99, + "step": 10440, + "train_speed(iter/s)": 0.205233 + }, + { + "acc": 0.77375274, + "epoch": 0.2438064130418933, + "grad_norm": 4.96875, + "learning_rate": 9.780143917970304e-06, + "loss": 0.81406355, + "memory(GiB)": 126.99, + "step": 10450, + "train_speed(iter/s)": 0.205333 + }, + { + "acc": 0.76430569, + "epoch": 0.2440397206141822, + "grad_norm": 6.09375, + "learning_rate": 9.77958955164617e-06, + "loss": 0.84869604, + "memory(GiB)": 126.99, + "step": 10460, + "train_speed(iter/s)": 0.205429 + }, + { + "acc": 0.75566778, + "epoch": 0.24427302818647106, + "grad_norm": 6.5, + "learning_rate": 9.779034503037016e-06, + "loss": 0.88865891, + "memory(GiB)": 126.99, + "step": 10470, + "train_speed(iter/s)": 0.205519 + }, + { + "acc": 0.77346563, + "epoch": 0.24450633575875996, + "grad_norm": 6.09375, + "learning_rate": 9.778478772222075e-06, + "loss": 0.82391491, + "memory(GiB)": 126.99, + "step": 10480, + "train_speed(iter/s)": 0.20562 + }, + { + "acc": 0.7746438, + "epoch": 0.24473964333104886, + "grad_norm": 7.84375, + "learning_rate": 9.777922359280677e-06, + "loss": 0.80834398, + "memory(GiB)": 126.99, + "step": 10490, + "train_speed(iter/s)": 0.205719 + }, + { + "acc": 0.75339031, + "epoch": 0.24497295090333776, + "grad_norm": 6.875, + "learning_rate": 9.777365264292252e-06, + "loss": 0.88665466, + "memory(GiB)": 126.99, + "step": 10500, + "train_speed(iter/s)": 0.205821 + }, + { + "epoch": 0.24497295090333776, + "eval_acc": 0.7332850658617177, + "eval_loss": 0.8476359844207764, + "eval_runtime": 1270.6663, + "eval_samples_per_second": 28.325, + "eval_steps_per_second": 14.163, + "step": 10500 + }, + { + "acc": 0.77165108, + "epoch": 0.24520625847562666, + "grad_norm": 5.03125, + "learning_rate": 9.77680748733632e-06, + "loss": 0.81288023, + "memory(GiB)": 126.99, + "step": 10510, + "train_speed(iter/s)": 0.200861 + }, + { + "acc": 0.75663023, + "epoch": 0.24543956604791553, + "grad_norm": 4.46875, + "learning_rate": 9.77624902849251e-06, + "loss": 0.89590054, + "memory(GiB)": 126.99, + "step": 10520, + "train_speed(iter/s)": 0.200959 + }, + { + "acc": 0.76394377, + "epoch": 0.24567287362020443, + "grad_norm": 5.53125, + "learning_rate": 9.775689887840537e-06, + "loss": 0.86777287, + "memory(GiB)": 126.99, + "step": 10530, + "train_speed(iter/s)": 0.201059 + }, + { + "acc": 0.76071887, + "epoch": 0.24590618119249333, + "grad_norm": 26.5, + "learning_rate": 9.775130065460222e-06, + "loss": 0.88518524, + "memory(GiB)": 126.99, + "step": 10540, + "train_speed(iter/s)": 0.201157 + }, + { + "acc": 0.77137909, + "epoch": 0.24613948876478223, + "grad_norm": 7.40625, + "learning_rate": 9.774569561431474e-06, + "loss": 0.83860283, + "memory(GiB)": 126.99, + "step": 10550, + "train_speed(iter/s)": 0.201256 + }, + { + "acc": 0.7849823, + "epoch": 0.24637279633707113, + "grad_norm": 5.53125, + "learning_rate": 9.77400837583431e-06, + "loss": 0.7651825, + "memory(GiB)": 126.99, + "step": 10560, + "train_speed(iter/s)": 0.201362 + }, + { + "acc": 0.76274223, + "epoch": 0.24660610390936, + "grad_norm": 7.5, + "learning_rate": 9.773446508748836e-06, + "loss": 0.86177063, + "memory(GiB)": 126.99, + "step": 10570, + "train_speed(iter/s)": 0.201458 + }, + { + "acc": 0.75613794, + "epoch": 0.2468394114816489, + "grad_norm": 5.96875, + "learning_rate": 9.772883960255261e-06, + "loss": 0.89295292, + "memory(GiB)": 126.99, + "step": 10580, + "train_speed(iter/s)": 0.201555 + }, + { + "acc": 0.76609902, + "epoch": 0.2470727190539378, + "grad_norm": 6.1875, + "learning_rate": 9.772320730433886e-06, + "loss": 0.84143562, + "memory(GiB)": 126.99, + "step": 10590, + "train_speed(iter/s)": 0.201655 + }, + { + "acc": 0.75564928, + "epoch": 0.2473060266262267, + "grad_norm": 6.3125, + "learning_rate": 9.771756819365114e-06, + "loss": 0.88248844, + "memory(GiB)": 126.99, + "step": 10600, + "train_speed(iter/s)": 0.201757 + }, + { + "acc": 0.7873702, + "epoch": 0.2475393341985156, + "grad_norm": 7.78125, + "learning_rate": 9.771192227129442e-06, + "loss": 0.78887529, + "memory(GiB)": 126.99, + "step": 10610, + "train_speed(iter/s)": 0.201848 + }, + { + "acc": 0.78069005, + "epoch": 0.24777264177080446, + "grad_norm": 4.65625, + "learning_rate": 9.770626953807468e-06, + "loss": 0.78519044, + "memory(GiB)": 126.99, + "step": 10620, + "train_speed(iter/s)": 0.201946 + }, + { + "acc": 0.76918497, + "epoch": 0.24800594934309336, + "grad_norm": 6.0, + "learning_rate": 9.770060999479878e-06, + "loss": 0.83556156, + "memory(GiB)": 126.99, + "step": 10630, + "train_speed(iter/s)": 0.202046 + }, + { + "acc": 0.76681919, + "epoch": 0.24823925691538226, + "grad_norm": 4.875, + "learning_rate": 9.769494364227468e-06, + "loss": 0.83850193, + "memory(GiB)": 126.99, + "step": 10640, + "train_speed(iter/s)": 0.202148 + }, + { + "acc": 0.78110733, + "epoch": 0.24847256448767116, + "grad_norm": 4.84375, + "learning_rate": 9.768927048131122e-06, + "loss": 0.77184925, + "memory(GiB)": 126.99, + "step": 10650, + "train_speed(iter/s)": 0.202252 + }, + { + "acc": 0.76969419, + "epoch": 0.24870587205996006, + "grad_norm": 5.875, + "learning_rate": 9.768359051271827e-06, + "loss": 0.85380192, + "memory(GiB)": 126.99, + "step": 10660, + "train_speed(iter/s)": 0.202351 + }, + { + "acc": 0.7685781, + "epoch": 0.24893917963224893, + "grad_norm": 11.375, + "learning_rate": 9.767790373730663e-06, + "loss": 0.84533386, + "memory(GiB)": 126.99, + "step": 10670, + "train_speed(iter/s)": 0.202452 + }, + { + "acc": 0.77039485, + "epoch": 0.24917248720453783, + "grad_norm": 5.6875, + "learning_rate": 9.767221015588807e-06, + "loss": 0.84062462, + "memory(GiB)": 126.99, + "step": 10680, + "train_speed(iter/s)": 0.202549 + }, + { + "acc": 0.77639532, + "epoch": 0.24940579477682673, + "grad_norm": 5.84375, + "learning_rate": 9.766650976927536e-06, + "loss": 0.81324234, + "memory(GiB)": 126.99, + "step": 10690, + "train_speed(iter/s)": 0.202647 + }, + { + "acc": 0.77875633, + "epoch": 0.24963910234911563, + "grad_norm": 7.96875, + "learning_rate": 9.766080257828223e-06, + "loss": 0.80693998, + "memory(GiB)": 126.99, + "step": 10700, + "train_speed(iter/s)": 0.202741 + }, + { + "acc": 0.7646112, + "epoch": 0.2498724099214045, + "grad_norm": 6.28125, + "learning_rate": 9.765508858372337e-06, + "loss": 0.85661631, + "memory(GiB)": 126.99, + "step": 10710, + "train_speed(iter/s)": 0.202837 + }, + { + "acc": 0.76114054, + "epoch": 0.2501057174936934, + "grad_norm": 6.65625, + "learning_rate": 9.764936778641448e-06, + "loss": 0.88623295, + "memory(GiB)": 126.99, + "step": 10720, + "train_speed(iter/s)": 0.202941 + }, + { + "acc": 0.77698221, + "epoch": 0.2503390250659823, + "grad_norm": 7.75, + "learning_rate": 9.764364018717215e-06, + "loss": 0.79923716, + "memory(GiB)": 126.99, + "step": 10730, + "train_speed(iter/s)": 0.203033 + }, + { + "acc": 0.74807673, + "epoch": 0.25057233263827117, + "grad_norm": 6.4375, + "learning_rate": 9.763790578681404e-06, + "loss": 0.92849827, + "memory(GiB)": 126.99, + "step": 10740, + "train_speed(iter/s)": 0.20313 + }, + { + "acc": 0.76602993, + "epoch": 0.25080564021056007, + "grad_norm": 5.9375, + "learning_rate": 9.763216458615871e-06, + "loss": 0.8343996, + "memory(GiB)": 126.99, + "step": 10750, + "train_speed(iter/s)": 0.203226 + }, + { + "acc": 0.77881093, + "epoch": 0.25103894778284896, + "grad_norm": 5.25, + "learning_rate": 9.762641658602575e-06, + "loss": 0.7768116, + "memory(GiB)": 126.99, + "step": 10760, + "train_speed(iter/s)": 0.203323 + }, + { + "acc": 0.74730372, + "epoch": 0.25127225535513786, + "grad_norm": 5.84375, + "learning_rate": 9.762066178723562e-06, + "loss": 0.90061388, + "memory(GiB)": 126.99, + "step": 10770, + "train_speed(iter/s)": 0.203419 + }, + { + "acc": 0.77457342, + "epoch": 0.25150556292742676, + "grad_norm": 5.34375, + "learning_rate": 9.761490019060988e-06, + "loss": 0.80694494, + "memory(GiB)": 126.99, + "step": 10780, + "train_speed(iter/s)": 0.203511 + }, + { + "acc": 0.76837416, + "epoch": 0.25173887049971566, + "grad_norm": 5.34375, + "learning_rate": 9.760913179697095e-06, + "loss": 0.82134075, + "memory(GiB)": 126.99, + "step": 10790, + "train_speed(iter/s)": 0.203603 + }, + { + "acc": 0.75787249, + "epoch": 0.25197217807200456, + "grad_norm": 6.125, + "learning_rate": 9.76033566071423e-06, + "loss": 0.88264313, + "memory(GiB)": 126.99, + "step": 10800, + "train_speed(iter/s)": 0.203694 + }, + { + "acc": 0.75982161, + "epoch": 0.25220548564429346, + "grad_norm": 4.65625, + "learning_rate": 9.759757462194832e-06, + "loss": 0.8837141, + "memory(GiB)": 126.99, + "step": 10810, + "train_speed(iter/s)": 0.203783 + }, + { + "acc": 0.78241043, + "epoch": 0.25243879321658236, + "grad_norm": 6.09375, + "learning_rate": 9.759178584221439e-06, + "loss": 0.79735069, + "memory(GiB)": 126.99, + "step": 10820, + "train_speed(iter/s)": 0.203875 + }, + { + "acc": 0.76798344, + "epoch": 0.2526721007888712, + "grad_norm": 7.125, + "learning_rate": 9.758599026876685e-06, + "loss": 0.83385935, + "memory(GiB)": 126.99, + "step": 10830, + "train_speed(iter/s)": 0.203966 + }, + { + "acc": 0.77368031, + "epoch": 0.2529054083611601, + "grad_norm": 7.65625, + "learning_rate": 9.758018790243304e-06, + "loss": 0.8143096, + "memory(GiB)": 126.99, + "step": 10840, + "train_speed(iter/s)": 0.204058 + }, + { + "acc": 0.75938997, + "epoch": 0.253138715933449, + "grad_norm": 5.59375, + "learning_rate": 9.757437874404121e-06, + "loss": 0.85309505, + "memory(GiB)": 126.99, + "step": 10850, + "train_speed(iter/s)": 0.204149 + }, + { + "acc": 0.78056459, + "epoch": 0.2533720235057379, + "grad_norm": 5.625, + "learning_rate": 9.756856279442064e-06, + "loss": 0.77668962, + "memory(GiB)": 126.99, + "step": 10860, + "train_speed(iter/s)": 0.204244 + }, + { + "acc": 0.74807405, + "epoch": 0.2536053310780268, + "grad_norm": 6.0625, + "learning_rate": 9.756274005440156e-06, + "loss": 0.92297764, + "memory(GiB)": 126.99, + "step": 10870, + "train_speed(iter/s)": 0.204338 + }, + { + "acc": 0.76769743, + "epoch": 0.2538386386503157, + "grad_norm": 4.625, + "learning_rate": 9.755691052481515e-06, + "loss": 0.81955452, + "memory(GiB)": 126.99, + "step": 10880, + "train_speed(iter/s)": 0.204435 + }, + { + "acc": 0.7821074, + "epoch": 0.2540719462226046, + "grad_norm": 5.78125, + "learning_rate": 9.755107420649357e-06, + "loss": 0.77177048, + "memory(GiB)": 126.99, + "step": 10890, + "train_speed(iter/s)": 0.204532 + }, + { + "acc": 0.76513309, + "epoch": 0.2543052537948935, + "grad_norm": 5.1875, + "learning_rate": 9.754523110026997e-06, + "loss": 0.84079628, + "memory(GiB)": 126.99, + "step": 10900, + "train_speed(iter/s)": 0.204625 + }, + { + "acc": 0.77776675, + "epoch": 0.2545385613671824, + "grad_norm": 7.125, + "learning_rate": 9.753938120697843e-06, + "loss": 0.79896116, + "memory(GiB)": 126.99, + "step": 10910, + "train_speed(iter/s)": 0.204718 + }, + { + "acc": 0.77102752, + "epoch": 0.2547718689394713, + "grad_norm": 5.34375, + "learning_rate": 9.753352452745406e-06, + "loss": 0.8230175, + "memory(GiB)": 126.99, + "step": 10920, + "train_speed(iter/s)": 0.204808 + }, + { + "acc": 0.7826086, + "epoch": 0.25500517651176013, + "grad_norm": 8.3125, + "learning_rate": 9.752766106253285e-06, + "loss": 0.77752132, + "memory(GiB)": 126.99, + "step": 10930, + "train_speed(iter/s)": 0.204896 + }, + { + "acc": 0.76253233, + "epoch": 0.25523848408404903, + "grad_norm": 4.75, + "learning_rate": 9.752179081305184e-06, + "loss": 0.85609999, + "memory(GiB)": 126.99, + "step": 10940, + "train_speed(iter/s)": 0.204985 + }, + { + "acc": 0.78686914, + "epoch": 0.25547179165633793, + "grad_norm": 5.65625, + "learning_rate": 9.751591377984899e-06, + "loss": 0.78415165, + "memory(GiB)": 126.99, + "step": 10950, + "train_speed(iter/s)": 0.205081 + }, + { + "acc": 0.76985493, + "epoch": 0.25570509922862683, + "grad_norm": 13.25, + "learning_rate": 9.751002996376324e-06, + "loss": 0.87296238, + "memory(GiB)": 126.99, + "step": 10960, + "train_speed(iter/s)": 0.205177 + }, + { + "acc": 0.77404528, + "epoch": 0.25593840680091573, + "grad_norm": 4.375, + "learning_rate": 9.750413936563454e-06, + "loss": 0.80378752, + "memory(GiB)": 126.99, + "step": 10970, + "train_speed(iter/s)": 0.20527 + }, + { + "acc": 0.76339779, + "epoch": 0.25617171437320463, + "grad_norm": 6.125, + "learning_rate": 9.749824198630371e-06, + "loss": 0.87236614, + "memory(GiB)": 126.99, + "step": 10980, + "train_speed(iter/s)": 0.205367 + }, + { + "acc": 0.76326017, + "epoch": 0.2564050219454935, + "grad_norm": 4.78125, + "learning_rate": 9.749233782661267e-06, + "loss": 0.84402294, + "memory(GiB)": 126.99, + "step": 10990, + "train_speed(iter/s)": 0.20546 + }, + { + "acc": 0.78211966, + "epoch": 0.2566383295177824, + "grad_norm": 4.4375, + "learning_rate": 9.74864268874042e-06, + "loss": 0.79133849, + "memory(GiB)": 126.99, + "step": 11000, + "train_speed(iter/s)": 0.20555 + }, + { + "epoch": 0.2566383295177824, + "eval_acc": 0.73350094028686, + "eval_loss": 0.8474711775779724, + "eval_runtime": 1269.9892, + "eval_samples_per_second": 28.34, + "eval_steps_per_second": 14.17, + "step": 11000 + }, + { + "acc": 0.76290751, + "epoch": 0.2568716370900713, + "grad_norm": 4.28125, + "learning_rate": 9.748050916952206e-06, + "loss": 0.85341339, + "memory(GiB)": 126.99, + "step": 11010, + "train_speed(iter/s)": 0.200816 + }, + { + "acc": 0.76456928, + "epoch": 0.2571049446623602, + "grad_norm": 4.84375, + "learning_rate": 9.747458467381104e-06, + "loss": 0.83613319, + "memory(GiB)": 126.99, + "step": 11020, + "train_speed(iter/s)": 0.200915 + }, + { + "acc": 0.77785635, + "epoch": 0.25733825223464907, + "grad_norm": 6.875, + "learning_rate": 9.746865340111686e-06, + "loss": 0.78845282, + "memory(GiB)": 126.99, + "step": 11030, + "train_speed(iter/s)": 0.201015 + }, + { + "acc": 0.77325573, + "epoch": 0.25757155980693797, + "grad_norm": 4.6875, + "learning_rate": 9.74627153522862e-06, + "loss": 0.82584515, + "memory(GiB)": 126.99, + "step": 11040, + "train_speed(iter/s)": 0.201115 + }, + { + "acc": 0.75915756, + "epoch": 0.25780486737922687, + "grad_norm": 7.75, + "learning_rate": 9.74567705281667e-06, + "loss": 0.86543064, + "memory(GiB)": 126.99, + "step": 11050, + "train_speed(iter/s)": 0.201208 + }, + { + "acc": 0.75994439, + "epoch": 0.25803817495151576, + "grad_norm": 5.46875, + "learning_rate": 9.745081892960699e-06, + "loss": 0.87644157, + "memory(GiB)": 126.99, + "step": 11060, + "train_speed(iter/s)": 0.201305 + }, + { + "acc": 0.75638137, + "epoch": 0.25827148252380466, + "grad_norm": 6.21875, + "learning_rate": 9.744486055745667e-06, + "loss": 0.90656376, + "memory(GiB)": 126.99, + "step": 11070, + "train_speed(iter/s)": 0.201399 + }, + { + "acc": 0.76791043, + "epoch": 0.25850479009609356, + "grad_norm": 4.8125, + "learning_rate": 9.743889541256628e-06, + "loss": 0.85354557, + "memory(GiB)": 126.99, + "step": 11080, + "train_speed(iter/s)": 0.201493 + }, + { + "acc": 0.78517094, + "epoch": 0.25873809766838246, + "grad_norm": 10.5, + "learning_rate": 9.743292349578737e-06, + "loss": 0.79755116, + "memory(GiB)": 126.99, + "step": 11090, + "train_speed(iter/s)": 0.201588 + }, + { + "acc": 0.77367783, + "epoch": 0.25897140524067136, + "grad_norm": 6.375, + "learning_rate": 9.742694480797239e-06, + "loss": 0.8688302, + "memory(GiB)": 126.99, + "step": 11100, + "train_speed(iter/s)": 0.201679 + }, + { + "acc": 0.76761403, + "epoch": 0.25920471281296026, + "grad_norm": 5.71875, + "learning_rate": 9.742095934997482e-06, + "loss": 0.84129333, + "memory(GiB)": 126.99, + "step": 11110, + "train_speed(iter/s)": 0.201772 + }, + { + "acc": 0.7736413, + "epoch": 0.2594380203852491, + "grad_norm": 5.28125, + "learning_rate": 9.741496712264908e-06, + "loss": 0.82993221, + "memory(GiB)": 126.99, + "step": 11120, + "train_speed(iter/s)": 0.20186 + }, + { + "acc": 0.80244122, + "epoch": 0.259671327957538, + "grad_norm": 9.8125, + "learning_rate": 9.740896812685057e-06, + "loss": 0.70631914, + "memory(GiB)": 126.99, + "step": 11130, + "train_speed(iter/s)": 0.201957 + }, + { + "acc": 0.78163109, + "epoch": 0.2599046355298269, + "grad_norm": 5.21875, + "learning_rate": 9.740296236343561e-06, + "loss": 0.7652422, + "memory(GiB)": 126.99, + "step": 11140, + "train_speed(iter/s)": 0.202052 + }, + { + "acc": 0.79088631, + "epoch": 0.2601379431021158, + "grad_norm": 6.8125, + "learning_rate": 9.739694983326155e-06, + "loss": 0.74674163, + "memory(GiB)": 126.99, + "step": 11150, + "train_speed(iter/s)": 0.20215 + }, + { + "acc": 0.75785141, + "epoch": 0.2603712506744047, + "grad_norm": 10.1875, + "learning_rate": 9.739093053718669e-06, + "loss": 0.86176929, + "memory(GiB)": 126.99, + "step": 11160, + "train_speed(iter/s)": 0.20224 + }, + { + "acc": 0.77965183, + "epoch": 0.2606045582466936, + "grad_norm": 5.71875, + "learning_rate": 9.738490447607025e-06, + "loss": 0.78320961, + "memory(GiB)": 126.99, + "step": 11170, + "train_speed(iter/s)": 0.202336 + }, + { + "acc": 0.77159529, + "epoch": 0.2608378658189825, + "grad_norm": 10.3125, + "learning_rate": 9.737887165077246e-06, + "loss": 0.82931585, + "memory(GiB)": 126.99, + "step": 11180, + "train_speed(iter/s)": 0.202431 + }, + { + "acc": 0.76718621, + "epoch": 0.2610711733912714, + "grad_norm": 5.09375, + "learning_rate": 9.73728320621545e-06, + "loss": 0.8547121, + "memory(GiB)": 126.99, + "step": 11190, + "train_speed(iter/s)": 0.202523 + }, + { + "acc": 0.76389389, + "epoch": 0.2613044809635603, + "grad_norm": 5.09375, + "learning_rate": 9.736678571107854e-06, + "loss": 0.86680279, + "memory(GiB)": 126.99, + "step": 11200, + "train_speed(iter/s)": 0.202609 + }, + { + "acc": 0.77421455, + "epoch": 0.2615377885358492, + "grad_norm": 5.28125, + "learning_rate": 9.736073259840766e-06, + "loss": 0.80106392, + "memory(GiB)": 126.99, + "step": 11210, + "train_speed(iter/s)": 0.202707 + }, + { + "acc": 0.74991431, + "epoch": 0.26177109610813803, + "grad_norm": 6.28125, + "learning_rate": 9.735467272500597e-06, + "loss": 0.92689228, + "memory(GiB)": 126.99, + "step": 11220, + "train_speed(iter/s)": 0.202804 + }, + { + "acc": 0.78649111, + "epoch": 0.26200440368042693, + "grad_norm": 10.5, + "learning_rate": 9.73486060917385e-06, + "loss": 0.78013344, + "memory(GiB)": 126.99, + "step": 11230, + "train_speed(iter/s)": 0.202899 + }, + { + "acc": 0.76939349, + "epoch": 0.26223771125271583, + "grad_norm": 9.375, + "learning_rate": 9.734253269947128e-06, + "loss": 0.84294491, + "memory(GiB)": 126.99, + "step": 11240, + "train_speed(iter/s)": 0.20299 + }, + { + "acc": 0.74981251, + "epoch": 0.26247101882500473, + "grad_norm": 4.0625, + "learning_rate": 9.733645254907126e-06, + "loss": 0.91040516, + "memory(GiB)": 126.99, + "step": 11250, + "train_speed(iter/s)": 0.203082 + }, + { + "acc": 0.77205968, + "epoch": 0.26270432639729363, + "grad_norm": 5.96875, + "learning_rate": 9.73303656414064e-06, + "loss": 0.80362463, + "memory(GiB)": 126.99, + "step": 11260, + "train_speed(iter/s)": 0.203173 + }, + { + "acc": 0.78065724, + "epoch": 0.26293763396958253, + "grad_norm": 10.25, + "learning_rate": 9.732427197734557e-06, + "loss": 0.77598629, + "memory(GiB)": 126.99, + "step": 11270, + "train_speed(iter/s)": 0.203266 + }, + { + "acc": 0.77076845, + "epoch": 0.26317094154187143, + "grad_norm": 5.84375, + "learning_rate": 9.73181715577587e-06, + "loss": 0.82357779, + "memory(GiB)": 126.99, + "step": 11280, + "train_speed(iter/s)": 0.203349 + }, + { + "acc": 0.78687391, + "epoch": 0.2634042491141603, + "grad_norm": 5.0625, + "learning_rate": 9.731206438351655e-06, + "loss": 0.77638292, + "memory(GiB)": 126.99, + "step": 11290, + "train_speed(iter/s)": 0.203439 + }, + { + "acc": 0.77242026, + "epoch": 0.2636375566864492, + "grad_norm": 6.4375, + "learning_rate": 9.730595045549096e-06, + "loss": 0.80217266, + "memory(GiB)": 126.99, + "step": 11300, + "train_speed(iter/s)": 0.203536 + }, + { + "acc": 0.76687241, + "epoch": 0.2638708642587381, + "grad_norm": 4.375, + "learning_rate": 9.72998297745547e-06, + "loss": 0.84325809, + "memory(GiB)": 126.99, + "step": 11310, + "train_speed(iter/s)": 0.203631 + }, + { + "acc": 0.74472752, + "epoch": 0.26410417183102697, + "grad_norm": 5.0625, + "learning_rate": 9.729370234158147e-06, + "loss": 0.92086382, + "memory(GiB)": 126.99, + "step": 11320, + "train_speed(iter/s)": 0.203723 + }, + { + "acc": 0.7770112, + "epoch": 0.26433747940331587, + "grad_norm": 6.25, + "learning_rate": 9.728756815744598e-06, + "loss": 0.86181021, + "memory(GiB)": 126.99, + "step": 11330, + "train_speed(iter/s)": 0.203818 + }, + { + "acc": 0.75057278, + "epoch": 0.26457078697560477, + "grad_norm": 6.1875, + "learning_rate": 9.728142722302385e-06, + "loss": 0.91030331, + "memory(GiB)": 126.99, + "step": 11340, + "train_speed(iter/s)": 0.20391 + }, + { + "acc": 0.76302509, + "epoch": 0.26480409454789366, + "grad_norm": 11.0625, + "learning_rate": 9.727527953919174e-06, + "loss": 0.87596025, + "memory(GiB)": 126.99, + "step": 11350, + "train_speed(iter/s)": 0.203997 + }, + { + "acc": 0.75909495, + "epoch": 0.26503740212018256, + "grad_norm": 5.53125, + "learning_rate": 9.72691251068272e-06, + "loss": 0.86929102, + "memory(GiB)": 126.99, + "step": 11360, + "train_speed(iter/s)": 0.204091 + }, + { + "acc": 0.76749086, + "epoch": 0.26527070969247146, + "grad_norm": 5.53125, + "learning_rate": 9.726296392680879e-06, + "loss": 0.83195705, + "memory(GiB)": 126.99, + "step": 11370, + "train_speed(iter/s)": 0.204179 + }, + { + "acc": 0.76265335, + "epoch": 0.26550401726476036, + "grad_norm": 20.5, + "learning_rate": 9.7256796000016e-06, + "loss": 0.84113808, + "memory(GiB)": 126.99, + "step": 11380, + "train_speed(iter/s)": 0.204266 + }, + { + "acc": 0.7723104, + "epoch": 0.26573732483704926, + "grad_norm": 6.53125, + "learning_rate": 9.725062132732931e-06, + "loss": 0.82607574, + "memory(GiB)": 126.99, + "step": 11390, + "train_speed(iter/s)": 0.204356 + }, + { + "acc": 0.76926665, + "epoch": 0.26597063240933816, + "grad_norm": 9.25, + "learning_rate": 9.724443990963017e-06, + "loss": 0.84509716, + "memory(GiB)": 126.99, + "step": 11400, + "train_speed(iter/s)": 0.204446 + }, + { + "acc": 0.77471218, + "epoch": 0.266203939981627, + "grad_norm": 5.96875, + "learning_rate": 9.723825174780095e-06, + "loss": 0.79775171, + "memory(GiB)": 126.99, + "step": 11410, + "train_speed(iter/s)": 0.204537 + }, + { + "acc": 0.76716242, + "epoch": 0.2664372475539159, + "grad_norm": 6.125, + "learning_rate": 9.723205684272501e-06, + "loss": 0.82582216, + "memory(GiB)": 126.99, + "step": 11420, + "train_speed(iter/s)": 0.20463 + }, + { + "acc": 0.77979913, + "epoch": 0.2666705551262048, + "grad_norm": 4.59375, + "learning_rate": 9.722585519528666e-06, + "loss": 0.79641523, + "memory(GiB)": 126.99, + "step": 11430, + "train_speed(iter/s)": 0.204722 + }, + { + "acc": 0.77594395, + "epoch": 0.2669038626984937, + "grad_norm": 7.5625, + "learning_rate": 9.721964680637124e-06, + "loss": 0.81631298, + "memory(GiB)": 126.99, + "step": 11440, + "train_speed(iter/s)": 0.204809 + }, + { + "acc": 0.77567191, + "epoch": 0.2671371702707826, + "grad_norm": 6.46875, + "learning_rate": 9.721343167686491e-06, + "loss": 0.80270195, + "memory(GiB)": 126.99, + "step": 11450, + "train_speed(iter/s)": 0.2049 + }, + { + "acc": 0.76392403, + "epoch": 0.2673704778430715, + "grad_norm": 9.3125, + "learning_rate": 9.720720980765495e-06, + "loss": 0.85869751, + "memory(GiB)": 126.99, + "step": 11460, + "train_speed(iter/s)": 0.204987 + }, + { + "acc": 0.76475925, + "epoch": 0.2676037854153604, + "grad_norm": 4.75, + "learning_rate": 9.72009811996295e-06, + "loss": 0.85545101, + "memory(GiB)": 126.99, + "step": 11470, + "train_speed(iter/s)": 0.205075 + }, + { + "acc": 0.77414856, + "epoch": 0.2678370929876493, + "grad_norm": 6.125, + "learning_rate": 9.719474585367771e-06, + "loss": 0.80691338, + "memory(GiB)": 126.99, + "step": 11480, + "train_speed(iter/s)": 0.205167 + }, + { + "acc": 0.7491497, + "epoch": 0.2680704005599382, + "grad_norm": 8.875, + "learning_rate": 9.718850377068964e-06, + "loss": 0.95009527, + "memory(GiB)": 126.99, + "step": 11490, + "train_speed(iter/s)": 0.205256 + }, + { + "acc": 0.77273641, + "epoch": 0.2683037081322271, + "grad_norm": 6.34375, + "learning_rate": 9.718225495155638e-06, + "loss": 0.82936974, + "memory(GiB)": 126.99, + "step": 11500, + "train_speed(iter/s)": 0.205341 + }, + { + "epoch": 0.2683037081322271, + "eval_acc": 0.7336386643266392, + "eval_loss": 0.8452193737030029, + "eval_runtime": 1270.1304, + "eval_samples_per_second": 28.336, + "eval_steps_per_second": 14.169, + "step": 11500 + }, + { + "acc": 0.7597836, + "epoch": 0.26853701570451594, + "grad_norm": 6.1875, + "learning_rate": 9.717599939716992e-06, + "loss": 0.84666138, + "memory(GiB)": 126.99, + "step": 11510, + "train_speed(iter/s)": 0.200821 + }, + { + "acc": 0.75912447, + "epoch": 0.26877032327680483, + "grad_norm": 5.90625, + "learning_rate": 9.716973710842326e-06, + "loss": 0.86441269, + "memory(GiB)": 126.99, + "step": 11520, + "train_speed(iter/s)": 0.20091 + }, + { + "acc": 0.77214475, + "epoch": 0.26900363084909373, + "grad_norm": 6.0, + "learning_rate": 9.716346808621031e-06, + "loss": 0.83422747, + "memory(GiB)": 126.99, + "step": 11530, + "train_speed(iter/s)": 0.201 + }, + { + "acc": 0.78334231, + "epoch": 0.26923693842138263, + "grad_norm": 5.6875, + "learning_rate": 9.715719233142601e-06, + "loss": 0.77969465, + "memory(GiB)": 126.99, + "step": 11540, + "train_speed(iter/s)": 0.201089 + }, + { + "acc": 0.77558217, + "epoch": 0.26947024599367153, + "grad_norm": 5.46875, + "learning_rate": 9.71509098449662e-06, + "loss": 0.82058563, + "memory(GiB)": 126.99, + "step": 11550, + "train_speed(iter/s)": 0.201179 + }, + { + "acc": 0.75732489, + "epoch": 0.26970355356596043, + "grad_norm": 4.5625, + "learning_rate": 9.71446206277277e-06, + "loss": 0.88411083, + "memory(GiB)": 126.99, + "step": 11560, + "train_speed(iter/s)": 0.201266 + }, + { + "acc": 0.75806885, + "epoch": 0.26993686113824933, + "grad_norm": 5.09375, + "learning_rate": 9.713832468060831e-06, + "loss": 0.86597509, + "memory(GiB)": 126.99, + "step": 11570, + "train_speed(iter/s)": 0.20135 + }, + { + "acc": 0.74850397, + "epoch": 0.2701701687105382, + "grad_norm": 5.5, + "learning_rate": 9.713202200450678e-06, + "loss": 0.9307641, + "memory(GiB)": 126.99, + "step": 11580, + "train_speed(iter/s)": 0.201441 + }, + { + "acc": 0.76493893, + "epoch": 0.2704034762828271, + "grad_norm": 5.75, + "learning_rate": 9.712571260032277e-06, + "loss": 0.8510767, + "memory(GiB)": 126.99, + "step": 11590, + "train_speed(iter/s)": 0.201524 + }, + { + "acc": 0.76068068, + "epoch": 0.27063678385511597, + "grad_norm": 5.9375, + "learning_rate": 9.7119396468957e-06, + "loss": 0.87124729, + "memory(GiB)": 126.99, + "step": 11600, + "train_speed(iter/s)": 0.201618 + }, + { + "acc": 0.76412511, + "epoch": 0.27087009142740487, + "grad_norm": 4.59375, + "learning_rate": 9.711307361131107e-06, + "loss": 0.84404926, + "memory(GiB)": 126.99, + "step": 11610, + "train_speed(iter/s)": 0.2017 + }, + { + "acc": 0.76919479, + "epoch": 0.27110339899969377, + "grad_norm": 5.375, + "learning_rate": 9.710674402828755e-06, + "loss": 0.83093939, + "memory(GiB)": 126.99, + "step": 11620, + "train_speed(iter/s)": 0.201785 + }, + { + "acc": 0.7725348, + "epoch": 0.27133670657198267, + "grad_norm": 4.28125, + "learning_rate": 9.710040772079001e-06, + "loss": 0.81993046, + "memory(GiB)": 126.99, + "step": 11630, + "train_speed(iter/s)": 0.201875 + }, + { + "acc": 0.75861168, + "epoch": 0.27157001414427157, + "grad_norm": 6.375, + "learning_rate": 9.709406468972295e-06, + "loss": 0.87555952, + "memory(GiB)": 126.99, + "step": 11640, + "train_speed(iter/s)": 0.201964 + }, + { + "acc": 0.76188645, + "epoch": 0.27180332171656046, + "grad_norm": 6.53125, + "learning_rate": 9.708771493599185e-06, + "loss": 0.83453884, + "memory(GiB)": 126.99, + "step": 11650, + "train_speed(iter/s)": 0.202059 + }, + { + "acc": 0.76258478, + "epoch": 0.27203662928884936, + "grad_norm": 4.78125, + "learning_rate": 9.708135846050313e-06, + "loss": 0.85640106, + "memory(GiB)": 126.99, + "step": 11660, + "train_speed(iter/s)": 0.20215 + }, + { + "acc": 0.78319798, + "epoch": 0.27226993686113826, + "grad_norm": 4.40625, + "learning_rate": 9.707499526416415e-06, + "loss": 0.78641605, + "memory(GiB)": 126.99, + "step": 11670, + "train_speed(iter/s)": 0.202232 + }, + { + "acc": 0.75823736, + "epoch": 0.27250324443342716, + "grad_norm": 5.9375, + "learning_rate": 9.706862534788327e-06, + "loss": 0.86921701, + "memory(GiB)": 126.99, + "step": 11680, + "train_speed(iter/s)": 0.202322 + }, + { + "acc": 0.77783613, + "epoch": 0.27273655200571606, + "grad_norm": 8.8125, + "learning_rate": 9.70622487125698e-06, + "loss": 0.78761058, + "memory(GiB)": 126.99, + "step": 11690, + "train_speed(iter/s)": 0.202412 + }, + { + "acc": 0.77462397, + "epoch": 0.2729698595780049, + "grad_norm": 6.625, + "learning_rate": 9.7055865359134e-06, + "loss": 0.81614437, + "memory(GiB)": 126.99, + "step": 11700, + "train_speed(iter/s)": 0.202495 + }, + { + "acc": 0.78386021, + "epoch": 0.2732031671502938, + "grad_norm": 7.15625, + "learning_rate": 9.704947528848706e-06, + "loss": 0.7758873, + "memory(GiB)": 126.99, + "step": 11710, + "train_speed(iter/s)": 0.202588 + }, + { + "acc": 0.77271953, + "epoch": 0.2734364747225827, + "grad_norm": 4.46875, + "learning_rate": 9.704307850154125e-06, + "loss": 0.80577879, + "memory(GiB)": 126.99, + "step": 11720, + "train_speed(iter/s)": 0.202677 + }, + { + "acc": 0.76300001, + "epoch": 0.2736697822948716, + "grad_norm": 5.0, + "learning_rate": 9.70366749992096e-06, + "loss": 0.85106087, + "memory(GiB)": 126.99, + "step": 11730, + "train_speed(iter/s)": 0.202765 + }, + { + "acc": 0.76563978, + "epoch": 0.2739030898671605, + "grad_norm": 4.0625, + "learning_rate": 9.703026478240627e-06, + "loss": 0.84070683, + "memory(GiB)": 126.99, + "step": 11740, + "train_speed(iter/s)": 0.202854 + }, + { + "acc": 0.79070711, + "epoch": 0.2741363974394494, + "grad_norm": 5.09375, + "learning_rate": 9.702384785204631e-06, + "loss": 0.75290918, + "memory(GiB)": 126.99, + "step": 11750, + "train_speed(iter/s)": 0.202945 + }, + { + "acc": 0.75680985, + "epoch": 0.2743697050117383, + "grad_norm": 5.125, + "learning_rate": 9.701742420904574e-06, + "loss": 0.89517021, + "memory(GiB)": 126.99, + "step": 11760, + "train_speed(iter/s)": 0.203037 + }, + { + "acc": 0.74839983, + "epoch": 0.2746030125840272, + "grad_norm": 6.59375, + "learning_rate": 9.701099385432151e-06, + "loss": 0.92097282, + "memory(GiB)": 126.99, + "step": 11770, + "train_speed(iter/s)": 0.203121 + }, + { + "acc": 0.75796165, + "epoch": 0.2748363201563161, + "grad_norm": 4.0, + "learning_rate": 9.700455678879157e-06, + "loss": 0.8720623, + "memory(GiB)": 126.99, + "step": 11780, + "train_speed(iter/s)": 0.203206 + }, + { + "acc": 0.76428533, + "epoch": 0.275069627728605, + "grad_norm": 11.4375, + "learning_rate": 9.69981130133748e-06, + "loss": 0.8485014, + "memory(GiB)": 126.99, + "step": 11790, + "train_speed(iter/s)": 0.203289 + }, + { + "acc": 0.75732021, + "epoch": 0.27530293530089384, + "grad_norm": 4.84375, + "learning_rate": 9.699166252899104e-06, + "loss": 0.88501377, + "memory(GiB)": 126.99, + "step": 11800, + "train_speed(iter/s)": 0.203367 + }, + { + "acc": 0.781742, + "epoch": 0.27553624287318274, + "grad_norm": 4.34375, + "learning_rate": 9.698520533656112e-06, + "loss": 0.78006496, + "memory(GiB)": 126.99, + "step": 11810, + "train_speed(iter/s)": 0.203457 + }, + { + "acc": 0.76830721, + "epoch": 0.27576955044547163, + "grad_norm": 7.125, + "learning_rate": 9.697874143700679e-06, + "loss": 0.81617126, + "memory(GiB)": 126.99, + "step": 11820, + "train_speed(iter/s)": 0.203552 + }, + { + "acc": 0.77733955, + "epoch": 0.27600285801776053, + "grad_norm": 7.65625, + "learning_rate": 9.697227083125076e-06, + "loss": 0.80572834, + "memory(GiB)": 126.99, + "step": 11830, + "train_speed(iter/s)": 0.203642 + }, + { + "acc": 0.77268896, + "epoch": 0.27623616559004943, + "grad_norm": 7.03125, + "learning_rate": 9.69657935202167e-06, + "loss": 0.80644588, + "memory(GiB)": 126.99, + "step": 11840, + "train_speed(iter/s)": 0.203734 + }, + { + "acc": 0.7876543, + "epoch": 0.27646947316233833, + "grad_norm": 6.1875, + "learning_rate": 9.695930950482928e-06, + "loss": 0.747785, + "memory(GiB)": 126.99, + "step": 11850, + "train_speed(iter/s)": 0.203818 + }, + { + "acc": 0.74815092, + "epoch": 0.27670278073462723, + "grad_norm": 5.53125, + "learning_rate": 9.695281878601406e-06, + "loss": 0.90503931, + "memory(GiB)": 126.99, + "step": 11860, + "train_speed(iter/s)": 0.203908 + }, + { + "acc": 0.7729579, + "epoch": 0.27693608830691613, + "grad_norm": 5.0625, + "learning_rate": 9.69463213646976e-06, + "loss": 0.82826509, + "memory(GiB)": 126.99, + "step": 11870, + "train_speed(iter/s)": 0.203998 + }, + { + "acc": 0.75472612, + "epoch": 0.277169395879205, + "grad_norm": 5.5, + "learning_rate": 9.69398172418074e-06, + "loss": 0.87955236, + "memory(GiB)": 126.99, + "step": 11880, + "train_speed(iter/s)": 0.204085 + }, + { + "acc": 0.78086786, + "epoch": 0.27740270345149387, + "grad_norm": 3.859375, + "learning_rate": 9.693330641827194e-06, + "loss": 0.75951362, + "memory(GiB)": 126.99, + "step": 11890, + "train_speed(iter/s)": 0.204175 + }, + { + "acc": 0.78073997, + "epoch": 0.27763601102378277, + "grad_norm": 4.46875, + "learning_rate": 9.69267888950206e-06, + "loss": 0.78068328, + "memory(GiB)": 126.99, + "step": 11900, + "train_speed(iter/s)": 0.204262 + }, + { + "acc": 0.77878237, + "epoch": 0.27786931859607167, + "grad_norm": 4.375, + "learning_rate": 9.69202646729838e-06, + "loss": 0.79412766, + "memory(GiB)": 126.99, + "step": 11910, + "train_speed(iter/s)": 0.204346 + }, + { + "acc": 0.78074179, + "epoch": 0.27810262616836057, + "grad_norm": 5.75, + "learning_rate": 9.691373375309284e-06, + "loss": 0.84565754, + "memory(GiB)": 126.99, + "step": 11920, + "train_speed(iter/s)": 0.20443 + }, + { + "acc": 0.75893049, + "epoch": 0.27833593374064947, + "grad_norm": 7.375, + "learning_rate": 9.690719613628001e-06, + "loss": 0.88701725, + "memory(GiB)": 126.99, + "step": 11930, + "train_speed(iter/s)": 0.204519 + }, + { + "acc": 0.77881913, + "epoch": 0.27856924131293836, + "grad_norm": 5.46875, + "learning_rate": 9.690065182347857e-06, + "loss": 0.8020647, + "memory(GiB)": 126.99, + "step": 11940, + "train_speed(iter/s)": 0.204602 + }, + { + "acc": 0.75828795, + "epoch": 0.27880254888522726, + "grad_norm": 6.03125, + "learning_rate": 9.68941008156227e-06, + "loss": 0.88880653, + "memory(GiB)": 126.99, + "step": 11950, + "train_speed(iter/s)": 0.204691 + }, + { + "acc": 0.7708621, + "epoch": 0.27903585645751616, + "grad_norm": 6.0, + "learning_rate": 9.688754311364755e-06, + "loss": 0.81502056, + "memory(GiB)": 126.99, + "step": 11960, + "train_speed(iter/s)": 0.204778 + }, + { + "acc": 0.74046493, + "epoch": 0.27926916402980506, + "grad_norm": 6.28125, + "learning_rate": 9.688097871848925e-06, + "loss": 0.93770523, + "memory(GiB)": 126.99, + "step": 11970, + "train_speed(iter/s)": 0.204869 + }, + { + "acc": 0.78146906, + "epoch": 0.27950247160209396, + "grad_norm": 6.375, + "learning_rate": 9.687440763108487e-06, + "loss": 0.79011135, + "memory(GiB)": 126.99, + "step": 11980, + "train_speed(iter/s)": 0.204953 + }, + { + "acc": 0.76589327, + "epoch": 0.2797357791743828, + "grad_norm": 5.875, + "learning_rate": 9.68678298523724e-06, + "loss": 0.86380882, + "memory(GiB)": 126.99, + "step": 11990, + "train_speed(iter/s)": 0.205044 + }, + { + "acc": 0.7602313, + "epoch": 0.2799690867466717, + "grad_norm": 4.84375, + "learning_rate": 9.686124538329083e-06, + "loss": 0.86302662, + "memory(GiB)": 126.99, + "step": 12000, + "train_speed(iter/s)": 0.20513 + }, + { + "epoch": 0.2799690867466717, + "eval_acc": 0.7339767288010274, + "eval_loss": 0.8442361354827881, + "eval_runtime": 1270.1423, + "eval_samples_per_second": 28.336, + "eval_steps_per_second": 14.168, + "step": 12000 + }, + { + "acc": 0.7855062, + "epoch": 0.2802023943189606, + "grad_norm": 4.5625, + "learning_rate": 9.685465422478011e-06, + "loss": 0.78390985, + "memory(GiB)": 126.99, + "step": 12010, + "train_speed(iter/s)": 0.200803 + }, + { + "acc": 0.77192798, + "epoch": 0.2804357018912495, + "grad_norm": 4.28125, + "learning_rate": 9.684805637778109e-06, + "loss": 0.83765612, + "memory(GiB)": 126.99, + "step": 12020, + "train_speed(iter/s)": 0.200888 + }, + { + "acc": 0.77343655, + "epoch": 0.2806690094635384, + "grad_norm": 6.65625, + "learning_rate": 9.684145184323565e-06, + "loss": 0.81042671, + "memory(GiB)": 126.99, + "step": 12030, + "train_speed(iter/s)": 0.200973 + }, + { + "acc": 0.76841726, + "epoch": 0.2809023170358273, + "grad_norm": 4.3125, + "learning_rate": 9.683484062208657e-06, + "loss": 0.84212866, + "memory(GiB)": 126.99, + "step": 12040, + "train_speed(iter/s)": 0.201061 + }, + { + "acc": 0.76003461, + "epoch": 0.2811356246081162, + "grad_norm": 7.59375, + "learning_rate": 9.682822271527758e-06, + "loss": 0.86936035, + "memory(GiB)": 126.99, + "step": 12050, + "train_speed(iter/s)": 0.20115 + }, + { + "acc": 0.76045341, + "epoch": 0.2813689321804051, + "grad_norm": 7.15625, + "learning_rate": 9.682159812375342e-06, + "loss": 0.87109947, + "memory(GiB)": 126.99, + "step": 12060, + "train_speed(iter/s)": 0.201238 + }, + { + "acc": 0.78477945, + "epoch": 0.281602239752694, + "grad_norm": 5.25, + "learning_rate": 9.681496684845973e-06, + "loss": 0.75963497, + "memory(GiB)": 126.99, + "step": 12070, + "train_speed(iter/s)": 0.201321 + }, + { + "acc": 0.76544571, + "epoch": 0.2818355473249829, + "grad_norm": 5.09375, + "learning_rate": 9.68083288903431e-06, + "loss": 0.84756174, + "memory(GiB)": 126.99, + "step": 12080, + "train_speed(iter/s)": 0.201406 + }, + { + "acc": 0.77797155, + "epoch": 0.28206885489727174, + "grad_norm": 6.65625, + "learning_rate": 9.680168425035114e-06, + "loss": 0.8038868, + "memory(GiB)": 126.99, + "step": 12090, + "train_speed(iter/s)": 0.201491 + }, + { + "acc": 0.75691757, + "epoch": 0.28230216246956064, + "grad_norm": 5.75, + "learning_rate": 9.679503292943234e-06, + "loss": 0.87715702, + "memory(GiB)": 126.99, + "step": 12100, + "train_speed(iter/s)": 0.201577 + }, + { + "acc": 0.76970291, + "epoch": 0.28253547004184953, + "grad_norm": 5.03125, + "learning_rate": 9.678837492853619e-06, + "loss": 0.83921423, + "memory(GiB)": 126.99, + "step": 12110, + "train_speed(iter/s)": 0.201649 + }, + { + "acc": 0.77343578, + "epoch": 0.28276877761413843, + "grad_norm": 6.71875, + "learning_rate": 9.67817102486131e-06, + "loss": 0.79686985, + "memory(GiB)": 126.99, + "step": 12120, + "train_speed(iter/s)": 0.201735 + }, + { + "acc": 0.76627212, + "epoch": 0.28300208518642733, + "grad_norm": 5.96875, + "learning_rate": 9.677503889061446e-06, + "loss": 0.8474947, + "memory(GiB)": 126.99, + "step": 12130, + "train_speed(iter/s)": 0.201818 + }, + { + "acc": 0.76688299, + "epoch": 0.28323539275871623, + "grad_norm": 5.875, + "learning_rate": 9.676836085549263e-06, + "loss": 0.8329731, + "memory(GiB)": 126.99, + "step": 12140, + "train_speed(iter/s)": 0.201905 + }, + { + "acc": 0.74896669, + "epoch": 0.28346870033100513, + "grad_norm": 5.25, + "learning_rate": 9.676167614420085e-06, + "loss": 0.91642704, + "memory(GiB)": 126.99, + "step": 12150, + "train_speed(iter/s)": 0.201985 + }, + { + "acc": 0.76068959, + "epoch": 0.28370200790329403, + "grad_norm": 7.15625, + "learning_rate": 9.67549847576934e-06, + "loss": 0.8687542, + "memory(GiB)": 126.99, + "step": 12160, + "train_speed(iter/s)": 0.202069 + }, + { + "acc": 0.76523075, + "epoch": 0.28393531547558293, + "grad_norm": 20.0, + "learning_rate": 9.674828669692545e-06, + "loss": 0.8203887, + "memory(GiB)": 126.99, + "step": 12170, + "train_speed(iter/s)": 0.202155 + }, + { + "acc": 0.77215323, + "epoch": 0.28416862304787177, + "grad_norm": 6.5625, + "learning_rate": 9.674158196285316e-06, + "loss": 0.80515928, + "memory(GiB)": 126.99, + "step": 12180, + "train_speed(iter/s)": 0.202235 + }, + { + "acc": 0.77063136, + "epoch": 0.28440193062016067, + "grad_norm": 7.75, + "learning_rate": 9.673487055643362e-06, + "loss": 0.82243662, + "memory(GiB)": 126.99, + "step": 12190, + "train_speed(iter/s)": 0.202321 + }, + { + "acc": 0.76656513, + "epoch": 0.28463523819244957, + "grad_norm": 5.25, + "learning_rate": 9.672815247862489e-06, + "loss": 0.82771702, + "memory(GiB)": 126.99, + "step": 12200, + "train_speed(iter/s)": 0.202403 + }, + { + "acc": 0.78110971, + "epoch": 0.28486854576473847, + "grad_norm": 6.15625, + "learning_rate": 9.672142773038595e-06, + "loss": 0.80094032, + "memory(GiB)": 126.99, + "step": 12210, + "train_speed(iter/s)": 0.20249 + }, + { + "acc": 0.75440207, + "epoch": 0.28510185333702737, + "grad_norm": 5.0, + "learning_rate": 9.671469631267678e-06, + "loss": 0.89850616, + "memory(GiB)": 126.99, + "step": 12220, + "train_speed(iter/s)": 0.20258 + }, + { + "acc": 0.753054, + "epoch": 0.28533516090931627, + "grad_norm": 7.15625, + "learning_rate": 9.67079582264583e-06, + "loss": 0.90991497, + "memory(GiB)": 126.99, + "step": 12230, + "train_speed(iter/s)": 0.202672 + }, + { + "acc": 0.77435451, + "epoch": 0.28556846848160516, + "grad_norm": 5.90625, + "learning_rate": 9.670121347269234e-06, + "loss": 0.82249737, + "memory(GiB)": 126.99, + "step": 12240, + "train_speed(iter/s)": 0.202753 + }, + { + "acc": 0.75970974, + "epoch": 0.28580177605389406, + "grad_norm": 5.40625, + "learning_rate": 9.669446205234172e-06, + "loss": 0.85512848, + "memory(GiB)": 126.99, + "step": 12250, + "train_speed(iter/s)": 0.202839 + }, + { + "acc": 0.78008022, + "epoch": 0.28603508362618296, + "grad_norm": 5.15625, + "learning_rate": 9.668770396637022e-06, + "loss": 0.79443493, + "memory(GiB)": 126.99, + "step": 12260, + "train_speed(iter/s)": 0.20292 + }, + { + "acc": 0.76683731, + "epoch": 0.28626839119847186, + "grad_norm": 5.15625, + "learning_rate": 9.668093921574253e-06, + "loss": 0.85410089, + "memory(GiB)": 126.99, + "step": 12270, + "train_speed(iter/s)": 0.203005 + }, + { + "acc": 0.75589981, + "epoch": 0.2865016987707607, + "grad_norm": 6.71875, + "learning_rate": 9.667416780142434e-06, + "loss": 0.88183594, + "memory(GiB)": 126.99, + "step": 12280, + "train_speed(iter/s)": 0.20309 + }, + { + "acc": 0.7808938, + "epoch": 0.2867350063430496, + "grad_norm": 5.34375, + "learning_rate": 9.666738972438224e-06, + "loss": 0.78594069, + "memory(GiB)": 126.99, + "step": 12290, + "train_speed(iter/s)": 0.203169 + }, + { + "acc": 0.77634277, + "epoch": 0.2869683139153385, + "grad_norm": 5.15625, + "learning_rate": 9.666060498558381e-06, + "loss": 0.80375671, + "memory(GiB)": 126.99, + "step": 12300, + "train_speed(iter/s)": 0.203253 + }, + { + "acc": 0.76860728, + "epoch": 0.2872016214876274, + "grad_norm": 4.9375, + "learning_rate": 9.665381358599759e-06, + "loss": 0.86330442, + "memory(GiB)": 126.99, + "step": 12310, + "train_speed(iter/s)": 0.203335 + }, + { + "acc": 0.7679863, + "epoch": 0.2874349290599163, + "grad_norm": 5.375, + "learning_rate": 9.664701552659303e-06, + "loss": 0.838801, + "memory(GiB)": 126.99, + "step": 12320, + "train_speed(iter/s)": 0.20342 + }, + { + "acc": 0.74797144, + "epoch": 0.2876682366322052, + "grad_norm": 8.5, + "learning_rate": 9.664021080834053e-06, + "loss": 0.93438997, + "memory(GiB)": 126.99, + "step": 12330, + "train_speed(iter/s)": 0.203504 + }, + { + "acc": 0.75219126, + "epoch": 0.2879015442044941, + "grad_norm": 5.59375, + "learning_rate": 9.663339943221153e-06, + "loss": 0.906147, + "memory(GiB)": 126.99, + "step": 12340, + "train_speed(iter/s)": 0.203584 + }, + { + "acc": 0.76432757, + "epoch": 0.288134851776783, + "grad_norm": 5.28125, + "learning_rate": 9.662658139917827e-06, + "loss": 0.84146051, + "memory(GiB)": 126.99, + "step": 12350, + "train_speed(iter/s)": 0.203666 + }, + { + "acc": 0.76514382, + "epoch": 0.2883681593490719, + "grad_norm": 5.0625, + "learning_rate": 9.661975671021408e-06, + "loss": 0.85332146, + "memory(GiB)": 126.99, + "step": 12360, + "train_speed(iter/s)": 0.203749 + }, + { + "acc": 0.75728321, + "epoch": 0.2886014669213608, + "grad_norm": 4.84375, + "learning_rate": 9.661292536629316e-06, + "loss": 0.85679588, + "memory(GiB)": 126.99, + "step": 12370, + "train_speed(iter/s)": 0.203833 + }, + { + "acc": 0.77875571, + "epoch": 0.28883477449364964, + "grad_norm": 7.0, + "learning_rate": 9.660608736839067e-06, + "loss": 0.80268106, + "memory(GiB)": 126.99, + "step": 12380, + "train_speed(iter/s)": 0.203916 + }, + { + "acc": 0.75960855, + "epoch": 0.28906808206593854, + "grad_norm": 5.125, + "learning_rate": 9.659924271748277e-06, + "loss": 0.86685429, + "memory(GiB)": 126.99, + "step": 12390, + "train_speed(iter/s)": 0.204004 + }, + { + "acc": 0.76572495, + "epoch": 0.28930138963822744, + "grad_norm": 4.09375, + "learning_rate": 9.65923914145465e-06, + "loss": 0.8074934, + "memory(GiB)": 126.99, + "step": 12400, + "train_speed(iter/s)": 0.204085 + }, + { + "acc": 0.76597309, + "epoch": 0.28953469721051633, + "grad_norm": 5.59375, + "learning_rate": 9.65855334605599e-06, + "loss": 0.82842016, + "memory(GiB)": 126.99, + "step": 12410, + "train_speed(iter/s)": 0.204169 + }, + { + "acc": 0.75547256, + "epoch": 0.28976800478280523, + "grad_norm": 6.5, + "learning_rate": 9.65786688565019e-06, + "loss": 0.88945179, + "memory(GiB)": 126.99, + "step": 12420, + "train_speed(iter/s)": 0.204248 + }, + { + "acc": 0.77330575, + "epoch": 0.29000131235509413, + "grad_norm": 5.34375, + "learning_rate": 9.65717976033525e-06, + "loss": 0.80908098, + "memory(GiB)": 126.99, + "step": 12430, + "train_speed(iter/s)": 0.204329 + }, + { + "acc": 0.76689053, + "epoch": 0.29023461992738303, + "grad_norm": 4.6875, + "learning_rate": 9.656491970209248e-06, + "loss": 0.84457645, + "memory(GiB)": 126.99, + "step": 12440, + "train_speed(iter/s)": 0.204414 + }, + { + "acc": 0.76981139, + "epoch": 0.29046792749967193, + "grad_norm": 6.71875, + "learning_rate": 9.655803515370373e-06, + "loss": 0.8389142, + "memory(GiB)": 126.99, + "step": 12450, + "train_speed(iter/s)": 0.204496 + }, + { + "acc": 0.77279081, + "epoch": 0.29070123507196083, + "grad_norm": 4.46875, + "learning_rate": 9.655114395916896e-06, + "loss": 0.8170186, + "memory(GiB)": 126.99, + "step": 12460, + "train_speed(iter/s)": 0.20458 + }, + { + "acc": 0.7852149, + "epoch": 0.29093454264424967, + "grad_norm": 6.90625, + "learning_rate": 9.654424611947194e-06, + "loss": 0.80533628, + "memory(GiB)": 126.99, + "step": 12470, + "train_speed(iter/s)": 0.204664 + }, + { + "acc": 0.76608725, + "epoch": 0.29116785021653857, + "grad_norm": 5.75, + "learning_rate": 9.65373416355973e-06, + "loss": 0.84452038, + "memory(GiB)": 126.99, + "step": 12480, + "train_speed(iter/s)": 0.204746 + }, + { + "acc": 0.76543536, + "epoch": 0.29140115778882747, + "grad_norm": 5.59375, + "learning_rate": 9.653043050853065e-06, + "loss": 0.85324554, + "memory(GiB)": 126.99, + "step": 12490, + "train_speed(iter/s)": 0.204826 + }, + { + "acc": 0.76770477, + "epoch": 0.29163446536111637, + "grad_norm": 6.5625, + "learning_rate": 9.652351273925854e-06, + "loss": 0.84301796, + "memory(GiB)": 126.99, + "step": 12500, + "train_speed(iter/s)": 0.204912 + }, + { + "epoch": 0.29163446536111637, + "eval_acc": 0.7345405966429606, + "eval_loss": 0.8422214984893799, + "eval_runtime": 1269.9542, + "eval_samples_per_second": 28.34, + "eval_steps_per_second": 14.171, + "step": 12500 + }, + { + "acc": 0.7895587, + "epoch": 0.29186777293340527, + "grad_norm": 5.8125, + "learning_rate": 9.651658832876853e-06, + "loss": 0.75596642, + "memory(GiB)": 126.99, + "step": 12510, + "train_speed(iter/s)": 0.200765 + }, + { + "acc": 0.75599642, + "epoch": 0.29210108050569417, + "grad_norm": 5.40625, + "learning_rate": 9.650965727804907e-06, + "loss": 0.88882275, + "memory(GiB)": 126.99, + "step": 12520, + "train_speed(iter/s)": 0.200848 + }, + { + "acc": 0.77908826, + "epoch": 0.29233438807798307, + "grad_norm": 6.25, + "learning_rate": 9.65027195880895e-06, + "loss": 0.79518585, + "memory(GiB)": 126.99, + "step": 12530, + "train_speed(iter/s)": 0.200931 + }, + { + "acc": 0.7476614, + "epoch": 0.29256769565027196, + "grad_norm": 10.25, + "learning_rate": 9.649577525988025e-06, + "loss": 0.93213406, + "memory(GiB)": 126.99, + "step": 12540, + "train_speed(iter/s)": 0.201017 + }, + { + "acc": 0.7507143, + "epoch": 0.29280100322256086, + "grad_norm": 5.78125, + "learning_rate": 9.648882429441258e-06, + "loss": 0.92582474, + "memory(GiB)": 126.99, + "step": 12550, + "train_speed(iter/s)": 0.201102 + }, + { + "acc": 0.76029882, + "epoch": 0.29303431079484976, + "grad_norm": 10.0625, + "learning_rate": 9.648186669267874e-06, + "loss": 0.86718149, + "memory(GiB)": 126.99, + "step": 12560, + "train_speed(iter/s)": 0.201185 + }, + { + "acc": 0.75470057, + "epoch": 0.2932676183671386, + "grad_norm": 6.6875, + "learning_rate": 9.647490245567194e-06, + "loss": 0.90233717, + "memory(GiB)": 126.99, + "step": 12570, + "train_speed(iter/s)": 0.201266 + }, + { + "acc": 0.7683279, + "epoch": 0.2935009259394275, + "grad_norm": 7.0625, + "learning_rate": 9.646793158438632e-06, + "loss": 0.82610855, + "memory(GiB)": 126.99, + "step": 12580, + "train_speed(iter/s)": 0.201344 + }, + { + "acc": 0.77332907, + "epoch": 0.2937342335117164, + "grad_norm": 9.125, + "learning_rate": 9.646095407981695e-06, + "loss": 0.80713387, + "memory(GiB)": 126.99, + "step": 12590, + "train_speed(iter/s)": 0.201425 + }, + { + "acc": 0.74880519, + "epoch": 0.2939675410840053, + "grad_norm": 8.75, + "learning_rate": 9.64539699429599e-06, + "loss": 0.91812592, + "memory(GiB)": 126.99, + "step": 12600, + "train_speed(iter/s)": 0.201507 + }, + { + "acc": 0.76272821, + "epoch": 0.2942008486562942, + "grad_norm": 7.25, + "learning_rate": 9.644697917481212e-06, + "loss": 0.84828262, + "memory(GiB)": 126.99, + "step": 12610, + "train_speed(iter/s)": 0.201593 + }, + { + "acc": 0.76495266, + "epoch": 0.2944341562285831, + "grad_norm": 5.46875, + "learning_rate": 9.643998177637157e-06, + "loss": 0.8520998, + "memory(GiB)": 126.99, + "step": 12620, + "train_speed(iter/s)": 0.201679 + }, + { + "acc": 0.75189352, + "epoch": 0.294667463800872, + "grad_norm": 5.40625, + "learning_rate": 9.643297774863709e-06, + "loss": 0.88819065, + "memory(GiB)": 126.99, + "step": 12630, + "train_speed(iter/s)": 0.201765 + }, + { + "acc": 0.74685659, + "epoch": 0.2949007713731609, + "grad_norm": 6.0625, + "learning_rate": 9.642596709260854e-06, + "loss": 0.93766823, + "memory(GiB)": 126.99, + "step": 12640, + "train_speed(iter/s)": 0.201846 + }, + { + "acc": 0.77418585, + "epoch": 0.2951340789454498, + "grad_norm": 7.71875, + "learning_rate": 9.641894980928668e-06, + "loss": 0.82185469, + "memory(GiB)": 126.99, + "step": 12650, + "train_speed(iter/s)": 0.201928 + }, + { + "acc": 0.77108278, + "epoch": 0.29536738651773864, + "grad_norm": 7.53125, + "learning_rate": 9.641192589967321e-06, + "loss": 0.84046984, + "memory(GiB)": 126.99, + "step": 12660, + "train_speed(iter/s)": 0.202013 + }, + { + "acc": 0.7660718, + "epoch": 0.29560069409002754, + "grad_norm": 6.28125, + "learning_rate": 9.64048953647708e-06, + "loss": 0.83025084, + "memory(GiB)": 126.99, + "step": 12670, + "train_speed(iter/s)": 0.202095 + }, + { + "acc": 0.76199236, + "epoch": 0.29583400166231644, + "grad_norm": 3.8125, + "learning_rate": 9.639785820558307e-06, + "loss": 0.88814316, + "memory(GiB)": 126.99, + "step": 12680, + "train_speed(iter/s)": 0.202176 + }, + { + "acc": 0.75356421, + "epoch": 0.29606730923460534, + "grad_norm": 4.96875, + "learning_rate": 9.639081442311456e-06, + "loss": 0.88174667, + "memory(GiB)": 126.99, + "step": 12690, + "train_speed(iter/s)": 0.202261 + }, + { + "acc": 0.76016741, + "epoch": 0.29630061680689423, + "grad_norm": 6.59375, + "learning_rate": 9.638376401837075e-06, + "loss": 0.87111378, + "memory(GiB)": 126.99, + "step": 12700, + "train_speed(iter/s)": 0.202342 + }, + { + "acc": 0.78399467, + "epoch": 0.29653392437918313, + "grad_norm": 4.625, + "learning_rate": 9.63767069923581e-06, + "loss": 0.77465773, + "memory(GiB)": 126.99, + "step": 12710, + "train_speed(iter/s)": 0.202423 + }, + { + "acc": 0.76579094, + "epoch": 0.29676723195147203, + "grad_norm": 5.28125, + "learning_rate": 9.636964334608402e-06, + "loss": 0.85976906, + "memory(GiB)": 126.99, + "step": 12720, + "train_speed(iter/s)": 0.2025 + }, + { + "acc": 0.76346893, + "epoch": 0.29700053952376093, + "grad_norm": 5.5625, + "learning_rate": 9.636257308055682e-06, + "loss": 0.85128269, + "memory(GiB)": 126.99, + "step": 12730, + "train_speed(iter/s)": 0.202583 + }, + { + "acc": 0.76815395, + "epoch": 0.29723384709604983, + "grad_norm": 4.65625, + "learning_rate": 9.635549619678578e-06, + "loss": 0.84149694, + "memory(GiB)": 126.99, + "step": 12740, + "train_speed(iter/s)": 0.202667 + }, + { + "acc": 0.76429968, + "epoch": 0.29746715466833873, + "grad_norm": 5.15625, + "learning_rate": 9.63484126957811e-06, + "loss": 0.8531889, + "memory(GiB)": 126.99, + "step": 12750, + "train_speed(iter/s)": 0.20275 + }, + { + "acc": 0.75266466, + "epoch": 0.2977004622406276, + "grad_norm": 7.625, + "learning_rate": 9.6341322578554e-06, + "loss": 0.87924614, + "memory(GiB)": 126.99, + "step": 12760, + "train_speed(iter/s)": 0.202829 + }, + { + "acc": 0.77012124, + "epoch": 0.29793376981291647, + "grad_norm": 4.4375, + "learning_rate": 9.633422584611654e-06, + "loss": 0.81980047, + "memory(GiB)": 126.99, + "step": 12770, + "train_speed(iter/s)": 0.202914 + }, + { + "acc": 0.78082376, + "epoch": 0.29816707738520537, + "grad_norm": 4.90625, + "learning_rate": 9.632712249948182e-06, + "loss": 0.80720272, + "memory(GiB)": 126.99, + "step": 12780, + "train_speed(iter/s)": 0.202998 + }, + { + "acc": 0.7774765, + "epoch": 0.29840038495749427, + "grad_norm": 4.71875, + "learning_rate": 9.632001253966381e-06, + "loss": 0.79807181, + "memory(GiB)": 126.99, + "step": 12790, + "train_speed(iter/s)": 0.203084 + }, + { + "acc": 0.77757535, + "epoch": 0.29863369252978317, + "grad_norm": 5.625, + "learning_rate": 9.631289596767748e-06, + "loss": 0.78618836, + "memory(GiB)": 126.99, + "step": 12800, + "train_speed(iter/s)": 0.203164 + }, + { + "acc": 0.77521505, + "epoch": 0.29886700010207207, + "grad_norm": 8.1875, + "learning_rate": 9.63057727845387e-06, + "loss": 0.80945616, + "memory(GiB)": 126.99, + "step": 12810, + "train_speed(iter/s)": 0.203252 + }, + { + "acc": 0.77795644, + "epoch": 0.29910030767436097, + "grad_norm": 8.875, + "learning_rate": 9.62986429912643e-06, + "loss": 0.78125734, + "memory(GiB)": 126.99, + "step": 12820, + "train_speed(iter/s)": 0.203339 + }, + { + "acc": 0.77475395, + "epoch": 0.29933361524664986, + "grad_norm": 5.8125, + "learning_rate": 9.629150658887206e-06, + "loss": 0.81328382, + "memory(GiB)": 126.99, + "step": 12830, + "train_speed(iter/s)": 0.203414 + }, + { + "acc": 0.77334385, + "epoch": 0.29956692281893876, + "grad_norm": 5.1875, + "learning_rate": 9.628436357838072e-06, + "loss": 0.81213036, + "memory(GiB)": 126.99, + "step": 12840, + "train_speed(iter/s)": 0.203488 + }, + { + "acc": 0.79729042, + "epoch": 0.29980023039122766, + "grad_norm": 4.84375, + "learning_rate": 9.627721396080992e-06, + "loss": 0.73419204, + "memory(GiB)": 126.99, + "step": 12850, + "train_speed(iter/s)": 0.203572 + }, + { + "acc": 0.76485343, + "epoch": 0.3000335379635165, + "grad_norm": 6.78125, + "learning_rate": 9.627005773718026e-06, + "loss": 0.85712852, + "memory(GiB)": 126.99, + "step": 12860, + "train_speed(iter/s)": 0.203649 + }, + { + "acc": 0.77371435, + "epoch": 0.3002668455358054, + "grad_norm": 6.25, + "learning_rate": 9.626289490851329e-06, + "loss": 0.81916208, + "memory(GiB)": 126.99, + "step": 12870, + "train_speed(iter/s)": 0.203723 + }, + { + "acc": 0.78421712, + "epoch": 0.3005001531080943, + "grad_norm": 3.6875, + "learning_rate": 9.625572547583153e-06, + "loss": 0.77294006, + "memory(GiB)": 126.99, + "step": 12880, + "train_speed(iter/s)": 0.203807 + }, + { + "acc": 0.77695723, + "epoch": 0.3007334606803832, + "grad_norm": 5.75, + "learning_rate": 9.624854944015839e-06, + "loss": 0.80077038, + "memory(GiB)": 126.99, + "step": 12890, + "train_speed(iter/s)": 0.203892 + }, + { + "acc": 0.77849216, + "epoch": 0.3009667682526721, + "grad_norm": 4.53125, + "learning_rate": 9.624136680251826e-06, + "loss": 0.78988638, + "memory(GiB)": 126.99, + "step": 12900, + "train_speed(iter/s)": 0.203969 + }, + { + "acc": 0.78356485, + "epoch": 0.301200075824961, + "grad_norm": 6.28125, + "learning_rate": 9.623417756393644e-06, + "loss": 0.78568068, + "memory(GiB)": 126.99, + "step": 12910, + "train_speed(iter/s)": 0.204048 + }, + { + "acc": 0.75422754, + "epoch": 0.3014333833972499, + "grad_norm": 43.25, + "learning_rate": 9.622698172543921e-06, + "loss": 0.88511696, + "memory(GiB)": 126.99, + "step": 12920, + "train_speed(iter/s)": 0.20413 + }, + { + "acc": 0.76167445, + "epoch": 0.3016666909695388, + "grad_norm": 5.375, + "learning_rate": 9.621977928805377e-06, + "loss": 0.85034475, + "memory(GiB)": 126.99, + "step": 12930, + "train_speed(iter/s)": 0.204214 + }, + { + "acc": 0.77598619, + "epoch": 0.3018999985418277, + "grad_norm": 6.9375, + "learning_rate": 9.621257025280826e-06, + "loss": 0.79932165, + "memory(GiB)": 126.99, + "step": 12940, + "train_speed(iter/s)": 0.204302 + }, + { + "acc": 0.76910601, + "epoch": 0.30213330611411654, + "grad_norm": 15.6875, + "learning_rate": 9.620535462073177e-06, + "loss": 0.85517778, + "memory(GiB)": 126.99, + "step": 12950, + "train_speed(iter/s)": 0.204382 + }, + { + "acc": 0.74408941, + "epoch": 0.30236661368640544, + "grad_norm": 6.09375, + "learning_rate": 9.619813239285433e-06, + "loss": 0.95150757, + "memory(GiB)": 126.99, + "step": 12960, + "train_speed(iter/s)": 0.204462 + }, + { + "acc": 0.77192564, + "epoch": 0.30259992125869434, + "grad_norm": 5.65625, + "learning_rate": 9.619090357020691e-06, + "loss": 0.81647863, + "memory(GiB)": 126.99, + "step": 12970, + "train_speed(iter/s)": 0.204548 + }, + { + "acc": 0.78476696, + "epoch": 0.30283322883098324, + "grad_norm": 6.21875, + "learning_rate": 9.618366815382143e-06, + "loss": 0.78855925, + "memory(GiB)": 126.99, + "step": 12980, + "train_speed(iter/s)": 0.204626 + }, + { + "acc": 0.77091103, + "epoch": 0.30306653640327214, + "grad_norm": 5.6875, + "learning_rate": 9.617642614473073e-06, + "loss": 0.88907585, + "memory(GiB)": 126.99, + "step": 12990, + "train_speed(iter/s)": 0.204708 + }, + { + "acc": 0.767976, + "epoch": 0.30329984397556103, + "grad_norm": 9.375, + "learning_rate": 9.616917754396861e-06, + "loss": 0.82190113, + "memory(GiB)": 126.99, + "step": 13000, + "train_speed(iter/s)": 0.204791 + }, + { + "epoch": 0.30329984397556103, + "eval_acc": 0.7348525576074836, + "eval_loss": 0.8418717384338379, + "eval_runtime": 1269.0169, + "eval_samples_per_second": 28.361, + "eval_steps_per_second": 14.181, + "step": 13000 + }, + { + "acc": 0.77310801, + "epoch": 0.30353315154784993, + "grad_norm": 5.375, + "learning_rate": 9.616192235256983e-06, + "loss": 0.80410042, + "memory(GiB)": 126.99, + "step": 13010, + "train_speed(iter/s)": 0.200802 + }, + { + "acc": 0.75549717, + "epoch": 0.30376645912013883, + "grad_norm": 6.28125, + "learning_rate": 9.615466057157002e-06, + "loss": 0.87396736, + "memory(GiB)": 126.99, + "step": 13020, + "train_speed(iter/s)": 0.200883 + }, + { + "acc": 0.78147354, + "epoch": 0.30399976669242773, + "grad_norm": 6.96875, + "learning_rate": 9.614739220200583e-06, + "loss": 0.79309797, + "memory(GiB)": 126.99, + "step": 13030, + "train_speed(iter/s)": 0.20096 + }, + { + "acc": 0.76732526, + "epoch": 0.30423307426471663, + "grad_norm": 6.28125, + "learning_rate": 9.614011724491481e-06, + "loss": 0.8398242, + "memory(GiB)": 126.99, + "step": 13040, + "train_speed(iter/s)": 0.201041 + }, + { + "acc": 0.75758252, + "epoch": 0.3044663818370055, + "grad_norm": 8.5, + "learning_rate": 9.613283570133547e-06, + "loss": 0.87248812, + "memory(GiB)": 126.99, + "step": 13050, + "train_speed(iter/s)": 0.201125 + }, + { + "acc": 0.76003232, + "epoch": 0.30469968940929437, + "grad_norm": 6.84375, + "learning_rate": 9.612554757230722e-06, + "loss": 0.87683392, + "memory(GiB)": 126.99, + "step": 13060, + "train_speed(iter/s)": 0.201212 + }, + { + "acc": 0.77400045, + "epoch": 0.30493299698158327, + "grad_norm": 5.78125, + "learning_rate": 9.611825285887045e-06, + "loss": 0.81213093, + "memory(GiB)": 126.99, + "step": 13070, + "train_speed(iter/s)": 0.201287 + }, + { + "acc": 0.76753764, + "epoch": 0.30516630455387217, + "grad_norm": 6.21875, + "learning_rate": 9.61109515620665e-06, + "loss": 0.84945755, + "memory(GiB)": 126.99, + "step": 13080, + "train_speed(iter/s)": 0.201367 + }, + { + "acc": 0.79222326, + "epoch": 0.30539961212616107, + "grad_norm": 5.5, + "learning_rate": 9.61036436829376e-06, + "loss": 0.73132648, + "memory(GiB)": 126.99, + "step": 13090, + "train_speed(iter/s)": 0.201446 + }, + { + "acc": 0.75894957, + "epoch": 0.30563291969844997, + "grad_norm": 5.5625, + "learning_rate": 9.609632922252695e-06, + "loss": 0.89140854, + "memory(GiB)": 126.99, + "step": 13100, + "train_speed(iter/s)": 0.201525 + }, + { + "acc": 0.77855654, + "epoch": 0.30586622727073887, + "grad_norm": 5.0625, + "learning_rate": 9.60890081818787e-06, + "loss": 0.81173906, + "memory(GiB)": 126.99, + "step": 13110, + "train_speed(iter/s)": 0.201604 + }, + { + "acc": 0.77230225, + "epoch": 0.30609953484302777, + "grad_norm": 5.40625, + "learning_rate": 9.608168056203792e-06, + "loss": 0.80290689, + "memory(GiB)": 126.99, + "step": 13120, + "train_speed(iter/s)": 0.201684 + }, + { + "acc": 0.76812458, + "epoch": 0.30633284241531666, + "grad_norm": 7.71875, + "learning_rate": 9.607434636405063e-06, + "loss": 0.86059694, + "memory(GiB)": 126.99, + "step": 13130, + "train_speed(iter/s)": 0.201761 + }, + { + "acc": 0.76915483, + "epoch": 0.30656614998760556, + "grad_norm": 4.90625, + "learning_rate": 9.606700558896376e-06, + "loss": 0.8283617, + "memory(GiB)": 126.99, + "step": 13140, + "train_speed(iter/s)": 0.201843 + }, + { + "acc": 0.75852375, + "epoch": 0.3067994575598944, + "grad_norm": 5.59375, + "learning_rate": 9.605965823782525e-06, + "loss": 0.88993883, + "memory(GiB)": 126.99, + "step": 13150, + "train_speed(iter/s)": 0.201916 + }, + { + "acc": 0.77059002, + "epoch": 0.3070327651321833, + "grad_norm": 6.875, + "learning_rate": 9.605230431168391e-06, + "loss": 0.83691521, + "memory(GiB)": 126.99, + "step": 13160, + "train_speed(iter/s)": 0.201993 + }, + { + "acc": 0.77577543, + "epoch": 0.3072660727044722, + "grad_norm": 4.3125, + "learning_rate": 9.604494381158949e-06, + "loss": 0.81176348, + "memory(GiB)": 126.99, + "step": 13170, + "train_speed(iter/s)": 0.202066 + }, + { + "acc": 0.78056211, + "epoch": 0.3074993802767611, + "grad_norm": 5.53125, + "learning_rate": 9.603757673859274e-06, + "loss": 0.80487776, + "memory(GiB)": 126.99, + "step": 13180, + "train_speed(iter/s)": 0.202144 + }, + { + "acc": 0.76723247, + "epoch": 0.30773268784905, + "grad_norm": 11.375, + "learning_rate": 9.603020309374526e-06, + "loss": 0.87342901, + "memory(GiB)": 126.99, + "step": 13190, + "train_speed(iter/s)": 0.202221 + }, + { + "acc": 0.77724471, + "epoch": 0.3079659954213389, + "grad_norm": 8.75, + "learning_rate": 9.602282287809966e-06, + "loss": 0.8030695, + "memory(GiB)": 126.99, + "step": 13200, + "train_speed(iter/s)": 0.202298 + }, + { + "acc": 0.77045097, + "epoch": 0.3081993029936278, + "grad_norm": 4.125, + "learning_rate": 9.601543609270947e-06, + "loss": 0.83347521, + "memory(GiB)": 126.99, + "step": 13210, + "train_speed(iter/s)": 0.202369 + }, + { + "acc": 0.76114216, + "epoch": 0.3084326105659167, + "grad_norm": 7.5625, + "learning_rate": 9.600804273862917e-06, + "loss": 0.86944656, + "memory(GiB)": 126.99, + "step": 13220, + "train_speed(iter/s)": 0.202449 + }, + { + "acc": 0.76104956, + "epoch": 0.3086659181382056, + "grad_norm": 10.1875, + "learning_rate": 9.60006428169141e-06, + "loss": 0.87441788, + "memory(GiB)": 126.99, + "step": 13230, + "train_speed(iter/s)": 0.202524 + }, + { + "acc": 0.78825359, + "epoch": 0.30889922571049444, + "grad_norm": 4.5625, + "learning_rate": 9.599323632862063e-06, + "loss": 0.7530653, + "memory(GiB)": 126.99, + "step": 13240, + "train_speed(iter/s)": 0.202602 + }, + { + "acc": 0.77793827, + "epoch": 0.30913253328278334, + "grad_norm": 8.1875, + "learning_rate": 9.598582327480605e-06, + "loss": 0.78715887, + "memory(GiB)": 126.99, + "step": 13250, + "train_speed(iter/s)": 0.202675 + }, + { + "acc": 0.76915102, + "epoch": 0.30936584085507224, + "grad_norm": 5.6875, + "learning_rate": 9.597840365652857e-06, + "loss": 0.82546225, + "memory(GiB)": 126.99, + "step": 13260, + "train_speed(iter/s)": 0.202753 + }, + { + "acc": 0.7758049, + "epoch": 0.30959914842736114, + "grad_norm": 5.5625, + "learning_rate": 9.597097747484731e-06, + "loss": 0.8197854, + "memory(GiB)": 126.99, + "step": 13270, + "train_speed(iter/s)": 0.202827 + }, + { + "acc": 0.77688622, + "epoch": 0.30983245599965004, + "grad_norm": 5.0625, + "learning_rate": 9.596354473082237e-06, + "loss": 0.80246162, + "memory(GiB)": 126.99, + "step": 13280, + "train_speed(iter/s)": 0.202904 + }, + { + "acc": 0.77225947, + "epoch": 0.31006576357193893, + "grad_norm": 6.46875, + "learning_rate": 9.595610542551476e-06, + "loss": 0.82849407, + "memory(GiB)": 126.99, + "step": 13290, + "train_speed(iter/s)": 0.202984 + }, + { + "acc": 0.76478229, + "epoch": 0.31029907114422783, + "grad_norm": 6.09375, + "learning_rate": 9.594865955998648e-06, + "loss": 0.85376844, + "memory(GiB)": 126.99, + "step": 13300, + "train_speed(iter/s)": 0.203061 + }, + { + "acc": 0.77614241, + "epoch": 0.31053237871651673, + "grad_norm": 4.1875, + "learning_rate": 9.594120713530038e-06, + "loss": 0.82821312, + "memory(GiB)": 126.99, + "step": 13310, + "train_speed(iter/s)": 0.203142 + }, + { + "acc": 0.76965942, + "epoch": 0.31076568628880563, + "grad_norm": 6.125, + "learning_rate": 9.59337481525203e-06, + "loss": 0.83385906, + "memory(GiB)": 126.99, + "step": 13320, + "train_speed(iter/s)": 0.20322 + }, + { + "acc": 0.7630805, + "epoch": 0.31099899386109453, + "grad_norm": 6.96875, + "learning_rate": 9.592628261271102e-06, + "loss": 0.8761137, + "memory(GiB)": 126.99, + "step": 13330, + "train_speed(iter/s)": 0.203301 + }, + { + "acc": 0.76680398, + "epoch": 0.3112323014333834, + "grad_norm": 5.28125, + "learning_rate": 9.591881051693826e-06, + "loss": 0.85415878, + "memory(GiB)": 126.99, + "step": 13340, + "train_speed(iter/s)": 0.20338 + }, + { + "acc": 0.77896576, + "epoch": 0.3114656090056723, + "grad_norm": 6.53125, + "learning_rate": 9.591133186626861e-06, + "loss": 0.80970163, + "memory(GiB)": 126.99, + "step": 13350, + "train_speed(iter/s)": 0.203457 + }, + { + "acc": 0.75927386, + "epoch": 0.31169891657796117, + "grad_norm": 4.84375, + "learning_rate": 9.590384666176968e-06, + "loss": 0.86237316, + "memory(GiB)": 126.99, + "step": 13360, + "train_speed(iter/s)": 0.203535 + }, + { + "acc": 0.77169557, + "epoch": 0.31193222415025007, + "grad_norm": 4.15625, + "learning_rate": 9.589635490450999e-06, + "loss": 0.80186424, + "memory(GiB)": 126.99, + "step": 13370, + "train_speed(iter/s)": 0.203611 + }, + { + "acc": 0.78818069, + "epoch": 0.31216553172253897, + "grad_norm": 5.0, + "learning_rate": 9.588885659555895e-06, + "loss": 0.76719275, + "memory(GiB)": 126.99, + "step": 13380, + "train_speed(iter/s)": 0.20369 + }, + { + "acc": 0.74860768, + "epoch": 0.31239883929482787, + "grad_norm": 7.09375, + "learning_rate": 9.588135173598696e-06, + "loss": 0.91373577, + "memory(GiB)": 126.99, + "step": 13390, + "train_speed(iter/s)": 0.203769 + }, + { + "acc": 0.77999992, + "epoch": 0.31263214686711677, + "grad_norm": 9.8125, + "learning_rate": 9.587384032686536e-06, + "loss": 0.78641224, + "memory(GiB)": 126.99, + "step": 13400, + "train_speed(iter/s)": 0.203846 + }, + { + "acc": 0.78604879, + "epoch": 0.31286545443940567, + "grad_norm": 5.6875, + "learning_rate": 9.586632236926637e-06, + "loss": 0.76967993, + "memory(GiB)": 126.99, + "step": 13410, + "train_speed(iter/s)": 0.203919 + }, + { + "acc": 0.76556487, + "epoch": 0.31309876201169456, + "grad_norm": 5.1875, + "learning_rate": 9.585879786426317e-06, + "loss": 0.83489494, + "memory(GiB)": 126.99, + "step": 13420, + "train_speed(iter/s)": 0.203997 + }, + { + "acc": 0.77812591, + "epoch": 0.3133320695839834, + "grad_norm": 4.3125, + "learning_rate": 9.585126681292991e-06, + "loss": 0.7988245, + "memory(GiB)": 126.99, + "step": 13430, + "train_speed(iter/s)": 0.204078 + }, + { + "acc": 0.77678261, + "epoch": 0.3135653771562723, + "grad_norm": 6.71875, + "learning_rate": 9.584372921634164e-06, + "loss": 0.82126627, + "memory(GiB)": 126.99, + "step": 13440, + "train_speed(iter/s)": 0.204156 + }, + { + "acc": 0.77149763, + "epoch": 0.3137986847285612, + "grad_norm": 6.34375, + "learning_rate": 9.583618507557433e-06, + "loss": 0.82525063, + "memory(GiB)": 126.99, + "step": 13450, + "train_speed(iter/s)": 0.204235 + }, + { + "acc": 0.76586752, + "epoch": 0.3140319923008501, + "grad_norm": 4.6875, + "learning_rate": 9.582863439170493e-06, + "loss": 0.83788195, + "memory(GiB)": 126.99, + "step": 13460, + "train_speed(iter/s)": 0.204314 + }, + { + "acc": 0.77512035, + "epoch": 0.314265299873139, + "grad_norm": 4.8125, + "learning_rate": 9.582107716581125e-06, + "loss": 0.82719707, + "memory(GiB)": 126.99, + "step": 13470, + "train_speed(iter/s)": 0.204394 + }, + { + "acc": 0.79411778, + "epoch": 0.3144986074454279, + "grad_norm": 12.1875, + "learning_rate": 9.581351339897215e-06, + "loss": 0.73287711, + "memory(GiB)": 126.99, + "step": 13480, + "train_speed(iter/s)": 0.204473 + }, + { + "acc": 0.78208966, + "epoch": 0.3147319150177168, + "grad_norm": 5.03125, + "learning_rate": 9.580594309226731e-06, + "loss": 0.75941992, + "memory(GiB)": 126.99, + "step": 13490, + "train_speed(iter/s)": 0.204553 + }, + { + "acc": 0.75446653, + "epoch": 0.3149652225900057, + "grad_norm": 6.65625, + "learning_rate": 9.579836624677742e-06, + "loss": 0.89500237, + "memory(GiB)": 126.99, + "step": 13500, + "train_speed(iter/s)": 0.204631 + }, + { + "epoch": 0.3149652225900057, + "eval_acc": 0.7354596643921382, + "eval_loss": 0.8395183086395264, + "eval_runtime": 1270.619, + "eval_samples_per_second": 28.326, + "eval_steps_per_second": 14.163, + "step": 13500 + }, + { + "acc": 0.7628726, + "epoch": 0.3151985301622946, + "grad_norm": 4.3125, + "learning_rate": 9.579078286358403e-06, + "loss": 0.86440277, + "memory(GiB)": 126.99, + "step": 13510, + "train_speed(iter/s)": 0.200788 + }, + { + "acc": 0.76125512, + "epoch": 0.3154318377345835, + "grad_norm": 5.4375, + "learning_rate": 9.578319294376968e-06, + "loss": 0.86319227, + "memory(GiB)": 126.99, + "step": 13520, + "train_speed(iter/s)": 0.200865 + }, + { + "acc": 0.76013141, + "epoch": 0.31566514530687234, + "grad_norm": 5.59375, + "learning_rate": 9.577559648841785e-06, + "loss": 0.85533056, + "memory(GiB)": 126.99, + "step": 13530, + "train_speed(iter/s)": 0.200944 + }, + { + "acc": 0.76009645, + "epoch": 0.31589845287916124, + "grad_norm": 6.59375, + "learning_rate": 9.576799349861292e-06, + "loss": 0.86631908, + "memory(GiB)": 126.99, + "step": 13540, + "train_speed(iter/s)": 0.201021 + }, + { + "acc": 0.76818829, + "epoch": 0.31613176045145014, + "grad_norm": 5.6875, + "learning_rate": 9.576038397544021e-06, + "loss": 0.83353624, + "memory(GiB)": 126.99, + "step": 13550, + "train_speed(iter/s)": 0.201096 + }, + { + "acc": 0.76592422, + "epoch": 0.31636506802373904, + "grad_norm": 4.8125, + "learning_rate": 9.5752767919986e-06, + "loss": 0.8563282, + "memory(GiB)": 126.99, + "step": 13560, + "train_speed(iter/s)": 0.201172 + }, + { + "acc": 0.76527843, + "epoch": 0.31659837559602794, + "grad_norm": 8.125, + "learning_rate": 9.574514533333744e-06, + "loss": 0.86033154, + "memory(GiB)": 126.99, + "step": 13570, + "train_speed(iter/s)": 0.201252 + }, + { + "acc": 0.77543097, + "epoch": 0.31683168316831684, + "grad_norm": 17.25, + "learning_rate": 9.573751621658267e-06, + "loss": 0.79135704, + "memory(GiB)": 126.99, + "step": 13580, + "train_speed(iter/s)": 0.201323 + }, + { + "acc": 0.76395407, + "epoch": 0.31706499074060573, + "grad_norm": 6.03125, + "learning_rate": 9.572988057081076e-06, + "loss": 0.85698547, + "memory(GiB)": 126.99, + "step": 13590, + "train_speed(iter/s)": 0.2014 + }, + { + "acc": 0.74935603, + "epoch": 0.31729829831289463, + "grad_norm": 5.8125, + "learning_rate": 9.572223839711168e-06, + "loss": 0.89535446, + "memory(GiB)": 126.99, + "step": 13600, + "train_speed(iter/s)": 0.20148 + }, + { + "acc": 0.76039143, + "epoch": 0.31753160588518353, + "grad_norm": 4.4375, + "learning_rate": 9.571458969657634e-06, + "loss": 0.85859356, + "memory(GiB)": 126.99, + "step": 13610, + "train_speed(iter/s)": 0.201561 + }, + { + "acc": 0.79370561, + "epoch": 0.31776491345747243, + "grad_norm": 6.375, + "learning_rate": 9.570693447029662e-06, + "loss": 0.73161249, + "memory(GiB)": 126.99, + "step": 13620, + "train_speed(iter/s)": 0.20163 + }, + { + "acc": 0.77307386, + "epoch": 0.3179982210297613, + "grad_norm": 4.8125, + "learning_rate": 9.569927271936528e-06, + "loss": 0.81770248, + "memory(GiB)": 126.99, + "step": 13630, + "train_speed(iter/s)": 0.201711 + }, + { + "acc": 0.77128267, + "epoch": 0.3182315286020502, + "grad_norm": 6.75, + "learning_rate": 9.569160444487602e-06, + "loss": 0.82179031, + "memory(GiB)": 126.99, + "step": 13640, + "train_speed(iter/s)": 0.20179 + }, + { + "acc": 0.78777056, + "epoch": 0.31846483617433907, + "grad_norm": 6.375, + "learning_rate": 9.56839296479235e-06, + "loss": 0.74274769, + "memory(GiB)": 126.99, + "step": 13650, + "train_speed(iter/s)": 0.201867 + }, + { + "acc": 0.79498763, + "epoch": 0.31869814374662797, + "grad_norm": 6.5, + "learning_rate": 9.56762483296033e-06, + "loss": 0.72544613, + "memory(GiB)": 126.99, + "step": 13660, + "train_speed(iter/s)": 0.201949 + }, + { + "acc": 0.76066217, + "epoch": 0.31893145131891687, + "grad_norm": 5.625, + "learning_rate": 9.566856049101192e-06, + "loss": 0.86741734, + "memory(GiB)": 126.99, + "step": 13670, + "train_speed(iter/s)": 0.20202 + }, + { + "acc": 0.77316117, + "epoch": 0.31916475889120577, + "grad_norm": 5.09375, + "learning_rate": 9.56608661332468e-06, + "loss": 0.80373688, + "memory(GiB)": 126.99, + "step": 13680, + "train_speed(iter/s)": 0.202091 + }, + { + "acc": 0.76230974, + "epoch": 0.31939806646349467, + "grad_norm": 3.96875, + "learning_rate": 9.56531652574063e-06, + "loss": 0.8677124, + "memory(GiB)": 126.99, + "step": 13690, + "train_speed(iter/s)": 0.20217 + }, + { + "acc": 0.77418733, + "epoch": 0.31963137403578357, + "grad_norm": 10.375, + "learning_rate": 9.564545786458971e-06, + "loss": 0.81346025, + "memory(GiB)": 126.99, + "step": 13700, + "train_speed(iter/s)": 0.202244 + }, + { + "acc": 0.77347922, + "epoch": 0.31986468160807247, + "grad_norm": 3.984375, + "learning_rate": 9.563774395589728e-06, + "loss": 0.79495201, + "memory(GiB)": 126.99, + "step": 13710, + "train_speed(iter/s)": 0.202319 + }, + { + "acc": 0.76973238, + "epoch": 0.3200979891803613, + "grad_norm": 5.28125, + "learning_rate": 9.563002353243019e-06, + "loss": 0.84068336, + "memory(GiB)": 126.99, + "step": 13720, + "train_speed(iter/s)": 0.202395 + }, + { + "acc": 0.74852266, + "epoch": 0.3203312967526502, + "grad_norm": 9.8125, + "learning_rate": 9.562229659529046e-06, + "loss": 0.91825085, + "memory(GiB)": 126.99, + "step": 13730, + "train_speed(iter/s)": 0.202472 + }, + { + "acc": 0.77733259, + "epoch": 0.3205646043249391, + "grad_norm": 4.71875, + "learning_rate": 9.561456314558116e-06, + "loss": 0.79782739, + "memory(GiB)": 126.99, + "step": 13740, + "train_speed(iter/s)": 0.202545 + }, + { + "acc": 0.75323248, + "epoch": 0.320797911897228, + "grad_norm": 8.3125, + "learning_rate": 9.560682318440619e-06, + "loss": 0.88182631, + "memory(GiB)": 126.99, + "step": 13750, + "train_speed(iter/s)": 0.202621 + }, + { + "acc": 0.76970348, + "epoch": 0.3210312194695169, + "grad_norm": 6.5625, + "learning_rate": 9.55990767128705e-06, + "loss": 0.84244089, + "memory(GiB)": 126.99, + "step": 13760, + "train_speed(iter/s)": 0.202694 + }, + { + "acc": 0.7580492, + "epoch": 0.3212645270418058, + "grad_norm": 5.46875, + "learning_rate": 9.559132373207984e-06, + "loss": 0.87328644, + "memory(GiB)": 126.99, + "step": 13770, + "train_speed(iter/s)": 0.202774 + }, + { + "acc": 0.74232302, + "epoch": 0.3214978346140947, + "grad_norm": 5.09375, + "learning_rate": 9.558356424314095e-06, + "loss": 0.94130011, + "memory(GiB)": 126.99, + "step": 13780, + "train_speed(iter/s)": 0.202853 + }, + { + "acc": 0.77524061, + "epoch": 0.3217311421863836, + "grad_norm": 5.375, + "learning_rate": 9.557579824716152e-06, + "loss": 0.81230278, + "memory(GiB)": 126.99, + "step": 13790, + "train_speed(iter/s)": 0.20293 + }, + { + "acc": 0.76253085, + "epoch": 0.3219644497586725, + "grad_norm": 6.25, + "learning_rate": 9.556802574525013e-06, + "loss": 0.86086216, + "memory(GiB)": 126.99, + "step": 13800, + "train_speed(iter/s)": 0.203005 + }, + { + "acc": 0.7655097, + "epoch": 0.3221977573309614, + "grad_norm": 5.125, + "learning_rate": 9.556024673851629e-06, + "loss": 0.86178303, + "memory(GiB)": 126.99, + "step": 13810, + "train_speed(iter/s)": 0.203083 + }, + { + "acc": 0.74736996, + "epoch": 0.32243106490325024, + "grad_norm": 6.4375, + "learning_rate": 9.555246122807047e-06, + "loss": 0.92181644, + "memory(GiB)": 126.99, + "step": 13820, + "train_speed(iter/s)": 0.20316 + }, + { + "acc": 0.77784414, + "epoch": 0.32266437247553914, + "grad_norm": 3.796875, + "learning_rate": 9.554466921502405e-06, + "loss": 0.80609112, + "memory(GiB)": 126.99, + "step": 13830, + "train_speed(iter/s)": 0.203234 + }, + { + "acc": 0.76699972, + "epoch": 0.32289768004782804, + "grad_norm": 6.21875, + "learning_rate": 9.553687070048934e-06, + "loss": 0.8321928, + "memory(GiB)": 126.99, + "step": 13840, + "train_speed(iter/s)": 0.203305 + }, + { + "acc": 0.76653528, + "epoch": 0.32313098762011694, + "grad_norm": 9.6875, + "learning_rate": 9.552906568557953e-06, + "loss": 0.85399723, + "memory(GiB)": 126.99, + "step": 13850, + "train_speed(iter/s)": 0.203377 + }, + { + "acc": 0.78444996, + "epoch": 0.32336429519240584, + "grad_norm": 4.78125, + "learning_rate": 9.552125417140885e-06, + "loss": 0.75022516, + "memory(GiB)": 126.99, + "step": 13860, + "train_speed(iter/s)": 0.203452 + }, + { + "acc": 0.75712585, + "epoch": 0.32359760276469474, + "grad_norm": 5.1875, + "learning_rate": 9.551343615909236e-06, + "loss": 0.87373152, + "memory(GiB)": 126.99, + "step": 13870, + "train_speed(iter/s)": 0.203521 + }, + { + "acc": 0.77796164, + "epoch": 0.32383091033698364, + "grad_norm": 5.125, + "learning_rate": 9.550561164974606e-06, + "loss": 0.79385543, + "memory(GiB)": 126.99, + "step": 13880, + "train_speed(iter/s)": 0.203594 + }, + { + "acc": 0.77821608, + "epoch": 0.32406421790927253, + "grad_norm": 7.8125, + "learning_rate": 9.549778064448693e-06, + "loss": 0.79453735, + "memory(GiB)": 126.99, + "step": 13890, + "train_speed(iter/s)": 0.203671 + }, + { + "acc": 0.77085085, + "epoch": 0.32429752548156143, + "grad_norm": 6.875, + "learning_rate": 9.548994314443284e-06, + "loss": 0.83549728, + "memory(GiB)": 126.99, + "step": 13900, + "train_speed(iter/s)": 0.203749 + }, + { + "acc": 0.7559999, + "epoch": 0.32453083305385033, + "grad_norm": 5.65625, + "learning_rate": 9.548209915070256e-06, + "loss": 0.88760204, + "memory(GiB)": 126.99, + "step": 13910, + "train_speed(iter/s)": 0.203819 + }, + { + "acc": 0.7560101, + "epoch": 0.3247641406261392, + "grad_norm": 5.65625, + "learning_rate": 9.547424866441586e-06, + "loss": 0.8699544, + "memory(GiB)": 126.99, + "step": 13920, + "train_speed(iter/s)": 0.203893 + }, + { + "acc": 0.77840662, + "epoch": 0.3249974481984281, + "grad_norm": 6.53125, + "learning_rate": 9.546639168669336e-06, + "loss": 0.80326424, + "memory(GiB)": 126.99, + "step": 13930, + "train_speed(iter/s)": 0.203969 + }, + { + "acc": 0.77628956, + "epoch": 0.325230755770717, + "grad_norm": 5.75, + "learning_rate": 9.545852821865667e-06, + "loss": 0.81750832, + "memory(GiB)": 126.99, + "step": 13940, + "train_speed(iter/s)": 0.204043 + }, + { + "acc": 0.76031036, + "epoch": 0.32546406334300587, + "grad_norm": 8.125, + "learning_rate": 9.545065826142825e-06, + "loss": 0.87485161, + "memory(GiB)": 126.99, + "step": 13950, + "train_speed(iter/s)": 0.204122 + }, + { + "acc": 0.76914015, + "epoch": 0.32569737091529477, + "grad_norm": 4.15625, + "learning_rate": 9.544278181613158e-06, + "loss": 0.82654829, + "memory(GiB)": 126.99, + "step": 13960, + "train_speed(iter/s)": 0.204195 + }, + { + "acc": 0.77492094, + "epoch": 0.32593067848758367, + "grad_norm": 5.1875, + "learning_rate": 9.543489888389103e-06, + "loss": 0.83645306, + "memory(GiB)": 126.99, + "step": 13970, + "train_speed(iter/s)": 0.20427 + }, + { + "acc": 0.77900114, + "epoch": 0.32616398605987257, + "grad_norm": 5.28125, + "learning_rate": 9.542700946583184e-06, + "loss": 0.79197259, + "memory(GiB)": 126.99, + "step": 13980, + "train_speed(iter/s)": 0.204347 + }, + { + "acc": 0.77285557, + "epoch": 0.32639729363216147, + "grad_norm": 5.5, + "learning_rate": 9.541911356308025e-06, + "loss": 0.83121672, + "memory(GiB)": 126.99, + "step": 13990, + "train_speed(iter/s)": 0.204424 + }, + { + "acc": 0.76080709, + "epoch": 0.32663060120445037, + "grad_norm": 4.71875, + "learning_rate": 9.541121117676339e-06, + "loss": 0.86064758, + "memory(GiB)": 126.99, + "step": 14000, + "train_speed(iter/s)": 0.204499 + }, + { + "epoch": 0.32663060120445037, + "eval_acc": 0.7354247529494964, + "eval_loss": 0.8388969898223877, + "eval_runtime": 1270.5843, + "eval_samples_per_second": 28.326, + "eval_steps_per_second": 14.164, + "step": 14000 + }, + { + "acc": 0.7774581, + "epoch": 0.3268639087767392, + "grad_norm": 4.84375, + "learning_rate": 9.540330230800935e-06, + "loss": 0.79988861, + "memory(GiB)": 126.99, + "step": 14010, + "train_speed(iter/s)": 0.200798 + }, + { + "acc": 0.78027916, + "epoch": 0.3270972163490281, + "grad_norm": 5.875, + "learning_rate": 9.539538695794708e-06, + "loss": 0.76487494, + "memory(GiB)": 126.99, + "step": 14020, + "train_speed(iter/s)": 0.200878 + }, + { + "acc": 0.77316322, + "epoch": 0.327330523921317, + "grad_norm": 5.46875, + "learning_rate": 9.53874651277065e-06, + "loss": 0.80076408, + "memory(GiB)": 126.99, + "step": 14030, + "train_speed(iter/s)": 0.200953 + }, + { + "acc": 0.79859195, + "epoch": 0.3275638314936059, + "grad_norm": 4.53125, + "learning_rate": 9.537953681841847e-06, + "loss": 0.72243781, + "memory(GiB)": 126.99, + "step": 14040, + "train_speed(iter/s)": 0.201027 + }, + { + "acc": 0.77445354, + "epoch": 0.3277971390658948, + "grad_norm": 5.5, + "learning_rate": 9.537160203121474e-06, + "loss": 0.80300465, + "memory(GiB)": 126.99, + "step": 14050, + "train_speed(iter/s)": 0.201102 + }, + { + "acc": 0.78483415, + "epoch": 0.3280304466381837, + "grad_norm": 4.71875, + "learning_rate": 9.536366076722799e-06, + "loss": 0.77799454, + "memory(GiB)": 126.99, + "step": 14060, + "train_speed(iter/s)": 0.201175 + }, + { + "acc": 0.76576405, + "epoch": 0.3282637542104726, + "grad_norm": 6.125, + "learning_rate": 9.535571302759184e-06, + "loss": 0.84655552, + "memory(GiB)": 126.99, + "step": 14070, + "train_speed(iter/s)": 0.201249 + }, + { + "acc": 0.76197848, + "epoch": 0.3284970617827615, + "grad_norm": 8.1875, + "learning_rate": 9.534775881344086e-06, + "loss": 0.8686224, + "memory(GiB)": 126.99, + "step": 14080, + "train_speed(iter/s)": 0.201318 + }, + { + "acc": 0.7604651, + "epoch": 0.3287303693550504, + "grad_norm": 6.40625, + "learning_rate": 9.533979812591046e-06, + "loss": 0.83883457, + "memory(GiB)": 126.99, + "step": 14090, + "train_speed(iter/s)": 0.201389 + }, + { + "acc": 0.77590809, + "epoch": 0.3289636769273393, + "grad_norm": 4.96875, + "learning_rate": 9.533183096613705e-06, + "loss": 0.80737171, + "memory(GiB)": 126.99, + "step": 14100, + "train_speed(iter/s)": 0.201463 + }, + { + "acc": 0.7858882, + "epoch": 0.32919698449962814, + "grad_norm": 5.0, + "learning_rate": 9.532385733525793e-06, + "loss": 0.7538044, + "memory(GiB)": 126.99, + "step": 14110, + "train_speed(iter/s)": 0.201532 + }, + { + "acc": 0.78518639, + "epoch": 0.32943029207191704, + "grad_norm": 5.84375, + "learning_rate": 9.531587723441136e-06, + "loss": 0.77603545, + "memory(GiB)": 126.99, + "step": 14120, + "train_speed(iter/s)": 0.201604 + }, + { + "acc": 0.7789793, + "epoch": 0.32966359964420594, + "grad_norm": 4.5625, + "learning_rate": 9.530789066473648e-06, + "loss": 0.78307705, + "memory(GiB)": 126.99, + "step": 14130, + "train_speed(iter/s)": 0.201676 + }, + { + "acc": 0.75844407, + "epoch": 0.32989690721649484, + "grad_norm": 5.75, + "learning_rate": 9.529989762737336e-06, + "loss": 0.85525703, + "memory(GiB)": 126.99, + "step": 14140, + "train_speed(iter/s)": 0.201751 + }, + { + "acc": 0.76479483, + "epoch": 0.33013021478878374, + "grad_norm": 6.8125, + "learning_rate": 9.529189812346303e-06, + "loss": 0.85921402, + "memory(GiB)": 126.99, + "step": 14150, + "train_speed(iter/s)": 0.201826 + }, + { + "acc": 0.77646847, + "epoch": 0.33036352236107264, + "grad_norm": 6.0, + "learning_rate": 9.528389215414737e-06, + "loss": 0.81476002, + "memory(GiB)": 126.99, + "step": 14160, + "train_speed(iter/s)": 0.201901 + }, + { + "acc": 0.77587495, + "epoch": 0.33059682993336154, + "grad_norm": 6.09375, + "learning_rate": 9.527587972056929e-06, + "loss": 0.79444175, + "memory(GiB)": 126.99, + "step": 14170, + "train_speed(iter/s)": 0.201978 + }, + { + "acc": 0.787714, + "epoch": 0.33083013750565043, + "grad_norm": 5.28125, + "learning_rate": 9.526786082387251e-06, + "loss": 0.73652754, + "memory(GiB)": 126.99, + "step": 14180, + "train_speed(iter/s)": 0.20205 + }, + { + "acc": 0.77357411, + "epoch": 0.33106344507793933, + "grad_norm": 4.4375, + "learning_rate": 9.525983546520176e-06, + "loss": 0.83334599, + "memory(GiB)": 126.99, + "step": 14190, + "train_speed(iter/s)": 0.202123 + }, + { + "acc": 0.78118382, + "epoch": 0.3312967526502282, + "grad_norm": 7.0, + "learning_rate": 9.525180364570265e-06, + "loss": 0.80493126, + "memory(GiB)": 126.99, + "step": 14200, + "train_speed(iter/s)": 0.202197 + }, + { + "acc": 0.75183415, + "epoch": 0.3315300602225171, + "grad_norm": 4.5625, + "learning_rate": 9.52437653665217e-06, + "loss": 0.91054926, + "memory(GiB)": 126.99, + "step": 14210, + "train_speed(iter/s)": 0.202276 + }, + { + "acc": 0.78139944, + "epoch": 0.331763367794806, + "grad_norm": 9.0, + "learning_rate": 9.52357206288064e-06, + "loss": 0.79247675, + "memory(GiB)": 126.99, + "step": 14220, + "train_speed(iter/s)": 0.20235 + }, + { + "acc": 0.75987701, + "epoch": 0.3319966753670949, + "grad_norm": 6.0625, + "learning_rate": 9.522766943370512e-06, + "loss": 0.87182941, + "memory(GiB)": 126.99, + "step": 14230, + "train_speed(iter/s)": 0.202422 + }, + { + "acc": 0.77394571, + "epoch": 0.33222998293938377, + "grad_norm": 7.53125, + "learning_rate": 9.521961178236716e-06, + "loss": 0.81149826, + "memory(GiB)": 126.99, + "step": 14240, + "train_speed(iter/s)": 0.202493 + }, + { + "acc": 0.77338943, + "epoch": 0.33246329051167267, + "grad_norm": 4.3125, + "learning_rate": 9.521154767594276e-06, + "loss": 0.81733627, + "memory(GiB)": 126.99, + "step": 14250, + "train_speed(iter/s)": 0.202568 + }, + { + "acc": 0.75781255, + "epoch": 0.33269659808396157, + "grad_norm": 6.9375, + "learning_rate": 9.520347711558306e-06, + "loss": 0.88546219, + "memory(GiB)": 126.99, + "step": 14260, + "train_speed(iter/s)": 0.202638 + }, + { + "acc": 0.76396008, + "epoch": 0.33292990565625047, + "grad_norm": 5.53125, + "learning_rate": 9.519540010244013e-06, + "loss": 0.86021147, + "memory(GiB)": 126.99, + "step": 14270, + "train_speed(iter/s)": 0.202717 + }, + { + "acc": 0.76865139, + "epoch": 0.33316321322853937, + "grad_norm": 4.78125, + "learning_rate": 9.518731663766697e-06, + "loss": 0.8223979, + "memory(GiB)": 126.99, + "step": 14280, + "train_speed(iter/s)": 0.202789 + }, + { + "acc": 0.78064613, + "epoch": 0.33339652080082827, + "grad_norm": 5.71875, + "learning_rate": 9.517922672241748e-06, + "loss": 0.78749752, + "memory(GiB)": 126.99, + "step": 14290, + "train_speed(iter/s)": 0.202864 + }, + { + "acc": 0.79933729, + "epoch": 0.3336298283731171, + "grad_norm": 7.34375, + "learning_rate": 9.517113035784651e-06, + "loss": 0.7244029, + "memory(GiB)": 126.99, + "step": 14300, + "train_speed(iter/s)": 0.20294 + }, + { + "acc": 0.78056884, + "epoch": 0.333863135945406, + "grad_norm": 7.25, + "learning_rate": 9.51630275451098e-06, + "loss": 0.79498949, + "memory(GiB)": 126.99, + "step": 14310, + "train_speed(iter/s)": 0.203011 + }, + { + "acc": 0.7816524, + "epoch": 0.3340964435176949, + "grad_norm": 6.0625, + "learning_rate": 9.515491828536403e-06, + "loss": 0.77081575, + "memory(GiB)": 126.99, + "step": 14320, + "train_speed(iter/s)": 0.203084 + }, + { + "acc": 0.76975269, + "epoch": 0.3343297510899838, + "grad_norm": 5.0625, + "learning_rate": 9.51468025797668e-06, + "loss": 0.8435153, + "memory(GiB)": 126.99, + "step": 14330, + "train_speed(iter/s)": 0.203157 + }, + { + "acc": 0.79847164, + "epoch": 0.3345630586622727, + "grad_norm": 6.90625, + "learning_rate": 9.51386804294766e-06, + "loss": 0.7151453, + "memory(GiB)": 126.99, + "step": 14340, + "train_speed(iter/s)": 0.20323 + }, + { + "acc": 0.78779755, + "epoch": 0.3347963662345616, + "grad_norm": 5.21875, + "learning_rate": 9.51305518356529e-06, + "loss": 0.75147953, + "memory(GiB)": 126.99, + "step": 14350, + "train_speed(iter/s)": 0.2033 + }, + { + "acc": 0.7548315, + "epoch": 0.3350296738068505, + "grad_norm": 7.09375, + "learning_rate": 9.512241679945602e-06, + "loss": 0.89756851, + "memory(GiB)": 126.99, + "step": 14360, + "train_speed(iter/s)": 0.203372 + }, + { + "acc": 0.7766715, + "epoch": 0.3352629813791394, + "grad_norm": 6.3125, + "learning_rate": 9.511427532204725e-06, + "loss": 0.79562111, + "memory(GiB)": 126.99, + "step": 14370, + "train_speed(iter/s)": 0.20344 + }, + { + "acc": 0.77851067, + "epoch": 0.3354962889514283, + "grad_norm": 6.46875, + "learning_rate": 9.51061274045888e-06, + "loss": 0.7912365, + "memory(GiB)": 126.99, + "step": 14380, + "train_speed(iter/s)": 0.203516 + }, + { + "acc": 0.76406937, + "epoch": 0.3357295965237172, + "grad_norm": 5.46875, + "learning_rate": 9.509797304824376e-06, + "loss": 0.87257404, + "memory(GiB)": 126.99, + "step": 14390, + "train_speed(iter/s)": 0.203587 + }, + { + "acc": 0.77162414, + "epoch": 0.33596290409600604, + "grad_norm": 6.21875, + "learning_rate": 9.508981225417615e-06, + "loss": 0.81675949, + "memory(GiB)": 126.99, + "step": 14400, + "train_speed(iter/s)": 0.203654 + }, + { + "acc": 0.76914759, + "epoch": 0.33619621166829494, + "grad_norm": 5.53125, + "learning_rate": 9.508164502355095e-06, + "loss": 0.83779526, + "memory(GiB)": 126.99, + "step": 14410, + "train_speed(iter/s)": 0.203724 + }, + { + "acc": 0.77464485, + "epoch": 0.33642951924058384, + "grad_norm": 4.53125, + "learning_rate": 9.507347135753403e-06, + "loss": 0.79855938, + "memory(GiB)": 126.99, + "step": 14420, + "train_speed(iter/s)": 0.203795 + }, + { + "acc": 0.77766848, + "epoch": 0.33666282681287274, + "grad_norm": 7.875, + "learning_rate": 9.506529125729216e-06, + "loss": 0.80243073, + "memory(GiB)": 126.99, + "step": 14430, + "train_speed(iter/s)": 0.203867 + }, + { + "acc": 0.77303267, + "epoch": 0.33689613438516164, + "grad_norm": 5.90625, + "learning_rate": 9.505710472399306e-06, + "loss": 0.79739962, + "memory(GiB)": 126.99, + "step": 14440, + "train_speed(iter/s)": 0.203937 + }, + { + "acc": 0.76884651, + "epoch": 0.33712944195745054, + "grad_norm": 4.78125, + "learning_rate": 9.504891175880533e-06, + "loss": 0.82474422, + "memory(GiB)": 126.99, + "step": 14450, + "train_speed(iter/s)": 0.204006 + }, + { + "acc": 0.76067557, + "epoch": 0.33736274952973944, + "grad_norm": 6.1875, + "learning_rate": 9.504071236289856e-06, + "loss": 0.87364044, + "memory(GiB)": 126.99, + "step": 14460, + "train_speed(iter/s)": 0.204078 + }, + { + "acc": 0.77003412, + "epoch": 0.33759605710202834, + "grad_norm": 5.3125, + "learning_rate": 9.503250653744316e-06, + "loss": 0.81374006, + "memory(GiB)": 126.99, + "step": 14470, + "train_speed(iter/s)": 0.204147 + }, + { + "acc": 0.77818384, + "epoch": 0.33782936467431723, + "grad_norm": 5.4375, + "learning_rate": 9.502429428361055e-06, + "loss": 0.81029873, + "memory(GiB)": 126.99, + "step": 14480, + "train_speed(iter/s)": 0.204217 + }, + { + "acc": 0.74643278, + "epoch": 0.3380626722466061, + "grad_norm": 5.4375, + "learning_rate": 9.5016075602573e-06, + "loss": 0.92115335, + "memory(GiB)": 126.99, + "step": 14490, + "train_speed(iter/s)": 0.20429 + }, + { + "acc": 0.76585412, + "epoch": 0.338295979818895, + "grad_norm": 9.25, + "learning_rate": 9.500785049550373e-06, + "loss": 0.83918991, + "memory(GiB)": 126.99, + "step": 14500, + "train_speed(iter/s)": 0.204369 + }, + { + "epoch": 0.338295979818895, + "eval_acc": 0.7356140434320767, + "eval_loss": 0.8385112881660461, + "eval_runtime": 1269.8549, + "eval_samples_per_second": 28.343, + "eval_steps_per_second": 14.172, + "step": 14500 + }, + { + "acc": 0.77105675, + "epoch": 0.3385292873911839, + "grad_norm": 4.25, + "learning_rate": 9.49996189635769e-06, + "loss": 0.81464977, + "memory(GiB)": 126.99, + "step": 14510, + "train_speed(iter/s)": 0.200798 + }, + { + "acc": 0.76173067, + "epoch": 0.3387625949634728, + "grad_norm": 4.75, + "learning_rate": 9.499138100796752e-06, + "loss": 0.85862141, + "memory(GiB)": 126.99, + "step": 14520, + "train_speed(iter/s)": 0.200874 + }, + { + "acc": 0.76991301, + "epoch": 0.3389959025357617, + "grad_norm": 4.5, + "learning_rate": 9.498313662985159e-06, + "loss": 0.83255396, + "memory(GiB)": 126.99, + "step": 14530, + "train_speed(iter/s)": 0.200942 + }, + { + "acc": 0.77943153, + "epoch": 0.33922921010805057, + "grad_norm": 5.21875, + "learning_rate": 9.497488583040595e-06, + "loss": 0.79724607, + "memory(GiB)": 126.99, + "step": 14540, + "train_speed(iter/s)": 0.201014 + }, + { + "acc": 0.76977105, + "epoch": 0.33946251768033947, + "grad_norm": 6.40625, + "learning_rate": 9.496662861080842e-06, + "loss": 0.83135042, + "memory(GiB)": 126.99, + "step": 14550, + "train_speed(iter/s)": 0.201086 + }, + { + "acc": 0.78565388, + "epoch": 0.33969582525262837, + "grad_norm": 7.28125, + "learning_rate": 9.495836497223775e-06, + "loss": 0.74578938, + "memory(GiB)": 126.99, + "step": 14560, + "train_speed(iter/s)": 0.201157 + }, + { + "acc": 0.75958099, + "epoch": 0.33992913282491727, + "grad_norm": 5.78125, + "learning_rate": 9.49500949158735e-06, + "loss": 0.88202744, + "memory(GiB)": 126.99, + "step": 14570, + "train_speed(iter/s)": 0.201229 + }, + { + "acc": 0.78799295, + "epoch": 0.34016244039720617, + "grad_norm": 5.90625, + "learning_rate": 9.494181844289629e-06, + "loss": 0.75782499, + "memory(GiB)": 126.99, + "step": 14580, + "train_speed(iter/s)": 0.201295 + }, + { + "acc": 0.78360558, + "epoch": 0.340395747969495, + "grad_norm": 4.78125, + "learning_rate": 9.493353555448754e-06, + "loss": 0.75555897, + "memory(GiB)": 135.77, + "step": 14590, + "train_speed(iter/s)": 0.201362 + }, + { + "acc": 0.78057513, + "epoch": 0.3406290555417839, + "grad_norm": 6.8125, + "learning_rate": 9.492524625182965e-06, + "loss": 0.76722956, + "memory(GiB)": 135.77, + "step": 14600, + "train_speed(iter/s)": 0.201432 + }, + { + "acc": 0.77534466, + "epoch": 0.3408623631140728, + "grad_norm": 4.8125, + "learning_rate": 9.49169505361059e-06, + "loss": 0.8156085, + "memory(GiB)": 135.77, + "step": 14610, + "train_speed(iter/s)": 0.201504 + }, + { + "acc": 0.7402894, + "epoch": 0.3410956706863617, + "grad_norm": 10.5, + "learning_rate": 9.490864840850051e-06, + "loss": 0.96359291, + "memory(GiB)": 135.77, + "step": 14620, + "train_speed(iter/s)": 0.201581 + }, + { + "acc": 0.7706079, + "epoch": 0.3413289782586506, + "grad_norm": 7.40625, + "learning_rate": 9.490033987019862e-06, + "loss": 0.83333149, + "memory(GiB)": 135.77, + "step": 14630, + "train_speed(iter/s)": 0.201651 + }, + { + "acc": 0.76909857, + "epoch": 0.3415622858309395, + "grad_norm": 4.6875, + "learning_rate": 9.489202492238624e-06, + "loss": 0.82511196, + "memory(GiB)": 135.77, + "step": 14640, + "train_speed(iter/s)": 0.201726 + }, + { + "acc": 0.75274253, + "epoch": 0.3417955934032284, + "grad_norm": 5.3125, + "learning_rate": 9.488370356625035e-06, + "loss": 0.8952776, + "memory(GiB)": 135.77, + "step": 14650, + "train_speed(iter/s)": 0.2018 + }, + { + "acc": 0.78073397, + "epoch": 0.3420289009755173, + "grad_norm": 12.375, + "learning_rate": 9.487537580297881e-06, + "loss": 0.7810813, + "memory(GiB)": 135.77, + "step": 14660, + "train_speed(iter/s)": 0.201871 + }, + { + "acc": 0.75856628, + "epoch": 0.3422622085478062, + "grad_norm": 4.5625, + "learning_rate": 9.486704163376041e-06, + "loss": 0.89207325, + "memory(GiB)": 135.77, + "step": 14670, + "train_speed(iter/s)": 0.201942 + }, + { + "acc": 0.77476802, + "epoch": 0.3424955161200951, + "grad_norm": 6.28125, + "learning_rate": 9.485870105978487e-06, + "loss": 0.78234882, + "memory(GiB)": 135.77, + "step": 14680, + "train_speed(iter/s)": 0.202016 + }, + { + "acc": 0.76214299, + "epoch": 0.34272882369238394, + "grad_norm": 4.78125, + "learning_rate": 9.485035408224277e-06, + "loss": 0.84753475, + "memory(GiB)": 135.77, + "step": 14690, + "train_speed(iter/s)": 0.202093 + }, + { + "acc": 0.77669897, + "epoch": 0.34296213126467284, + "grad_norm": 7.3125, + "learning_rate": 9.484200070232565e-06, + "loss": 0.82079849, + "memory(GiB)": 135.77, + "step": 14700, + "train_speed(iter/s)": 0.202169 + }, + { + "acc": 0.76222053, + "epoch": 0.34319543883696174, + "grad_norm": 5.125, + "learning_rate": 9.483364092122595e-06, + "loss": 0.86403046, + "memory(GiB)": 135.77, + "step": 14710, + "train_speed(iter/s)": 0.20224 + }, + { + "acc": 0.77041807, + "epoch": 0.34342874640925064, + "grad_norm": 6.03125, + "learning_rate": 9.482527474013705e-06, + "loss": 0.84138517, + "memory(GiB)": 135.77, + "step": 14720, + "train_speed(iter/s)": 0.202309 + }, + { + "acc": 0.769876, + "epoch": 0.34366205398153954, + "grad_norm": 8.125, + "learning_rate": 9.481690216025321e-06, + "loss": 0.83939104, + "memory(GiB)": 135.77, + "step": 14730, + "train_speed(iter/s)": 0.202379 + }, + { + "acc": 0.76854458, + "epoch": 0.34389536155382844, + "grad_norm": 7.75, + "learning_rate": 9.480852318276958e-06, + "loss": 0.8580862, + "memory(GiB)": 135.77, + "step": 14740, + "train_speed(iter/s)": 0.202448 + }, + { + "acc": 0.79126053, + "epoch": 0.34412866912611734, + "grad_norm": 7.03125, + "learning_rate": 9.48001378088823e-06, + "loss": 0.74607038, + "memory(GiB)": 135.77, + "step": 14750, + "train_speed(iter/s)": 0.202516 + }, + { + "acc": 0.77793612, + "epoch": 0.34436197669840624, + "grad_norm": 5.53125, + "learning_rate": 9.479174603978836e-06, + "loss": 0.7889802, + "memory(GiB)": 135.77, + "step": 14760, + "train_speed(iter/s)": 0.202585 + }, + { + "acc": 0.78102379, + "epoch": 0.34459528427069513, + "grad_norm": 5.34375, + "learning_rate": 9.478334787668569e-06, + "loss": 0.79738607, + "memory(GiB)": 135.77, + "step": 14770, + "train_speed(iter/s)": 0.202653 + }, + { + "acc": 0.77394485, + "epoch": 0.344828591842984, + "grad_norm": 8.1875, + "learning_rate": 9.477494332077311e-06, + "loss": 0.81590576, + "memory(GiB)": 135.77, + "step": 14780, + "train_speed(iter/s)": 0.202721 + }, + { + "acc": 0.76899199, + "epoch": 0.3450618994152729, + "grad_norm": 4.78125, + "learning_rate": 9.476653237325037e-06, + "loss": 0.84678192, + "memory(GiB)": 135.77, + "step": 14790, + "train_speed(iter/s)": 0.202792 + }, + { + "acc": 0.77013807, + "epoch": 0.3452952069875618, + "grad_norm": 5.21875, + "learning_rate": 9.475811503531815e-06, + "loss": 0.82642403, + "memory(GiB)": 135.77, + "step": 14800, + "train_speed(iter/s)": 0.202859 + }, + { + "acc": 0.7706193, + "epoch": 0.3455285145598507, + "grad_norm": 6.15625, + "learning_rate": 9.474969130817801e-06, + "loss": 0.82289953, + "memory(GiB)": 135.77, + "step": 14810, + "train_speed(iter/s)": 0.20293 + }, + { + "acc": 0.76709123, + "epoch": 0.3457618221321396, + "grad_norm": 5.34375, + "learning_rate": 9.474126119303245e-06, + "loss": 0.85336533, + "memory(GiB)": 135.77, + "step": 14820, + "train_speed(iter/s)": 0.202989 + }, + { + "acc": 0.77197552, + "epoch": 0.3459951297044285, + "grad_norm": 7.625, + "learning_rate": 9.473282469108483e-06, + "loss": 0.81618519, + "memory(GiB)": 135.77, + "step": 14830, + "train_speed(iter/s)": 0.203057 + }, + { + "acc": 0.76123796, + "epoch": 0.34622843727671737, + "grad_norm": 4.625, + "learning_rate": 9.472438180353948e-06, + "loss": 0.87118511, + "memory(GiB)": 135.77, + "step": 14840, + "train_speed(iter/s)": 0.203124 + }, + { + "acc": 0.79290915, + "epoch": 0.34646174484900627, + "grad_norm": 5.125, + "learning_rate": 9.471593253160162e-06, + "loss": 0.73074284, + "memory(GiB)": 135.77, + "step": 14850, + "train_speed(iter/s)": 0.203196 + }, + { + "acc": 0.76475649, + "epoch": 0.34669505242129517, + "grad_norm": 5.15625, + "learning_rate": 9.470747687647741e-06, + "loss": 0.85227308, + "memory(GiB)": 135.77, + "step": 14860, + "train_speed(iter/s)": 0.203268 + }, + { + "acc": 0.7734736, + "epoch": 0.34692835999358407, + "grad_norm": 7.03125, + "learning_rate": 9.469901483937384e-06, + "loss": 0.79763713, + "memory(GiB)": 135.77, + "step": 14870, + "train_speed(iter/s)": 0.203343 + }, + { + "acc": 0.79150887, + "epoch": 0.3471616675658729, + "grad_norm": 4.8125, + "learning_rate": 9.469054642149889e-06, + "loss": 0.73478098, + "memory(GiB)": 135.77, + "step": 14880, + "train_speed(iter/s)": 0.203411 + }, + { + "acc": 0.77741909, + "epoch": 0.3473949751381618, + "grad_norm": 6.25, + "learning_rate": 9.468207162406143e-06, + "loss": 0.81160421, + "memory(GiB)": 135.77, + "step": 14890, + "train_speed(iter/s)": 0.203483 + }, + { + "acc": 0.76978407, + "epoch": 0.3476282827104507, + "grad_norm": 5.0, + "learning_rate": 9.46735904482712e-06, + "loss": 0.81876183, + "memory(GiB)": 135.77, + "step": 14900, + "train_speed(iter/s)": 0.203552 + }, + { + "acc": 0.77738295, + "epoch": 0.3478615902827396, + "grad_norm": 6.3125, + "learning_rate": 9.466510289533894e-06, + "loss": 0.79115467, + "memory(GiB)": 135.77, + "step": 14910, + "train_speed(iter/s)": 0.203622 + }, + { + "acc": 0.78486333, + "epoch": 0.3480948978550285, + "grad_norm": 5.96875, + "learning_rate": 9.46566089664762e-06, + "loss": 0.74208999, + "memory(GiB)": 135.77, + "step": 14920, + "train_speed(iter/s)": 0.203692 + }, + { + "acc": 0.78259773, + "epoch": 0.3483282054273174, + "grad_norm": 8.75, + "learning_rate": 9.46481086628955e-06, + "loss": 0.79701777, + "memory(GiB)": 135.77, + "step": 14930, + "train_speed(iter/s)": 0.20376 + }, + { + "acc": 0.77444029, + "epoch": 0.3485615129996063, + "grad_norm": 5.53125, + "learning_rate": 9.463960198581028e-06, + "loss": 0.82572346, + "memory(GiB)": 135.77, + "step": 14940, + "train_speed(iter/s)": 0.203828 + }, + { + "acc": 0.76832108, + "epoch": 0.3487948205718952, + "grad_norm": 7.4375, + "learning_rate": 9.463108893643483e-06, + "loss": 0.81358547, + "memory(GiB)": 135.77, + "step": 14950, + "train_speed(iter/s)": 0.203898 + }, + { + "acc": 0.76551371, + "epoch": 0.3490281281441841, + "grad_norm": 6.84375, + "learning_rate": 9.46225695159844e-06, + "loss": 0.86378431, + "memory(GiB)": 135.77, + "step": 14960, + "train_speed(iter/s)": 0.203957 + }, + { + "acc": 0.79054885, + "epoch": 0.34926143571647295, + "grad_norm": 4.71875, + "learning_rate": 9.461404372567513e-06, + "loss": 0.75698428, + "memory(GiB)": 135.77, + "step": 14970, + "train_speed(iter/s)": 0.204027 + }, + { + "acc": 0.76183825, + "epoch": 0.34949474328876184, + "grad_norm": 4.84375, + "learning_rate": 9.460551156672408e-06, + "loss": 0.87834749, + "memory(GiB)": 135.77, + "step": 14980, + "train_speed(iter/s)": 0.204097 + }, + { + "acc": 0.75772481, + "epoch": 0.34972805086105074, + "grad_norm": 5.0, + "learning_rate": 9.459697304034923e-06, + "loss": 0.87366276, + "memory(GiB)": 135.77, + "step": 14990, + "train_speed(iter/s)": 0.204166 + }, + { + "acc": 0.74876041, + "epoch": 0.34996135843333964, + "grad_norm": 5.53125, + "learning_rate": 9.458842814776941e-06, + "loss": 0.91464996, + "memory(GiB)": 135.77, + "step": 15000, + "train_speed(iter/s)": 0.204229 + }, + { + "epoch": 0.34996135843333964, + "eval_acc": 0.7360832660327197, + "eval_loss": 0.8364920616149902, + "eval_runtime": 1270.2256, + "eval_samples_per_second": 28.334, + "eval_steps_per_second": 14.168, + "step": 15000 + }, + { + "acc": 0.78980742, + "epoch": 0.35019466600562854, + "grad_norm": 5.09375, + "learning_rate": 9.457987689020444e-06, + "loss": 0.74719791, + "memory(GiB)": 135.77, + "step": 15010, + "train_speed(iter/s)": 0.200781 + }, + { + "acc": 0.76756434, + "epoch": 0.35042797357791744, + "grad_norm": 6.4375, + "learning_rate": 9.457131926887498e-06, + "loss": 0.84798584, + "memory(GiB)": 135.77, + "step": 15020, + "train_speed(iter/s)": 0.200849 + }, + { + "acc": 0.7941637, + "epoch": 0.35066128115020634, + "grad_norm": 6.28125, + "learning_rate": 9.456275528500264e-06, + "loss": 0.75351415, + "memory(GiB)": 135.77, + "step": 15030, + "train_speed(iter/s)": 0.200916 + }, + { + "acc": 0.76890516, + "epoch": 0.35089458872249524, + "grad_norm": 5.53125, + "learning_rate": 9.455418493980996e-06, + "loss": 0.84603033, + "memory(GiB)": 135.77, + "step": 15040, + "train_speed(iter/s)": 0.200988 + }, + { + "acc": 0.76987128, + "epoch": 0.35112789629478414, + "grad_norm": 8.5625, + "learning_rate": 9.454560823452031e-06, + "loss": 0.82527428, + "memory(GiB)": 135.77, + "step": 15050, + "train_speed(iter/s)": 0.201059 + }, + { + "acc": 0.76635885, + "epoch": 0.35136120386707304, + "grad_norm": 6.21875, + "learning_rate": 9.4537025170358e-06, + "loss": 0.84537125, + "memory(GiB)": 135.77, + "step": 15060, + "train_speed(iter/s)": 0.201127 + }, + { + "acc": 0.78066635, + "epoch": 0.3515945114393619, + "grad_norm": 4.71875, + "learning_rate": 9.45284357485483e-06, + "loss": 0.79466629, + "memory(GiB)": 135.77, + "step": 15070, + "train_speed(iter/s)": 0.201199 + }, + { + "acc": 0.77449002, + "epoch": 0.3518278190116508, + "grad_norm": 4.65625, + "learning_rate": 9.451983997031736e-06, + "loss": 0.80630569, + "memory(GiB)": 135.77, + "step": 15080, + "train_speed(iter/s)": 0.201266 + }, + { + "acc": 0.77382059, + "epoch": 0.3520611265839397, + "grad_norm": 5.1875, + "learning_rate": 9.451123783689216e-06, + "loss": 0.81937408, + "memory(GiB)": 135.77, + "step": 15090, + "train_speed(iter/s)": 0.201336 + }, + { + "acc": 0.75354958, + "epoch": 0.3522944341562286, + "grad_norm": 6.28125, + "learning_rate": 9.450262934950069e-06, + "loss": 0.88799534, + "memory(GiB)": 135.77, + "step": 15100, + "train_speed(iter/s)": 0.201403 + }, + { + "acc": 0.76095815, + "epoch": 0.3525277417285175, + "grad_norm": 6.25, + "learning_rate": 9.449401450937184e-06, + "loss": 0.88658123, + "memory(GiB)": 135.77, + "step": 15110, + "train_speed(iter/s)": 0.201471 + }, + { + "acc": 0.77167931, + "epoch": 0.3527610493008064, + "grad_norm": 5.53125, + "learning_rate": 9.448539331773532e-06, + "loss": 0.83307276, + "memory(GiB)": 135.77, + "step": 15120, + "train_speed(iter/s)": 0.201538 + }, + { + "acc": 0.77130241, + "epoch": 0.35299435687309527, + "grad_norm": 6.75, + "learning_rate": 9.447676577582184e-06, + "loss": 0.8277441, + "memory(GiB)": 135.77, + "step": 15130, + "train_speed(iter/s)": 0.201605 + }, + { + "acc": 0.76398301, + "epoch": 0.35322766444538417, + "grad_norm": 5.5, + "learning_rate": 9.446813188486294e-06, + "loss": 0.85096407, + "memory(GiB)": 135.77, + "step": 15140, + "train_speed(iter/s)": 0.201673 + }, + { + "acc": 0.76413918, + "epoch": 0.35346097201767307, + "grad_norm": 7.375, + "learning_rate": 9.445949164609116e-06, + "loss": 0.87346344, + "memory(GiB)": 135.77, + "step": 15150, + "train_speed(iter/s)": 0.201742 + }, + { + "acc": 0.78450708, + "epoch": 0.35369427958996197, + "grad_norm": 5.125, + "learning_rate": 9.445084506073985e-06, + "loss": 0.77033629, + "memory(GiB)": 135.77, + "step": 15160, + "train_speed(iter/s)": 0.201812 + }, + { + "acc": 0.76229258, + "epoch": 0.3539275871622508, + "grad_norm": 7.3125, + "learning_rate": 9.444219213004333e-06, + "loss": 0.86626339, + "memory(GiB)": 135.77, + "step": 15170, + "train_speed(iter/s)": 0.201878 + }, + { + "acc": 0.76085272, + "epoch": 0.3541608947345397, + "grad_norm": 4.71875, + "learning_rate": 9.443353285523678e-06, + "loss": 0.86795692, + "memory(GiB)": 135.77, + "step": 15180, + "train_speed(iter/s)": 0.20195 + }, + { + "acc": 0.76711388, + "epoch": 0.3543942023068286, + "grad_norm": 5.5, + "learning_rate": 9.442486723755633e-06, + "loss": 0.82695503, + "memory(GiB)": 135.77, + "step": 15190, + "train_speed(iter/s)": 0.202021 + }, + { + "acc": 0.76617556, + "epoch": 0.3546275098791175, + "grad_norm": 7.375, + "learning_rate": 9.4416195278239e-06, + "loss": 0.82697554, + "memory(GiB)": 135.77, + "step": 15200, + "train_speed(iter/s)": 0.202089 + }, + { + "acc": 0.76866422, + "epoch": 0.3548608174514064, + "grad_norm": 4.28125, + "learning_rate": 9.440751697852268e-06, + "loss": 0.84350128, + "memory(GiB)": 135.77, + "step": 15210, + "train_speed(iter/s)": 0.202153 + }, + { + "acc": 0.78142757, + "epoch": 0.3550941250236953, + "grad_norm": 9.0625, + "learning_rate": 9.439883233964621e-06, + "loss": 0.79565501, + "memory(GiB)": 135.77, + "step": 15220, + "train_speed(iter/s)": 0.202225 + }, + { + "acc": 0.77310419, + "epoch": 0.3553274325959842, + "grad_norm": 4.875, + "learning_rate": 9.439014136284934e-06, + "loss": 0.83620567, + "memory(GiB)": 135.77, + "step": 15230, + "train_speed(iter/s)": 0.202294 + }, + { + "acc": 0.78746805, + "epoch": 0.3555607401682731, + "grad_norm": 4.0625, + "learning_rate": 9.438144404937266e-06, + "loss": 0.75883884, + "memory(GiB)": 135.77, + "step": 15240, + "train_speed(iter/s)": 0.202366 + }, + { + "acc": 0.77289262, + "epoch": 0.355794047740562, + "grad_norm": 4.53125, + "learning_rate": 9.437274040045775e-06, + "loss": 0.85325108, + "memory(GiB)": 135.77, + "step": 15250, + "train_speed(iter/s)": 0.202433 + }, + { + "acc": 0.79338937, + "epoch": 0.35602735531285085, + "grad_norm": 5.53125, + "learning_rate": 9.436403041734704e-06, + "loss": 0.74687109, + "memory(GiB)": 135.77, + "step": 15260, + "train_speed(iter/s)": 0.202505 + }, + { + "acc": 0.77933197, + "epoch": 0.35626066288513975, + "grad_norm": 4.5, + "learning_rate": 9.435531410128387e-06, + "loss": 0.78443747, + "memory(GiB)": 135.77, + "step": 15270, + "train_speed(iter/s)": 0.202572 + }, + { + "acc": 0.76276693, + "epoch": 0.35649397045742864, + "grad_norm": 4.125, + "learning_rate": 9.434659145351251e-06, + "loss": 0.85257626, + "memory(GiB)": 135.77, + "step": 15280, + "train_speed(iter/s)": 0.202643 + }, + { + "acc": 0.77083325, + "epoch": 0.35672727802971754, + "grad_norm": 5.4375, + "learning_rate": 9.433786247527809e-06, + "loss": 0.81012554, + "memory(GiB)": 135.77, + "step": 15290, + "train_speed(iter/s)": 0.202711 + }, + { + "acc": 0.76903772, + "epoch": 0.35696058560200644, + "grad_norm": 4.625, + "learning_rate": 9.432912716782667e-06, + "loss": 0.82582378, + "memory(GiB)": 135.77, + "step": 15300, + "train_speed(iter/s)": 0.202778 + }, + { + "acc": 0.77473183, + "epoch": 0.35719389317429534, + "grad_norm": 5.28125, + "learning_rate": 9.432038553240526e-06, + "loss": 0.82623148, + "memory(GiB)": 135.77, + "step": 15310, + "train_speed(iter/s)": 0.202843 + }, + { + "acc": 0.77468882, + "epoch": 0.35742720074658424, + "grad_norm": 5.625, + "learning_rate": 9.431163757026167e-06, + "loss": 0.79663391, + "memory(GiB)": 135.77, + "step": 15320, + "train_speed(iter/s)": 0.20291 + }, + { + "acc": 0.7720438, + "epoch": 0.35766050831887314, + "grad_norm": 5.9375, + "learning_rate": 9.430288328264467e-06, + "loss": 0.8156147, + "memory(GiB)": 135.77, + "step": 15330, + "train_speed(iter/s)": 0.202979 + }, + { + "acc": 0.77816076, + "epoch": 0.35789381589116204, + "grad_norm": 5.59375, + "learning_rate": 9.429412267080397e-06, + "loss": 0.78154631, + "memory(GiB)": 135.77, + "step": 15340, + "train_speed(iter/s)": 0.203047 + }, + { + "acc": 0.76980267, + "epoch": 0.35812712346345094, + "grad_norm": 5.8125, + "learning_rate": 9.428535573599013e-06, + "loss": 0.83480797, + "memory(GiB)": 135.77, + "step": 15350, + "train_speed(iter/s)": 0.203121 + }, + { + "acc": 0.76492939, + "epoch": 0.3583604310357398, + "grad_norm": 4.65625, + "learning_rate": 9.427658247945463e-06, + "loss": 0.8619091, + "memory(GiB)": 135.77, + "step": 15360, + "train_speed(iter/s)": 0.203181 + }, + { + "acc": 0.75917945, + "epoch": 0.3585937386080287, + "grad_norm": 9.375, + "learning_rate": 9.426780290244983e-06, + "loss": 0.89382544, + "memory(GiB)": 135.77, + "step": 15370, + "train_speed(iter/s)": 0.203252 + }, + { + "acc": 0.76189117, + "epoch": 0.3588270461803176, + "grad_norm": 8.9375, + "learning_rate": 9.425901700622904e-06, + "loss": 0.84197168, + "memory(GiB)": 135.77, + "step": 15380, + "train_speed(iter/s)": 0.20332 + }, + { + "acc": 0.77091656, + "epoch": 0.3590603537526065, + "grad_norm": 7.6875, + "learning_rate": 9.42502247920464e-06, + "loss": 0.8417284, + "memory(GiB)": 135.77, + "step": 15390, + "train_speed(iter/s)": 0.20339 + }, + { + "acc": 0.77439508, + "epoch": 0.3592936613248954, + "grad_norm": 4.78125, + "learning_rate": 9.424142626115706e-06, + "loss": 0.81533623, + "memory(GiB)": 135.77, + "step": 15400, + "train_speed(iter/s)": 0.203459 + }, + { + "acc": 0.75681648, + "epoch": 0.3595269688971843, + "grad_norm": 6.8125, + "learning_rate": 9.423262141481695e-06, + "loss": 0.87958059, + "memory(GiB)": 135.77, + "step": 15410, + "train_speed(iter/s)": 0.203521 + }, + { + "acc": 0.77736855, + "epoch": 0.3597602764694732, + "grad_norm": 6.5, + "learning_rate": 9.4223810254283e-06, + "loss": 0.80077677, + "memory(GiB)": 135.77, + "step": 15420, + "train_speed(iter/s)": 0.203593 + }, + { + "acc": 0.7821209, + "epoch": 0.35999358404176207, + "grad_norm": 6.6875, + "learning_rate": 9.421499278081296e-06, + "loss": 0.80219059, + "memory(GiB)": 135.77, + "step": 15430, + "train_speed(iter/s)": 0.20366 + }, + { + "acc": 0.77446098, + "epoch": 0.36022689161405097, + "grad_norm": 6.5625, + "learning_rate": 9.420616899566557e-06, + "loss": 0.79374628, + "memory(GiB)": 135.77, + "step": 15440, + "train_speed(iter/s)": 0.203725 + }, + { + "acc": 0.76317291, + "epoch": 0.36046019918633987, + "grad_norm": 4.84375, + "learning_rate": 9.41973389001004e-06, + "loss": 0.87783279, + "memory(GiB)": 135.77, + "step": 15450, + "train_speed(iter/s)": 0.203795 + }, + { + "acc": 0.77032871, + "epoch": 0.3606935067586287, + "grad_norm": 5.0, + "learning_rate": 9.418850249537792e-06, + "loss": 0.83090649, + "memory(GiB)": 135.77, + "step": 15460, + "train_speed(iter/s)": 0.203862 + }, + { + "acc": 0.75985117, + "epoch": 0.3609268143309176, + "grad_norm": 7.0625, + "learning_rate": 9.417965978275955e-06, + "loss": 0.87291212, + "memory(GiB)": 135.77, + "step": 15470, + "train_speed(iter/s)": 0.203928 + }, + { + "acc": 0.78009644, + "epoch": 0.3611601219032065, + "grad_norm": 5.90625, + "learning_rate": 9.417081076350758e-06, + "loss": 0.79100814, + "memory(GiB)": 135.77, + "step": 15480, + "train_speed(iter/s)": 0.203997 + }, + { + "acc": 0.78492813, + "epoch": 0.3613934294754954, + "grad_norm": 5.75, + "learning_rate": 9.416195543888522e-06, + "loss": 0.78736544, + "memory(GiB)": 135.77, + "step": 15490, + "train_speed(iter/s)": 0.204062 + }, + { + "acc": 0.75766697, + "epoch": 0.3616267370477843, + "grad_norm": 6.53125, + "learning_rate": 9.415309381015654e-06, + "loss": 0.86035919, + "memory(GiB)": 135.77, + "step": 15500, + "train_speed(iter/s)": 0.204131 + }, + { + "epoch": 0.3616267370477843, + "eval_acc": 0.7363535894968445, + "eval_loss": 0.8363694548606873, + "eval_runtime": 1270.7457, + "eval_samples_per_second": 28.323, + "eval_steps_per_second": 14.162, + "step": 15500 + }, + { + "acc": 0.76968145, + "epoch": 0.3618600446200732, + "grad_norm": 7.1875, + "learning_rate": 9.414422587858654e-06, + "loss": 0.82733736, + "memory(GiB)": 135.77, + "step": 15510, + "train_speed(iter/s)": 0.200797 + }, + { + "acc": 0.78013411, + "epoch": 0.3620933521923621, + "grad_norm": 4.21875, + "learning_rate": 9.413535164544112e-06, + "loss": 0.77737093, + "memory(GiB)": 135.77, + "step": 15520, + "train_speed(iter/s)": 0.200865 + }, + { + "acc": 0.76417913, + "epoch": 0.362326659764651, + "grad_norm": 5.8125, + "learning_rate": 9.412647111198708e-06, + "loss": 0.86939831, + "memory(GiB)": 135.77, + "step": 15530, + "train_speed(iter/s)": 0.200933 + }, + { + "acc": 0.76209974, + "epoch": 0.3625599673369399, + "grad_norm": 3.8125, + "learning_rate": 9.411758427949211e-06, + "loss": 0.85637188, + "memory(GiB)": 135.77, + "step": 15540, + "train_speed(iter/s)": 0.201 + }, + { + "acc": 0.79845304, + "epoch": 0.36279327490922875, + "grad_norm": 4.3125, + "learning_rate": 9.410869114922478e-06, + "loss": 0.72790012, + "memory(GiB)": 135.77, + "step": 15550, + "train_speed(iter/s)": 0.201068 + }, + { + "acc": 0.77766886, + "epoch": 0.36302658248151765, + "grad_norm": 6.5, + "learning_rate": 9.409979172245463e-06, + "loss": 0.79989691, + "memory(GiB)": 135.77, + "step": 15560, + "train_speed(iter/s)": 0.201133 + }, + { + "acc": 0.77453566, + "epoch": 0.36325989005380654, + "grad_norm": 5.625, + "learning_rate": 9.409088600045202e-06, + "loss": 0.82451859, + "memory(GiB)": 135.77, + "step": 15570, + "train_speed(iter/s)": 0.201202 + }, + { + "acc": 0.77265034, + "epoch": 0.36349319762609544, + "grad_norm": 5.78125, + "learning_rate": 9.408197398448822e-06, + "loss": 0.79861336, + "memory(GiB)": 135.77, + "step": 15580, + "train_speed(iter/s)": 0.201269 + }, + { + "acc": 0.77113476, + "epoch": 0.36372650519838434, + "grad_norm": 4.9375, + "learning_rate": 9.407305567583547e-06, + "loss": 0.8350358, + "memory(GiB)": 135.77, + "step": 15590, + "train_speed(iter/s)": 0.201333 + }, + { + "acc": 0.76244006, + "epoch": 0.36395981277067324, + "grad_norm": 5.1875, + "learning_rate": 9.40641310757668e-06, + "loss": 0.86877785, + "memory(GiB)": 135.77, + "step": 15600, + "train_speed(iter/s)": 0.2014 + }, + { + "acc": 0.7791647, + "epoch": 0.36419312034296214, + "grad_norm": 5.75, + "learning_rate": 9.405520018555624e-06, + "loss": 0.79430704, + "memory(GiB)": 135.77, + "step": 15610, + "train_speed(iter/s)": 0.201469 + }, + { + "acc": 0.77130833, + "epoch": 0.36442642791525104, + "grad_norm": 5.46875, + "learning_rate": 9.404626300647864e-06, + "loss": 0.8362133, + "memory(GiB)": 135.77, + "step": 15620, + "train_speed(iter/s)": 0.201536 + }, + { + "acc": 0.78146958, + "epoch": 0.36465973548753994, + "grad_norm": 5.53125, + "learning_rate": 9.403731953980978e-06, + "loss": 0.7925806, + "memory(GiB)": 135.77, + "step": 15630, + "train_speed(iter/s)": 0.201599 + }, + { + "acc": 0.77909827, + "epoch": 0.36489304305982884, + "grad_norm": 4.25, + "learning_rate": 9.402836978682636e-06, + "loss": 0.79750004, + "memory(GiB)": 135.77, + "step": 15640, + "train_speed(iter/s)": 0.201662 + }, + { + "acc": 0.77254696, + "epoch": 0.3651263506321177, + "grad_norm": 7.96875, + "learning_rate": 9.401941374880595e-06, + "loss": 0.80021648, + "memory(GiB)": 135.77, + "step": 15650, + "train_speed(iter/s)": 0.201723 + }, + { + "acc": 0.77365751, + "epoch": 0.3653596582044066, + "grad_norm": 5.09375, + "learning_rate": 9.4010451427027e-06, + "loss": 0.81189899, + "memory(GiB)": 135.77, + "step": 15660, + "train_speed(iter/s)": 0.201787 + }, + { + "acc": 0.77464514, + "epoch": 0.3655929657766955, + "grad_norm": 4.34375, + "learning_rate": 9.40014828227689e-06, + "loss": 0.81965914, + "memory(GiB)": 135.77, + "step": 15670, + "train_speed(iter/s)": 0.201854 + }, + { + "acc": 0.76693192, + "epoch": 0.3658262733489844, + "grad_norm": 12.625, + "learning_rate": 9.399250793731192e-06, + "loss": 0.85733891, + "memory(GiB)": 135.77, + "step": 15680, + "train_speed(iter/s)": 0.20192 + }, + { + "acc": 0.76616225, + "epoch": 0.3660595809212733, + "grad_norm": 4.5, + "learning_rate": 9.398352677193719e-06, + "loss": 0.85392532, + "memory(GiB)": 135.77, + "step": 15690, + "train_speed(iter/s)": 0.201988 + }, + { + "acc": 0.77419605, + "epoch": 0.3662928884935622, + "grad_norm": 5.375, + "learning_rate": 9.397453932792681e-06, + "loss": 0.80535965, + "memory(GiB)": 135.77, + "step": 15700, + "train_speed(iter/s)": 0.202053 + }, + { + "acc": 0.76591721, + "epoch": 0.3665261960658511, + "grad_norm": 32.5, + "learning_rate": 9.396554560656371e-06, + "loss": 0.90535355, + "memory(GiB)": 135.77, + "step": 15710, + "train_speed(iter/s)": 0.202119 + }, + { + "acc": 0.75157509, + "epoch": 0.36675950363813997, + "grad_norm": 5.125, + "learning_rate": 9.395654560913174e-06, + "loss": 0.90469847, + "memory(GiB)": 135.77, + "step": 15720, + "train_speed(iter/s)": 0.202184 + }, + { + "acc": 0.7669796, + "epoch": 0.36699281121042887, + "grad_norm": 4.5625, + "learning_rate": 9.394753933691567e-06, + "loss": 0.86323175, + "memory(GiB)": 135.77, + "step": 15730, + "train_speed(iter/s)": 0.202243 + }, + { + "acc": 0.76320677, + "epoch": 0.3672261187827177, + "grad_norm": 4.65625, + "learning_rate": 9.393852679120113e-06, + "loss": 0.85583038, + "memory(GiB)": 135.77, + "step": 15740, + "train_speed(iter/s)": 0.202312 + }, + { + "acc": 0.78019586, + "epoch": 0.3674594263550066, + "grad_norm": 4.875, + "learning_rate": 9.392950797327463e-06, + "loss": 0.77571545, + "memory(GiB)": 135.77, + "step": 15750, + "train_speed(iter/s)": 0.202374 + }, + { + "acc": 0.76291428, + "epoch": 0.3676927339272955, + "grad_norm": 7.25, + "learning_rate": 9.392048288442363e-06, + "loss": 0.8645957, + "memory(GiB)": 135.77, + "step": 15760, + "train_speed(iter/s)": 0.202441 + }, + { + "acc": 0.75715952, + "epoch": 0.3679260414995844, + "grad_norm": 5.65625, + "learning_rate": 9.391145152593646e-06, + "loss": 0.89210663, + "memory(GiB)": 135.77, + "step": 15770, + "train_speed(iter/s)": 0.202507 + }, + { + "acc": 0.77362776, + "epoch": 0.3681593490718733, + "grad_norm": 8.9375, + "learning_rate": 9.390241389910236e-06, + "loss": 0.84507351, + "memory(GiB)": 135.77, + "step": 15780, + "train_speed(iter/s)": 0.202567 + }, + { + "acc": 0.77469263, + "epoch": 0.3683926566441622, + "grad_norm": 5.65625, + "learning_rate": 9.389337000521142e-06, + "loss": 0.82550621, + "memory(GiB)": 135.77, + "step": 15790, + "train_speed(iter/s)": 0.202635 + }, + { + "acc": 0.76861792, + "epoch": 0.3686259642164511, + "grad_norm": 5.125, + "learning_rate": 9.388431984555466e-06, + "loss": 0.84576836, + "memory(GiB)": 135.77, + "step": 15800, + "train_speed(iter/s)": 0.202702 + }, + { + "acc": 0.79161878, + "epoch": 0.36885927178874, + "grad_norm": 4.8125, + "learning_rate": 9.387526342142398e-06, + "loss": 0.75349326, + "memory(GiB)": 135.77, + "step": 15810, + "train_speed(iter/s)": 0.202771 + }, + { + "acc": 0.7592205, + "epoch": 0.3690925793610289, + "grad_norm": 6.21875, + "learning_rate": 9.386620073411221e-06, + "loss": 0.8692564, + "memory(GiB)": 135.77, + "step": 15820, + "train_speed(iter/s)": 0.202835 + }, + { + "acc": 0.77718821, + "epoch": 0.3693258869333178, + "grad_norm": 7.3125, + "learning_rate": 9.385713178491302e-06, + "loss": 0.81367092, + "memory(GiB)": 135.77, + "step": 15830, + "train_speed(iter/s)": 0.202905 + }, + { + "acc": 0.78268156, + "epoch": 0.36955919450560665, + "grad_norm": 7.625, + "learning_rate": 9.384805657512101e-06, + "loss": 0.79621563, + "memory(GiB)": 135.77, + "step": 15840, + "train_speed(iter/s)": 0.202971 + }, + { + "acc": 0.76887393, + "epoch": 0.36979250207789555, + "grad_norm": 6.75, + "learning_rate": 9.383897510603167e-06, + "loss": 0.84250908, + "memory(GiB)": 135.77, + "step": 15850, + "train_speed(iter/s)": 0.203038 + }, + { + "acc": 0.78669109, + "epoch": 0.37002580965018445, + "grad_norm": 5.90625, + "learning_rate": 9.382988737894136e-06, + "loss": 0.7452014, + "memory(GiB)": 135.77, + "step": 15860, + "train_speed(iter/s)": 0.203102 + }, + { + "acc": 0.76802521, + "epoch": 0.37025911722247334, + "grad_norm": 7.375, + "learning_rate": 9.382079339514736e-06, + "loss": 0.85868645, + "memory(GiB)": 135.77, + "step": 15870, + "train_speed(iter/s)": 0.203168 + }, + { + "acc": 0.76643047, + "epoch": 0.37049242479476224, + "grad_norm": 5.0, + "learning_rate": 9.381169315594782e-06, + "loss": 0.83867474, + "memory(GiB)": 135.77, + "step": 15880, + "train_speed(iter/s)": 0.20323 + }, + { + "acc": 0.77568998, + "epoch": 0.37072573236705114, + "grad_norm": 4.90625, + "learning_rate": 9.380258666264184e-06, + "loss": 0.83556032, + "memory(GiB)": 135.77, + "step": 15890, + "train_speed(iter/s)": 0.203294 + }, + { + "acc": 0.77228613, + "epoch": 0.37095903993934004, + "grad_norm": 9.0625, + "learning_rate": 9.379347391652931e-06, + "loss": 0.81034203, + "memory(GiB)": 135.77, + "step": 15900, + "train_speed(iter/s)": 0.20336 + }, + { + "acc": 0.76030293, + "epoch": 0.37119234751162894, + "grad_norm": 7.4375, + "learning_rate": 9.378435491891112e-06, + "loss": 0.88398857, + "memory(GiB)": 135.77, + "step": 15910, + "train_speed(iter/s)": 0.203423 + }, + { + "acc": 0.7455883, + "epoch": 0.37142565508391784, + "grad_norm": 6.21875, + "learning_rate": 9.377522967108897e-06, + "loss": 0.94365292, + "memory(GiB)": 135.77, + "step": 15920, + "train_speed(iter/s)": 0.203489 + }, + { + "acc": 0.76555414, + "epoch": 0.37165896265620674, + "grad_norm": 6.5625, + "learning_rate": 9.376609817436551e-06, + "loss": 0.86510744, + "memory(GiB)": 135.77, + "step": 15930, + "train_speed(iter/s)": 0.203548 + }, + { + "acc": 0.75843372, + "epoch": 0.3718922702284956, + "grad_norm": 9.6875, + "learning_rate": 9.375696043004425e-06, + "loss": 0.90398607, + "memory(GiB)": 135.77, + "step": 15940, + "train_speed(iter/s)": 0.203613 + }, + { + "acc": 0.75562124, + "epoch": 0.3721255778007845, + "grad_norm": 4.71875, + "learning_rate": 9.374781643942961e-06, + "loss": 0.88584518, + "memory(GiB)": 135.77, + "step": 15950, + "train_speed(iter/s)": 0.20368 + }, + { + "acc": 0.76519918, + "epoch": 0.3723588853730734, + "grad_norm": 7.25, + "learning_rate": 9.373866620382686e-06, + "loss": 0.8289814, + "memory(GiB)": 135.77, + "step": 15960, + "train_speed(iter/s)": 0.203744 + }, + { + "acc": 0.78620424, + "epoch": 0.3725921929453623, + "grad_norm": 13.9375, + "learning_rate": 9.372950972454222e-06, + "loss": 0.74287148, + "memory(GiB)": 135.77, + "step": 15970, + "train_speed(iter/s)": 0.203812 + }, + { + "acc": 0.77351294, + "epoch": 0.3728255005176512, + "grad_norm": 4.84375, + "learning_rate": 9.372034700288278e-06, + "loss": 0.79458714, + "memory(GiB)": 135.77, + "step": 15980, + "train_speed(iter/s)": 0.20388 + }, + { + "acc": 0.74867344, + "epoch": 0.3730588080899401, + "grad_norm": 6.75, + "learning_rate": 9.37111780401565e-06, + "loss": 0.92256651, + "memory(GiB)": 135.77, + "step": 15990, + "train_speed(iter/s)": 0.203948 + }, + { + "acc": 0.74160004, + "epoch": 0.373292115662229, + "grad_norm": 5.8125, + "learning_rate": 9.370200283767225e-06, + "loss": 0.94474297, + "memory(GiB)": 135.77, + "step": 16000, + "train_speed(iter/s)": 0.204015 + }, + { + "epoch": 0.373292115662229, + "eval_acc": 0.7363508670448953, + "eval_loss": 0.8348709940910339, + "eval_runtime": 1270.1462, + "eval_samples_per_second": 28.336, + "eval_steps_per_second": 14.168, + "step": 16000 + }, + { + "acc": 0.78337979, + "epoch": 0.3735254232345179, + "grad_norm": 5.5, + "learning_rate": 9.369282139673979e-06, + "loss": 0.79470425, + "memory(GiB)": 135.77, + "step": 16010, + "train_speed(iter/s)": 0.200783 + }, + { + "acc": 0.76890593, + "epoch": 0.37375873080680677, + "grad_norm": 4.6875, + "learning_rate": 9.368363371866978e-06, + "loss": 0.82831898, + "memory(GiB)": 135.77, + "step": 16020, + "train_speed(iter/s)": 0.200845 + }, + { + "acc": 0.75959797, + "epoch": 0.3739920383790956, + "grad_norm": 3.90625, + "learning_rate": 9.367443980477374e-06, + "loss": 0.86754074, + "memory(GiB)": 135.77, + "step": 16030, + "train_speed(iter/s)": 0.200911 + }, + { + "acc": 0.77851305, + "epoch": 0.3742253459513845, + "grad_norm": 5.6875, + "learning_rate": 9.366523965636412e-06, + "loss": 0.77482352, + "memory(GiB)": 135.77, + "step": 16040, + "train_speed(iter/s)": 0.200977 + }, + { + "acc": 0.75851483, + "epoch": 0.3744586535236734, + "grad_norm": 6.1875, + "learning_rate": 9.36560332747542e-06, + "loss": 0.86383018, + "memory(GiB)": 135.77, + "step": 16050, + "train_speed(iter/s)": 0.201043 + }, + { + "acc": 0.77050686, + "epoch": 0.3746919610959623, + "grad_norm": 7.25, + "learning_rate": 9.364682066125822e-06, + "loss": 0.84514685, + "memory(GiB)": 135.77, + "step": 16060, + "train_speed(iter/s)": 0.201107 + }, + { + "acc": 0.7671258, + "epoch": 0.3749252686682512, + "grad_norm": 7.90625, + "learning_rate": 9.363760181719127e-06, + "loss": 0.86259499, + "memory(GiB)": 135.77, + "step": 16070, + "train_speed(iter/s)": 0.201169 + }, + { + "acc": 0.76694169, + "epoch": 0.3751585762405401, + "grad_norm": 10.0, + "learning_rate": 9.362837674386934e-06, + "loss": 0.85537701, + "memory(GiB)": 135.77, + "step": 16080, + "train_speed(iter/s)": 0.201237 + }, + { + "acc": 0.79346738, + "epoch": 0.375391883812829, + "grad_norm": 6.4375, + "learning_rate": 9.36191454426093e-06, + "loss": 0.74549022, + "memory(GiB)": 135.77, + "step": 16090, + "train_speed(iter/s)": 0.2013 + }, + { + "acc": 0.76404152, + "epoch": 0.3756251913851179, + "grad_norm": 5.21875, + "learning_rate": 9.360990791472893e-06, + "loss": 0.85988121, + "memory(GiB)": 135.77, + "step": 16100, + "train_speed(iter/s)": 0.20136 + }, + { + "acc": 0.77790794, + "epoch": 0.3758584989574068, + "grad_norm": 7.1875, + "learning_rate": 9.360066416154687e-06, + "loss": 0.82874393, + "memory(GiB)": 135.77, + "step": 16110, + "train_speed(iter/s)": 0.201424 + }, + { + "acc": 0.76713996, + "epoch": 0.3760918065296957, + "grad_norm": 6.34375, + "learning_rate": 9.359141418438266e-06, + "loss": 0.82495213, + "memory(GiB)": 135.77, + "step": 16120, + "train_speed(iter/s)": 0.201487 + }, + { + "acc": 0.76586213, + "epoch": 0.37632511410198455, + "grad_norm": 7.125, + "learning_rate": 9.358215798455674e-06, + "loss": 0.83144331, + "memory(GiB)": 135.77, + "step": 16130, + "train_speed(iter/s)": 0.201555 + }, + { + "acc": 0.7565609, + "epoch": 0.37655842167427345, + "grad_norm": 6.0, + "learning_rate": 9.357289556339044e-06, + "loss": 0.8857419, + "memory(GiB)": 135.77, + "step": 16140, + "train_speed(iter/s)": 0.201622 + }, + { + "acc": 0.76512766, + "epoch": 0.37679172924656235, + "grad_norm": 10.5, + "learning_rate": 9.356362692220593e-06, + "loss": 0.88596659, + "memory(GiB)": 135.77, + "step": 16150, + "train_speed(iter/s)": 0.201686 + }, + { + "acc": 0.77160578, + "epoch": 0.37702503681885124, + "grad_norm": 7.6875, + "learning_rate": 9.355435206232635e-06, + "loss": 0.81468468, + "memory(GiB)": 135.77, + "step": 16160, + "train_speed(iter/s)": 0.201747 + }, + { + "acc": 0.76007071, + "epoch": 0.37725834439114014, + "grad_norm": 5.1875, + "learning_rate": 9.354507098507568e-06, + "loss": 0.86362362, + "memory(GiB)": 135.77, + "step": 16170, + "train_speed(iter/s)": 0.201815 + }, + { + "acc": 0.75717325, + "epoch": 0.37749165196342904, + "grad_norm": 5.78125, + "learning_rate": 9.353578369177876e-06, + "loss": 0.88568306, + "memory(GiB)": 135.77, + "step": 16180, + "train_speed(iter/s)": 0.201881 + }, + { + "acc": 0.77196169, + "epoch": 0.37772495953571794, + "grad_norm": 5.78125, + "learning_rate": 9.352649018376136e-06, + "loss": 0.82890453, + "memory(GiB)": 135.77, + "step": 16190, + "train_speed(iter/s)": 0.201948 + }, + { + "acc": 0.75310216, + "epoch": 0.37795826710800684, + "grad_norm": 5.8125, + "learning_rate": 9.351719046235013e-06, + "loss": 0.88745995, + "memory(GiB)": 135.77, + "step": 16200, + "train_speed(iter/s)": 0.202008 + }, + { + "acc": 0.78186665, + "epoch": 0.37819157468029574, + "grad_norm": 4.03125, + "learning_rate": 9.350788452887262e-06, + "loss": 0.75842514, + "memory(GiB)": 135.77, + "step": 16210, + "train_speed(iter/s)": 0.202072 + }, + { + "acc": 0.7654644, + "epoch": 0.37842488225258464, + "grad_norm": 7.9375, + "learning_rate": 9.349857238465723e-06, + "loss": 0.8322834, + "memory(GiB)": 135.77, + "step": 16220, + "train_speed(iter/s)": 0.202134 + }, + { + "acc": 0.77073183, + "epoch": 0.3786581898248735, + "grad_norm": 5.125, + "learning_rate": 9.348925403103326e-06, + "loss": 0.84538736, + "memory(GiB)": 135.77, + "step": 16230, + "train_speed(iter/s)": 0.202201 + }, + { + "acc": 0.76439257, + "epoch": 0.3788914973971624, + "grad_norm": 6.25, + "learning_rate": 9.347992946933091e-06, + "loss": 0.86807938, + "memory(GiB)": 135.77, + "step": 16240, + "train_speed(iter/s)": 0.202266 + }, + { + "acc": 0.77966743, + "epoch": 0.3791248049694513, + "grad_norm": 5.0625, + "learning_rate": 9.347059870088127e-06, + "loss": 0.75784845, + "memory(GiB)": 135.77, + "step": 16250, + "train_speed(iter/s)": 0.202327 + }, + { + "acc": 0.76927428, + "epoch": 0.3793581125417402, + "grad_norm": 5.96875, + "learning_rate": 9.346126172701629e-06, + "loss": 0.83154087, + "memory(GiB)": 135.77, + "step": 16260, + "train_speed(iter/s)": 0.202393 + }, + { + "acc": 0.77350688, + "epoch": 0.3795914201140291, + "grad_norm": 3.765625, + "learning_rate": 9.345191854906881e-06, + "loss": 0.80910931, + "memory(GiB)": 135.77, + "step": 16270, + "train_speed(iter/s)": 0.202457 + }, + { + "acc": 0.7722403, + "epoch": 0.379824727686318, + "grad_norm": 4.125, + "learning_rate": 9.344256916837259e-06, + "loss": 0.83394699, + "memory(GiB)": 135.77, + "step": 16280, + "train_speed(iter/s)": 0.202518 + }, + { + "acc": 0.77347498, + "epoch": 0.3800580352586069, + "grad_norm": 8.625, + "learning_rate": 9.343321358626225e-06, + "loss": 0.78846674, + "memory(GiB)": 135.77, + "step": 16290, + "train_speed(iter/s)": 0.202587 + }, + { + "acc": 0.76977482, + "epoch": 0.3802913428308958, + "grad_norm": 6.1875, + "learning_rate": 9.342385180407328e-06, + "loss": 0.83136139, + "memory(GiB)": 135.77, + "step": 16300, + "train_speed(iter/s)": 0.202652 + }, + { + "acc": 0.77511702, + "epoch": 0.38052465040318467, + "grad_norm": 4.34375, + "learning_rate": 9.341448382314207e-06, + "loss": 0.80090103, + "memory(GiB)": 135.77, + "step": 16310, + "train_speed(iter/s)": 0.202714 + }, + { + "acc": 0.7784832, + "epoch": 0.3807579579754735, + "grad_norm": 4.28125, + "learning_rate": 9.340510964480591e-06, + "loss": 0.77812891, + "memory(GiB)": 135.77, + "step": 16320, + "train_speed(iter/s)": 0.202777 + }, + { + "acc": 0.78746824, + "epoch": 0.3809912655477624, + "grad_norm": 5.40625, + "learning_rate": 9.339572927040298e-06, + "loss": 0.76021338, + "memory(GiB)": 135.77, + "step": 16330, + "train_speed(iter/s)": 0.202838 + }, + { + "acc": 0.77678819, + "epoch": 0.3812245731200513, + "grad_norm": 5.46875, + "learning_rate": 9.338634270127227e-06, + "loss": 0.81745186, + "memory(GiB)": 135.77, + "step": 16340, + "train_speed(iter/s)": 0.202905 + }, + { + "acc": 0.75448432, + "epoch": 0.3814578806923402, + "grad_norm": 4.875, + "learning_rate": 9.337694993875376e-06, + "loss": 0.89182949, + "memory(GiB)": 135.77, + "step": 16350, + "train_speed(iter/s)": 0.202971 + }, + { + "acc": 0.7632596, + "epoch": 0.3816911882646291, + "grad_norm": 7.53125, + "learning_rate": 9.336755098418824e-06, + "loss": 0.86298809, + "memory(GiB)": 135.77, + "step": 16360, + "train_speed(iter/s)": 0.203036 + }, + { + "acc": 0.756043, + "epoch": 0.381924495836918, + "grad_norm": 5.5, + "learning_rate": 9.335814583891743e-06, + "loss": 0.89383011, + "memory(GiB)": 135.77, + "step": 16370, + "train_speed(iter/s)": 0.203095 + }, + { + "acc": 0.76111898, + "epoch": 0.3821578034092069, + "grad_norm": 4.65625, + "learning_rate": 9.33487345042839e-06, + "loss": 0.89134369, + "memory(GiB)": 135.77, + "step": 16380, + "train_speed(iter/s)": 0.203158 + }, + { + "acc": 0.75423412, + "epoch": 0.3823911109814958, + "grad_norm": 7.34375, + "learning_rate": 9.333931698163107e-06, + "loss": 0.89891577, + "memory(GiB)": 135.77, + "step": 16390, + "train_speed(iter/s)": 0.203226 + }, + { + "acc": 0.77302418, + "epoch": 0.3826244185537847, + "grad_norm": 5.6875, + "learning_rate": 9.332989327230337e-06, + "loss": 0.80847969, + "memory(GiB)": 135.77, + "step": 16400, + "train_speed(iter/s)": 0.203289 + }, + { + "acc": 0.75313158, + "epoch": 0.3828577261260736, + "grad_norm": 6.71875, + "learning_rate": 9.3320463377646e-06, + "loss": 0.87891369, + "memory(GiB)": 135.77, + "step": 16410, + "train_speed(iter/s)": 0.20336 + }, + { + "acc": 0.76052837, + "epoch": 0.38309103369836245, + "grad_norm": 5.0625, + "learning_rate": 9.331102729900505e-06, + "loss": 0.87168789, + "memory(GiB)": 135.77, + "step": 16420, + "train_speed(iter/s)": 0.203426 + }, + { + "acc": 0.75323372, + "epoch": 0.38332434127065135, + "grad_norm": 4.8125, + "learning_rate": 9.330158503772753e-06, + "loss": 0.9150569, + "memory(GiB)": 135.77, + "step": 16430, + "train_speed(iter/s)": 0.203491 + }, + { + "acc": 0.7864645, + "epoch": 0.38355764884294025, + "grad_norm": 5.90625, + "learning_rate": 9.329213659516134e-06, + "loss": 0.79203343, + "memory(GiB)": 135.77, + "step": 16440, + "train_speed(iter/s)": 0.203553 + }, + { + "acc": 0.77691231, + "epoch": 0.38379095641522915, + "grad_norm": 4.40625, + "learning_rate": 9.328268197265523e-06, + "loss": 0.79185615, + "memory(GiB)": 135.77, + "step": 16450, + "train_speed(iter/s)": 0.203619 + }, + { + "acc": 0.77853565, + "epoch": 0.38402426398751804, + "grad_norm": 6.59375, + "learning_rate": 9.327322117155881e-06, + "loss": 0.78379364, + "memory(GiB)": 135.77, + "step": 16460, + "train_speed(iter/s)": 0.20368 + }, + { + "acc": 0.78826647, + "epoch": 0.38425757155980694, + "grad_norm": 4.21875, + "learning_rate": 9.326375419322267e-06, + "loss": 0.81094532, + "memory(GiB)": 135.77, + "step": 16470, + "train_speed(iter/s)": 0.203742 + }, + { + "acc": 0.76101255, + "epoch": 0.38449087913209584, + "grad_norm": 4.59375, + "learning_rate": 9.325428103899818e-06, + "loss": 0.88430595, + "memory(GiB)": 135.77, + "step": 16480, + "train_speed(iter/s)": 0.203798 + }, + { + "acc": 0.774227, + "epoch": 0.38472418670438474, + "grad_norm": 5.75, + "learning_rate": 9.324480171023764e-06, + "loss": 0.81697226, + "memory(GiB)": 135.77, + "step": 16490, + "train_speed(iter/s)": 0.203856 + }, + { + "acc": 0.76122689, + "epoch": 0.38495749427667364, + "grad_norm": 4.5, + "learning_rate": 9.32353162082942e-06, + "loss": 0.85380974, + "memory(GiB)": 135.77, + "step": 16500, + "train_speed(iter/s)": 0.203918 + }, + { + "epoch": 0.38495749427667364, + "eval_acc": 0.736815605607034, + "eval_loss": 0.8338123559951782, + "eval_runtime": 1271.5777, + "eval_samples_per_second": 28.304, + "eval_steps_per_second": 14.152, + "step": 16500 + }, + { + "acc": 0.76048388, + "epoch": 0.3851908018489625, + "grad_norm": 7.8125, + "learning_rate": 9.322582453452195e-06, + "loss": 0.85223875, + "memory(GiB)": 135.77, + "step": 16510, + "train_speed(iter/s)": 0.200786 + }, + { + "acc": 0.78726511, + "epoch": 0.3854241094212514, + "grad_norm": 5.5625, + "learning_rate": 9.32163266902758e-06, + "loss": 0.76919699, + "memory(GiB)": 135.77, + "step": 16520, + "train_speed(iter/s)": 0.200847 + }, + { + "acc": 0.76031055, + "epoch": 0.3856574169935403, + "grad_norm": 6.0625, + "learning_rate": 9.320682267691157e-06, + "loss": 0.87715168, + "memory(GiB)": 135.77, + "step": 16530, + "train_speed(iter/s)": 0.200912 + }, + { + "acc": 0.76955204, + "epoch": 0.3858907245658292, + "grad_norm": 8.6875, + "learning_rate": 9.319731249578595e-06, + "loss": 0.84494944, + "memory(GiB)": 135.77, + "step": 16540, + "train_speed(iter/s)": 0.200977 + }, + { + "acc": 0.77983017, + "epoch": 0.3861240321381181, + "grad_norm": 5.4375, + "learning_rate": 9.318779614825653e-06, + "loss": 0.78632112, + "memory(GiB)": 135.77, + "step": 16550, + "train_speed(iter/s)": 0.201038 + }, + { + "acc": 0.77373309, + "epoch": 0.386357339710407, + "grad_norm": 5.15625, + "learning_rate": 9.317827363568176e-06, + "loss": 0.8040472, + "memory(GiB)": 135.77, + "step": 16560, + "train_speed(iter/s)": 0.2011 + }, + { + "acc": 0.77826786, + "epoch": 0.3865906472826959, + "grad_norm": 6.34375, + "learning_rate": 9.316874495942095e-06, + "loss": 0.7839695, + "memory(GiB)": 135.77, + "step": 16570, + "train_speed(iter/s)": 0.20116 + }, + { + "acc": 0.77872405, + "epoch": 0.3868239548549848, + "grad_norm": 6.0, + "learning_rate": 9.315921012083436e-06, + "loss": 0.77586727, + "memory(GiB)": 135.77, + "step": 16580, + "train_speed(iter/s)": 0.201222 + }, + { + "acc": 0.7748662, + "epoch": 0.3870572624272737, + "grad_norm": 11.875, + "learning_rate": 9.314966912128305e-06, + "loss": 0.81466694, + "memory(GiB)": 135.77, + "step": 16590, + "train_speed(iter/s)": 0.201285 + }, + { + "acc": 0.77469873, + "epoch": 0.3872905699995626, + "grad_norm": 6.21875, + "learning_rate": 9.3140121962129e-06, + "loss": 0.81014376, + "memory(GiB)": 135.77, + "step": 16600, + "train_speed(iter/s)": 0.201349 + }, + { + "acc": 0.78045406, + "epoch": 0.3875238775718514, + "grad_norm": 6.5625, + "learning_rate": 9.313056864473508e-06, + "loss": 0.77907581, + "memory(GiB)": 135.77, + "step": 16610, + "train_speed(iter/s)": 0.20141 + }, + { + "acc": 0.76823573, + "epoch": 0.3877571851441403, + "grad_norm": 6.46875, + "learning_rate": 9.312100917046502e-06, + "loss": 0.8444459, + "memory(GiB)": 135.77, + "step": 16620, + "train_speed(iter/s)": 0.201474 + }, + { + "acc": 0.76610289, + "epoch": 0.3879904927164292, + "grad_norm": 6.03125, + "learning_rate": 9.311144354068342e-06, + "loss": 0.84226179, + "memory(GiB)": 135.77, + "step": 16630, + "train_speed(iter/s)": 0.20154 + }, + { + "acc": 0.78322606, + "epoch": 0.3882238002887181, + "grad_norm": 5.84375, + "learning_rate": 9.310187175675579e-06, + "loss": 0.78812022, + "memory(GiB)": 135.77, + "step": 16640, + "train_speed(iter/s)": 0.201607 + }, + { + "acc": 0.75634503, + "epoch": 0.388457107861007, + "grad_norm": 6.78125, + "learning_rate": 9.309229382004847e-06, + "loss": 0.90151043, + "memory(GiB)": 135.77, + "step": 16650, + "train_speed(iter/s)": 0.20167 + }, + { + "acc": 0.75544233, + "epoch": 0.3886904154332959, + "grad_norm": 5.0625, + "learning_rate": 9.308270973192875e-06, + "loss": 0.89699974, + "memory(GiB)": 135.77, + "step": 16660, + "train_speed(iter/s)": 0.201732 + }, + { + "acc": 0.74633512, + "epoch": 0.3889237230055848, + "grad_norm": 5.96875, + "learning_rate": 9.307311949376472e-06, + "loss": 0.92126493, + "memory(GiB)": 135.77, + "step": 16670, + "train_speed(iter/s)": 0.201796 + }, + { + "acc": 0.77705793, + "epoch": 0.3891570305778737, + "grad_norm": 6.125, + "learning_rate": 9.306352310692539e-06, + "loss": 0.80012226, + "memory(GiB)": 135.77, + "step": 16680, + "train_speed(iter/s)": 0.201858 + }, + { + "acc": 0.77201366, + "epoch": 0.3893903381501626, + "grad_norm": 4.84375, + "learning_rate": 9.305392057278066e-06, + "loss": 0.8111311, + "memory(GiB)": 135.77, + "step": 16690, + "train_speed(iter/s)": 0.201919 + }, + { + "acc": 0.76858664, + "epoch": 0.3896236457224515, + "grad_norm": 5.28125, + "learning_rate": 9.304431189270127e-06, + "loss": 0.8267766, + "memory(GiB)": 135.77, + "step": 16700, + "train_speed(iter/s)": 0.201983 + }, + { + "acc": 0.75889578, + "epoch": 0.38985695329474035, + "grad_norm": 4.59375, + "learning_rate": 9.303469706805886e-06, + "loss": 0.84821739, + "memory(GiB)": 135.77, + "step": 16710, + "train_speed(iter/s)": 0.202043 + }, + { + "acc": 0.76242075, + "epoch": 0.39009026086702925, + "grad_norm": 6.125, + "learning_rate": 9.302507610022593e-06, + "loss": 0.87555027, + "memory(GiB)": 135.77, + "step": 16720, + "train_speed(iter/s)": 0.202104 + }, + { + "acc": 0.79397368, + "epoch": 0.39032356843931815, + "grad_norm": 8.0625, + "learning_rate": 9.30154489905759e-06, + "loss": 0.72974205, + "memory(GiB)": 135.77, + "step": 16730, + "train_speed(iter/s)": 0.202163 + }, + { + "acc": 0.74320917, + "epoch": 0.39055687601160705, + "grad_norm": 5.5, + "learning_rate": 9.300581574048303e-06, + "loss": 0.93350315, + "memory(GiB)": 135.77, + "step": 16740, + "train_speed(iter/s)": 0.202225 + }, + { + "acc": 0.77537889, + "epoch": 0.39079018358389594, + "grad_norm": 5.59375, + "learning_rate": 9.299617635132243e-06, + "loss": 0.78615303, + "memory(GiB)": 135.77, + "step": 16750, + "train_speed(iter/s)": 0.202286 + }, + { + "acc": 0.77060065, + "epoch": 0.39102349115618484, + "grad_norm": 5.6875, + "learning_rate": 9.298653082447019e-06, + "loss": 0.84748516, + "memory(GiB)": 135.77, + "step": 16760, + "train_speed(iter/s)": 0.202351 + }, + { + "acc": 0.78617654, + "epoch": 0.39125679872847374, + "grad_norm": 5.40625, + "learning_rate": 9.29768791613031e-06, + "loss": 0.76330194, + "memory(GiB)": 135.77, + "step": 16770, + "train_speed(iter/s)": 0.202413 + }, + { + "acc": 0.7793808, + "epoch": 0.39149010630076264, + "grad_norm": 4.96875, + "learning_rate": 9.296722136319904e-06, + "loss": 0.79507647, + "memory(GiB)": 135.77, + "step": 16780, + "train_speed(iter/s)": 0.202476 + }, + { + "acc": 0.7710515, + "epoch": 0.39172341387305154, + "grad_norm": 6.25, + "learning_rate": 9.29575574315366e-06, + "loss": 0.8227356, + "memory(GiB)": 135.77, + "step": 16790, + "train_speed(iter/s)": 0.202539 + }, + { + "acc": 0.77599983, + "epoch": 0.3919567214453404, + "grad_norm": 5.53125, + "learning_rate": 9.294788736769534e-06, + "loss": 0.80106039, + "memory(GiB)": 135.77, + "step": 16800, + "train_speed(iter/s)": 0.202601 + }, + { + "acc": 0.77693524, + "epoch": 0.3921900290176293, + "grad_norm": 7.5625, + "learning_rate": 9.293821117305562e-06, + "loss": 0.79049306, + "memory(GiB)": 135.77, + "step": 16810, + "train_speed(iter/s)": 0.202663 + }, + { + "acc": 0.7763052, + "epoch": 0.3924233365899182, + "grad_norm": 6.28125, + "learning_rate": 9.29285288489987e-06, + "loss": 0.81076241, + "memory(GiB)": 135.77, + "step": 16820, + "train_speed(iter/s)": 0.202723 + }, + { + "acc": 0.77143803, + "epoch": 0.3926566441622071, + "grad_norm": 9.8125, + "learning_rate": 9.29188403969068e-06, + "loss": 0.81417742, + "memory(GiB)": 135.77, + "step": 16830, + "train_speed(iter/s)": 0.202787 + }, + { + "acc": 0.79286661, + "epoch": 0.392889951734496, + "grad_norm": 8.375, + "learning_rate": 9.290914581816287e-06, + "loss": 0.72028284, + "memory(GiB)": 135.77, + "step": 16840, + "train_speed(iter/s)": 0.20285 + }, + { + "acc": 0.77796297, + "epoch": 0.3931232593067849, + "grad_norm": 6.1875, + "learning_rate": 9.289944511415086e-06, + "loss": 0.80314026, + "memory(GiB)": 135.77, + "step": 16850, + "train_speed(iter/s)": 0.202906 + }, + { + "acc": 0.75296588, + "epoch": 0.3933565668790738, + "grad_norm": 4.28125, + "learning_rate": 9.28897382862555e-06, + "loss": 0.88018427, + "memory(GiB)": 135.77, + "step": 16860, + "train_speed(iter/s)": 0.202965 + }, + { + "acc": 0.74555116, + "epoch": 0.3935898744513627, + "grad_norm": 5.53125, + "learning_rate": 9.288002533586247e-06, + "loss": 0.92258759, + "memory(GiB)": 135.77, + "step": 16870, + "train_speed(iter/s)": 0.203027 + }, + { + "acc": 0.77875471, + "epoch": 0.3938231820236516, + "grad_norm": 5.125, + "learning_rate": 9.287030626435828e-06, + "loss": 0.81162071, + "memory(GiB)": 135.77, + "step": 16880, + "train_speed(iter/s)": 0.20309 + }, + { + "acc": 0.77456217, + "epoch": 0.3940564895959405, + "grad_norm": 5.1875, + "learning_rate": 9.286058107313034e-06, + "loss": 0.79409857, + "memory(GiB)": 135.77, + "step": 16890, + "train_speed(iter/s)": 0.203154 + }, + { + "acc": 0.76377172, + "epoch": 0.3942897971682293, + "grad_norm": 5.8125, + "learning_rate": 9.285084976356689e-06, + "loss": 0.84241772, + "memory(GiB)": 135.77, + "step": 16900, + "train_speed(iter/s)": 0.203216 + }, + { + "acc": 0.76552515, + "epoch": 0.3945231047405182, + "grad_norm": 4.9375, + "learning_rate": 9.284111233705709e-06, + "loss": 0.86844196, + "memory(GiB)": 135.77, + "step": 16910, + "train_speed(iter/s)": 0.20328 + }, + { + "acc": 0.7652967, + "epoch": 0.3947564123128071, + "grad_norm": 4.28125, + "learning_rate": 9.283136879499094e-06, + "loss": 0.85671463, + "memory(GiB)": 135.77, + "step": 16920, + "train_speed(iter/s)": 0.203337 + }, + { + "acc": 0.77319775, + "epoch": 0.394989719885096, + "grad_norm": 4.84375, + "learning_rate": 9.282161913875933e-06, + "loss": 0.83186893, + "memory(GiB)": 135.77, + "step": 16930, + "train_speed(iter/s)": 0.203398 + }, + { + "acc": 0.76866665, + "epoch": 0.3952230274573849, + "grad_norm": 4.71875, + "learning_rate": 9.281186336975406e-06, + "loss": 0.83866825, + "memory(GiB)": 135.77, + "step": 16940, + "train_speed(iter/s)": 0.203462 + }, + { + "acc": 0.76994901, + "epoch": 0.3954563350296738, + "grad_norm": 5.78125, + "learning_rate": 9.28021014893677e-06, + "loss": 0.83855286, + "memory(GiB)": 135.77, + "step": 16950, + "train_speed(iter/s)": 0.203521 + }, + { + "acc": 0.77082605, + "epoch": 0.3956896426019627, + "grad_norm": 6.09375, + "learning_rate": 9.27923334989938e-06, + "loss": 0.83198881, + "memory(GiB)": 135.77, + "step": 16960, + "train_speed(iter/s)": 0.203576 + }, + { + "acc": 0.75358386, + "epoch": 0.3959229501742516, + "grad_norm": 5.28125, + "learning_rate": 9.278255940002671e-06, + "loss": 0.89232082, + "memory(GiB)": 135.77, + "step": 16970, + "train_speed(iter/s)": 0.20364 + }, + { + "acc": 0.76478891, + "epoch": 0.3961562577465405, + "grad_norm": 9.0625, + "learning_rate": 9.27727791938617e-06, + "loss": 0.84635849, + "memory(GiB)": 135.77, + "step": 16980, + "train_speed(iter/s)": 0.203705 + }, + { + "acc": 0.7693079, + "epoch": 0.3963895653188294, + "grad_norm": 7.15625, + "learning_rate": 9.27629928818949e-06, + "loss": 0.83523731, + "memory(GiB)": 135.77, + "step": 16990, + "train_speed(iter/s)": 0.203767 + }, + { + "acc": 0.76514502, + "epoch": 0.39662287289111825, + "grad_norm": 6.40625, + "learning_rate": 9.275320046552328e-06, + "loss": 0.85309467, + "memory(GiB)": 135.77, + "step": 17000, + "train_speed(iter/s)": 0.203827 + }, + { + "epoch": 0.39662287289111825, + "eval_acc": 0.7371956278702851, + "eval_loss": 0.8326179385185242, + "eval_runtime": 1270.543, + "eval_samples_per_second": 28.327, + "eval_steps_per_second": 14.164, + "step": 17000 + }, + { + "acc": 0.76229954, + "epoch": 0.39685618046340715, + "grad_norm": 6.625, + "learning_rate": 9.274340194614471e-06, + "loss": 0.83733425, + "memory(GiB)": 135.77, + "step": 17010, + "train_speed(iter/s)": 0.200789 + }, + { + "acc": 0.77126536, + "epoch": 0.39708948803569605, + "grad_norm": 8.75, + "learning_rate": 9.273359732515793e-06, + "loss": 0.83545189, + "memory(GiB)": 135.77, + "step": 17020, + "train_speed(iter/s)": 0.200853 + }, + { + "acc": 0.76169815, + "epoch": 0.39732279560798495, + "grad_norm": 5.84375, + "learning_rate": 9.272378660396255e-06, + "loss": 0.87710171, + "memory(GiB)": 135.77, + "step": 17030, + "train_speed(iter/s)": 0.200915 + }, + { + "acc": 0.77218542, + "epoch": 0.39755610318027385, + "grad_norm": 5.0, + "learning_rate": 9.271396978395904e-06, + "loss": 0.83802376, + "memory(GiB)": 135.77, + "step": 17040, + "train_speed(iter/s)": 0.200972 + }, + { + "acc": 0.76550169, + "epoch": 0.39778941075256274, + "grad_norm": 5.25, + "learning_rate": 9.270414686654875e-06, + "loss": 0.85740089, + "memory(GiB)": 135.77, + "step": 17050, + "train_speed(iter/s)": 0.201033 + }, + { + "acc": 0.7832613, + "epoch": 0.39802271832485164, + "grad_norm": 7.125, + "learning_rate": 9.269431785313391e-06, + "loss": 0.76584721, + "memory(GiB)": 135.77, + "step": 17060, + "train_speed(iter/s)": 0.201094 + }, + { + "acc": 0.77678032, + "epoch": 0.39825602589714054, + "grad_norm": 7.3125, + "learning_rate": 9.268448274511759e-06, + "loss": 0.81068668, + "memory(GiB)": 135.77, + "step": 17070, + "train_speed(iter/s)": 0.201157 + }, + { + "acc": 0.77908564, + "epoch": 0.39848933346942944, + "grad_norm": 5.96875, + "learning_rate": 9.267464154390375e-06, + "loss": 0.79487085, + "memory(GiB)": 135.77, + "step": 17080, + "train_speed(iter/s)": 0.201218 + }, + { + "acc": 0.80031281, + "epoch": 0.3987226410417183, + "grad_norm": 4.46875, + "learning_rate": 9.266479425089725e-06, + "loss": 0.71730728, + "memory(GiB)": 135.77, + "step": 17090, + "train_speed(iter/s)": 0.20128 + }, + { + "acc": 0.77682505, + "epoch": 0.3989559486140072, + "grad_norm": 10.0625, + "learning_rate": 9.265494086750375e-06, + "loss": 0.80053043, + "memory(GiB)": 135.77, + "step": 17100, + "train_speed(iter/s)": 0.201344 + }, + { + "acc": 0.77470179, + "epoch": 0.3991892561862961, + "grad_norm": 5.125, + "learning_rate": 9.264508139512985e-06, + "loss": 0.81498718, + "memory(GiB)": 135.77, + "step": 17110, + "train_speed(iter/s)": 0.201402 + }, + { + "acc": 0.77046309, + "epoch": 0.399422563758585, + "grad_norm": 6.25, + "learning_rate": 9.263521583518293e-06, + "loss": 0.82258339, + "memory(GiB)": 135.77, + "step": 17120, + "train_speed(iter/s)": 0.20146 + }, + { + "acc": 0.79407024, + "epoch": 0.3996558713308739, + "grad_norm": 8.25, + "learning_rate": 9.262534418907137e-06, + "loss": 0.72604542, + "memory(GiB)": 135.77, + "step": 17130, + "train_speed(iter/s)": 0.20152 + }, + { + "acc": 0.7635807, + "epoch": 0.3998891789031628, + "grad_norm": 5.6875, + "learning_rate": 9.26154664582043e-06, + "loss": 0.8567337, + "memory(GiB)": 135.77, + "step": 17140, + "train_speed(iter/s)": 0.201581 + }, + { + "acc": 0.78804669, + "epoch": 0.4001224864754517, + "grad_norm": 6.65625, + "learning_rate": 9.260558264399177e-06, + "loss": 0.73743448, + "memory(GiB)": 135.77, + "step": 17150, + "train_speed(iter/s)": 0.201638 + }, + { + "acc": 0.76328459, + "epoch": 0.4003557940477406, + "grad_norm": 5.5, + "learning_rate": 9.25956927478447e-06, + "loss": 0.85945473, + "memory(GiB)": 135.77, + "step": 17160, + "train_speed(iter/s)": 0.2017 + }, + { + "acc": 0.77089682, + "epoch": 0.4005891016200295, + "grad_norm": 5.4375, + "learning_rate": 9.258579677117486e-06, + "loss": 0.81945114, + "memory(GiB)": 135.77, + "step": 17170, + "train_speed(iter/s)": 0.201758 + }, + { + "acc": 0.7839241, + "epoch": 0.4008224091923184, + "grad_norm": 5.125, + "learning_rate": 9.25758947153949e-06, + "loss": 0.77010636, + "memory(GiB)": 135.77, + "step": 17180, + "train_speed(iter/s)": 0.201814 + }, + { + "acc": 0.76271248, + "epoch": 0.4010557167646072, + "grad_norm": 3.828125, + "learning_rate": 9.256598658191834e-06, + "loss": 0.85306063, + "memory(GiB)": 135.77, + "step": 17190, + "train_speed(iter/s)": 0.201875 + }, + { + "acc": 0.77881155, + "epoch": 0.4012890243368961, + "grad_norm": 6.875, + "learning_rate": 9.255607237215957e-06, + "loss": 0.7829484, + "memory(GiB)": 135.77, + "step": 17200, + "train_speed(iter/s)": 0.201937 + }, + { + "acc": 0.75986853, + "epoch": 0.401522331909185, + "grad_norm": 5.96875, + "learning_rate": 9.254615208753381e-06, + "loss": 0.8725729, + "memory(GiB)": 135.77, + "step": 17210, + "train_speed(iter/s)": 0.202 + }, + { + "acc": 0.77503152, + "epoch": 0.4017556394814739, + "grad_norm": 4.0, + "learning_rate": 9.253622572945722e-06, + "loss": 0.82128544, + "memory(GiB)": 135.77, + "step": 17220, + "train_speed(iter/s)": 0.202063 + }, + { + "acc": 0.75122519, + "epoch": 0.4019889470537628, + "grad_norm": 5.53125, + "learning_rate": 9.252629329934676e-06, + "loss": 0.91885424, + "memory(GiB)": 135.77, + "step": 17230, + "train_speed(iter/s)": 0.202123 + }, + { + "acc": 0.78238068, + "epoch": 0.4022222546260517, + "grad_norm": 4.25, + "learning_rate": 9.251635479862029e-06, + "loss": 0.7822403, + "memory(GiB)": 135.77, + "step": 17240, + "train_speed(iter/s)": 0.202187 + }, + { + "acc": 0.7734271, + "epoch": 0.4024555621983406, + "grad_norm": 5.9375, + "learning_rate": 9.25064102286965e-06, + "loss": 0.83377533, + "memory(GiB)": 135.77, + "step": 17250, + "train_speed(iter/s)": 0.202249 + }, + { + "acc": 0.78481631, + "epoch": 0.4026888697706295, + "grad_norm": 8.6875, + "learning_rate": 9.249645959099503e-06, + "loss": 0.76742496, + "memory(GiB)": 135.77, + "step": 17260, + "train_speed(iter/s)": 0.202311 + }, + { + "acc": 0.78067999, + "epoch": 0.4029221773429184, + "grad_norm": 6.4375, + "learning_rate": 9.248650288693628e-06, + "loss": 0.78222075, + "memory(GiB)": 135.77, + "step": 17270, + "train_speed(iter/s)": 0.202371 + }, + { + "acc": 0.78589168, + "epoch": 0.40315548491520725, + "grad_norm": 4.6875, + "learning_rate": 9.247654011794158e-06, + "loss": 0.7706872, + "memory(GiB)": 135.77, + "step": 17280, + "train_speed(iter/s)": 0.20243 + }, + { + "acc": 0.76477909, + "epoch": 0.40338879248749615, + "grad_norm": 9.0, + "learning_rate": 9.246657128543313e-06, + "loss": 0.85425339, + "memory(GiB)": 135.77, + "step": 17290, + "train_speed(iter/s)": 0.202491 + }, + { + "acc": 0.77842503, + "epoch": 0.40362210005978505, + "grad_norm": 5.4375, + "learning_rate": 9.245659639083396e-06, + "loss": 0.79319229, + "memory(GiB)": 135.77, + "step": 17300, + "train_speed(iter/s)": 0.202549 + }, + { + "acc": 0.78424797, + "epoch": 0.40385540763207395, + "grad_norm": 4.375, + "learning_rate": 9.244661543556799e-06, + "loss": 0.7634676, + "memory(GiB)": 135.77, + "step": 17310, + "train_speed(iter/s)": 0.202609 + }, + { + "acc": 0.77571602, + "epoch": 0.40408871520436285, + "grad_norm": 8.375, + "learning_rate": 9.243662842106e-06, + "loss": 0.80258932, + "memory(GiB)": 135.77, + "step": 17320, + "train_speed(iter/s)": 0.202667 + }, + { + "acc": 0.76631498, + "epoch": 0.40432202277665175, + "grad_norm": 4.25, + "learning_rate": 9.242663534873562e-06, + "loss": 0.85109797, + "memory(GiB)": 135.77, + "step": 17330, + "train_speed(iter/s)": 0.202731 + }, + { + "acc": 0.76990891, + "epoch": 0.40455533034894064, + "grad_norm": 5.40625, + "learning_rate": 9.241663622002137e-06, + "loss": 0.84066992, + "memory(GiB)": 135.77, + "step": 17340, + "train_speed(iter/s)": 0.202789 + }, + { + "acc": 0.76096568, + "epoch": 0.40478863792122954, + "grad_norm": 5.59375, + "learning_rate": 9.240663103634464e-06, + "loss": 0.86097231, + "memory(GiB)": 135.77, + "step": 17350, + "train_speed(iter/s)": 0.202851 + }, + { + "acc": 0.78643513, + "epoch": 0.40502194549351844, + "grad_norm": 7.78125, + "learning_rate": 9.239661979913364e-06, + "loss": 0.7529027, + "memory(GiB)": 135.77, + "step": 17360, + "train_speed(iter/s)": 0.20291 + }, + { + "acc": 0.78234057, + "epoch": 0.40525525306580734, + "grad_norm": 5.71875, + "learning_rate": 9.238660250981748e-06, + "loss": 0.76194448, + "memory(GiB)": 135.77, + "step": 17370, + "train_speed(iter/s)": 0.202971 + }, + { + "acc": 0.7768539, + "epoch": 0.4054885606380962, + "grad_norm": 5.0, + "learning_rate": 9.237657916982612e-06, + "loss": 0.79744596, + "memory(GiB)": 135.77, + "step": 17380, + "train_speed(iter/s)": 0.203032 + }, + { + "acc": 0.76710577, + "epoch": 0.4057218682103851, + "grad_norm": 13.4375, + "learning_rate": 9.236654978059039e-06, + "loss": 0.82225361, + "memory(GiB)": 135.77, + "step": 17390, + "train_speed(iter/s)": 0.203092 + }, + { + "acc": 0.78320732, + "epoch": 0.405955175782674, + "grad_norm": 4.53125, + "learning_rate": 9.2356514343542e-06, + "loss": 0.79602137, + "memory(GiB)": 135.77, + "step": 17400, + "train_speed(iter/s)": 0.203151 + }, + { + "acc": 0.7971508, + "epoch": 0.4061884833549629, + "grad_norm": 3.9375, + "learning_rate": 9.234647286011347e-06, + "loss": 0.73624454, + "memory(GiB)": 135.77, + "step": 17410, + "train_speed(iter/s)": 0.203208 + }, + { + "acc": 0.79400434, + "epoch": 0.4064217909272518, + "grad_norm": 5.75, + "learning_rate": 9.233642533173827e-06, + "loss": 0.73229074, + "memory(GiB)": 135.77, + "step": 17420, + "train_speed(iter/s)": 0.203266 + }, + { + "acc": 0.78245573, + "epoch": 0.4066550984995407, + "grad_norm": 7.125, + "learning_rate": 9.232637175985064e-06, + "loss": 0.79270024, + "memory(GiB)": 135.77, + "step": 17430, + "train_speed(iter/s)": 0.203326 + }, + { + "acc": 0.7708353, + "epoch": 0.4068884060718296, + "grad_norm": 5.40625, + "learning_rate": 9.231631214588572e-06, + "loss": 0.80469551, + "memory(GiB)": 135.77, + "step": 17440, + "train_speed(iter/s)": 0.203388 + }, + { + "acc": 0.77979951, + "epoch": 0.4071217136441185, + "grad_norm": 5.3125, + "learning_rate": 9.230624649127956e-06, + "loss": 0.7915205, + "memory(GiB)": 135.77, + "step": 17450, + "train_speed(iter/s)": 0.203451 + }, + { + "acc": 0.78297606, + "epoch": 0.4073550212164074, + "grad_norm": 5.21875, + "learning_rate": 9.2296174797469e-06, + "loss": 0.78460536, + "memory(GiB)": 135.77, + "step": 17460, + "train_speed(iter/s)": 0.20351 + }, + { + "acc": 0.78100643, + "epoch": 0.4075883287886963, + "grad_norm": 4.1875, + "learning_rate": 9.228609706589175e-06, + "loss": 0.80171537, + "memory(GiB)": 135.77, + "step": 17470, + "train_speed(iter/s)": 0.203567 + }, + { + "acc": 0.76642833, + "epoch": 0.4078216363609851, + "grad_norm": 5.1875, + "learning_rate": 9.227601329798645e-06, + "loss": 0.86399326, + "memory(GiB)": 135.77, + "step": 17480, + "train_speed(iter/s)": 0.203629 + }, + { + "acc": 0.76468382, + "epoch": 0.408054943933274, + "grad_norm": 5.15625, + "learning_rate": 9.226592349519254e-06, + "loss": 0.84064198, + "memory(GiB)": 135.77, + "step": 17490, + "train_speed(iter/s)": 0.203689 + }, + { + "acc": 0.757623, + "epoch": 0.4082882515055629, + "grad_norm": 7.5, + "learning_rate": 9.225582765895032e-06, + "loss": 0.88375015, + "memory(GiB)": 135.77, + "step": 17500, + "train_speed(iter/s)": 0.203747 + }, + { + "epoch": 0.4082882515055629, + "eval_acc": 0.7373759502758565, + "eval_loss": 0.8317380547523499, + "eval_runtime": 1270.3611, + "eval_samples_per_second": 28.331, + "eval_steps_per_second": 14.166, + "step": 17500 + }, + { + "acc": 0.76951265, + "epoch": 0.4085215590778518, + "grad_norm": 6.9375, + "learning_rate": 9.224572579070097e-06, + "loss": 0.83398447, + "memory(GiB)": 135.77, + "step": 17510, + "train_speed(iter/s)": 0.200799 + }, + { + "acc": 0.77614574, + "epoch": 0.4087548666501407, + "grad_norm": 5.53125, + "learning_rate": 9.223561789188655e-06, + "loss": 0.80068007, + "memory(GiB)": 135.77, + "step": 17520, + "train_speed(iter/s)": 0.200853 + }, + { + "acc": 0.76483154, + "epoch": 0.4089881742224296, + "grad_norm": 6.59375, + "learning_rate": 9.222550396394994e-06, + "loss": 0.82644291, + "memory(GiB)": 135.77, + "step": 17530, + "train_speed(iter/s)": 0.200914 + }, + { + "acc": 0.76731434, + "epoch": 0.4092214817947185, + "grad_norm": 11.25, + "learning_rate": 9.221538400833489e-06, + "loss": 0.84105368, + "memory(GiB)": 135.77, + "step": 17540, + "train_speed(iter/s)": 0.200968 + }, + { + "acc": 0.79679441, + "epoch": 0.4094547893670074, + "grad_norm": 5.625, + "learning_rate": 9.220525802648605e-06, + "loss": 0.72740088, + "memory(GiB)": 135.77, + "step": 17550, + "train_speed(iter/s)": 0.201025 + }, + { + "acc": 0.77622986, + "epoch": 0.4096880969392963, + "grad_norm": 5.5625, + "learning_rate": 9.219512601984889e-06, + "loss": 0.81656227, + "memory(GiB)": 135.77, + "step": 17560, + "train_speed(iter/s)": 0.20108 + }, + { + "acc": 0.78093081, + "epoch": 0.40992140451158515, + "grad_norm": 5.8125, + "learning_rate": 9.218498798986975e-06, + "loss": 0.75747585, + "memory(GiB)": 135.77, + "step": 17570, + "train_speed(iter/s)": 0.201138 + }, + { + "acc": 0.75355787, + "epoch": 0.41015471208387405, + "grad_norm": 4.84375, + "learning_rate": 9.217484393799582e-06, + "loss": 0.91436329, + "memory(GiB)": 135.77, + "step": 17580, + "train_speed(iter/s)": 0.201199 + }, + { + "acc": 0.76489058, + "epoch": 0.41038801965616295, + "grad_norm": 5.125, + "learning_rate": 9.216469386567517e-06, + "loss": 0.86888409, + "memory(GiB)": 135.77, + "step": 17590, + "train_speed(iter/s)": 0.201259 + }, + { + "acc": 0.79336662, + "epoch": 0.41062132722845185, + "grad_norm": 4.875, + "learning_rate": 9.215453777435672e-06, + "loss": 0.71763039, + "memory(GiB)": 135.77, + "step": 17600, + "train_speed(iter/s)": 0.201315 + }, + { + "acc": 0.80592194, + "epoch": 0.41085463480074075, + "grad_norm": 7.0, + "learning_rate": 9.214437566549026e-06, + "loss": 0.7039835, + "memory(GiB)": 135.77, + "step": 17610, + "train_speed(iter/s)": 0.201373 + }, + { + "acc": 0.75960789, + "epoch": 0.41108794237302965, + "grad_norm": 5.3125, + "learning_rate": 9.21342075405264e-06, + "loss": 0.88718882, + "memory(GiB)": 135.77, + "step": 17620, + "train_speed(iter/s)": 0.201435 + }, + { + "acc": 0.75959663, + "epoch": 0.41132124994531855, + "grad_norm": 5.09375, + "learning_rate": 9.212403340091667e-06, + "loss": 0.88778267, + "memory(GiB)": 135.77, + "step": 17630, + "train_speed(iter/s)": 0.201493 + }, + { + "acc": 0.76623917, + "epoch": 0.41155455751760744, + "grad_norm": 5.65625, + "learning_rate": 9.21138532481134e-06, + "loss": 0.84202518, + "memory(GiB)": 135.77, + "step": 17640, + "train_speed(iter/s)": 0.201554 + }, + { + "acc": 0.76951289, + "epoch": 0.41178786508989634, + "grad_norm": 5.375, + "learning_rate": 9.210366708356982e-06, + "loss": 0.81451988, + "memory(GiB)": 135.77, + "step": 17650, + "train_speed(iter/s)": 0.201614 + }, + { + "acc": 0.76205425, + "epoch": 0.41202117266218524, + "grad_norm": 5.0625, + "learning_rate": 9.209347490874e-06, + "loss": 0.86257401, + "memory(GiB)": 135.77, + "step": 17660, + "train_speed(iter/s)": 0.201674 + }, + { + "acc": 0.76792526, + "epoch": 0.4122544802344741, + "grad_norm": 4.46875, + "learning_rate": 9.208327672507883e-06, + "loss": 0.82535515, + "memory(GiB)": 135.77, + "step": 17670, + "train_speed(iter/s)": 0.201736 + }, + { + "acc": 0.77031803, + "epoch": 0.412487787806763, + "grad_norm": 6.40625, + "learning_rate": 9.207307253404216e-06, + "loss": 0.83331718, + "memory(GiB)": 135.77, + "step": 17680, + "train_speed(iter/s)": 0.201795 + }, + { + "acc": 0.78183222, + "epoch": 0.4127210953790519, + "grad_norm": 5.59375, + "learning_rate": 9.20628623370866e-06, + "loss": 0.7861207, + "memory(GiB)": 135.77, + "step": 17690, + "train_speed(iter/s)": 0.201854 + }, + { + "acc": 0.78258052, + "epoch": 0.4129544029513408, + "grad_norm": 4.71875, + "learning_rate": 9.205264613566968e-06, + "loss": 0.78774872, + "memory(GiB)": 135.77, + "step": 17700, + "train_speed(iter/s)": 0.201912 + }, + { + "acc": 0.76604571, + "epoch": 0.4131877105236297, + "grad_norm": 5.78125, + "learning_rate": 9.204242393124973e-06, + "loss": 0.85110035, + "memory(GiB)": 135.77, + "step": 17710, + "train_speed(iter/s)": 0.201968 + }, + { + "acc": 0.77456961, + "epoch": 0.4134210180959186, + "grad_norm": 4.25, + "learning_rate": 9.203219572528597e-06, + "loss": 0.7974988, + "memory(GiB)": 135.77, + "step": 17720, + "train_speed(iter/s)": 0.202026 + }, + { + "acc": 0.78755922, + "epoch": 0.4136543256682075, + "grad_norm": 6.59375, + "learning_rate": 9.202196151923849e-06, + "loss": 0.75697212, + "memory(GiB)": 135.77, + "step": 17730, + "train_speed(iter/s)": 0.202085 + }, + { + "acc": 0.75414586, + "epoch": 0.4138876332404964, + "grad_norm": 6.90625, + "learning_rate": 9.201172131456821e-06, + "loss": 0.8852272, + "memory(GiB)": 135.77, + "step": 17740, + "train_speed(iter/s)": 0.202146 + }, + { + "acc": 0.75268192, + "epoch": 0.4141209408127853, + "grad_norm": 6.1875, + "learning_rate": 9.20014751127369e-06, + "loss": 0.89101114, + "memory(GiB)": 135.77, + "step": 17750, + "train_speed(iter/s)": 0.202204 + }, + { + "acc": 0.76879907, + "epoch": 0.4143542483850742, + "grad_norm": 4.65625, + "learning_rate": 9.199122291520724e-06, + "loss": 0.82659073, + "memory(GiB)": 135.77, + "step": 17760, + "train_speed(iter/s)": 0.202266 + }, + { + "acc": 0.78848228, + "epoch": 0.414587555957363, + "grad_norm": 4.625, + "learning_rate": 9.198096472344269e-06, + "loss": 0.76872091, + "memory(GiB)": 135.77, + "step": 17770, + "train_speed(iter/s)": 0.202325 + }, + { + "acc": 0.76771512, + "epoch": 0.4148208635296519, + "grad_norm": 4.34375, + "learning_rate": 9.197070053890764e-06, + "loss": 0.82841167, + "memory(GiB)": 135.77, + "step": 17780, + "train_speed(iter/s)": 0.202385 + }, + { + "acc": 0.78823538, + "epoch": 0.4150541711019408, + "grad_norm": 3.953125, + "learning_rate": 9.196043036306726e-06, + "loss": 0.76270905, + "memory(GiB)": 135.77, + "step": 17790, + "train_speed(iter/s)": 0.202446 + }, + { + "acc": 0.78680201, + "epoch": 0.4152874786742297, + "grad_norm": 4.9375, + "learning_rate": 9.195015419738765e-06, + "loss": 0.77416258, + "memory(GiB)": 135.77, + "step": 17800, + "train_speed(iter/s)": 0.202509 + }, + { + "acc": 0.77593956, + "epoch": 0.4155207862465186, + "grad_norm": 6.6875, + "learning_rate": 9.193987204333573e-06, + "loss": 0.79424219, + "memory(GiB)": 135.77, + "step": 17810, + "train_speed(iter/s)": 0.202572 + }, + { + "acc": 0.76831007, + "epoch": 0.4157540938188075, + "grad_norm": 7.0, + "learning_rate": 9.192958390237923e-06, + "loss": 0.84353199, + "memory(GiB)": 135.77, + "step": 17820, + "train_speed(iter/s)": 0.202631 + }, + { + "acc": 0.7490344, + "epoch": 0.4159874013910964, + "grad_norm": 4.9375, + "learning_rate": 9.19192897759868e-06, + "loss": 0.92404823, + "memory(GiB)": 135.77, + "step": 17830, + "train_speed(iter/s)": 0.202692 + }, + { + "acc": 0.76985192, + "epoch": 0.4162207089633853, + "grad_norm": 9.25, + "learning_rate": 9.190898966562796e-06, + "loss": 0.83063097, + "memory(GiB)": 135.77, + "step": 17840, + "train_speed(iter/s)": 0.202753 + }, + { + "acc": 0.76316862, + "epoch": 0.4164540165356742, + "grad_norm": 5.8125, + "learning_rate": 9.1898683572773e-06, + "loss": 0.8611824, + "memory(GiB)": 135.77, + "step": 17850, + "train_speed(iter/s)": 0.202812 + }, + { + "acc": 0.75131006, + "epoch": 0.41668732410796305, + "grad_norm": 7.21875, + "learning_rate": 9.188837149889316e-06, + "loss": 0.92140636, + "memory(GiB)": 135.77, + "step": 17860, + "train_speed(iter/s)": 0.202873 + }, + { + "acc": 0.77328281, + "epoch": 0.41692063168025195, + "grad_norm": 5.0625, + "learning_rate": 9.187805344546044e-06, + "loss": 0.81961975, + "memory(GiB)": 135.77, + "step": 17870, + "train_speed(iter/s)": 0.202932 + }, + { + "acc": 0.78284941, + "epoch": 0.41715393925254085, + "grad_norm": 5.375, + "learning_rate": 9.186772941394776e-06, + "loss": 0.78479848, + "memory(GiB)": 135.77, + "step": 17880, + "train_speed(iter/s)": 0.202994 + }, + { + "acc": 0.77911134, + "epoch": 0.41738724682482975, + "grad_norm": 6.78125, + "learning_rate": 9.185739940582885e-06, + "loss": 0.82526703, + "memory(GiB)": 135.77, + "step": 17890, + "train_speed(iter/s)": 0.203057 + }, + { + "acc": 0.77189975, + "epoch": 0.41762055439711865, + "grad_norm": 7.34375, + "learning_rate": 9.184706342257835e-06, + "loss": 0.80744982, + "memory(GiB)": 135.77, + "step": 17900, + "train_speed(iter/s)": 0.20312 + }, + { + "acc": 0.77462916, + "epoch": 0.41785386196940755, + "grad_norm": 6.46875, + "learning_rate": 9.183672146567171e-06, + "loss": 0.81883631, + "memory(GiB)": 135.77, + "step": 17910, + "train_speed(iter/s)": 0.203176 + }, + { + "acc": 0.75889959, + "epoch": 0.41808716954169645, + "grad_norm": 4.78125, + "learning_rate": 9.182637353658523e-06, + "loss": 0.85708733, + "memory(GiB)": 135.77, + "step": 17920, + "train_speed(iter/s)": 0.203236 + }, + { + "acc": 0.77901268, + "epoch": 0.41832047711398535, + "grad_norm": 11.125, + "learning_rate": 9.181601963679607e-06, + "loss": 0.80118008, + "memory(GiB)": 135.77, + "step": 17930, + "train_speed(iter/s)": 0.203298 + }, + { + "acc": 0.76691198, + "epoch": 0.41855378468627424, + "grad_norm": 6.09375, + "learning_rate": 9.180565976778226e-06, + "loss": 0.82437811, + "memory(GiB)": 135.77, + "step": 17940, + "train_speed(iter/s)": 0.203358 + }, + { + "acc": 0.79058146, + "epoch": 0.41878709225856314, + "grad_norm": 5.59375, + "learning_rate": 9.179529393102265e-06, + "loss": 0.75918694, + "memory(GiB)": 135.77, + "step": 17950, + "train_speed(iter/s)": 0.203415 + }, + { + "acc": 0.78140459, + "epoch": 0.419020399830852, + "grad_norm": 4.46875, + "learning_rate": 9.1784922127997e-06, + "loss": 0.77219858, + "memory(GiB)": 135.77, + "step": 17960, + "train_speed(iter/s)": 0.203476 + }, + { + "acc": 0.76335707, + "epoch": 0.4192537074031409, + "grad_norm": 4.625, + "learning_rate": 9.177454436018584e-06, + "loss": 0.84384384, + "memory(GiB)": 135.77, + "step": 17970, + "train_speed(iter/s)": 0.203534 + }, + { + "acc": 0.77697759, + "epoch": 0.4194870149754298, + "grad_norm": 6.5625, + "learning_rate": 9.17641606290706e-06, + "loss": 0.85184278, + "memory(GiB)": 135.77, + "step": 17980, + "train_speed(iter/s)": 0.203595 + }, + { + "acc": 0.75867949, + "epoch": 0.4197203225477187, + "grad_norm": 7.40625, + "learning_rate": 9.175377093613359e-06, + "loss": 0.84790859, + "memory(GiB)": 135.77, + "step": 17990, + "train_speed(iter/s)": 0.203654 + }, + { + "acc": 0.77024422, + "epoch": 0.4199536301200076, + "grad_norm": 6.71875, + "learning_rate": 9.174337528285787e-06, + "loss": 0.83740625, + "memory(GiB)": 135.77, + "step": 18000, + "train_speed(iter/s)": 0.203712 + }, + { + "epoch": 0.4199536301200076, + "eval_acc": 0.7375597958545385, + "eval_loss": 0.8316662907600403, + "eval_runtime": 1270.1578, + "eval_samples_per_second": 28.336, + "eval_steps_per_second": 14.168, + "step": 18000 + }, + { + "acc": 0.75532227, + "epoch": 0.4201869376922965, + "grad_norm": 4.65625, + "learning_rate": 9.173297367072748e-06, + "loss": 0.87728586, + "memory(GiB)": 135.77, + "step": 18010, + "train_speed(iter/s)": 0.200842 + }, + { + "acc": 0.78155527, + "epoch": 0.4204202452645854, + "grad_norm": 6.9375, + "learning_rate": 9.172256610122721e-06, + "loss": 0.79191427, + "memory(GiB)": 135.77, + "step": 18020, + "train_speed(iter/s)": 0.200902 + }, + { + "acc": 0.7721571, + "epoch": 0.4206535528368743, + "grad_norm": 7.78125, + "learning_rate": 9.171215257584277e-06, + "loss": 0.82012405, + "memory(GiB)": 135.77, + "step": 18030, + "train_speed(iter/s)": 0.200964 + }, + { + "acc": 0.74781427, + "epoch": 0.4208868604091632, + "grad_norm": 5.1875, + "learning_rate": 9.170173309606063e-06, + "loss": 0.92548695, + "memory(GiB)": 135.77, + "step": 18040, + "train_speed(iter/s)": 0.201023 + }, + { + "acc": 0.77438364, + "epoch": 0.421120167981452, + "grad_norm": 5.3125, + "learning_rate": 9.169130766336824e-06, + "loss": 0.8166708, + "memory(GiB)": 135.77, + "step": 18050, + "train_speed(iter/s)": 0.201079 + }, + { + "acc": 0.76485076, + "epoch": 0.4213534755537409, + "grad_norm": 4.90625, + "learning_rate": 9.168087627925377e-06, + "loss": 0.85983076, + "memory(GiB)": 135.77, + "step": 18060, + "train_speed(iter/s)": 0.201133 + }, + { + "acc": 0.77190371, + "epoch": 0.4215867831260298, + "grad_norm": 6.40625, + "learning_rate": 9.167043894520633e-06, + "loss": 0.81317501, + "memory(GiB)": 135.77, + "step": 18070, + "train_speed(iter/s)": 0.201191 + }, + { + "acc": 0.77723899, + "epoch": 0.4218200906983187, + "grad_norm": 4.6875, + "learning_rate": 9.165999566271584e-06, + "loss": 0.81175861, + "memory(GiB)": 135.77, + "step": 18080, + "train_speed(iter/s)": 0.201248 + }, + { + "acc": 0.77668056, + "epoch": 0.4220533982706076, + "grad_norm": 5.8125, + "learning_rate": 9.164954643327306e-06, + "loss": 0.79176364, + "memory(GiB)": 135.77, + "step": 18090, + "train_speed(iter/s)": 0.201302 + }, + { + "acc": 0.78481464, + "epoch": 0.4222867058428965, + "grad_norm": 4.5625, + "learning_rate": 9.163909125836965e-06, + "loss": 0.7773984, + "memory(GiB)": 135.77, + "step": 18100, + "train_speed(iter/s)": 0.201359 + }, + { + "acc": 0.77743702, + "epoch": 0.4225200134151854, + "grad_norm": 6.1875, + "learning_rate": 9.162863013949803e-06, + "loss": 0.7793623, + "memory(GiB)": 135.77, + "step": 18110, + "train_speed(iter/s)": 0.201417 + }, + { + "acc": 0.75463772, + "epoch": 0.4227533209874743, + "grad_norm": 5.96875, + "learning_rate": 9.161816307815157e-06, + "loss": 0.88482685, + "memory(GiB)": 135.77, + "step": 18120, + "train_speed(iter/s)": 0.201472 + }, + { + "acc": 0.78212109, + "epoch": 0.4229866285597632, + "grad_norm": 5.53125, + "learning_rate": 9.160769007582441e-06, + "loss": 0.78088417, + "memory(GiB)": 135.77, + "step": 18130, + "train_speed(iter/s)": 0.201529 + }, + { + "acc": 0.77992072, + "epoch": 0.4232199361320521, + "grad_norm": 5.21875, + "learning_rate": 9.15972111340116e-06, + "loss": 0.80578728, + "memory(GiB)": 135.77, + "step": 18140, + "train_speed(iter/s)": 0.201586 + }, + { + "acc": 0.75823526, + "epoch": 0.42345324370434095, + "grad_norm": 7.90625, + "learning_rate": 9.158672625420894e-06, + "loss": 0.88358002, + "memory(GiB)": 135.77, + "step": 18150, + "train_speed(iter/s)": 0.201641 + }, + { + "acc": 0.78155651, + "epoch": 0.42368655127662985, + "grad_norm": 6.40625, + "learning_rate": 9.157623543791323e-06, + "loss": 0.7862711, + "memory(GiB)": 135.77, + "step": 18160, + "train_speed(iter/s)": 0.201697 + }, + { + "acc": 0.77112303, + "epoch": 0.42391985884891875, + "grad_norm": 6.96875, + "learning_rate": 9.156573868662197e-06, + "loss": 0.83190708, + "memory(GiB)": 135.77, + "step": 18170, + "train_speed(iter/s)": 0.201751 + }, + { + "acc": 0.78510017, + "epoch": 0.42415316642120765, + "grad_norm": 5.3125, + "learning_rate": 9.155523600183359e-06, + "loss": 0.75841799, + "memory(GiB)": 135.77, + "step": 18180, + "train_speed(iter/s)": 0.201805 + }, + { + "acc": 0.76126904, + "epoch": 0.42438647399349655, + "grad_norm": 6.5625, + "learning_rate": 9.154472738504735e-06, + "loss": 0.84513931, + "memory(GiB)": 135.77, + "step": 18190, + "train_speed(iter/s)": 0.201858 + }, + { + "acc": 0.77650328, + "epoch": 0.42461978156578545, + "grad_norm": 5.09375, + "learning_rate": 9.153421283776334e-06, + "loss": 0.80113697, + "memory(GiB)": 135.77, + "step": 18200, + "train_speed(iter/s)": 0.201916 + }, + { + "acc": 0.78944378, + "epoch": 0.42485308913807435, + "grad_norm": 4.0625, + "learning_rate": 9.152369236148252e-06, + "loss": 0.76571293, + "memory(GiB)": 135.77, + "step": 18210, + "train_speed(iter/s)": 0.201971 + }, + { + "acc": 0.77950716, + "epoch": 0.42508639671036325, + "grad_norm": 5.1875, + "learning_rate": 9.151316595770665e-06, + "loss": 0.78431196, + "memory(GiB)": 135.77, + "step": 18220, + "train_speed(iter/s)": 0.202028 + }, + { + "acc": 0.75776768, + "epoch": 0.42531970428265214, + "grad_norm": 6.96875, + "learning_rate": 9.150263362793844e-06, + "loss": 0.88814754, + "memory(GiB)": 135.77, + "step": 18230, + "train_speed(iter/s)": 0.20209 + }, + { + "acc": 0.7760685, + "epoch": 0.42555301185494104, + "grad_norm": 4.65625, + "learning_rate": 9.14920953736813e-06, + "loss": 0.81063766, + "memory(GiB)": 135.77, + "step": 18240, + "train_speed(iter/s)": 0.202147 + }, + { + "acc": 0.77113094, + "epoch": 0.4257863194272299, + "grad_norm": 5.03125, + "learning_rate": 9.148155119643963e-06, + "loss": 0.82983465, + "memory(GiB)": 135.77, + "step": 18250, + "train_speed(iter/s)": 0.202206 + }, + { + "acc": 0.76979413, + "epoch": 0.4260196269995188, + "grad_norm": 4.9375, + "learning_rate": 9.147100109771856e-06, + "loss": 0.83212032, + "memory(GiB)": 135.77, + "step": 18260, + "train_speed(iter/s)": 0.202259 + }, + { + "acc": 0.7747787, + "epoch": 0.4262529345718077, + "grad_norm": 4.78125, + "learning_rate": 9.146044507902411e-06, + "loss": 0.80783243, + "memory(GiB)": 135.77, + "step": 18270, + "train_speed(iter/s)": 0.202315 + }, + { + "acc": 0.78634143, + "epoch": 0.4264862421440966, + "grad_norm": 7.71875, + "learning_rate": 9.144988314186321e-06, + "loss": 0.76813116, + "memory(GiB)": 135.77, + "step": 18280, + "train_speed(iter/s)": 0.202366 + }, + { + "acc": 0.76954098, + "epoch": 0.4267195497163855, + "grad_norm": 5.75, + "learning_rate": 9.143931528774351e-06, + "loss": 0.82664013, + "memory(GiB)": 135.77, + "step": 18290, + "train_speed(iter/s)": 0.202423 + }, + { + "acc": 0.74711275, + "epoch": 0.4269528572886744, + "grad_norm": 5.65625, + "learning_rate": 9.14287415181736e-06, + "loss": 0.90971508, + "memory(GiB)": 135.77, + "step": 18300, + "train_speed(iter/s)": 0.202481 + }, + { + "acc": 0.75965652, + "epoch": 0.4271861648609633, + "grad_norm": 5.6875, + "learning_rate": 9.141816183466286e-06, + "loss": 0.85946274, + "memory(GiB)": 135.77, + "step": 18310, + "train_speed(iter/s)": 0.202539 + }, + { + "acc": 0.76663742, + "epoch": 0.4274194724332522, + "grad_norm": 5.5625, + "learning_rate": 9.140757623872156e-06, + "loss": 0.86080341, + "memory(GiB)": 135.77, + "step": 18320, + "train_speed(iter/s)": 0.202596 + }, + { + "acc": 0.76160979, + "epoch": 0.4276527800055411, + "grad_norm": 4.34375, + "learning_rate": 9.139698473186079e-06, + "loss": 0.85901699, + "memory(GiB)": 135.77, + "step": 18330, + "train_speed(iter/s)": 0.202652 + }, + { + "acc": 0.77596149, + "epoch": 0.4278860875778299, + "grad_norm": 5.03125, + "learning_rate": 9.138638731559246e-06, + "loss": 0.80930233, + "memory(GiB)": 135.77, + "step": 18340, + "train_speed(iter/s)": 0.20271 + }, + { + "acc": 0.78887205, + "epoch": 0.4281193951501188, + "grad_norm": 5.0, + "learning_rate": 9.137578399142936e-06, + "loss": 0.73909388, + "memory(GiB)": 135.77, + "step": 18350, + "train_speed(iter/s)": 0.202766 + }, + { + "acc": 0.77916021, + "epoch": 0.4283527027224077, + "grad_norm": 5.0, + "learning_rate": 9.136517476088513e-06, + "loss": 0.80833597, + "memory(GiB)": 135.77, + "step": 18360, + "train_speed(iter/s)": 0.202821 + }, + { + "acc": 0.77610526, + "epoch": 0.4285860102946966, + "grad_norm": 6.5625, + "learning_rate": 9.135455962547422e-06, + "loss": 0.80960493, + "memory(GiB)": 135.77, + "step": 18370, + "train_speed(iter/s)": 0.202876 + }, + { + "acc": 0.75241766, + "epoch": 0.4288193178669855, + "grad_norm": 6.25, + "learning_rate": 9.134393858671193e-06, + "loss": 0.87778807, + "memory(GiB)": 135.77, + "step": 18380, + "train_speed(iter/s)": 0.202933 + }, + { + "acc": 0.7702713, + "epoch": 0.4290526254392744, + "grad_norm": 7.0, + "learning_rate": 9.13333116461144e-06, + "loss": 0.82612343, + "memory(GiB)": 135.77, + "step": 18390, + "train_speed(iter/s)": 0.20299 + }, + { + "acc": 0.78864408, + "epoch": 0.4292859330115633, + "grad_norm": 5.15625, + "learning_rate": 9.132267880519867e-06, + "loss": 0.75824766, + "memory(GiB)": 135.77, + "step": 18400, + "train_speed(iter/s)": 0.203043 + }, + { + "acc": 0.76866798, + "epoch": 0.4295192405838522, + "grad_norm": 6.1875, + "learning_rate": 9.131204006548253e-06, + "loss": 0.83031731, + "memory(GiB)": 135.77, + "step": 18410, + "train_speed(iter/s)": 0.203101 + }, + { + "acc": 0.7711411, + "epoch": 0.4297525481561411, + "grad_norm": 6.90625, + "learning_rate": 9.130139542848468e-06, + "loss": 0.83066397, + "memory(GiB)": 135.77, + "step": 18420, + "train_speed(iter/s)": 0.203161 + }, + { + "acc": 0.76493034, + "epoch": 0.42998585572843, + "grad_norm": 5.75, + "learning_rate": 9.129074489572463e-06, + "loss": 0.85996132, + "memory(GiB)": 135.77, + "step": 18430, + "train_speed(iter/s)": 0.203214 + }, + { + "acc": 0.76129556, + "epoch": 0.43021916330071885, + "grad_norm": 4.15625, + "learning_rate": 9.128008846872273e-06, + "loss": 0.87012196, + "memory(GiB)": 135.77, + "step": 18440, + "train_speed(iter/s)": 0.203266 + }, + { + "acc": 0.77166157, + "epoch": 0.43045247087300775, + "grad_norm": 9.5625, + "learning_rate": 9.126942614900021e-06, + "loss": 0.80364647, + "memory(GiB)": 135.77, + "step": 18450, + "train_speed(iter/s)": 0.203324 + }, + { + "acc": 0.76030464, + "epoch": 0.43068577844529665, + "grad_norm": 4.1875, + "learning_rate": 9.125875793807908e-06, + "loss": 0.88488598, + "memory(GiB)": 135.77, + "step": 18460, + "train_speed(iter/s)": 0.203381 + }, + { + "acc": 0.77922316, + "epoch": 0.43091908601758555, + "grad_norm": 9.4375, + "learning_rate": 9.124808383748226e-06, + "loss": 0.79430113, + "memory(GiB)": 135.77, + "step": 18470, + "train_speed(iter/s)": 0.203433 + }, + { + "acc": 0.76354847, + "epoch": 0.43115239358987445, + "grad_norm": 7.46875, + "learning_rate": 9.123740384873343e-06, + "loss": 0.87230701, + "memory(GiB)": 135.77, + "step": 18480, + "train_speed(iter/s)": 0.203489 + }, + { + "acc": 0.76531634, + "epoch": 0.43138570116216335, + "grad_norm": 6.3125, + "learning_rate": 9.122671797335719e-06, + "loss": 0.86186409, + "memory(GiB)": 135.77, + "step": 18490, + "train_speed(iter/s)": 0.203546 + }, + { + "acc": 0.76710591, + "epoch": 0.43161900873445225, + "grad_norm": 5.96875, + "learning_rate": 9.121602621287892e-06, + "loss": 0.85632124, + "memory(GiB)": 135.77, + "step": 18500, + "train_speed(iter/s)": 0.203601 + }, + { + "epoch": 0.43161900873445225, + "eval_acc": 0.737777431866236, + "eval_loss": 0.8302583694458008, + "eval_runtime": 1269.9297, + "eval_samples_per_second": 28.341, + "eval_steps_per_second": 14.171, + "step": 18500 + }, + { + "acc": 0.79308367, + "epoch": 0.43185231630674115, + "grad_norm": 5.53125, + "learning_rate": 9.120532856882491e-06, + "loss": 0.74874711, + "memory(GiB)": 135.77, + "step": 18510, + "train_speed(iter/s)": 0.200815 + }, + { + "acc": 0.77777281, + "epoch": 0.43208562387903005, + "grad_norm": 5.375, + "learning_rate": 9.119462504272221e-06, + "loss": 0.8051053, + "memory(GiB)": 135.77, + "step": 18520, + "train_speed(iter/s)": 0.200868 + }, + { + "acc": 0.74930582, + "epoch": 0.43231893145131894, + "grad_norm": 5.9375, + "learning_rate": 9.118391563609875e-06, + "loss": 0.8819828, + "memory(GiB)": 135.77, + "step": 18530, + "train_speed(iter/s)": 0.200927 + }, + { + "acc": 0.77309723, + "epoch": 0.4325522390236078, + "grad_norm": 6.34375, + "learning_rate": 9.117320035048329e-06, + "loss": 0.82186155, + "memory(GiB)": 135.77, + "step": 18540, + "train_speed(iter/s)": 0.200984 + }, + { + "acc": 0.77171998, + "epoch": 0.4327855465958967, + "grad_norm": 6.65625, + "learning_rate": 9.116247918740544e-06, + "loss": 0.80551882, + "memory(GiB)": 135.77, + "step": 18550, + "train_speed(iter/s)": 0.201042 + }, + { + "acc": 0.76656771, + "epoch": 0.4330188541681856, + "grad_norm": 4.59375, + "learning_rate": 9.115175214839565e-06, + "loss": 0.87863846, + "memory(GiB)": 135.77, + "step": 18560, + "train_speed(iter/s)": 0.201095 + }, + { + "acc": 0.78267555, + "epoch": 0.4332521617404745, + "grad_norm": 5.46875, + "learning_rate": 9.114101923498519e-06, + "loss": 0.77156582, + "memory(GiB)": 135.77, + "step": 18570, + "train_speed(iter/s)": 0.201152 + }, + { + "acc": 0.76703129, + "epoch": 0.4334854693127634, + "grad_norm": 8.625, + "learning_rate": 9.113028044870619e-06, + "loss": 0.84351616, + "memory(GiB)": 135.77, + "step": 18580, + "train_speed(iter/s)": 0.201206 + }, + { + "acc": 0.76306887, + "epoch": 0.4337187768850523, + "grad_norm": 5.625, + "learning_rate": 9.11195357910916e-06, + "loss": 0.86529598, + "memory(GiB)": 135.77, + "step": 18590, + "train_speed(iter/s)": 0.201258 + }, + { + "acc": 0.77179461, + "epoch": 0.4339520844573412, + "grad_norm": 5.03125, + "learning_rate": 9.110878526367523e-06, + "loss": 0.8225605, + "memory(GiB)": 135.77, + "step": 18600, + "train_speed(iter/s)": 0.201317 + }, + { + "acc": 0.77156925, + "epoch": 0.4341853920296301, + "grad_norm": 6.03125, + "learning_rate": 9.10980288679917e-06, + "loss": 0.81905994, + "memory(GiB)": 135.77, + "step": 18610, + "train_speed(iter/s)": 0.201374 + }, + { + "acc": 0.76532478, + "epoch": 0.434418699601919, + "grad_norm": 8.8125, + "learning_rate": 9.10872666055765e-06, + "loss": 0.85075331, + "memory(GiB)": 135.77, + "step": 18620, + "train_speed(iter/s)": 0.201432 + }, + { + "acc": 0.76773767, + "epoch": 0.4346520071742078, + "grad_norm": 6.34375, + "learning_rate": 9.107649847796591e-06, + "loss": 0.8390007, + "memory(GiB)": 135.77, + "step": 18630, + "train_speed(iter/s)": 0.201484 + }, + { + "acc": 0.76637259, + "epoch": 0.4348853147464967, + "grad_norm": 7.53125, + "learning_rate": 9.10657244866971e-06, + "loss": 0.83106155, + "memory(GiB)": 135.77, + "step": 18640, + "train_speed(iter/s)": 0.201536 + }, + { + "acc": 0.76556454, + "epoch": 0.4351186223187856, + "grad_norm": 6.5625, + "learning_rate": 9.105494463330805e-06, + "loss": 0.82924232, + "memory(GiB)": 135.77, + "step": 18650, + "train_speed(iter/s)": 0.201594 + }, + { + "acc": 0.76742105, + "epoch": 0.4353519298910745, + "grad_norm": 4.0, + "learning_rate": 9.104415891933757e-06, + "loss": 0.83789377, + "memory(GiB)": 135.77, + "step": 18660, + "train_speed(iter/s)": 0.20165 + }, + { + "acc": 0.77874622, + "epoch": 0.4355852374633634, + "grad_norm": 5.1875, + "learning_rate": 9.103336734632536e-06, + "loss": 0.78959827, + "memory(GiB)": 135.77, + "step": 18670, + "train_speed(iter/s)": 0.2017 + }, + { + "acc": 0.7700861, + "epoch": 0.4358185450356523, + "grad_norm": 4.5, + "learning_rate": 9.102256991581185e-06, + "loss": 0.82032681, + "memory(GiB)": 135.77, + "step": 18680, + "train_speed(iter/s)": 0.201755 + }, + { + "acc": 0.76898155, + "epoch": 0.4360518526079412, + "grad_norm": 5.65625, + "learning_rate": 9.101176662933842e-06, + "loss": 0.85672741, + "memory(GiB)": 135.77, + "step": 18690, + "train_speed(iter/s)": 0.201813 + }, + { + "acc": 0.76275616, + "epoch": 0.4362851601802301, + "grad_norm": 5.09375, + "learning_rate": 9.10009574884472e-06, + "loss": 0.86024952, + "memory(GiB)": 135.77, + "step": 18700, + "train_speed(iter/s)": 0.201867 + }, + { + "acc": 0.76601181, + "epoch": 0.436518467752519, + "grad_norm": 6.375, + "learning_rate": 9.099014249468124e-06, + "loss": 0.86094589, + "memory(GiB)": 135.77, + "step": 18710, + "train_speed(iter/s)": 0.201923 + }, + { + "acc": 0.77703285, + "epoch": 0.4367517753248079, + "grad_norm": 7.375, + "learning_rate": 9.097932164958432e-06, + "loss": 0.79216099, + "memory(GiB)": 135.77, + "step": 18720, + "train_speed(iter/s)": 0.201982 + }, + { + "acc": 0.76565948, + "epoch": 0.43698508289709675, + "grad_norm": 5.28125, + "learning_rate": 9.096849495470113e-06, + "loss": 0.86634254, + "memory(GiB)": 135.77, + "step": 18730, + "train_speed(iter/s)": 0.202039 + }, + { + "acc": 0.77021027, + "epoch": 0.43721839046938565, + "grad_norm": 5.5, + "learning_rate": 9.095766241157721e-06, + "loss": 0.81364603, + "memory(GiB)": 135.77, + "step": 18740, + "train_speed(iter/s)": 0.202094 + }, + { + "acc": 0.78136158, + "epoch": 0.43745169804167455, + "grad_norm": 4.8125, + "learning_rate": 9.094682402175887e-06, + "loss": 0.79765949, + "memory(GiB)": 135.77, + "step": 18750, + "train_speed(iter/s)": 0.202146 + }, + { + "acc": 0.76939726, + "epoch": 0.43768500561396345, + "grad_norm": 9.6875, + "learning_rate": 9.093597978679329e-06, + "loss": 0.84548798, + "memory(GiB)": 135.77, + "step": 18760, + "train_speed(iter/s)": 0.202202 + }, + { + "acc": 0.77349901, + "epoch": 0.43791831318625235, + "grad_norm": 7.65625, + "learning_rate": 9.09251297082285e-06, + "loss": 0.81046124, + "memory(GiB)": 135.77, + "step": 18770, + "train_speed(iter/s)": 0.202258 + }, + { + "acc": 0.76422777, + "epoch": 0.43815162075854125, + "grad_norm": 4.5, + "learning_rate": 9.091427378761333e-06, + "loss": 0.86868763, + "memory(GiB)": 135.77, + "step": 18780, + "train_speed(iter/s)": 0.202317 + }, + { + "acc": 0.77027721, + "epoch": 0.43838492833083015, + "grad_norm": 5.15625, + "learning_rate": 9.090341202649746e-06, + "loss": 0.84670544, + "memory(GiB)": 135.77, + "step": 18790, + "train_speed(iter/s)": 0.202371 + }, + { + "acc": 0.76263161, + "epoch": 0.43861823590311905, + "grad_norm": 5.9375, + "learning_rate": 9.08925444264314e-06, + "loss": 0.86874142, + "memory(GiB)": 135.77, + "step": 18800, + "train_speed(iter/s)": 0.202424 + }, + { + "acc": 0.75490875, + "epoch": 0.43885154347540795, + "grad_norm": 6.78125, + "learning_rate": 9.088167098896652e-06, + "loss": 0.89412689, + "memory(GiB)": 135.77, + "step": 18810, + "train_speed(iter/s)": 0.202481 + }, + { + "acc": 0.77074194, + "epoch": 0.4390848510476968, + "grad_norm": 6.15625, + "learning_rate": 9.087079171565496e-06, + "loss": 0.84905615, + "memory(GiB)": 135.77, + "step": 18820, + "train_speed(iter/s)": 0.202537 + }, + { + "acc": 0.76013765, + "epoch": 0.4393181586199857, + "grad_norm": 4.59375, + "learning_rate": 9.085990660804976e-06, + "loss": 0.84425535, + "memory(GiB)": 135.77, + "step": 18830, + "train_speed(iter/s)": 0.202595 + }, + { + "acc": 0.78072777, + "epoch": 0.4395514661922746, + "grad_norm": 5.96875, + "learning_rate": 9.084901566770476e-06, + "loss": 0.78052444, + "memory(GiB)": 135.77, + "step": 18840, + "train_speed(iter/s)": 0.20265 + }, + { + "acc": 0.78506041, + "epoch": 0.4397847737645635, + "grad_norm": 7.21875, + "learning_rate": 9.083811889617467e-06, + "loss": 0.76543379, + "memory(GiB)": 135.77, + "step": 18850, + "train_speed(iter/s)": 0.202704 + }, + { + "acc": 0.7516593, + "epoch": 0.4400180813368524, + "grad_norm": 5.46875, + "learning_rate": 9.082721629501494e-06, + "loss": 0.88883514, + "memory(GiB)": 135.77, + "step": 18860, + "train_speed(iter/s)": 0.202756 + }, + { + "acc": 0.77115822, + "epoch": 0.4402513889091413, + "grad_norm": 8.1875, + "learning_rate": 9.081630786578195e-06, + "loss": 0.8167778, + "memory(GiB)": 135.77, + "step": 18870, + "train_speed(iter/s)": 0.202811 + }, + { + "acc": 0.76743846, + "epoch": 0.4404846964814302, + "grad_norm": 6.125, + "learning_rate": 9.080539361003288e-06, + "loss": 0.85453176, + "memory(GiB)": 135.77, + "step": 18880, + "train_speed(iter/s)": 0.202866 + }, + { + "acc": 0.75334711, + "epoch": 0.4407180040537191, + "grad_norm": 4.375, + "learning_rate": 9.079447352932571e-06, + "loss": 0.89333754, + "memory(GiB)": 135.77, + "step": 18890, + "train_speed(iter/s)": 0.202919 + }, + { + "acc": 0.78106527, + "epoch": 0.440951311626008, + "grad_norm": 6.21875, + "learning_rate": 9.078354762521931e-06, + "loss": 0.80243101, + "memory(GiB)": 135.77, + "step": 18900, + "train_speed(iter/s)": 0.202975 + }, + { + "acc": 0.76703472, + "epoch": 0.4411846191982969, + "grad_norm": 5.8125, + "learning_rate": 9.077261589927333e-06, + "loss": 0.84479599, + "memory(GiB)": 135.77, + "step": 18910, + "train_speed(iter/s)": 0.203031 + }, + { + "acc": 0.77735329, + "epoch": 0.4414179267705857, + "grad_norm": 11.625, + "learning_rate": 9.076167835304828e-06, + "loss": 0.80349712, + "memory(GiB)": 135.77, + "step": 18920, + "train_speed(iter/s)": 0.203086 + }, + { + "acc": 0.79136057, + "epoch": 0.4416512343428746, + "grad_norm": 5.25, + "learning_rate": 9.075073498810547e-06, + "loss": 0.76573734, + "memory(GiB)": 135.77, + "step": 18930, + "train_speed(iter/s)": 0.203143 + }, + { + "acc": 0.78350835, + "epoch": 0.4418845419151635, + "grad_norm": 5.09375, + "learning_rate": 9.073978580600709e-06, + "loss": 0.78551674, + "memory(GiB)": 135.77, + "step": 18940, + "train_speed(iter/s)": 0.203194 + }, + { + "acc": 0.76640816, + "epoch": 0.4421178494874524, + "grad_norm": 5.5625, + "learning_rate": 9.072883080831611e-06, + "loss": 0.8355978, + "memory(GiB)": 135.77, + "step": 18950, + "train_speed(iter/s)": 0.203252 + }, + { + "acc": 0.75378032, + "epoch": 0.4423511570597413, + "grad_norm": 6.25, + "learning_rate": 9.071786999659638e-06, + "loss": 0.88689022, + "memory(GiB)": 135.77, + "step": 18960, + "train_speed(iter/s)": 0.203313 + }, + { + "acc": 0.74607992, + "epoch": 0.4425844646320302, + "grad_norm": 31.375, + "learning_rate": 9.070690337241252e-06, + "loss": 0.98085604, + "memory(GiB)": 135.77, + "step": 18970, + "train_speed(iter/s)": 0.203368 + }, + { + "acc": 0.78964634, + "epoch": 0.4428177722043191, + "grad_norm": 5.71875, + "learning_rate": 9.069593093733004e-06, + "loss": 0.76366062, + "memory(GiB)": 135.77, + "step": 18980, + "train_speed(iter/s)": 0.203426 + }, + { + "acc": 0.77851391, + "epoch": 0.443051079776608, + "grad_norm": 7.03125, + "learning_rate": 9.068495269291524e-06, + "loss": 0.81780148, + "memory(GiB)": 135.77, + "step": 18990, + "train_speed(iter/s)": 0.203481 + }, + { + "acc": 0.75752182, + "epoch": 0.4432843873488969, + "grad_norm": 6.84375, + "learning_rate": 9.067396864073527e-06, + "loss": 0.87670937, + "memory(GiB)": 135.77, + "step": 19000, + "train_speed(iter/s)": 0.203539 + }, + { + "epoch": 0.4432843873488969, + "eval_acc": 0.7378800843191412, + "eval_loss": 0.8296611905097961, + "eval_runtime": 1269.212, + "eval_samples_per_second": 28.357, + "eval_steps_per_second": 14.179, + "step": 19000 + }, + { + "acc": 0.77129135, + "epoch": 0.4435176949211858, + "grad_norm": 4.09375, + "learning_rate": 9.066297878235808e-06, + "loss": 0.82974644, + "memory(GiB)": 135.77, + "step": 19010, + "train_speed(iter/s)": 0.200829 + }, + { + "acc": 0.77290907, + "epoch": 0.44375100249347466, + "grad_norm": 3.796875, + "learning_rate": 9.065198311935248e-06, + "loss": 0.82878008, + "memory(GiB)": 135.77, + "step": 19020, + "train_speed(iter/s)": 0.200885 + }, + { + "acc": 0.77206917, + "epoch": 0.44398431006576355, + "grad_norm": 7.4375, + "learning_rate": 9.06409816532881e-06, + "loss": 0.84106894, + "memory(GiB)": 135.77, + "step": 19030, + "train_speed(iter/s)": 0.20094 + }, + { + "acc": 0.75648594, + "epoch": 0.44421761763805245, + "grad_norm": 4.9375, + "learning_rate": 9.06299743857354e-06, + "loss": 0.90824041, + "memory(GiB)": 135.77, + "step": 19040, + "train_speed(iter/s)": 0.200992 + }, + { + "acc": 0.76354513, + "epoch": 0.44445092521034135, + "grad_norm": 4.8125, + "learning_rate": 9.061896131826566e-06, + "loss": 0.86085148, + "memory(GiB)": 135.77, + "step": 19050, + "train_speed(iter/s)": 0.201042 + }, + { + "acc": 0.76192708, + "epoch": 0.44468423278263025, + "grad_norm": 6.8125, + "learning_rate": 9.0607942452451e-06, + "loss": 0.8651083, + "memory(GiB)": 135.77, + "step": 19060, + "train_speed(iter/s)": 0.201098 + }, + { + "acc": 0.76140442, + "epoch": 0.44491754035491915, + "grad_norm": 6.28125, + "learning_rate": 9.059691778986433e-06, + "loss": 0.85103846, + "memory(GiB)": 135.77, + "step": 19070, + "train_speed(iter/s)": 0.201155 + }, + { + "acc": 0.76223292, + "epoch": 0.44515084792720805, + "grad_norm": 8.5, + "learning_rate": 9.058588733207945e-06, + "loss": 0.83704605, + "memory(GiB)": 135.77, + "step": 19080, + "train_speed(iter/s)": 0.201211 + }, + { + "acc": 0.76429772, + "epoch": 0.44538415549949695, + "grad_norm": 6.09375, + "learning_rate": 9.057485108067094e-06, + "loss": 0.86491184, + "memory(GiB)": 135.77, + "step": 19090, + "train_speed(iter/s)": 0.201266 + }, + { + "acc": 0.77062263, + "epoch": 0.44561746307178585, + "grad_norm": 4.875, + "learning_rate": 9.056380903721424e-06, + "loss": 0.81978521, + "memory(GiB)": 135.77, + "step": 19100, + "train_speed(iter/s)": 0.201321 + }, + { + "acc": 0.77301741, + "epoch": 0.4458507706440747, + "grad_norm": 5.78125, + "learning_rate": 9.055276120328557e-06, + "loss": 0.81939163, + "memory(GiB)": 135.77, + "step": 19110, + "train_speed(iter/s)": 0.201375 + }, + { + "acc": 0.77150536, + "epoch": 0.4460840782163636, + "grad_norm": 4.4375, + "learning_rate": 9.054170758046204e-06, + "loss": 0.81701164, + "memory(GiB)": 135.77, + "step": 19120, + "train_speed(iter/s)": 0.201429 + }, + { + "acc": 0.77243052, + "epoch": 0.4463173857886525, + "grad_norm": 5.375, + "learning_rate": 9.05306481703215e-06, + "loss": 0.83213739, + "memory(GiB)": 135.77, + "step": 19130, + "train_speed(iter/s)": 0.201484 + }, + { + "acc": 0.76942472, + "epoch": 0.4465506933609414, + "grad_norm": 6.0625, + "learning_rate": 9.051958297444272e-06, + "loss": 0.8267148, + "memory(GiB)": 135.77, + "step": 19140, + "train_speed(iter/s)": 0.201538 + }, + { + "acc": 0.76962104, + "epoch": 0.4467840009332303, + "grad_norm": 5.28125, + "learning_rate": 9.050851199440524e-06, + "loss": 0.81351061, + "memory(GiB)": 135.77, + "step": 19150, + "train_speed(iter/s)": 0.201591 + }, + { + "acc": 0.7752852, + "epoch": 0.4470173085055192, + "grad_norm": 5.25, + "learning_rate": 9.049743523178945e-06, + "loss": 0.81729002, + "memory(GiB)": 135.77, + "step": 19160, + "train_speed(iter/s)": 0.201644 + }, + { + "acc": 0.76699119, + "epoch": 0.4472506160778081, + "grad_norm": 4.6875, + "learning_rate": 9.048635268817653e-06, + "loss": 0.85345955, + "memory(GiB)": 135.77, + "step": 19170, + "train_speed(iter/s)": 0.201701 + }, + { + "acc": 0.76262298, + "epoch": 0.447483923650097, + "grad_norm": 8.9375, + "learning_rate": 9.047526436514854e-06, + "loss": 0.85194921, + "memory(GiB)": 135.77, + "step": 19180, + "train_speed(iter/s)": 0.201754 + }, + { + "acc": 0.76009088, + "epoch": 0.4477172312223859, + "grad_norm": 4.90625, + "learning_rate": 9.04641702642883e-06, + "loss": 0.86009045, + "memory(GiB)": 135.77, + "step": 19190, + "train_speed(iter/s)": 0.201813 + }, + { + "acc": 0.77364101, + "epoch": 0.4479505387946748, + "grad_norm": 4.90625, + "learning_rate": 9.045307038717954e-06, + "loss": 0.81192675, + "memory(GiB)": 135.77, + "step": 19200, + "train_speed(iter/s)": 0.201865 + }, + { + "acc": 0.77397594, + "epoch": 0.4481838463669636, + "grad_norm": 6.625, + "learning_rate": 9.044196473540672e-06, + "loss": 0.8263483, + "memory(GiB)": 135.77, + "step": 19210, + "train_speed(iter/s)": 0.201919 + }, + { + "acc": 0.753967, + "epoch": 0.4484171539392525, + "grad_norm": 5.0625, + "learning_rate": 9.043085331055516e-06, + "loss": 0.89315586, + "memory(GiB)": 135.77, + "step": 19220, + "train_speed(iter/s)": 0.201975 + }, + { + "acc": 0.78117127, + "epoch": 0.4486504615115414, + "grad_norm": 3.390625, + "learning_rate": 9.041973611421106e-06, + "loss": 0.77627773, + "memory(GiB)": 135.77, + "step": 19230, + "train_speed(iter/s)": 0.20203 + }, + { + "acc": 0.76547561, + "epoch": 0.4488837690838303, + "grad_norm": 5.5625, + "learning_rate": 9.040861314796137e-06, + "loss": 0.84824905, + "memory(GiB)": 135.77, + "step": 19240, + "train_speed(iter/s)": 0.20209 + }, + { + "acc": 0.79242058, + "epoch": 0.4491170766561192, + "grad_norm": 4.40625, + "learning_rate": 9.039748441339389e-06, + "loss": 0.73243713, + "memory(GiB)": 135.77, + "step": 19250, + "train_speed(iter/s)": 0.202146 + }, + { + "acc": 0.74393578, + "epoch": 0.4493503842284081, + "grad_norm": 7.0625, + "learning_rate": 9.038634991209725e-06, + "loss": 0.93040276, + "memory(GiB)": 135.77, + "step": 19260, + "train_speed(iter/s)": 0.202197 + }, + { + "acc": 0.7861661, + "epoch": 0.449583691800697, + "grad_norm": 7.78125, + "learning_rate": 9.03752096456609e-06, + "loss": 0.77835784, + "memory(GiB)": 135.77, + "step": 19270, + "train_speed(iter/s)": 0.20225 + }, + { + "acc": 0.77567, + "epoch": 0.4498169993729859, + "grad_norm": 5.0, + "learning_rate": 9.036406361567506e-06, + "loss": 0.7994051, + "memory(GiB)": 135.77, + "step": 19280, + "train_speed(iter/s)": 0.202304 + }, + { + "acc": 0.76911907, + "epoch": 0.4500503069452748, + "grad_norm": 5.5, + "learning_rate": 9.035291182373092e-06, + "loss": 0.82964163, + "memory(GiB)": 135.77, + "step": 19290, + "train_speed(iter/s)": 0.202358 + }, + { + "acc": 0.76818333, + "epoch": 0.4502836145175637, + "grad_norm": 5.65625, + "learning_rate": 9.03417542714203e-06, + "loss": 0.8365284, + "memory(GiB)": 135.77, + "step": 19300, + "train_speed(iter/s)": 0.202412 + }, + { + "acc": 0.77055397, + "epoch": 0.45051692208985256, + "grad_norm": 5.53125, + "learning_rate": 9.033059096033598e-06, + "loss": 0.82133121, + "memory(GiB)": 135.77, + "step": 19310, + "train_speed(iter/s)": 0.202465 + }, + { + "acc": 0.769734, + "epoch": 0.45075022966214146, + "grad_norm": 4.65625, + "learning_rate": 9.031942189207154e-06, + "loss": 0.81667995, + "memory(GiB)": 135.77, + "step": 19320, + "train_speed(iter/s)": 0.202515 + }, + { + "acc": 0.7695981, + "epoch": 0.45098353723443035, + "grad_norm": 5.25, + "learning_rate": 9.030824706822132e-06, + "loss": 0.83406506, + "memory(GiB)": 135.77, + "step": 19330, + "train_speed(iter/s)": 0.202565 + }, + { + "acc": 0.77217684, + "epoch": 0.45121684480671925, + "grad_norm": 5.0625, + "learning_rate": 9.029706649038055e-06, + "loss": 0.80703802, + "memory(GiB)": 135.77, + "step": 19340, + "train_speed(iter/s)": 0.202615 + }, + { + "acc": 0.76901155, + "epoch": 0.45145015237900815, + "grad_norm": 6.9375, + "learning_rate": 9.028588016014524e-06, + "loss": 0.8353384, + "memory(GiB)": 135.77, + "step": 19350, + "train_speed(iter/s)": 0.202672 + }, + { + "acc": 0.78265429, + "epoch": 0.45168345995129705, + "grad_norm": 5.53125, + "learning_rate": 9.027468807911223e-06, + "loss": 0.78769207, + "memory(GiB)": 135.77, + "step": 19360, + "train_speed(iter/s)": 0.202725 + }, + { + "acc": 0.79214249, + "epoch": 0.45191676752358595, + "grad_norm": 11.1875, + "learning_rate": 9.026349024887921e-06, + "loss": 0.72728148, + "memory(GiB)": 135.77, + "step": 19370, + "train_speed(iter/s)": 0.202779 + }, + { + "acc": 0.76945453, + "epoch": 0.45215007509587485, + "grad_norm": 5.875, + "learning_rate": 9.025228667104465e-06, + "loss": 0.83437386, + "memory(GiB)": 135.77, + "step": 19380, + "train_speed(iter/s)": 0.202835 + }, + { + "acc": 0.76589389, + "epoch": 0.45238338266816375, + "grad_norm": 5.5, + "learning_rate": 9.024107734720786e-06, + "loss": 0.85786552, + "memory(GiB)": 135.77, + "step": 19390, + "train_speed(iter/s)": 0.20289 + }, + { + "acc": 0.76294546, + "epoch": 0.4526166902404526, + "grad_norm": 6.5, + "learning_rate": 9.022986227896898e-06, + "loss": 0.8357048, + "memory(GiB)": 135.77, + "step": 19400, + "train_speed(iter/s)": 0.202945 + }, + { + "acc": 0.78395748, + "epoch": 0.4528499978127415, + "grad_norm": 7.28125, + "learning_rate": 9.021864146792894e-06, + "loss": 0.74856625, + "memory(GiB)": 135.77, + "step": 19410, + "train_speed(iter/s)": 0.203001 + }, + { + "acc": 0.78489389, + "epoch": 0.4530833053850304, + "grad_norm": 15.25, + "learning_rate": 9.02074149156895e-06, + "loss": 0.76024318, + "memory(GiB)": 135.77, + "step": 19420, + "train_speed(iter/s)": 0.203057 + }, + { + "acc": 0.75939164, + "epoch": 0.4533166129573193, + "grad_norm": 7.75, + "learning_rate": 9.019618262385328e-06, + "loss": 0.85180931, + "memory(GiB)": 135.77, + "step": 19430, + "train_speed(iter/s)": 0.203111 + }, + { + "acc": 0.78605642, + "epoch": 0.4535499205296082, + "grad_norm": 5.4375, + "learning_rate": 9.018494459402365e-06, + "loss": 0.76549592, + "memory(GiB)": 135.77, + "step": 19440, + "train_speed(iter/s)": 0.203163 + }, + { + "acc": 0.76347561, + "epoch": 0.4537832281018971, + "grad_norm": 7.0625, + "learning_rate": 9.017370082780485e-06, + "loss": 0.85260391, + "memory(GiB)": 135.77, + "step": 19450, + "train_speed(iter/s)": 0.203218 + }, + { + "acc": 0.78153772, + "epoch": 0.454016535674186, + "grad_norm": 4.8125, + "learning_rate": 9.016245132680195e-06, + "loss": 0.77672606, + "memory(GiB)": 135.77, + "step": 19460, + "train_speed(iter/s)": 0.203268 + }, + { + "acc": 0.78809853, + "epoch": 0.4542498432464749, + "grad_norm": 5.4375, + "learning_rate": 9.015119609262078e-06, + "loss": 0.74799476, + "memory(GiB)": 135.77, + "step": 19470, + "train_speed(iter/s)": 0.203321 + }, + { + "acc": 0.78086472, + "epoch": 0.4544831508187638, + "grad_norm": 6.03125, + "learning_rate": 9.013993512686803e-06, + "loss": 0.7939446, + "memory(GiB)": 135.77, + "step": 19480, + "train_speed(iter/s)": 0.203372 + }, + { + "acc": 0.77702465, + "epoch": 0.4547164583910527, + "grad_norm": 5.78125, + "learning_rate": 9.01286684311512e-06, + "loss": 0.81716118, + "memory(GiB)": 135.77, + "step": 19490, + "train_speed(iter/s)": 0.203422 + }, + { + "acc": 0.75863905, + "epoch": 0.4549497659633415, + "grad_norm": 6.78125, + "learning_rate": 9.011739600707862e-06, + "loss": 0.89850245, + "memory(GiB)": 135.77, + "step": 19500, + "train_speed(iter/s)": 0.203476 + }, + { + "epoch": 0.4549497659633415, + "eval_acc": 0.7380889123980622, + "eval_loss": 0.829414427280426, + "eval_runtime": 1270.491, + "eval_samples_per_second": 28.328, + "eval_steps_per_second": 14.165, + "step": 19500 + }, + { + "acc": 0.7731576, + "epoch": 0.4551830735356304, + "grad_norm": 5.09375, + "learning_rate": 9.01061178562594e-06, + "loss": 0.81027994, + "memory(GiB)": 135.77, + "step": 19510, + "train_speed(iter/s)": 0.200831 + }, + { + "acc": 0.76880264, + "epoch": 0.4554163811079193, + "grad_norm": 5.0625, + "learning_rate": 9.009483398030353e-06, + "loss": 0.81138697, + "memory(GiB)": 135.77, + "step": 19520, + "train_speed(iter/s)": 0.200881 + }, + { + "acc": 0.78158731, + "epoch": 0.4556496886802082, + "grad_norm": 5.8125, + "learning_rate": 9.008354438082173e-06, + "loss": 0.79674578, + "memory(GiB)": 135.77, + "step": 19530, + "train_speed(iter/s)": 0.200933 + }, + { + "acc": 0.75377884, + "epoch": 0.4558829962524971, + "grad_norm": 7.09375, + "learning_rate": 9.007224905942562e-06, + "loss": 0.89388762, + "memory(GiB)": 135.77, + "step": 19540, + "train_speed(iter/s)": 0.200988 + }, + { + "acc": 0.77003942, + "epoch": 0.456116303824786, + "grad_norm": 6.125, + "learning_rate": 9.00609480177276e-06, + "loss": 0.83401814, + "memory(GiB)": 135.77, + "step": 19550, + "train_speed(iter/s)": 0.201041 + }, + { + "acc": 0.76566763, + "epoch": 0.4563496113970749, + "grad_norm": 4.6875, + "learning_rate": 9.00496412573409e-06, + "loss": 0.86685047, + "memory(GiB)": 135.77, + "step": 19560, + "train_speed(iter/s)": 0.201093 + }, + { + "acc": 0.79012947, + "epoch": 0.4565829189693638, + "grad_norm": 4.8125, + "learning_rate": 9.003832877987952e-06, + "loss": 0.74963512, + "memory(GiB)": 135.77, + "step": 19570, + "train_speed(iter/s)": 0.201142 + }, + { + "acc": 0.76897526, + "epoch": 0.4568162265416527, + "grad_norm": 3.921875, + "learning_rate": 9.002701058695836e-06, + "loss": 0.82385035, + "memory(GiB)": 135.77, + "step": 19580, + "train_speed(iter/s)": 0.201196 + }, + { + "acc": 0.76111898, + "epoch": 0.45704953411394156, + "grad_norm": 5.375, + "learning_rate": 9.001568668019306e-06, + "loss": 0.85867405, + "memory(GiB)": 135.77, + "step": 19590, + "train_speed(iter/s)": 0.201252 + }, + { + "acc": 0.78389206, + "epoch": 0.45728284168623046, + "grad_norm": 6.125, + "learning_rate": 9.000435706120011e-06, + "loss": 0.78628263, + "memory(GiB)": 135.77, + "step": 19600, + "train_speed(iter/s)": 0.201305 + }, + { + "acc": 0.77073708, + "epoch": 0.45751614925851936, + "grad_norm": 6.28125, + "learning_rate": 8.999302173159681e-06, + "loss": 0.83480806, + "memory(GiB)": 135.77, + "step": 19610, + "train_speed(iter/s)": 0.201358 + }, + { + "acc": 0.76435275, + "epoch": 0.45774945683080825, + "grad_norm": 5.15625, + "learning_rate": 8.998168069300128e-06, + "loss": 0.86908226, + "memory(GiB)": 135.77, + "step": 19620, + "train_speed(iter/s)": 0.20141 + }, + { + "acc": 0.77386503, + "epoch": 0.45798276440309715, + "grad_norm": 4.78125, + "learning_rate": 8.997033394703246e-06, + "loss": 0.82412939, + "memory(GiB)": 135.77, + "step": 19630, + "train_speed(iter/s)": 0.201461 + }, + { + "acc": 0.75808735, + "epoch": 0.45821607197538605, + "grad_norm": 7.875, + "learning_rate": 8.995898149531005e-06, + "loss": 0.87254372, + "memory(GiB)": 135.77, + "step": 19640, + "train_speed(iter/s)": 0.201514 + }, + { + "acc": 0.77585478, + "epoch": 0.45844937954767495, + "grad_norm": 6.125, + "learning_rate": 8.994762333945465e-06, + "loss": 0.8103075, + "memory(GiB)": 135.77, + "step": 19650, + "train_speed(iter/s)": 0.20157 + }, + { + "acc": 0.76372824, + "epoch": 0.45868268711996385, + "grad_norm": 5.21875, + "learning_rate": 8.993625948108764e-06, + "loss": 0.86949158, + "memory(GiB)": 135.77, + "step": 19660, + "train_speed(iter/s)": 0.201625 + }, + { + "acc": 0.76708436, + "epoch": 0.45891599469225275, + "grad_norm": 5.65625, + "learning_rate": 8.992488992183116e-06, + "loss": 0.84779406, + "memory(GiB)": 135.77, + "step": 19670, + "train_speed(iter/s)": 0.201678 + }, + { + "acc": 0.7899591, + "epoch": 0.45914930226454165, + "grad_norm": 5.03125, + "learning_rate": 8.991351466330827e-06, + "loss": 0.7441288, + "memory(GiB)": 135.77, + "step": 19680, + "train_speed(iter/s)": 0.201731 + }, + { + "acc": 0.78236446, + "epoch": 0.4593826098368305, + "grad_norm": 4.96875, + "learning_rate": 8.990213370714274e-06, + "loss": 0.79019766, + "memory(GiB)": 135.77, + "step": 19690, + "train_speed(iter/s)": 0.201785 + }, + { + "acc": 0.7680593, + "epoch": 0.4596159174091194, + "grad_norm": 7.71875, + "learning_rate": 8.989074705495921e-06, + "loss": 0.8316782, + "memory(GiB)": 135.77, + "step": 19700, + "train_speed(iter/s)": 0.201834 + }, + { + "acc": 0.77131128, + "epoch": 0.4598492249814083, + "grad_norm": 8.875, + "learning_rate": 8.987935470838315e-06, + "loss": 0.82049313, + "memory(GiB)": 135.77, + "step": 19710, + "train_speed(iter/s)": 0.201885 + }, + { + "acc": 0.78124671, + "epoch": 0.4600825325536972, + "grad_norm": 5.71875, + "learning_rate": 8.986795666904077e-06, + "loss": 0.80133085, + "memory(GiB)": 135.77, + "step": 19720, + "train_speed(iter/s)": 0.201938 + }, + { + "acc": 0.75701828, + "epoch": 0.4603158401259861, + "grad_norm": 9.125, + "learning_rate": 8.985655293855917e-06, + "loss": 0.89116259, + "memory(GiB)": 135.77, + "step": 19730, + "train_speed(iter/s)": 0.201995 + }, + { + "acc": 0.76774426, + "epoch": 0.460549147698275, + "grad_norm": 5.34375, + "learning_rate": 8.98451435185662e-06, + "loss": 0.87877426, + "memory(GiB)": 135.77, + "step": 19740, + "train_speed(iter/s)": 0.202049 + }, + { + "acc": 0.77983117, + "epoch": 0.4607824552705639, + "grad_norm": 5.34375, + "learning_rate": 8.983372841069059e-06, + "loss": 0.79148512, + "memory(GiB)": 135.77, + "step": 19750, + "train_speed(iter/s)": 0.202102 + }, + { + "acc": 0.74972019, + "epoch": 0.4610157628428528, + "grad_norm": 5.84375, + "learning_rate": 8.98223076165618e-06, + "loss": 0.89190731, + "memory(GiB)": 135.77, + "step": 19760, + "train_speed(iter/s)": 0.202153 + }, + { + "acc": 0.78662281, + "epoch": 0.4612490704151417, + "grad_norm": 5.71875, + "learning_rate": 8.981088113781018e-06, + "loss": 0.75874639, + "memory(GiB)": 135.77, + "step": 19770, + "train_speed(iter/s)": 0.202207 + }, + { + "acc": 0.76131806, + "epoch": 0.4614823779874306, + "grad_norm": 7.21875, + "learning_rate": 8.979944897606685e-06, + "loss": 0.87373924, + "memory(GiB)": 135.77, + "step": 19780, + "train_speed(iter/s)": 0.20226 + }, + { + "acc": 0.76455545, + "epoch": 0.4617156855597194, + "grad_norm": 6.15625, + "learning_rate": 8.978801113296371e-06, + "loss": 0.87849846, + "memory(GiB)": 135.77, + "step": 19790, + "train_speed(iter/s)": 0.20231 + }, + { + "acc": 0.76590109, + "epoch": 0.4619489931320083, + "grad_norm": 5.15625, + "learning_rate": 8.977656761013357e-06, + "loss": 0.83625021, + "memory(GiB)": 135.77, + "step": 19800, + "train_speed(iter/s)": 0.202362 + }, + { + "acc": 0.75346403, + "epoch": 0.4621823007042972, + "grad_norm": 15.125, + "learning_rate": 8.976511840920994e-06, + "loss": 0.90793972, + "memory(GiB)": 135.77, + "step": 19810, + "train_speed(iter/s)": 0.202415 + }, + { + "acc": 0.77614479, + "epoch": 0.4624156082765861, + "grad_norm": 5.9375, + "learning_rate": 8.975366353182721e-06, + "loss": 0.81695967, + "memory(GiB)": 135.77, + "step": 19820, + "train_speed(iter/s)": 0.202469 + }, + { + "acc": 0.77849016, + "epoch": 0.462648915848875, + "grad_norm": 5.46875, + "learning_rate": 8.974220297962058e-06, + "loss": 0.79997635, + "memory(GiB)": 135.77, + "step": 19830, + "train_speed(iter/s)": 0.202525 + }, + { + "acc": 0.77346191, + "epoch": 0.4628822234211639, + "grad_norm": 6.0625, + "learning_rate": 8.973073675422602e-06, + "loss": 0.80886145, + "memory(GiB)": 135.77, + "step": 19840, + "train_speed(iter/s)": 0.202581 + }, + { + "acc": 0.81118984, + "epoch": 0.4631155309934528, + "grad_norm": 5.6875, + "learning_rate": 8.97192648572803e-06, + "loss": 0.68877487, + "memory(GiB)": 135.77, + "step": 19850, + "train_speed(iter/s)": 0.202632 + }, + { + "acc": 0.75902133, + "epoch": 0.4633488385657417, + "grad_norm": 4.53125, + "learning_rate": 8.970778729042109e-06, + "loss": 0.86830082, + "memory(GiB)": 135.77, + "step": 19860, + "train_speed(iter/s)": 0.202682 + }, + { + "acc": 0.75023003, + "epoch": 0.4635821461380306, + "grad_norm": 6.3125, + "learning_rate": 8.969630405528675e-06, + "loss": 0.90686493, + "memory(GiB)": 135.77, + "step": 19870, + "train_speed(iter/s)": 0.202737 + }, + { + "acc": 0.78217545, + "epoch": 0.46381545371031946, + "grad_norm": 5.875, + "learning_rate": 8.968481515351656e-06, + "loss": 0.77547541, + "memory(GiB)": 135.77, + "step": 19880, + "train_speed(iter/s)": 0.202788 + }, + { + "acc": 0.77224998, + "epoch": 0.46404876128260836, + "grad_norm": 5.4375, + "learning_rate": 8.967332058675054e-06, + "loss": 0.81753349, + "memory(GiB)": 135.77, + "step": 19890, + "train_speed(iter/s)": 0.20284 + }, + { + "acc": 0.76978784, + "epoch": 0.46428206885489726, + "grad_norm": 5.40625, + "learning_rate": 8.96618203566295e-06, + "loss": 0.83251038, + "memory(GiB)": 135.77, + "step": 19900, + "train_speed(iter/s)": 0.202894 + }, + { + "acc": 0.78358245, + "epoch": 0.46451537642718616, + "grad_norm": 5.125, + "learning_rate": 8.965031446479516e-06, + "loss": 0.77494287, + "memory(GiB)": 135.77, + "step": 19910, + "train_speed(iter/s)": 0.202944 + }, + { + "acc": 0.75092211, + "epoch": 0.46474868399947505, + "grad_norm": 6.6875, + "learning_rate": 8.963880291288992e-06, + "loss": 0.90174236, + "memory(GiB)": 135.77, + "step": 19920, + "train_speed(iter/s)": 0.202996 + }, + { + "acc": 0.76736884, + "epoch": 0.46498199157176395, + "grad_norm": 5.8125, + "learning_rate": 8.96272857025571e-06, + "loss": 0.833918, + "memory(GiB)": 135.77, + "step": 19930, + "train_speed(iter/s)": 0.203051 + }, + { + "acc": 0.75787697, + "epoch": 0.46521529914405285, + "grad_norm": 5.375, + "learning_rate": 8.961576283544076e-06, + "loss": 0.89234486, + "memory(GiB)": 135.77, + "step": 19940, + "train_speed(iter/s)": 0.203107 + }, + { + "acc": 0.77328529, + "epoch": 0.46544860671634175, + "grad_norm": 6.15625, + "learning_rate": 8.960423431318576e-06, + "loss": 0.8312933, + "memory(GiB)": 135.77, + "step": 19950, + "train_speed(iter/s)": 0.20316 + }, + { + "acc": 0.77509122, + "epoch": 0.46568191428863065, + "grad_norm": 6.59375, + "learning_rate": 8.959270013743784e-06, + "loss": 0.81753225, + "memory(GiB)": 135.77, + "step": 19960, + "train_speed(iter/s)": 0.203211 + }, + { + "acc": 0.76919918, + "epoch": 0.46591522186091955, + "grad_norm": 5.25, + "learning_rate": 8.958116030984347e-06, + "loss": 0.84067249, + "memory(GiB)": 135.77, + "step": 19970, + "train_speed(iter/s)": 0.203265 + }, + { + "acc": 0.77842779, + "epoch": 0.4661485294332084, + "grad_norm": 4.125, + "learning_rate": 8.956961483204996e-06, + "loss": 0.78261929, + "memory(GiB)": 135.77, + "step": 19980, + "train_speed(iter/s)": 0.203316 + }, + { + "acc": 0.75851555, + "epoch": 0.4663818370054973, + "grad_norm": 6.78125, + "learning_rate": 8.955806370570543e-06, + "loss": 0.87037487, + "memory(GiB)": 135.77, + "step": 19990, + "train_speed(iter/s)": 0.203366 + }, + { + "acc": 0.77085943, + "epoch": 0.4666151445777862, + "grad_norm": 4.84375, + "learning_rate": 8.954650693245882e-06, + "loss": 0.81232281, + "memory(GiB)": 135.77, + "step": 20000, + "train_speed(iter/s)": 0.203415 + }, + { + "epoch": 0.4666151445777862, + "eval_acc": 0.7384622086035567, + "eval_loss": 0.8278706669807434, + "eval_runtime": 1271.2712, + "eval_samples_per_second": 28.311, + "eval_steps_per_second": 14.156, + "step": 20000 + }, + { + "acc": 0.77572637, + "epoch": 0.4668484521500751, + "grad_norm": 5.65625, + "learning_rate": 8.953494451395979e-06, + "loss": 0.8144228, + "memory(GiB)": 135.77, + "step": 20010, + "train_speed(iter/s)": 0.200838 + }, + { + "acc": 0.79318457, + "epoch": 0.467081759722364, + "grad_norm": 9.25, + "learning_rate": 8.952337645185894e-06, + "loss": 0.74365864, + "memory(GiB)": 135.77, + "step": 20020, + "train_speed(iter/s)": 0.20089 + }, + { + "acc": 0.7678813, + "epoch": 0.4673150672946529, + "grad_norm": 4.6875, + "learning_rate": 8.951180274780758e-06, + "loss": 0.8354866, + "memory(GiB)": 135.77, + "step": 20030, + "train_speed(iter/s)": 0.20094 + }, + { + "acc": 0.76604643, + "epoch": 0.4675483748669418, + "grad_norm": 9.0, + "learning_rate": 8.950022340345786e-06, + "loss": 0.85739231, + "memory(GiB)": 135.77, + "step": 20040, + "train_speed(iter/s)": 0.200993 + }, + { + "acc": 0.76447287, + "epoch": 0.4677816824392307, + "grad_norm": 5.75, + "learning_rate": 8.948863842046272e-06, + "loss": 0.84905338, + "memory(GiB)": 135.77, + "step": 20050, + "train_speed(iter/s)": 0.201048 + }, + { + "acc": 0.77429447, + "epoch": 0.4680149900115196, + "grad_norm": 3.671875, + "learning_rate": 8.947704780047593e-06, + "loss": 0.81440744, + "memory(GiB)": 135.77, + "step": 20060, + "train_speed(iter/s)": 0.201097 + }, + { + "acc": 0.78105116, + "epoch": 0.4682482975838085, + "grad_norm": 5.40625, + "learning_rate": 8.946545154515201e-06, + "loss": 0.78914485, + "memory(GiB)": 135.77, + "step": 20070, + "train_speed(iter/s)": 0.201152 + }, + { + "acc": 0.7645546, + "epoch": 0.4684816051560973, + "grad_norm": 7.3125, + "learning_rate": 8.945384965614636e-06, + "loss": 0.87174253, + "memory(GiB)": 135.77, + "step": 20080, + "train_speed(iter/s)": 0.201205 + }, + { + "acc": 0.79997892, + "epoch": 0.4687149127283862, + "grad_norm": 5.6875, + "learning_rate": 8.944224213511514e-06, + "loss": 0.71954641, + "memory(GiB)": 135.77, + "step": 20090, + "train_speed(iter/s)": 0.20126 + }, + { + "acc": 0.762815, + "epoch": 0.4689482203006751, + "grad_norm": 8.1875, + "learning_rate": 8.943062898371531e-06, + "loss": 0.85633907, + "memory(GiB)": 135.77, + "step": 20100, + "train_speed(iter/s)": 0.201312 + }, + { + "acc": 0.76686854, + "epoch": 0.469181527872964, + "grad_norm": 13.0, + "learning_rate": 8.941901020360464e-06, + "loss": 0.82543678, + "memory(GiB)": 135.77, + "step": 20110, + "train_speed(iter/s)": 0.201361 + }, + { + "acc": 0.77254, + "epoch": 0.4694148354452529, + "grad_norm": 5.09375, + "learning_rate": 8.940738579644171e-06, + "loss": 0.80971737, + "memory(GiB)": 135.77, + "step": 20120, + "train_speed(iter/s)": 0.201415 + }, + { + "acc": 0.78281898, + "epoch": 0.4696481430175418, + "grad_norm": 6.0, + "learning_rate": 8.939575576388592e-06, + "loss": 0.78121605, + "memory(GiB)": 135.77, + "step": 20130, + "train_speed(iter/s)": 0.201468 + }, + { + "acc": 0.77683525, + "epoch": 0.4698814505898307, + "grad_norm": 5.0625, + "learning_rate": 8.938412010759743e-06, + "loss": 0.82987223, + "memory(GiB)": 135.77, + "step": 20140, + "train_speed(iter/s)": 0.20152 + }, + { + "acc": 0.78844047, + "epoch": 0.4701147581621196, + "grad_norm": 8.6875, + "learning_rate": 8.937247882923724e-06, + "loss": 0.76155491, + "memory(GiB)": 135.77, + "step": 20150, + "train_speed(iter/s)": 0.201573 + }, + { + "acc": 0.78030567, + "epoch": 0.4703480657344085, + "grad_norm": 5.9375, + "learning_rate": 8.936083193046712e-06, + "loss": 0.78899822, + "memory(GiB)": 135.77, + "step": 20160, + "train_speed(iter/s)": 0.201621 + }, + { + "acc": 0.76187115, + "epoch": 0.47058137330669736, + "grad_norm": 7.34375, + "learning_rate": 8.93491794129497e-06, + "loss": 0.83778381, + "memory(GiB)": 135.77, + "step": 20170, + "train_speed(iter/s)": 0.201673 + }, + { + "acc": 0.76804895, + "epoch": 0.47081468087898626, + "grad_norm": 4.1875, + "learning_rate": 8.933752127834834e-06, + "loss": 0.85201845, + "memory(GiB)": 135.77, + "step": 20180, + "train_speed(iter/s)": 0.201725 + }, + { + "acc": 0.764326, + "epoch": 0.47104798845127516, + "grad_norm": 6.3125, + "learning_rate": 8.932585752832725e-06, + "loss": 0.83452625, + "memory(GiB)": 135.77, + "step": 20190, + "train_speed(iter/s)": 0.201778 + }, + { + "acc": 0.76837258, + "epoch": 0.47128129602356406, + "grad_norm": 5.0, + "learning_rate": 8.931418816455142e-06, + "loss": 0.83890285, + "memory(GiB)": 135.77, + "step": 20200, + "train_speed(iter/s)": 0.201827 + }, + { + "acc": 0.76820593, + "epoch": 0.47151460359585295, + "grad_norm": 5.03125, + "learning_rate": 8.930251318868664e-06, + "loss": 0.82331924, + "memory(GiB)": 135.77, + "step": 20210, + "train_speed(iter/s)": 0.201878 + }, + { + "acc": 0.78207231, + "epoch": 0.47174791116814185, + "grad_norm": 4.96875, + "learning_rate": 8.929083260239952e-06, + "loss": 0.78689303, + "memory(GiB)": 135.77, + "step": 20220, + "train_speed(iter/s)": 0.201928 + }, + { + "acc": 0.7715971, + "epoch": 0.47198121874043075, + "grad_norm": 5.625, + "learning_rate": 8.927914640735748e-06, + "loss": 0.82081528, + "memory(GiB)": 135.77, + "step": 20230, + "train_speed(iter/s)": 0.201983 + }, + { + "acc": 0.76612329, + "epoch": 0.47221452631271965, + "grad_norm": 5.90625, + "learning_rate": 8.926745460522867e-06, + "loss": 0.85213652, + "memory(GiB)": 135.77, + "step": 20240, + "train_speed(iter/s)": 0.202032 + }, + { + "acc": 0.77809095, + "epoch": 0.47244783388500855, + "grad_norm": 6.9375, + "learning_rate": 8.925575719768215e-06, + "loss": 0.80060663, + "memory(GiB)": 135.77, + "step": 20250, + "train_speed(iter/s)": 0.202087 + }, + { + "acc": 0.77375336, + "epoch": 0.47268114145729745, + "grad_norm": 5.4375, + "learning_rate": 8.92440541863877e-06, + "loss": 0.81226845, + "memory(GiB)": 135.77, + "step": 20260, + "train_speed(iter/s)": 0.202137 + }, + { + "acc": 0.80193977, + "epoch": 0.4729144490295863, + "grad_norm": 6.34375, + "learning_rate": 8.923234557301588e-06, + "loss": 0.72059889, + "memory(GiB)": 135.77, + "step": 20270, + "train_speed(iter/s)": 0.202186 + }, + { + "acc": 0.79480009, + "epoch": 0.4731477566018752, + "grad_norm": 5.21875, + "learning_rate": 8.922063135923815e-06, + "loss": 0.72272043, + "memory(GiB)": 135.77, + "step": 20280, + "train_speed(iter/s)": 0.202237 + }, + { + "acc": 0.79387074, + "epoch": 0.4733810641741641, + "grad_norm": 4.9375, + "learning_rate": 8.920891154672668e-06, + "loss": 0.75309858, + "memory(GiB)": 135.77, + "step": 20290, + "train_speed(iter/s)": 0.202289 + }, + { + "acc": 0.7726409, + "epoch": 0.473614371746453, + "grad_norm": 8.4375, + "learning_rate": 8.91971861371545e-06, + "loss": 0.83473721, + "memory(GiB)": 135.77, + "step": 20300, + "train_speed(iter/s)": 0.202341 + }, + { + "acc": 0.77393756, + "epoch": 0.4738476793187419, + "grad_norm": 3.65625, + "learning_rate": 8.918545513219535e-06, + "loss": 0.79616671, + "memory(GiB)": 135.77, + "step": 20310, + "train_speed(iter/s)": 0.202391 + }, + { + "acc": 0.76672659, + "epoch": 0.4740809868910308, + "grad_norm": 5.90625, + "learning_rate": 8.917371853352388e-06, + "loss": 0.85512495, + "memory(GiB)": 135.77, + "step": 20320, + "train_speed(iter/s)": 0.202442 + }, + { + "acc": 0.765168, + "epoch": 0.4743142944633197, + "grad_norm": 8.625, + "learning_rate": 8.916197634281547e-06, + "loss": 0.82549314, + "memory(GiB)": 135.77, + "step": 20330, + "train_speed(iter/s)": 0.202492 + }, + { + "acc": 0.77795401, + "epoch": 0.4745476020356086, + "grad_norm": 5.28125, + "learning_rate": 8.91502285617463e-06, + "loss": 0.79356041, + "memory(GiB)": 135.77, + "step": 20340, + "train_speed(iter/s)": 0.202544 + }, + { + "acc": 0.75690231, + "epoch": 0.4747809096078975, + "grad_norm": 5.875, + "learning_rate": 8.913847519199341e-06, + "loss": 0.89064808, + "memory(GiB)": 135.77, + "step": 20350, + "train_speed(iter/s)": 0.202595 + }, + { + "acc": 0.76705751, + "epoch": 0.4750142171801863, + "grad_norm": 6.65625, + "learning_rate": 8.912671623523452e-06, + "loss": 0.84812698, + "memory(GiB)": 135.77, + "step": 20360, + "train_speed(iter/s)": 0.202641 + }, + { + "acc": 0.78243647, + "epoch": 0.4752475247524752, + "grad_norm": 6.40625, + "learning_rate": 8.911495169314828e-06, + "loss": 0.77283859, + "memory(GiB)": 135.77, + "step": 20370, + "train_speed(iter/s)": 0.20269 + }, + { + "acc": 0.78674965, + "epoch": 0.4754808323247641, + "grad_norm": 5.59375, + "learning_rate": 8.910318156741401e-06, + "loss": 0.76450891, + "memory(GiB)": 135.77, + "step": 20380, + "train_speed(iter/s)": 0.20274 + }, + { + "acc": 0.7955699, + "epoch": 0.475714139897053, + "grad_norm": 4.5, + "learning_rate": 8.909140585971198e-06, + "loss": 0.73192253, + "memory(GiB)": 135.77, + "step": 20390, + "train_speed(iter/s)": 0.202788 + }, + { + "acc": 0.78440504, + "epoch": 0.4759474474693419, + "grad_norm": 5.28125, + "learning_rate": 8.90796245717231e-06, + "loss": 0.77477818, + "memory(GiB)": 135.77, + "step": 20400, + "train_speed(iter/s)": 0.202837 + }, + { + "acc": 0.76490378, + "epoch": 0.4761807550416308, + "grad_norm": 8.9375, + "learning_rate": 8.906783770512915e-06, + "loss": 0.86861458, + "memory(GiB)": 135.77, + "step": 20410, + "train_speed(iter/s)": 0.202886 + }, + { + "acc": 0.76525488, + "epoch": 0.4764140626139197, + "grad_norm": 5.4375, + "learning_rate": 8.905604526161274e-06, + "loss": 0.82954245, + "memory(GiB)": 135.77, + "step": 20420, + "train_speed(iter/s)": 0.20294 + }, + { + "acc": 0.7579958, + "epoch": 0.4766473701862086, + "grad_norm": 6.25, + "learning_rate": 8.904424724285721e-06, + "loss": 0.87198391, + "memory(GiB)": 135.77, + "step": 20430, + "train_speed(iter/s)": 0.202991 + }, + { + "acc": 0.76998606, + "epoch": 0.4768806777584975, + "grad_norm": 6.78125, + "learning_rate": 8.903244365054671e-06, + "loss": 0.81940556, + "memory(GiB)": 135.77, + "step": 20440, + "train_speed(iter/s)": 0.20304 + }, + { + "acc": 0.76503363, + "epoch": 0.4771139853307864, + "grad_norm": 5.8125, + "learning_rate": 8.902063448636624e-06, + "loss": 0.86698151, + "memory(GiB)": 135.77, + "step": 20450, + "train_speed(iter/s)": 0.203093 + }, + { + "acc": 0.77026491, + "epoch": 0.47734729290307526, + "grad_norm": 5.71875, + "learning_rate": 8.900881975200151e-06, + "loss": 0.82462835, + "memory(GiB)": 135.77, + "step": 20460, + "train_speed(iter/s)": 0.203142 + }, + { + "acc": 0.76255193, + "epoch": 0.47758060047536416, + "grad_norm": 4.3125, + "learning_rate": 8.89969994491391e-06, + "loss": 0.84601803, + "memory(GiB)": 135.77, + "step": 20470, + "train_speed(iter/s)": 0.203189 + }, + { + "acc": 0.75619602, + "epoch": 0.47781390804765306, + "grad_norm": 6.28125, + "learning_rate": 8.898517357946636e-06, + "loss": 0.88812332, + "memory(GiB)": 135.77, + "step": 20480, + "train_speed(iter/s)": 0.203241 + }, + { + "acc": 0.77179279, + "epoch": 0.47804721561994196, + "grad_norm": 6.3125, + "learning_rate": 8.897334214467141e-06, + "loss": 0.82135019, + "memory(GiB)": 135.77, + "step": 20490, + "train_speed(iter/s)": 0.203288 + }, + { + "acc": 0.77499766, + "epoch": 0.47828052319223086, + "grad_norm": 5.53125, + "learning_rate": 8.89615051464432e-06, + "loss": 0.79335108, + "memory(GiB)": 135.77, + "step": 20500, + "train_speed(iter/s)": 0.203339 + }, + { + "epoch": 0.47828052319223086, + "eval_acc": 0.7387652014910709, + "eval_loss": 0.827155590057373, + "eval_runtime": 1271.1109, + "eval_samples_per_second": 28.315, + "eval_steps_per_second": 14.158, + "step": 20500 + }, + { + "acc": 0.76934633, + "epoch": 0.47851383076451975, + "grad_norm": 5.34375, + "learning_rate": 8.894966258647144e-06, + "loss": 0.82157841, + "memory(GiB)": 135.77, + "step": 20510, + "train_speed(iter/s)": 0.200826 + }, + { + "acc": 0.76049023, + "epoch": 0.47874713833680865, + "grad_norm": 6.0625, + "learning_rate": 8.893781446644667e-06, + "loss": 0.87835636, + "memory(GiB)": 135.77, + "step": 20520, + "train_speed(iter/s)": 0.200874 + }, + { + "acc": 0.77881689, + "epoch": 0.47898044590909755, + "grad_norm": 4.375, + "learning_rate": 8.892596078806017e-06, + "loss": 0.81451454, + "memory(GiB)": 135.77, + "step": 20530, + "train_speed(iter/s)": 0.200925 + }, + { + "acc": 0.77435408, + "epoch": 0.47921375348138645, + "grad_norm": 4.8125, + "learning_rate": 8.89141015530041e-06, + "loss": 0.8198391, + "memory(GiB)": 135.77, + "step": 20540, + "train_speed(iter/s)": 0.200977 + }, + { + "acc": 0.77629027, + "epoch": 0.47944706105367535, + "grad_norm": 4.34375, + "learning_rate": 8.890223676297132e-06, + "loss": 0.80017614, + "memory(GiB)": 135.77, + "step": 20550, + "train_speed(iter/s)": 0.201028 + }, + { + "acc": 0.78644643, + "epoch": 0.4796803686259642, + "grad_norm": 6.3125, + "learning_rate": 8.889036641965557e-06, + "loss": 0.77309122, + "memory(GiB)": 135.77, + "step": 20560, + "train_speed(iter/s)": 0.201078 + }, + { + "acc": 0.77234049, + "epoch": 0.4799136761982531, + "grad_norm": 10.125, + "learning_rate": 8.887849052475128e-06, + "loss": 0.82501678, + "memory(GiB)": 135.77, + "step": 20570, + "train_speed(iter/s)": 0.201129 + }, + { + "acc": 0.78163843, + "epoch": 0.480146983770542, + "grad_norm": 4.6875, + "learning_rate": 8.886660907995379e-06, + "loss": 0.80244398, + "memory(GiB)": 135.77, + "step": 20580, + "train_speed(iter/s)": 0.20118 + }, + { + "acc": 0.77460861, + "epoch": 0.4803802913428309, + "grad_norm": 6.0, + "learning_rate": 8.885472208695911e-06, + "loss": 0.82104893, + "memory(GiB)": 135.77, + "step": 20590, + "train_speed(iter/s)": 0.201231 + }, + { + "acc": 0.7811389, + "epoch": 0.4806135989151198, + "grad_norm": 6.78125, + "learning_rate": 8.884282954746417e-06, + "loss": 0.76911659, + "memory(GiB)": 135.77, + "step": 20600, + "train_speed(iter/s)": 0.201284 + }, + { + "acc": 0.77763205, + "epoch": 0.4808469064874087, + "grad_norm": 5.1875, + "learning_rate": 8.88309314631666e-06, + "loss": 0.78176918, + "memory(GiB)": 135.77, + "step": 20610, + "train_speed(iter/s)": 0.201334 + }, + { + "acc": 0.76166954, + "epoch": 0.4810802140596976, + "grad_norm": 3.90625, + "learning_rate": 8.881902783576482e-06, + "loss": 0.86578398, + "memory(GiB)": 135.77, + "step": 20620, + "train_speed(iter/s)": 0.201383 + }, + { + "acc": 0.7865201, + "epoch": 0.4813135216319865, + "grad_norm": 4.96875, + "learning_rate": 8.88071186669581e-06, + "loss": 0.77439818, + "memory(GiB)": 135.77, + "step": 20630, + "train_speed(iter/s)": 0.201435 + }, + { + "acc": 0.76671553, + "epoch": 0.4815468292042754, + "grad_norm": 5.03125, + "learning_rate": 8.879520395844648e-06, + "loss": 0.84458523, + "memory(GiB)": 135.77, + "step": 20640, + "train_speed(iter/s)": 0.201488 + }, + { + "acc": 0.75628233, + "epoch": 0.4817801367765642, + "grad_norm": 5.25, + "learning_rate": 8.878328371193074e-06, + "loss": 0.86866341, + "memory(GiB)": 135.77, + "step": 20650, + "train_speed(iter/s)": 0.201541 + }, + { + "acc": 0.76081595, + "epoch": 0.4820134443488531, + "grad_norm": 5.09375, + "learning_rate": 8.877135792911253e-06, + "loss": 0.86255798, + "memory(GiB)": 135.77, + "step": 20660, + "train_speed(iter/s)": 0.201591 + }, + { + "acc": 0.75712156, + "epoch": 0.482246751921142, + "grad_norm": 6.03125, + "learning_rate": 8.875942661169423e-06, + "loss": 0.89423008, + "memory(GiB)": 135.77, + "step": 20670, + "train_speed(iter/s)": 0.201641 + }, + { + "acc": 0.77293806, + "epoch": 0.4824800594934309, + "grad_norm": 5.5, + "learning_rate": 8.874748976137905e-06, + "loss": 0.79875817, + "memory(GiB)": 135.77, + "step": 20680, + "train_speed(iter/s)": 0.201693 + }, + { + "acc": 0.74118772, + "epoch": 0.4827133670657198, + "grad_norm": 5.5625, + "learning_rate": 8.873554737987098e-06, + "loss": 0.91008282, + "memory(GiB)": 135.77, + "step": 20690, + "train_speed(iter/s)": 0.201745 + }, + { + "acc": 0.7834497, + "epoch": 0.4829466746380087, + "grad_norm": 7.0625, + "learning_rate": 8.872359946887474e-06, + "loss": 0.76405153, + "memory(GiB)": 135.77, + "step": 20700, + "train_speed(iter/s)": 0.201796 + }, + { + "acc": 0.73779564, + "epoch": 0.4831799822102976, + "grad_norm": 5.78125, + "learning_rate": 8.871164603009595e-06, + "loss": 0.97299786, + "memory(GiB)": 135.77, + "step": 20710, + "train_speed(iter/s)": 0.201848 + }, + { + "acc": 0.76881113, + "epoch": 0.4834132897825865, + "grad_norm": 5.6875, + "learning_rate": 8.869968706524092e-06, + "loss": 0.92142582, + "memory(GiB)": 135.77, + "step": 20720, + "train_speed(iter/s)": 0.201901 + }, + { + "acc": 0.78435059, + "epoch": 0.4836465973548754, + "grad_norm": 4.375, + "learning_rate": 8.868772257601682e-06, + "loss": 0.79667149, + "memory(GiB)": 135.77, + "step": 20730, + "train_speed(iter/s)": 0.201952 + }, + { + "acc": 0.75337543, + "epoch": 0.4838799049271643, + "grad_norm": 7.15625, + "learning_rate": 8.867575256413154e-06, + "loss": 0.91704159, + "memory(GiB)": 135.77, + "step": 20740, + "train_speed(iter/s)": 0.202003 + }, + { + "acc": 0.76679363, + "epoch": 0.48411321249945316, + "grad_norm": 6.15625, + "learning_rate": 8.866377703129382e-06, + "loss": 0.82663794, + "memory(GiB)": 135.77, + "step": 20750, + "train_speed(iter/s)": 0.202056 + }, + { + "acc": 0.77863007, + "epoch": 0.48434652007174206, + "grad_norm": 4.8125, + "learning_rate": 8.865179597921318e-06, + "loss": 0.79373541, + "memory(GiB)": 135.77, + "step": 20760, + "train_speed(iter/s)": 0.202104 + }, + { + "acc": 0.75620103, + "epoch": 0.48457982764403096, + "grad_norm": 6.0625, + "learning_rate": 8.863980940959989e-06, + "loss": 0.92305183, + "memory(GiB)": 135.77, + "step": 20770, + "train_speed(iter/s)": 0.20215 + }, + { + "acc": 0.7790472, + "epoch": 0.48481313521631986, + "grad_norm": 4.78125, + "learning_rate": 8.862781732416502e-06, + "loss": 0.77640362, + "memory(GiB)": 135.77, + "step": 20780, + "train_speed(iter/s)": 0.2022 + }, + { + "acc": 0.77119627, + "epoch": 0.48504644278860876, + "grad_norm": 6.0625, + "learning_rate": 8.861581972462045e-06, + "loss": 0.82055054, + "memory(GiB)": 135.77, + "step": 20790, + "train_speed(iter/s)": 0.202249 + }, + { + "acc": 0.76551666, + "epoch": 0.48527975036089765, + "grad_norm": 5.5, + "learning_rate": 8.860381661267882e-06, + "loss": 0.82655621, + "memory(GiB)": 135.77, + "step": 20800, + "train_speed(iter/s)": 0.202298 + }, + { + "acc": 0.78621545, + "epoch": 0.48551305793318655, + "grad_norm": 8.9375, + "learning_rate": 8.859180799005361e-06, + "loss": 0.76219521, + "memory(GiB)": 135.77, + "step": 20810, + "train_speed(iter/s)": 0.202346 + }, + { + "acc": 0.77965999, + "epoch": 0.48574636550547545, + "grad_norm": 5.34375, + "learning_rate": 8.857979385845901e-06, + "loss": 0.781145, + "memory(GiB)": 135.77, + "step": 20820, + "train_speed(iter/s)": 0.202396 + }, + { + "acc": 0.76239419, + "epoch": 0.48597967307776435, + "grad_norm": 4.75, + "learning_rate": 8.856777421961004e-06, + "loss": 0.87815866, + "memory(GiB)": 135.77, + "step": 20830, + "train_speed(iter/s)": 0.202448 + }, + { + "acc": 0.77660542, + "epoch": 0.48621298065005325, + "grad_norm": 5.1875, + "learning_rate": 8.855574907522251e-06, + "loss": 0.79042578, + "memory(GiB)": 135.77, + "step": 20840, + "train_speed(iter/s)": 0.202497 + }, + { + "acc": 0.7720644, + "epoch": 0.4864462882223421, + "grad_norm": 6.375, + "learning_rate": 8.854371842701299e-06, + "loss": 0.82205925, + "memory(GiB)": 135.77, + "step": 20850, + "train_speed(iter/s)": 0.202548 + }, + { + "acc": 0.77594128, + "epoch": 0.486679595794631, + "grad_norm": 5.625, + "learning_rate": 8.853168227669886e-06, + "loss": 0.81534863, + "memory(GiB)": 135.77, + "step": 20860, + "train_speed(iter/s)": 0.202599 + }, + { + "acc": 0.77423177, + "epoch": 0.4869129033669199, + "grad_norm": 5.0, + "learning_rate": 8.851964062599828e-06, + "loss": 0.79051189, + "memory(GiB)": 135.77, + "step": 20870, + "train_speed(iter/s)": 0.202649 + }, + { + "acc": 0.77537422, + "epoch": 0.4871462109392088, + "grad_norm": 15.1875, + "learning_rate": 8.850759347663021e-06, + "loss": 0.80190439, + "memory(GiB)": 135.77, + "step": 20880, + "train_speed(iter/s)": 0.202703 + }, + { + "acc": 0.76119142, + "epoch": 0.4873795185114977, + "grad_norm": 6.40625, + "learning_rate": 8.849554083031435e-06, + "loss": 0.86152458, + "memory(GiB)": 135.77, + "step": 20890, + "train_speed(iter/s)": 0.202756 + }, + { + "acc": 0.75670996, + "epoch": 0.4876128260837866, + "grad_norm": 5.1875, + "learning_rate": 8.84834826887712e-06, + "loss": 0.87847652, + "memory(GiB)": 135.77, + "step": 20900, + "train_speed(iter/s)": 0.20281 + }, + { + "acc": 0.76710215, + "epoch": 0.4878461336560755, + "grad_norm": 6.84375, + "learning_rate": 8.84714190537221e-06, + "loss": 0.84475441, + "memory(GiB)": 135.77, + "step": 20910, + "train_speed(iter/s)": 0.20286 + }, + { + "acc": 0.77066274, + "epoch": 0.4880794412283644, + "grad_norm": 5.6875, + "learning_rate": 8.84593499268891e-06, + "loss": 0.82889204, + "memory(GiB)": 135.77, + "step": 20920, + "train_speed(iter/s)": 0.202908 + }, + { + "acc": 0.76561365, + "epoch": 0.4883127488006533, + "grad_norm": 4.75, + "learning_rate": 8.844727530999506e-06, + "loss": 0.8475029, + "memory(GiB)": 135.77, + "step": 20930, + "train_speed(iter/s)": 0.202954 + }, + { + "acc": 0.77417688, + "epoch": 0.48854605637294213, + "grad_norm": 4.59375, + "learning_rate": 8.843519520476365e-06, + "loss": 0.82181873, + "memory(GiB)": 135.77, + "step": 20940, + "train_speed(iter/s)": 0.203003 + }, + { + "acc": 0.78241935, + "epoch": 0.488779363945231, + "grad_norm": 5.1875, + "learning_rate": 8.842310961291926e-06, + "loss": 0.80962143, + "memory(GiB)": 135.77, + "step": 20950, + "train_speed(iter/s)": 0.203051 + }, + { + "acc": 0.79182229, + "epoch": 0.4890126715175199, + "grad_norm": 7.59375, + "learning_rate": 8.841101853618717e-06, + "loss": 0.74582729, + "memory(GiB)": 135.77, + "step": 20960, + "train_speed(iter/s)": 0.203103 + }, + { + "acc": 0.78107166, + "epoch": 0.4892459790898088, + "grad_norm": 4.21875, + "learning_rate": 8.839892197629334e-06, + "loss": 0.78661366, + "memory(GiB)": 135.77, + "step": 20970, + "train_speed(iter/s)": 0.203151 + }, + { + "acc": 0.77803307, + "epoch": 0.4894792866620977, + "grad_norm": 6.875, + "learning_rate": 8.838681993496454e-06, + "loss": 0.79262047, + "memory(GiB)": 135.77, + "step": 20980, + "train_speed(iter/s)": 0.203205 + }, + { + "acc": 0.74206686, + "epoch": 0.4897125942343866, + "grad_norm": 7.4375, + "learning_rate": 8.837471241392835e-06, + "loss": 0.94257679, + "memory(GiB)": 135.77, + "step": 20990, + "train_speed(iter/s)": 0.203256 + }, + { + "acc": 0.76296959, + "epoch": 0.4899459018066755, + "grad_norm": 6.03125, + "learning_rate": 8.83625994149131e-06, + "loss": 0.87966471, + "memory(GiB)": 135.77, + "step": 21000, + "train_speed(iter/s)": 0.203307 + }, + { + "epoch": 0.4899459018066755, + "eval_acc": 0.7386233137012519, + "eval_loss": 0.8264919519424438, + "eval_runtime": 1269.5678, + "eval_samples_per_second": 28.349, + "eval_steps_per_second": 14.175, + "step": 21000 + }, + { + "acc": 0.76228848, + "epoch": 0.4901792093789644, + "grad_norm": 5.1875, + "learning_rate": 8.835048093964796e-06, + "loss": 0.86440468, + "memory(GiB)": 135.77, + "step": 21010, + "train_speed(iter/s)": 0.200857 + }, + { + "acc": 0.7821867, + "epoch": 0.4904125169512533, + "grad_norm": 4.21875, + "learning_rate": 8.833835698986276e-06, + "loss": 0.78023453, + "memory(GiB)": 135.77, + "step": 21020, + "train_speed(iter/s)": 0.200905 + }, + { + "acc": 0.77446461, + "epoch": 0.4906458245235422, + "grad_norm": 5.3125, + "learning_rate": 8.832622756728828e-06, + "loss": 0.81071396, + "memory(GiB)": 135.77, + "step": 21030, + "train_speed(iter/s)": 0.200954 + }, + { + "acc": 0.75736957, + "epoch": 0.49087913209583106, + "grad_norm": 5.9375, + "learning_rate": 8.831409267365594e-06, + "loss": 0.87871695, + "memory(GiB)": 135.77, + "step": 21040, + "train_speed(iter/s)": 0.201004 + }, + { + "acc": 0.7824193, + "epoch": 0.49111243966811996, + "grad_norm": 5.21875, + "learning_rate": 8.830195231069799e-06, + "loss": 0.77914681, + "memory(GiB)": 135.77, + "step": 21050, + "train_speed(iter/s)": 0.201054 + }, + { + "acc": 0.77138987, + "epoch": 0.49134574724040886, + "grad_norm": 7.03125, + "learning_rate": 8.828980648014747e-06, + "loss": 0.82984867, + "memory(GiB)": 135.77, + "step": 21060, + "train_speed(iter/s)": 0.201107 + }, + { + "acc": 0.76764355, + "epoch": 0.49157905481269776, + "grad_norm": 6.15625, + "learning_rate": 8.82776551837382e-06, + "loss": 0.83743019, + "memory(GiB)": 135.77, + "step": 21070, + "train_speed(iter/s)": 0.201157 + }, + { + "acc": 0.7618494, + "epoch": 0.49181236238498666, + "grad_norm": 5.40625, + "learning_rate": 8.826549842320478e-06, + "loss": 0.87587261, + "memory(GiB)": 135.77, + "step": 21080, + "train_speed(iter/s)": 0.201209 + }, + { + "acc": 0.77934647, + "epoch": 0.49204566995727556, + "grad_norm": 4.4375, + "learning_rate": 8.825333620028257e-06, + "loss": 0.80535469, + "memory(GiB)": 135.77, + "step": 21090, + "train_speed(iter/s)": 0.201258 + }, + { + "acc": 0.76986637, + "epoch": 0.49227897752956445, + "grad_norm": 4.375, + "learning_rate": 8.824116851670772e-06, + "loss": 0.83490791, + "memory(GiB)": 135.77, + "step": 21100, + "train_speed(iter/s)": 0.201309 + }, + { + "acc": 0.7731987, + "epoch": 0.49251228510185335, + "grad_norm": 6.5625, + "learning_rate": 8.822899537421721e-06, + "loss": 0.83892517, + "memory(GiB)": 135.77, + "step": 21110, + "train_speed(iter/s)": 0.201358 + }, + { + "acc": 0.76397038, + "epoch": 0.49274559267414225, + "grad_norm": 6.46875, + "learning_rate": 8.821681677454868e-06, + "loss": 0.84116879, + "memory(GiB)": 135.77, + "step": 21120, + "train_speed(iter/s)": 0.201407 + }, + { + "acc": 0.76126223, + "epoch": 0.49297890024643115, + "grad_norm": 4.8125, + "learning_rate": 8.820463271944066e-06, + "loss": 0.85854349, + "memory(GiB)": 135.77, + "step": 21130, + "train_speed(iter/s)": 0.201458 + }, + { + "acc": 0.77129207, + "epoch": 0.49321220781872, + "grad_norm": 8.4375, + "learning_rate": 8.819244321063243e-06, + "loss": 0.82659998, + "memory(GiB)": 135.77, + "step": 21140, + "train_speed(iter/s)": 0.201505 + }, + { + "acc": 0.76816425, + "epoch": 0.4934455153910089, + "grad_norm": 7.4375, + "learning_rate": 8.818024824986404e-06, + "loss": 0.84741154, + "memory(GiB)": 135.77, + "step": 21150, + "train_speed(iter/s)": 0.201552 + }, + { + "acc": 0.76245174, + "epoch": 0.4936788229632978, + "grad_norm": 8.6875, + "learning_rate": 8.816804783887628e-06, + "loss": 0.86243572, + "memory(GiB)": 135.77, + "step": 21160, + "train_speed(iter/s)": 0.2016 + }, + { + "acc": 0.78625603, + "epoch": 0.4939121305355867, + "grad_norm": 6.09375, + "learning_rate": 8.815584197941078e-06, + "loss": 0.76783695, + "memory(GiB)": 135.77, + "step": 21170, + "train_speed(iter/s)": 0.201648 + }, + { + "acc": 0.79220161, + "epoch": 0.4941454381078756, + "grad_norm": 6.96875, + "learning_rate": 8.814363067320995e-06, + "loss": 0.75065899, + "memory(GiB)": 135.77, + "step": 21180, + "train_speed(iter/s)": 0.201696 + }, + { + "acc": 0.78739977, + "epoch": 0.4943787456801645, + "grad_norm": 5.5, + "learning_rate": 8.81314139220169e-06, + "loss": 0.77436671, + "memory(GiB)": 135.77, + "step": 21190, + "train_speed(iter/s)": 0.201746 + }, + { + "acc": 0.75706043, + "epoch": 0.4946120532524534, + "grad_norm": 4.375, + "learning_rate": 8.811919172757558e-06, + "loss": 0.86744957, + "memory(GiB)": 135.77, + "step": 21200, + "train_speed(iter/s)": 0.201796 + }, + { + "acc": 0.77850995, + "epoch": 0.4948453608247423, + "grad_norm": 5.03125, + "learning_rate": 8.810696409163073e-06, + "loss": 0.81761723, + "memory(GiB)": 135.77, + "step": 21210, + "train_speed(iter/s)": 0.201845 + }, + { + "acc": 0.79036131, + "epoch": 0.4950786683970312, + "grad_norm": 4.65625, + "learning_rate": 8.809473101592783e-06, + "loss": 0.77943802, + "memory(GiB)": 135.77, + "step": 21220, + "train_speed(iter/s)": 0.201896 + }, + { + "acc": 0.77873907, + "epoch": 0.49531197596932003, + "grad_norm": 5.75, + "learning_rate": 8.808249250221312e-06, + "loss": 0.79133549, + "memory(GiB)": 135.77, + "step": 21230, + "train_speed(iter/s)": 0.201945 + }, + { + "acc": 0.77417302, + "epoch": 0.4955452835416089, + "grad_norm": 5.84375, + "learning_rate": 8.807024855223369e-06, + "loss": 0.81216822, + "memory(GiB)": 135.77, + "step": 21240, + "train_speed(iter/s)": 0.201996 + }, + { + "acc": 0.76840887, + "epoch": 0.4957785911138978, + "grad_norm": 7.0625, + "learning_rate": 8.805799916773734e-06, + "loss": 0.82953939, + "memory(GiB)": 135.77, + "step": 21250, + "train_speed(iter/s)": 0.202046 + }, + { + "acc": 0.74933271, + "epoch": 0.4960118986861867, + "grad_norm": 6.4375, + "learning_rate": 8.804574435047265e-06, + "loss": 0.90532026, + "memory(GiB)": 135.77, + "step": 21260, + "train_speed(iter/s)": 0.202093 + }, + { + "acc": 0.77972379, + "epoch": 0.4962452062584756, + "grad_norm": 6.4375, + "learning_rate": 8.803348410218902e-06, + "loss": 0.80596933, + "memory(GiB)": 135.77, + "step": 21270, + "train_speed(iter/s)": 0.202143 + }, + { + "acc": 0.77269158, + "epoch": 0.4964785138307645, + "grad_norm": 4.53125, + "learning_rate": 8.802121842463658e-06, + "loss": 0.81263514, + "memory(GiB)": 135.77, + "step": 21280, + "train_speed(iter/s)": 0.202195 + }, + { + "acc": 0.77480335, + "epoch": 0.4967118214030534, + "grad_norm": 6.21875, + "learning_rate": 8.800894731956624e-06, + "loss": 0.81107597, + "memory(GiB)": 135.77, + "step": 21290, + "train_speed(iter/s)": 0.202243 + }, + { + "acc": 0.75720205, + "epoch": 0.4969451289753423, + "grad_norm": 8.5625, + "learning_rate": 8.799667078872973e-06, + "loss": 0.88299694, + "memory(GiB)": 135.77, + "step": 21300, + "train_speed(iter/s)": 0.202292 + }, + { + "acc": 0.75752621, + "epoch": 0.4971784365476312, + "grad_norm": 6.65625, + "learning_rate": 8.79843888338795e-06, + "loss": 0.90188789, + "memory(GiB)": 135.77, + "step": 21310, + "train_speed(iter/s)": 0.202342 + }, + { + "acc": 0.77910862, + "epoch": 0.4974117441199201, + "grad_norm": 4.84375, + "learning_rate": 8.797210145676879e-06, + "loss": 0.78877602, + "memory(GiB)": 135.77, + "step": 21320, + "train_speed(iter/s)": 0.202388 + }, + { + "acc": 0.78843384, + "epoch": 0.49764505169220896, + "grad_norm": 5.15625, + "learning_rate": 8.795980865915164e-06, + "loss": 0.75855894, + "memory(GiB)": 135.77, + "step": 21330, + "train_speed(iter/s)": 0.202436 + }, + { + "acc": 0.78700695, + "epoch": 0.49787835926449786, + "grad_norm": 6.125, + "learning_rate": 8.794751044278282e-06, + "loss": 0.76552219, + "memory(GiB)": 135.77, + "step": 21340, + "train_speed(iter/s)": 0.202487 + }, + { + "acc": 0.77964926, + "epoch": 0.49811166683678676, + "grad_norm": 5.53125, + "learning_rate": 8.793520680941792e-06, + "loss": 0.78910999, + "memory(GiB)": 135.77, + "step": 21350, + "train_speed(iter/s)": 0.202533 + }, + { + "acc": 0.76680984, + "epoch": 0.49834497440907566, + "grad_norm": 6.96875, + "learning_rate": 8.792289776081326e-06, + "loss": 0.84402256, + "memory(GiB)": 135.77, + "step": 21360, + "train_speed(iter/s)": 0.20258 + }, + { + "acc": 0.7667099, + "epoch": 0.49857828198136456, + "grad_norm": 4.25, + "learning_rate": 8.791058329872595e-06, + "loss": 0.85665445, + "memory(GiB)": 135.77, + "step": 21370, + "train_speed(iter/s)": 0.20263 + }, + { + "acc": 0.77208252, + "epoch": 0.49881158955365346, + "grad_norm": 4.71875, + "learning_rate": 8.78982634249139e-06, + "loss": 0.82627926, + "memory(GiB)": 135.77, + "step": 21380, + "train_speed(iter/s)": 0.202672 + }, + { + "acc": 0.76971836, + "epoch": 0.49904489712594235, + "grad_norm": 5.03125, + "learning_rate": 8.788593814113576e-06, + "loss": 0.82828751, + "memory(GiB)": 135.77, + "step": 21390, + "train_speed(iter/s)": 0.20272 + }, + { + "acc": 0.77777624, + "epoch": 0.49927820469823125, + "grad_norm": 7.25, + "learning_rate": 8.787360744915096e-06, + "loss": 0.78845906, + "memory(GiB)": 135.77, + "step": 21400, + "train_speed(iter/s)": 0.202769 + }, + { + "acc": 0.76814055, + "epoch": 0.49951151227052015, + "grad_norm": 4.9375, + "learning_rate": 8.786127135071968e-06, + "loss": 0.82526455, + "memory(GiB)": 135.77, + "step": 21410, + "train_speed(iter/s)": 0.202816 + }, + { + "acc": 0.76935434, + "epoch": 0.499744819842809, + "grad_norm": 4.84375, + "learning_rate": 8.784892984760292e-06, + "loss": 0.82196589, + "memory(GiB)": 135.77, + "step": 21420, + "train_speed(iter/s)": 0.20286 + }, + { + "acc": 0.76840973, + "epoch": 0.4999781274150979, + "grad_norm": 4.65625, + "learning_rate": 8.783658294156244e-06, + "loss": 0.84509468, + "memory(GiB)": 135.77, + "step": 21430, + "train_speed(iter/s)": 0.202908 + }, + { + "acc": 0.78127012, + "epoch": 0.5002114349873868, + "grad_norm": 5.0, + "learning_rate": 8.782423063436072e-06, + "loss": 0.79705486, + "memory(GiB)": 135.77, + "step": 21440, + "train_speed(iter/s)": 0.202956 + }, + { + "acc": 0.79307652, + "epoch": 0.5004447425596757, + "grad_norm": 6.28125, + "learning_rate": 8.781187292776106e-06, + "loss": 0.75553842, + "memory(GiB)": 135.77, + "step": 21450, + "train_speed(iter/s)": 0.203003 + }, + { + "acc": 0.77026339, + "epoch": 0.5006780501319646, + "grad_norm": 5.28125, + "learning_rate": 8.779950982352751e-06, + "loss": 0.82962399, + "memory(GiB)": 135.77, + "step": 21460, + "train_speed(iter/s)": 0.203051 + }, + { + "acc": 0.78340578, + "epoch": 0.5009113577042534, + "grad_norm": 4.40625, + "learning_rate": 8.778714132342494e-06, + "loss": 0.76500807, + "memory(GiB)": 135.77, + "step": 21470, + "train_speed(iter/s)": 0.203099 + }, + { + "acc": 0.76358261, + "epoch": 0.5011446652765423, + "grad_norm": 5.3125, + "learning_rate": 8.777476742921893e-06, + "loss": 0.86745148, + "memory(GiB)": 135.77, + "step": 21480, + "train_speed(iter/s)": 0.203144 + }, + { + "acc": 0.76443934, + "epoch": 0.5013779728488312, + "grad_norm": 6.3125, + "learning_rate": 8.776238814267581e-06, + "loss": 0.85055971, + "memory(GiB)": 135.77, + "step": 21490, + "train_speed(iter/s)": 0.203195 + }, + { + "acc": 0.7856565, + "epoch": 0.5016112804211201, + "grad_norm": 4.84375, + "learning_rate": 8.775000346556278e-06, + "loss": 0.76976533, + "memory(GiB)": 135.77, + "step": 21500, + "train_speed(iter/s)": 0.203245 + }, + { + "epoch": 0.5016112804211201, + "eval_acc": 0.7388905944249629, + "eval_loss": 0.8257662653923035, + "eval_runtime": 1270.2669, + "eval_samples_per_second": 28.333, + "eval_steps_per_second": 14.167, + "step": 21500 + }, + { + "acc": 0.77753158, + "epoch": 0.501844587993409, + "grad_norm": 5.96875, + "learning_rate": 8.773761339964773e-06, + "loss": 0.79236035, + "memory(GiB)": 135.77, + "step": 21510, + "train_speed(iter/s)": 0.20085 + }, + { + "acc": 0.77459497, + "epoch": 0.5020778955656979, + "grad_norm": 6.34375, + "learning_rate": 8.77252179466993e-06, + "loss": 0.80510149, + "memory(GiB)": 135.77, + "step": 21520, + "train_speed(iter/s)": 0.200901 + }, + { + "acc": 0.79065723, + "epoch": 0.5023112031379868, + "grad_norm": 5.875, + "learning_rate": 8.771281710848697e-06, + "loss": 0.75441141, + "memory(GiB)": 135.77, + "step": 21530, + "train_speed(iter/s)": 0.200951 + }, + { + "acc": 0.78152294, + "epoch": 0.5025445107102757, + "grad_norm": 5.6875, + "learning_rate": 8.770041088678098e-06, + "loss": 0.78967323, + "memory(GiB)": 135.77, + "step": 21540, + "train_speed(iter/s)": 0.200999 + }, + { + "acc": 0.76808047, + "epoch": 0.5027778182825646, + "grad_norm": 4.875, + "learning_rate": 8.768799928335227e-06, + "loss": 0.83531723, + "memory(GiB)": 135.77, + "step": 21550, + "train_speed(iter/s)": 0.201047 + }, + { + "acc": 0.78148546, + "epoch": 0.5030111258548535, + "grad_norm": 6.8125, + "learning_rate": 8.76755822999726e-06, + "loss": 0.77231493, + "memory(GiB)": 135.77, + "step": 21560, + "train_speed(iter/s)": 0.201097 + }, + { + "acc": 0.78652892, + "epoch": 0.5032444334271424, + "grad_norm": 6.5, + "learning_rate": 8.766315993841452e-06, + "loss": 0.76357327, + "memory(GiB)": 135.77, + "step": 21570, + "train_speed(iter/s)": 0.201146 + }, + { + "acc": 0.74624329, + "epoch": 0.5034777409994313, + "grad_norm": 4.90625, + "learning_rate": 8.76507322004513e-06, + "loss": 0.91759214, + "memory(GiB)": 135.77, + "step": 21580, + "train_speed(iter/s)": 0.201192 + }, + { + "acc": 0.77871332, + "epoch": 0.5037110485717202, + "grad_norm": 5.46875, + "learning_rate": 8.7638299087857e-06, + "loss": 0.78241935, + "memory(GiB)": 135.77, + "step": 21590, + "train_speed(iter/s)": 0.201238 + }, + { + "acc": 0.7683857, + "epoch": 0.5039443561440091, + "grad_norm": 6.59375, + "learning_rate": 8.762586060240642e-06, + "loss": 0.83219185, + "memory(GiB)": 135.77, + "step": 21600, + "train_speed(iter/s)": 0.201284 + }, + { + "acc": 0.78917575, + "epoch": 0.504177663716298, + "grad_norm": 10.875, + "learning_rate": 8.761341674587518e-06, + "loss": 0.76239634, + "memory(GiB)": 135.77, + "step": 21610, + "train_speed(iter/s)": 0.201331 + }, + { + "acc": 0.78974929, + "epoch": 0.5044109712885869, + "grad_norm": 4.96875, + "learning_rate": 8.760096752003962e-06, + "loss": 0.77535329, + "memory(GiB)": 135.77, + "step": 21620, + "train_speed(iter/s)": 0.201378 + }, + { + "acc": 0.77866402, + "epoch": 0.5046442788608758, + "grad_norm": 4.9375, + "learning_rate": 8.758851292667687e-06, + "loss": 0.78650284, + "memory(GiB)": 135.77, + "step": 21630, + "train_speed(iter/s)": 0.201426 + }, + { + "acc": 0.77446556, + "epoch": 0.5048775864331647, + "grad_norm": 6.6875, + "learning_rate": 8.757605296756483e-06, + "loss": 0.80399456, + "memory(GiB)": 135.77, + "step": 21640, + "train_speed(iter/s)": 0.201474 + }, + { + "acc": 0.76340103, + "epoch": 0.5051108940054536, + "grad_norm": 5.0, + "learning_rate": 8.756358764448214e-06, + "loss": 0.85637417, + "memory(GiB)": 135.77, + "step": 21650, + "train_speed(iter/s)": 0.201523 + }, + { + "acc": 0.78347025, + "epoch": 0.5053442015777424, + "grad_norm": 4.875, + "learning_rate": 8.755111695920823e-06, + "loss": 0.79445, + "memory(GiB)": 135.77, + "step": 21660, + "train_speed(iter/s)": 0.201571 + }, + { + "acc": 0.77764778, + "epoch": 0.5055775091500313, + "grad_norm": 4.125, + "learning_rate": 8.753864091352326e-06, + "loss": 0.80275316, + "memory(GiB)": 135.77, + "step": 21670, + "train_speed(iter/s)": 0.201619 + }, + { + "acc": 0.78915906, + "epoch": 0.5058108167223202, + "grad_norm": 3.84375, + "learning_rate": 8.752615950920824e-06, + "loss": 0.75332527, + "memory(GiB)": 135.77, + "step": 21680, + "train_speed(iter/s)": 0.201664 + }, + { + "acc": 0.77775726, + "epoch": 0.5060441242946091, + "grad_norm": 10.625, + "learning_rate": 8.751367274804483e-06, + "loss": 0.79482059, + "memory(GiB)": 135.77, + "step": 21690, + "train_speed(iter/s)": 0.201709 + }, + { + "acc": 0.77287965, + "epoch": 0.506277431866898, + "grad_norm": 4.53125, + "learning_rate": 8.750118063181553e-06, + "loss": 0.8031868, + "memory(GiB)": 135.77, + "step": 21700, + "train_speed(iter/s)": 0.201762 + }, + { + "acc": 0.77395134, + "epoch": 0.5065107394391869, + "grad_norm": 4.8125, + "learning_rate": 8.74886831623036e-06, + "loss": 0.79606371, + "memory(GiB)": 135.77, + "step": 21710, + "train_speed(iter/s)": 0.20181 + }, + { + "acc": 0.75434523, + "epoch": 0.5067440470114758, + "grad_norm": 5.375, + "learning_rate": 8.747618034129304e-06, + "loss": 0.90953197, + "memory(GiB)": 135.77, + "step": 21720, + "train_speed(iter/s)": 0.20186 + }, + { + "acc": 0.78446827, + "epoch": 0.5069773545837647, + "grad_norm": 6.5625, + "learning_rate": 8.746367217056861e-06, + "loss": 0.79035029, + "memory(GiB)": 135.77, + "step": 21730, + "train_speed(iter/s)": 0.201906 + }, + { + "acc": 0.76340156, + "epoch": 0.5072106621560536, + "grad_norm": 6.40625, + "learning_rate": 8.745115865191587e-06, + "loss": 0.8527956, + "memory(GiB)": 135.77, + "step": 21740, + "train_speed(iter/s)": 0.201955 + }, + { + "acc": 0.77388973, + "epoch": 0.5074439697283425, + "grad_norm": 5.46875, + "learning_rate": 8.743863978712111e-06, + "loss": 0.81416397, + "memory(GiB)": 135.77, + "step": 21750, + "train_speed(iter/s)": 0.202002 + }, + { + "acc": 0.77853451, + "epoch": 0.5076772773006314, + "grad_norm": 5.0, + "learning_rate": 8.74261155779714e-06, + "loss": 0.78336382, + "memory(GiB)": 135.77, + "step": 21760, + "train_speed(iter/s)": 0.202047 + }, + { + "acc": 0.75856724, + "epoch": 0.5079105848729203, + "grad_norm": 7.21875, + "learning_rate": 8.741358602625455e-06, + "loss": 0.87376547, + "memory(GiB)": 135.77, + "step": 21770, + "train_speed(iter/s)": 0.202089 + }, + { + "acc": 0.78273201, + "epoch": 0.5081438924452092, + "grad_norm": 5.65625, + "learning_rate": 8.740105113375919e-06, + "loss": 0.76739621, + "memory(GiB)": 135.77, + "step": 21780, + "train_speed(iter/s)": 0.202136 + }, + { + "acc": 0.77240195, + "epoch": 0.5083772000174981, + "grad_norm": 4.21875, + "learning_rate": 8.738851090227462e-06, + "loss": 0.80845757, + "memory(GiB)": 135.77, + "step": 21790, + "train_speed(iter/s)": 0.202185 + }, + { + "acc": 0.78104792, + "epoch": 0.508610507589787, + "grad_norm": 5.625, + "learning_rate": 8.737596533359101e-06, + "loss": 0.78938627, + "memory(GiB)": 135.77, + "step": 21800, + "train_speed(iter/s)": 0.202235 + }, + { + "acc": 0.76249051, + "epoch": 0.5088438151620759, + "grad_norm": 5.96875, + "learning_rate": 8.736341442949919e-06, + "loss": 0.8694706, + "memory(GiB)": 135.77, + "step": 21810, + "train_speed(iter/s)": 0.202282 + }, + { + "acc": 0.77197785, + "epoch": 0.5090771227343648, + "grad_norm": 6.6875, + "learning_rate": 8.73508581917908e-06, + "loss": 0.80897655, + "memory(GiB)": 135.77, + "step": 21820, + "train_speed(iter/s)": 0.202328 + }, + { + "acc": 0.75979204, + "epoch": 0.5093104303066537, + "grad_norm": 7.5625, + "learning_rate": 8.733829662225825e-06, + "loss": 0.86723404, + "memory(GiB)": 135.77, + "step": 21830, + "train_speed(iter/s)": 0.202377 + }, + { + "acc": 0.77098808, + "epoch": 0.5095437378789426, + "grad_norm": 7.1875, + "learning_rate": 8.732572972269472e-06, + "loss": 0.84290762, + "memory(GiB)": 135.77, + "step": 21840, + "train_speed(iter/s)": 0.202422 + }, + { + "acc": 0.78516512, + "epoch": 0.5097770454512315, + "grad_norm": 5.5, + "learning_rate": 8.731315749489412e-06, + "loss": 0.77529469, + "memory(GiB)": 135.77, + "step": 21850, + "train_speed(iter/s)": 0.202471 + }, + { + "acc": 0.77012315, + "epoch": 0.5100103530235203, + "grad_norm": 5.625, + "learning_rate": 8.730057994065113e-06, + "loss": 0.81857462, + "memory(GiB)": 135.77, + "step": 21860, + "train_speed(iter/s)": 0.202521 + }, + { + "acc": 0.76024265, + "epoch": 0.5102436605958092, + "grad_norm": 7.84375, + "learning_rate": 8.728799706176117e-06, + "loss": 0.89396629, + "memory(GiB)": 135.77, + "step": 21870, + "train_speed(iter/s)": 0.20257 + }, + { + "acc": 0.79580889, + "epoch": 0.5104769681680981, + "grad_norm": 4.96875, + "learning_rate": 8.727540886002048e-06, + "loss": 0.71652207, + "memory(GiB)": 135.77, + "step": 21880, + "train_speed(iter/s)": 0.20262 + }, + { + "acc": 0.7760314, + "epoch": 0.510710275740387, + "grad_norm": 4.96875, + "learning_rate": 8.7262815337226e-06, + "loss": 0.81335468, + "memory(GiB)": 135.77, + "step": 21890, + "train_speed(iter/s)": 0.202666 + }, + { + "acc": 0.7670917, + "epoch": 0.5109435833126759, + "grad_norm": 5.21875, + "learning_rate": 8.725021649517545e-06, + "loss": 0.82741623, + "memory(GiB)": 135.77, + "step": 21900, + "train_speed(iter/s)": 0.202713 + }, + { + "acc": 0.76540327, + "epoch": 0.5111768908849648, + "grad_norm": 7.03125, + "learning_rate": 8.723761233566732e-06, + "loss": 0.85327883, + "memory(GiB)": 135.77, + "step": 21910, + "train_speed(iter/s)": 0.202761 + }, + { + "acc": 0.76294174, + "epoch": 0.5114101984572537, + "grad_norm": 7.59375, + "learning_rate": 8.722500286050084e-06, + "loss": 0.8585103, + "memory(GiB)": 135.77, + "step": 21920, + "train_speed(iter/s)": 0.202807 + }, + { + "acc": 0.77434635, + "epoch": 0.5116435060295426, + "grad_norm": 6.40625, + "learning_rate": 8.721238807147602e-06, + "loss": 0.83060961, + "memory(GiB)": 135.77, + "step": 21930, + "train_speed(iter/s)": 0.202851 + }, + { + "acc": 0.76418648, + "epoch": 0.5118768136018315, + "grad_norm": 10.4375, + "learning_rate": 8.71997679703936e-06, + "loss": 0.84364891, + "memory(GiB)": 135.77, + "step": 21940, + "train_speed(iter/s)": 0.202899 + }, + { + "acc": 0.77589755, + "epoch": 0.5121101211741204, + "grad_norm": 6.53125, + "learning_rate": 8.718714255905514e-06, + "loss": 0.80891504, + "memory(GiB)": 135.77, + "step": 21950, + "train_speed(iter/s)": 0.202942 + }, + { + "acc": 0.78276296, + "epoch": 0.5123434287464093, + "grad_norm": 4.96875, + "learning_rate": 8.717451183926286e-06, + "loss": 0.78558817, + "memory(GiB)": 135.77, + "step": 21960, + "train_speed(iter/s)": 0.20299 + }, + { + "acc": 0.76813164, + "epoch": 0.5125767363186982, + "grad_norm": 5.0625, + "learning_rate": 8.716187581281982e-06, + "loss": 0.82764816, + "memory(GiB)": 135.77, + "step": 21970, + "train_speed(iter/s)": 0.203035 + }, + { + "acc": 0.76752729, + "epoch": 0.512810043890987, + "grad_norm": 10.5625, + "learning_rate": 8.71492344815298e-06, + "loss": 0.84671097, + "memory(GiB)": 135.77, + "step": 21980, + "train_speed(iter/s)": 0.203084 + }, + { + "acc": 0.7599195, + "epoch": 0.513043351463276, + "grad_norm": 5.5625, + "learning_rate": 8.713658784719735e-06, + "loss": 0.85972528, + "memory(GiB)": 135.77, + "step": 21990, + "train_speed(iter/s)": 0.203127 + }, + { + "acc": 0.77313404, + "epoch": 0.5132766590355649, + "grad_norm": 5.90625, + "learning_rate": 8.712393591162779e-06, + "loss": 0.82830524, + "memory(GiB)": 135.77, + "step": 22000, + "train_speed(iter/s)": 0.203176 + }, + { + "epoch": 0.5132766590355649, + "eval_acc": 0.739051699522658, + "eval_loss": 0.8252963423728943, + "eval_runtime": 1269.798, + "eval_samples_per_second": 28.344, + "eval_steps_per_second": 14.172, + "step": 22000 + }, + { + "acc": 0.76468811, + "epoch": 0.5135099666078538, + "grad_norm": 6.0, + "learning_rate": 8.711127867662715e-06, + "loss": 0.85957546, + "memory(GiB)": 135.77, + "step": 22010, + "train_speed(iter/s)": 0.200838 + }, + { + "acc": 0.77129335, + "epoch": 0.5137432741801427, + "grad_norm": 4.25, + "learning_rate": 8.709861614400223e-06, + "loss": 0.83853226, + "memory(GiB)": 135.77, + "step": 22020, + "train_speed(iter/s)": 0.200887 + }, + { + "acc": 0.78872757, + "epoch": 0.5139765817524315, + "grad_norm": 4.9375, + "learning_rate": 8.708594831556068e-06, + "loss": 0.76557589, + "memory(GiB)": 135.77, + "step": 22030, + "train_speed(iter/s)": 0.200934 + }, + { + "acc": 0.76171837, + "epoch": 0.5142098893247204, + "grad_norm": 5.75, + "learning_rate": 8.707327519311075e-06, + "loss": 0.8551321, + "memory(GiB)": 135.77, + "step": 22040, + "train_speed(iter/s)": 0.200982 + }, + { + "acc": 0.76743393, + "epoch": 0.5144431968970092, + "grad_norm": 8.3125, + "learning_rate": 8.706059677846157e-06, + "loss": 0.85651779, + "memory(GiB)": 135.77, + "step": 22050, + "train_speed(iter/s)": 0.201031 + }, + { + "acc": 0.77318702, + "epoch": 0.5146765044692981, + "grad_norm": 4.8125, + "learning_rate": 8.704791307342297e-06, + "loss": 0.79379597, + "memory(GiB)": 135.77, + "step": 22060, + "train_speed(iter/s)": 0.201077 + }, + { + "acc": 0.7719686, + "epoch": 0.514909812041587, + "grad_norm": 5.8125, + "learning_rate": 8.703522407980554e-06, + "loss": 0.81712446, + "memory(GiB)": 135.77, + "step": 22070, + "train_speed(iter/s)": 0.201124 + }, + { + "acc": 0.77638397, + "epoch": 0.5151431196138759, + "grad_norm": 10.875, + "learning_rate": 8.702252979942063e-06, + "loss": 0.80130062, + "memory(GiB)": 135.77, + "step": 22080, + "train_speed(iter/s)": 0.201174 + }, + { + "acc": 0.77209868, + "epoch": 0.5153764271861648, + "grad_norm": 5.21875, + "learning_rate": 8.700983023408034e-06, + "loss": 0.8194252, + "memory(GiB)": 135.77, + "step": 22090, + "train_speed(iter/s)": 0.201224 + }, + { + "acc": 0.76339512, + "epoch": 0.5156097347584537, + "grad_norm": 6.71875, + "learning_rate": 8.699712538559752e-06, + "loss": 0.85726337, + "memory(GiB)": 135.77, + "step": 22100, + "train_speed(iter/s)": 0.201271 + }, + { + "acc": 0.7645709, + "epoch": 0.5158430423307426, + "grad_norm": 4.9375, + "learning_rate": 8.698441525578582e-06, + "loss": 0.85991373, + "memory(GiB)": 135.77, + "step": 22110, + "train_speed(iter/s)": 0.201319 + }, + { + "acc": 0.79457126, + "epoch": 0.5160763499030315, + "grad_norm": 5.125, + "learning_rate": 8.697169984645959e-06, + "loss": 0.72133851, + "memory(GiB)": 135.77, + "step": 22120, + "train_speed(iter/s)": 0.201368 + }, + { + "acc": 0.75647445, + "epoch": 0.5163096574753204, + "grad_norm": 4.0625, + "learning_rate": 8.695897915943395e-06, + "loss": 0.86364326, + "memory(GiB)": 135.77, + "step": 22130, + "train_speed(iter/s)": 0.201417 + }, + { + "acc": 0.76443992, + "epoch": 0.5165429650476093, + "grad_norm": 5.15625, + "learning_rate": 8.694625319652477e-06, + "loss": 0.85923405, + "memory(GiB)": 135.77, + "step": 22140, + "train_speed(iter/s)": 0.201461 + }, + { + "acc": 0.79320621, + "epoch": 0.5167762726198982, + "grad_norm": 5.46875, + "learning_rate": 8.693352195954866e-06, + "loss": 0.74455805, + "memory(GiB)": 135.77, + "step": 22150, + "train_speed(iter/s)": 0.201504 + }, + { + "acc": 0.75500603, + "epoch": 0.5170095801921871, + "grad_norm": 4.90625, + "learning_rate": 8.692078545032304e-06, + "loss": 0.89729595, + "memory(GiB)": 135.77, + "step": 22160, + "train_speed(iter/s)": 0.201548 + }, + { + "acc": 0.78628874, + "epoch": 0.517242887764476, + "grad_norm": 6.65625, + "learning_rate": 8.6908043670666e-06, + "loss": 0.77559862, + "memory(GiB)": 135.77, + "step": 22170, + "train_speed(iter/s)": 0.201596 + }, + { + "acc": 0.7568099, + "epoch": 0.5174761953367649, + "grad_norm": 6.21875, + "learning_rate": 8.689529662239647e-06, + "loss": 0.91406498, + "memory(GiB)": 135.77, + "step": 22180, + "train_speed(iter/s)": 0.20164 + }, + { + "acc": 0.76510105, + "epoch": 0.5177095029090538, + "grad_norm": 4.40625, + "learning_rate": 8.688254430733405e-06, + "loss": 0.85134945, + "memory(GiB)": 135.77, + "step": 22190, + "train_speed(iter/s)": 0.201688 + }, + { + "acc": 0.77507906, + "epoch": 0.5179428104813427, + "grad_norm": 5.5, + "learning_rate": 8.686978672729916e-06, + "loss": 0.8101469, + "memory(GiB)": 135.77, + "step": 22200, + "train_speed(iter/s)": 0.201735 + }, + { + "acc": 0.75599632, + "epoch": 0.5181761180536316, + "grad_norm": 4.78125, + "learning_rate": 8.68570238841129e-06, + "loss": 0.86438799, + "memory(GiB)": 135.77, + "step": 22210, + "train_speed(iter/s)": 0.201783 + }, + { + "acc": 0.76995659, + "epoch": 0.5184094256259205, + "grad_norm": 6.125, + "learning_rate": 8.684425577959722e-06, + "loss": 0.82093, + "memory(GiB)": 135.77, + "step": 22220, + "train_speed(iter/s)": 0.201832 + }, + { + "acc": 0.77510152, + "epoch": 0.5186427331982094, + "grad_norm": 4.3125, + "learning_rate": 8.683148241557472e-06, + "loss": 0.80338688, + "memory(GiB)": 135.77, + "step": 22230, + "train_speed(iter/s)": 0.201878 + }, + { + "acc": 0.76440411, + "epoch": 0.5188760407704982, + "grad_norm": 6.125, + "learning_rate": 8.681870379386879e-06, + "loss": 0.83043871, + "memory(GiB)": 135.77, + "step": 22240, + "train_speed(iter/s)": 0.201929 + }, + { + "acc": 0.76180897, + "epoch": 0.5191093483427871, + "grad_norm": 5.78125, + "learning_rate": 8.68059199163036e-06, + "loss": 0.86176853, + "memory(GiB)": 135.77, + "step": 22250, + "train_speed(iter/s)": 0.201964 + }, + { + "acc": 0.77140579, + "epoch": 0.519342655915076, + "grad_norm": 6.4375, + "learning_rate": 8.679313078470403e-06, + "loss": 0.80798788, + "memory(GiB)": 135.77, + "step": 22260, + "train_speed(iter/s)": 0.202008 + }, + { + "acc": 0.76855969, + "epoch": 0.5195759634873649, + "grad_norm": 4.875, + "learning_rate": 8.678033640089574e-06, + "loss": 0.83115768, + "memory(GiB)": 135.77, + "step": 22270, + "train_speed(iter/s)": 0.202055 + }, + { + "acc": 0.78249931, + "epoch": 0.5198092710596538, + "grad_norm": 4.65625, + "learning_rate": 8.676753676670511e-06, + "loss": 0.78415298, + "memory(GiB)": 135.77, + "step": 22280, + "train_speed(iter/s)": 0.2021 + }, + { + "acc": 0.78204756, + "epoch": 0.5200425786319427, + "grad_norm": 4.15625, + "learning_rate": 8.67547318839593e-06, + "loss": 0.78528633, + "memory(GiB)": 135.77, + "step": 22290, + "train_speed(iter/s)": 0.202146 + }, + { + "acc": 0.76204596, + "epoch": 0.5202758862042316, + "grad_norm": 6.75, + "learning_rate": 8.674192175448617e-06, + "loss": 0.8319747, + "memory(GiB)": 135.77, + "step": 22300, + "train_speed(iter/s)": 0.202193 + }, + { + "acc": 0.76219587, + "epoch": 0.5205091937765205, + "grad_norm": 6.53125, + "learning_rate": 8.672910638011439e-06, + "loss": 0.84944124, + "memory(GiB)": 135.77, + "step": 22310, + "train_speed(iter/s)": 0.202241 + }, + { + "acc": 0.76580586, + "epoch": 0.5207425013488094, + "grad_norm": 5.25, + "learning_rate": 8.671628576267333e-06, + "loss": 0.83932171, + "memory(GiB)": 135.77, + "step": 22320, + "train_speed(iter/s)": 0.20229 + }, + { + "acc": 0.76935239, + "epoch": 0.5209758089210983, + "grad_norm": 5.34375, + "learning_rate": 8.670345990399317e-06, + "loss": 0.82982864, + "memory(GiB)": 135.77, + "step": 22330, + "train_speed(iter/s)": 0.202333 + }, + { + "acc": 0.79416113, + "epoch": 0.5212091164933872, + "grad_norm": 7.59375, + "learning_rate": 8.669062880590474e-06, + "loss": 0.73947349, + "memory(GiB)": 135.77, + "step": 22340, + "train_speed(iter/s)": 0.202381 + }, + { + "acc": 0.77279081, + "epoch": 0.5214424240656761, + "grad_norm": 4.71875, + "learning_rate": 8.667779247023974e-06, + "loss": 0.81007175, + "memory(GiB)": 135.77, + "step": 22350, + "train_speed(iter/s)": 0.202429 + }, + { + "acc": 0.7789588, + "epoch": 0.521675731637965, + "grad_norm": 5.03125, + "learning_rate": 8.666495089883049e-06, + "loss": 0.76399951, + "memory(GiB)": 135.77, + "step": 22360, + "train_speed(iter/s)": 0.202475 + }, + { + "acc": 0.78549161, + "epoch": 0.5219090392102539, + "grad_norm": 4.65625, + "learning_rate": 8.665210409351015e-06, + "loss": 0.77918396, + "memory(GiB)": 135.77, + "step": 22370, + "train_speed(iter/s)": 0.202519 + }, + { + "acc": 0.76314993, + "epoch": 0.5221423467825428, + "grad_norm": 6.125, + "learning_rate": 8.663925205611261e-06, + "loss": 0.87036915, + "memory(GiB)": 135.77, + "step": 22380, + "train_speed(iter/s)": 0.202566 + }, + { + "acc": 0.79042592, + "epoch": 0.5223756543548317, + "grad_norm": 4.875, + "learning_rate": 8.66263947884725e-06, + "loss": 0.77454262, + "memory(GiB)": 135.77, + "step": 22390, + "train_speed(iter/s)": 0.202605 + }, + { + "acc": 0.76517191, + "epoch": 0.5226089619271206, + "grad_norm": 3.859375, + "learning_rate": 8.661353229242514e-06, + "loss": 0.84707232, + "memory(GiB)": 135.77, + "step": 22400, + "train_speed(iter/s)": 0.202646 + }, + { + "acc": 0.77440958, + "epoch": 0.5228422694994095, + "grad_norm": 5.46875, + "learning_rate": 8.66006645698067e-06, + "loss": 0.80511379, + "memory(GiB)": 135.77, + "step": 22410, + "train_speed(iter/s)": 0.202689 + }, + { + "acc": 0.77160006, + "epoch": 0.5230755770716984, + "grad_norm": 6.71875, + "learning_rate": 8.658779162245404e-06, + "loss": 0.82624683, + "memory(GiB)": 135.77, + "step": 22420, + "train_speed(iter/s)": 0.202737 + }, + { + "acc": 0.79068604, + "epoch": 0.5233088846439872, + "grad_norm": 6.1875, + "learning_rate": 8.657491345220475e-06, + "loss": 0.74397564, + "memory(GiB)": 135.77, + "step": 22430, + "train_speed(iter/s)": 0.202783 + }, + { + "acc": 0.79531755, + "epoch": 0.5235421922162761, + "grad_norm": 4.34375, + "learning_rate": 8.656203006089716e-06, + "loss": 0.72965755, + "memory(GiB)": 135.77, + "step": 22440, + "train_speed(iter/s)": 0.202829 + }, + { + "acc": 0.78984456, + "epoch": 0.523775499788565, + "grad_norm": 6.46875, + "learning_rate": 8.654914145037044e-06, + "loss": 0.74049387, + "memory(GiB)": 135.77, + "step": 22450, + "train_speed(iter/s)": 0.202874 + }, + { + "acc": 0.76976719, + "epoch": 0.5240088073608539, + "grad_norm": 4.875, + "learning_rate": 8.653624762246437e-06, + "loss": 0.82113333, + "memory(GiB)": 135.77, + "step": 22460, + "train_speed(iter/s)": 0.202921 + }, + { + "acc": 0.76820931, + "epoch": 0.5242421149331428, + "grad_norm": 4.90625, + "learning_rate": 8.652334857901957e-06, + "loss": 0.8498045, + "memory(GiB)": 135.77, + "step": 22470, + "train_speed(iter/s)": 0.202959 + }, + { + "acc": 0.75914822, + "epoch": 0.5244754225054317, + "grad_norm": 6.53125, + "learning_rate": 8.651044432187736e-06, + "loss": 0.895998, + "memory(GiB)": 135.77, + "step": 22480, + "train_speed(iter/s)": 0.203006 + }, + { + "acc": 0.77738414, + "epoch": 0.5247087300777206, + "grad_norm": 4.75, + "learning_rate": 8.649753485287986e-06, + "loss": 0.79222326, + "memory(GiB)": 135.77, + "step": 22490, + "train_speed(iter/s)": 0.203052 + }, + { + "acc": 0.78186607, + "epoch": 0.5249420376500095, + "grad_norm": 7.0625, + "learning_rate": 8.648462017386982e-06, + "loss": 0.80105238, + "memory(GiB)": 135.77, + "step": 22500, + "train_speed(iter/s)": 0.203094 + }, + { + "epoch": 0.5249420376500095, + "eval_acc": 0.7395784139026973, + "eval_loss": 0.8238633871078491, + "eval_runtime": 1268.9883, + "eval_samples_per_second": 28.362, + "eval_steps_per_second": 14.181, + "step": 22500 + }, + { + "acc": 0.78160195, + "epoch": 0.5251753452222984, + "grad_norm": 4.875, + "learning_rate": 8.64717002866909e-06, + "loss": 0.77151155, + "memory(GiB)": 135.77, + "step": 22510, + "train_speed(iter/s)": 0.200805 + }, + { + "acc": 0.78638101, + "epoch": 0.5254086527945873, + "grad_norm": 4.71875, + "learning_rate": 8.64587751931873e-06, + "loss": 0.77090092, + "memory(GiB)": 135.77, + "step": 22520, + "train_speed(iter/s)": 0.200853 + }, + { + "acc": 0.78011241, + "epoch": 0.5256419603668762, + "grad_norm": 5.03125, + "learning_rate": 8.644584489520418e-06, + "loss": 0.78862772, + "memory(GiB)": 135.77, + "step": 22530, + "train_speed(iter/s)": 0.200897 + }, + { + "acc": 0.77731066, + "epoch": 0.5258752679391651, + "grad_norm": 4.8125, + "learning_rate": 8.643290939458728e-06, + "loss": 0.79162712, + "memory(GiB)": 135.77, + "step": 22540, + "train_speed(iter/s)": 0.200945 + }, + { + "acc": 0.77135563, + "epoch": 0.526108575511454, + "grad_norm": 4.84375, + "learning_rate": 8.641996869318313e-06, + "loss": 0.83071518, + "memory(GiB)": 135.77, + "step": 22550, + "train_speed(iter/s)": 0.200987 + }, + { + "acc": 0.78698025, + "epoch": 0.5263418830837429, + "grad_norm": 8.125, + "learning_rate": 8.640702279283904e-06, + "loss": 0.76378479, + "memory(GiB)": 135.77, + "step": 22560, + "train_speed(iter/s)": 0.201035 + }, + { + "acc": 0.75245285, + "epoch": 0.5265751906560318, + "grad_norm": 6.875, + "learning_rate": 8.639407169540302e-06, + "loss": 0.90592232, + "memory(GiB)": 135.77, + "step": 22570, + "train_speed(iter/s)": 0.201079 + }, + { + "acc": 0.76512051, + "epoch": 0.5268084982283207, + "grad_norm": 6.40625, + "learning_rate": 8.638111540272384e-06, + "loss": 0.84115753, + "memory(GiB)": 135.77, + "step": 22580, + "train_speed(iter/s)": 0.201125 + }, + { + "acc": 0.76233463, + "epoch": 0.5270418058006096, + "grad_norm": 5.09375, + "learning_rate": 8.636815391665102e-06, + "loss": 0.87490129, + "memory(GiB)": 135.77, + "step": 22590, + "train_speed(iter/s)": 0.201173 + }, + { + "acc": 0.77505226, + "epoch": 0.5272751133728985, + "grad_norm": 5.5625, + "learning_rate": 8.635518723903478e-06, + "loss": 0.82525368, + "memory(GiB)": 135.77, + "step": 22600, + "train_speed(iter/s)": 0.201217 + }, + { + "acc": 0.77524152, + "epoch": 0.5275084209451874, + "grad_norm": 5.375, + "learning_rate": 8.634221537172612e-06, + "loss": 0.81201534, + "memory(GiB)": 135.77, + "step": 22610, + "train_speed(iter/s)": 0.201261 + }, + { + "acc": 0.76617479, + "epoch": 0.5277417285174762, + "grad_norm": 6.625, + "learning_rate": 8.632923831657678e-06, + "loss": 0.85680828, + "memory(GiB)": 135.77, + "step": 22620, + "train_speed(iter/s)": 0.201309 + }, + { + "acc": 0.77629232, + "epoch": 0.527975036089765, + "grad_norm": 5.4375, + "learning_rate": 8.631625607543921e-06, + "loss": 0.7853322, + "memory(GiB)": 135.77, + "step": 22630, + "train_speed(iter/s)": 0.201353 + }, + { + "acc": 0.78849845, + "epoch": 0.5282083436620539, + "grad_norm": 4.1875, + "learning_rate": 8.630326865016663e-06, + "loss": 0.74947386, + "memory(GiB)": 135.77, + "step": 22640, + "train_speed(iter/s)": 0.201397 + }, + { + "acc": 0.76544561, + "epoch": 0.5284416512343428, + "grad_norm": 5.0625, + "learning_rate": 8.629027604261303e-06, + "loss": 0.84232635, + "memory(GiB)": 135.77, + "step": 22650, + "train_speed(iter/s)": 0.201444 + }, + { + "acc": 0.77632599, + "epoch": 0.5286749588066317, + "grad_norm": 6.21875, + "learning_rate": 8.627727825463303e-06, + "loss": 0.79433575, + "memory(GiB)": 135.77, + "step": 22660, + "train_speed(iter/s)": 0.201489 + }, + { + "acc": 0.78881607, + "epoch": 0.5289082663789206, + "grad_norm": 6.40625, + "learning_rate": 8.626427528808212e-06, + "loss": 0.75886679, + "memory(GiB)": 135.77, + "step": 22670, + "train_speed(iter/s)": 0.201535 + }, + { + "acc": 0.78701801, + "epoch": 0.5291415739512095, + "grad_norm": 14.4375, + "learning_rate": 8.625126714481645e-06, + "loss": 0.79751778, + "memory(GiB)": 135.77, + "step": 22680, + "train_speed(iter/s)": 0.201581 + }, + { + "acc": 0.76863942, + "epoch": 0.5293748815234984, + "grad_norm": 5.28125, + "learning_rate": 8.623825382669291e-06, + "loss": 0.82617855, + "memory(GiB)": 135.77, + "step": 22690, + "train_speed(iter/s)": 0.201626 + }, + { + "acc": 0.77472267, + "epoch": 0.5296081890957873, + "grad_norm": 6.40625, + "learning_rate": 8.622523533556916e-06, + "loss": 0.84323826, + "memory(GiB)": 135.77, + "step": 22700, + "train_speed(iter/s)": 0.201672 + }, + { + "acc": 0.76703949, + "epoch": 0.5298414966680762, + "grad_norm": 6.875, + "learning_rate": 8.621221167330363e-06, + "loss": 0.85303726, + "memory(GiB)": 135.77, + "step": 22710, + "train_speed(iter/s)": 0.201718 + }, + { + "acc": 0.76639805, + "epoch": 0.5300748042403651, + "grad_norm": 4.15625, + "learning_rate": 8.619918284175537e-06, + "loss": 0.82314072, + "memory(GiB)": 135.77, + "step": 22720, + "train_speed(iter/s)": 0.201763 + }, + { + "acc": 0.79025326, + "epoch": 0.530308111812654, + "grad_norm": 3.8125, + "learning_rate": 8.618614884278427e-06, + "loss": 0.73413811, + "memory(GiB)": 135.77, + "step": 22730, + "train_speed(iter/s)": 0.201808 + }, + { + "acc": 0.76510487, + "epoch": 0.5305414193849429, + "grad_norm": 4.625, + "learning_rate": 8.617310967825094e-06, + "loss": 0.82598991, + "memory(GiB)": 135.77, + "step": 22740, + "train_speed(iter/s)": 0.201853 + }, + { + "acc": 0.76874247, + "epoch": 0.5307747269572318, + "grad_norm": 4.9375, + "learning_rate": 8.616006535001673e-06, + "loss": 0.83377934, + "memory(GiB)": 135.77, + "step": 22750, + "train_speed(iter/s)": 0.201897 + }, + { + "acc": 0.76021204, + "epoch": 0.5310080345295207, + "grad_norm": 5.15625, + "learning_rate": 8.614701585994368e-06, + "loss": 0.87588558, + "memory(GiB)": 135.77, + "step": 22760, + "train_speed(iter/s)": 0.201943 + }, + { + "acc": 0.77022514, + "epoch": 0.5312413421018096, + "grad_norm": 5.1875, + "learning_rate": 8.613396120989463e-06, + "loss": 0.82545309, + "memory(GiB)": 135.77, + "step": 22770, + "train_speed(iter/s)": 0.201989 + }, + { + "acc": 0.7779541, + "epoch": 0.5314746496740985, + "grad_norm": 4.75, + "learning_rate": 8.61209014017331e-06, + "loss": 0.7887279, + "memory(GiB)": 135.77, + "step": 22780, + "train_speed(iter/s)": 0.202035 + }, + { + "acc": 0.77761946, + "epoch": 0.5317079572463874, + "grad_norm": 4.40625, + "learning_rate": 8.610783643732339e-06, + "loss": 0.78312531, + "memory(GiB)": 135.77, + "step": 22790, + "train_speed(iter/s)": 0.202078 + }, + { + "acc": 0.76519241, + "epoch": 0.5319412648186763, + "grad_norm": 9.9375, + "learning_rate": 8.60947663185305e-06, + "loss": 0.85003548, + "memory(GiB)": 135.77, + "step": 22800, + "train_speed(iter/s)": 0.202127 + }, + { + "acc": 0.770683, + "epoch": 0.5321745723909652, + "grad_norm": 4.90625, + "learning_rate": 8.608169104722024e-06, + "loss": 0.83203659, + "memory(GiB)": 135.77, + "step": 22810, + "train_speed(iter/s)": 0.202169 + }, + { + "acc": 0.75626473, + "epoch": 0.532407879963254, + "grad_norm": 6.0, + "learning_rate": 8.606861062525904e-06, + "loss": 0.88525591, + "memory(GiB)": 135.77, + "step": 22820, + "train_speed(iter/s)": 0.20221 + }, + { + "acc": 0.7737916, + "epoch": 0.5326411875355429, + "grad_norm": 4.59375, + "learning_rate": 8.605552505451417e-06, + "loss": 0.81642294, + "memory(GiB)": 135.77, + "step": 22830, + "train_speed(iter/s)": 0.202255 + }, + { + "acc": 0.77903752, + "epoch": 0.5328744951078318, + "grad_norm": 5.9375, + "learning_rate": 8.604243433685356e-06, + "loss": 0.8116827, + "memory(GiB)": 135.77, + "step": 22840, + "train_speed(iter/s)": 0.202302 + }, + { + "acc": 0.79416924, + "epoch": 0.5331078026801207, + "grad_norm": 6.96875, + "learning_rate": 8.602933847414592e-06, + "loss": 0.74621596, + "memory(GiB)": 135.77, + "step": 22850, + "train_speed(iter/s)": 0.202346 + }, + { + "acc": 0.77615061, + "epoch": 0.5333411102524096, + "grad_norm": 6.375, + "learning_rate": 8.601623746826068e-06, + "loss": 0.80405464, + "memory(GiB)": 135.77, + "step": 22860, + "train_speed(iter/s)": 0.20239 + }, + { + "acc": 0.75500526, + "epoch": 0.5335744178246985, + "grad_norm": 5.4375, + "learning_rate": 8.600313132106801e-06, + "loss": 0.92301064, + "memory(GiB)": 135.77, + "step": 22870, + "train_speed(iter/s)": 0.202435 + }, + { + "acc": 0.74417582, + "epoch": 0.5338077253969874, + "grad_norm": 14.125, + "learning_rate": 8.599002003443879e-06, + "loss": 0.93030882, + "memory(GiB)": 135.77, + "step": 22880, + "train_speed(iter/s)": 0.202484 + }, + { + "acc": 0.76454968, + "epoch": 0.5340410329692763, + "grad_norm": 5.21875, + "learning_rate": 8.597690361024468e-06, + "loss": 0.85199795, + "memory(GiB)": 135.77, + "step": 22890, + "train_speed(iter/s)": 0.202531 + }, + { + "acc": 0.77811437, + "epoch": 0.5342743405415652, + "grad_norm": 7.15625, + "learning_rate": 8.596378205035803e-06, + "loss": 0.80663748, + "memory(GiB)": 135.77, + "step": 22900, + "train_speed(iter/s)": 0.202578 + }, + { + "acc": 0.77453089, + "epoch": 0.5345076481138541, + "grad_norm": 5.21875, + "learning_rate": 8.595065535665192e-06, + "loss": 0.81512251, + "memory(GiB)": 135.77, + "step": 22910, + "train_speed(iter/s)": 0.202625 + }, + { + "acc": 0.77443113, + "epoch": 0.534740955686143, + "grad_norm": 5.53125, + "learning_rate": 8.593752353100022e-06, + "loss": 0.82128239, + "memory(GiB)": 135.77, + "step": 22920, + "train_speed(iter/s)": 0.202669 + }, + { + "acc": 0.78365164, + "epoch": 0.5349742632584319, + "grad_norm": 5.5, + "learning_rate": 8.592438657527746e-06, + "loss": 0.77799377, + "memory(GiB)": 135.77, + "step": 22930, + "train_speed(iter/s)": 0.202714 + }, + { + "acc": 0.76975231, + "epoch": 0.5352075708307208, + "grad_norm": 6.40625, + "learning_rate": 8.591124449135897e-06, + "loss": 0.81138725, + "memory(GiB)": 135.77, + "step": 22940, + "train_speed(iter/s)": 0.202762 + }, + { + "acc": 0.79545565, + "epoch": 0.5354408784030097, + "grad_norm": 13.875, + "learning_rate": 8.589809728112076e-06, + "loss": 0.72523451, + "memory(GiB)": 135.77, + "step": 22950, + "train_speed(iter/s)": 0.202809 + }, + { + "acc": 0.76595306, + "epoch": 0.5356741859752986, + "grad_norm": 11.1875, + "learning_rate": 8.588494494643959e-06, + "loss": 0.84729843, + "memory(GiB)": 135.77, + "step": 22960, + "train_speed(iter/s)": 0.202854 + }, + { + "acc": 0.77811699, + "epoch": 0.5359074935475875, + "grad_norm": 5.125, + "learning_rate": 8.587178748919294e-06, + "loss": 0.8056942, + "memory(GiB)": 135.77, + "step": 22970, + "train_speed(iter/s)": 0.202899 + }, + { + "acc": 0.77778416, + "epoch": 0.5361408011198764, + "grad_norm": 4.875, + "learning_rate": 8.585862491125906e-06, + "loss": 0.79754281, + "memory(GiB)": 135.77, + "step": 22980, + "train_speed(iter/s)": 0.202945 + }, + { + "acc": 0.77270217, + "epoch": 0.5363741086921653, + "grad_norm": 4.84375, + "learning_rate": 8.584545721451689e-06, + "loss": 0.81794977, + "memory(GiB)": 135.77, + "step": 22990, + "train_speed(iter/s)": 0.202991 + }, + { + "acc": 0.75620012, + "epoch": 0.5366074162644542, + "grad_norm": 5.03125, + "learning_rate": 8.583228440084612e-06, + "loss": 0.89408569, + "memory(GiB)": 135.77, + "step": 23000, + "train_speed(iter/s)": 0.203038 + }, + { + "epoch": 0.5366074162644542, + "eval_acc": 0.7395305307772392, + "eval_loss": 0.8233683705329895, + "eval_runtime": 1269.3275, + "eval_samples_per_second": 28.354, + "eval_steps_per_second": 14.178, + "step": 23000 + }, + { + "acc": 0.76377153, + "epoch": 0.536840723836743, + "grad_norm": 22.625, + "learning_rate": 8.581910647212714e-06, + "loss": 0.84084702, + "memory(GiB)": 135.77, + "step": 23010, + "train_speed(iter/s)": 0.200803 + }, + { + "acc": 0.77578378, + "epoch": 0.5370740314090319, + "grad_norm": 6.3125, + "learning_rate": 8.580592343024114e-06, + "loss": 0.81817398, + "memory(GiB)": 135.77, + "step": 23020, + "train_speed(iter/s)": 0.200847 + }, + { + "acc": 0.78007755, + "epoch": 0.5373073389813208, + "grad_norm": 4.375, + "learning_rate": 8.579273527706997e-06, + "loss": 0.79391298, + "memory(GiB)": 135.77, + "step": 23030, + "train_speed(iter/s)": 0.200887 + }, + { + "acc": 0.7673666, + "epoch": 0.5375406465536097, + "grad_norm": 5.0625, + "learning_rate": 8.577954201449621e-06, + "loss": 0.8341753, + "memory(GiB)": 135.77, + "step": 23040, + "train_speed(iter/s)": 0.200931 + }, + { + "acc": 0.75479908, + "epoch": 0.5377739541258986, + "grad_norm": 6.9375, + "learning_rate": 8.576634364440327e-06, + "loss": 0.87442303, + "memory(GiB)": 135.77, + "step": 23050, + "train_speed(iter/s)": 0.20097 + }, + { + "acc": 0.77252378, + "epoch": 0.5380072616981875, + "grad_norm": 6.65625, + "learning_rate": 8.575314016867512e-06, + "loss": 0.84094563, + "memory(GiB)": 135.77, + "step": 23060, + "train_speed(iter/s)": 0.201015 + }, + { + "acc": 0.76748762, + "epoch": 0.5382405692704764, + "grad_norm": 7.25, + "learning_rate": 8.573993158919661e-06, + "loss": 0.82697706, + "memory(GiB)": 135.77, + "step": 23070, + "train_speed(iter/s)": 0.201062 + }, + { + "acc": 0.77766724, + "epoch": 0.5384738768427653, + "grad_norm": 3.890625, + "learning_rate": 8.572671790785325e-06, + "loss": 0.78724775, + "memory(GiB)": 135.77, + "step": 23080, + "train_speed(iter/s)": 0.201105 + }, + { + "acc": 0.76558957, + "epoch": 0.5387071844150542, + "grad_norm": 4.8125, + "learning_rate": 8.57134991265313e-06, + "loss": 0.844098, + "memory(GiB)": 135.77, + "step": 23090, + "train_speed(iter/s)": 0.201148 + }, + { + "acc": 0.78299785, + "epoch": 0.5389404919873431, + "grad_norm": 5.90625, + "learning_rate": 8.57002752471177e-06, + "loss": 0.77799244, + "memory(GiB)": 135.77, + "step": 23100, + "train_speed(iter/s)": 0.201192 + }, + { + "acc": 0.78844042, + "epoch": 0.539173799559632, + "grad_norm": 4.78125, + "learning_rate": 8.56870462715002e-06, + "loss": 0.73825145, + "memory(GiB)": 135.77, + "step": 23110, + "train_speed(iter/s)": 0.201237 + }, + { + "acc": 0.78577042, + "epoch": 0.5394071071319209, + "grad_norm": 8.75, + "learning_rate": 8.567381220156721e-06, + "loss": 0.76423135, + "memory(GiB)": 135.77, + "step": 23120, + "train_speed(iter/s)": 0.201279 + }, + { + "acc": 0.76853571, + "epoch": 0.5396404147042098, + "grad_norm": 5.59375, + "learning_rate": 8.566057303920788e-06, + "loss": 0.82382536, + "memory(GiB)": 135.77, + "step": 23130, + "train_speed(iter/s)": 0.201326 + }, + { + "acc": 0.79181309, + "epoch": 0.5398737222764987, + "grad_norm": 3.703125, + "learning_rate": 8.564732878631212e-06, + "loss": 0.76008949, + "memory(GiB)": 135.77, + "step": 23140, + "train_speed(iter/s)": 0.20137 + }, + { + "acc": 0.77274303, + "epoch": 0.5401070298487876, + "grad_norm": 5.6875, + "learning_rate": 8.563407944477052e-06, + "loss": 0.82584839, + "memory(GiB)": 135.77, + "step": 23150, + "train_speed(iter/s)": 0.201413 + }, + { + "acc": 0.76535187, + "epoch": 0.5403403374210765, + "grad_norm": 5.8125, + "learning_rate": 8.562082501647445e-06, + "loss": 0.82021761, + "memory(GiB)": 135.77, + "step": 23160, + "train_speed(iter/s)": 0.201455 + }, + { + "acc": 0.76622362, + "epoch": 0.5405736449933654, + "grad_norm": 6.09375, + "learning_rate": 8.560756550331594e-06, + "loss": 0.82352085, + "memory(GiB)": 135.77, + "step": 23170, + "train_speed(iter/s)": 0.201499 + }, + { + "acc": 0.78566408, + "epoch": 0.5408069525656543, + "grad_norm": 5.34375, + "learning_rate": 8.55943009071878e-06, + "loss": 0.78319578, + "memory(GiB)": 135.77, + "step": 23180, + "train_speed(iter/s)": 0.201539 + }, + { + "acc": 0.79545965, + "epoch": 0.5410402601379432, + "grad_norm": 5.90625, + "learning_rate": 8.558103122998354e-06, + "loss": 0.74384508, + "memory(GiB)": 135.77, + "step": 23190, + "train_speed(iter/s)": 0.201582 + }, + { + "acc": 0.76623678, + "epoch": 0.5412735677102319, + "grad_norm": 6.3125, + "learning_rate": 8.556775647359744e-06, + "loss": 0.8515913, + "memory(GiB)": 135.77, + "step": 23200, + "train_speed(iter/s)": 0.201629 + }, + { + "acc": 0.76652737, + "epoch": 0.5415068752825208, + "grad_norm": 18.25, + "learning_rate": 8.55544766399244e-06, + "loss": 0.84666195, + "memory(GiB)": 135.77, + "step": 23210, + "train_speed(iter/s)": 0.201675 + }, + { + "acc": 0.77143021, + "epoch": 0.5417401828548097, + "grad_norm": 4.84375, + "learning_rate": 8.554119173086014e-06, + "loss": 0.82694244, + "memory(GiB)": 135.77, + "step": 23220, + "train_speed(iter/s)": 0.201722 + }, + { + "acc": 0.78573523, + "epoch": 0.5419734904270986, + "grad_norm": 4.4375, + "learning_rate": 8.552790174830112e-06, + "loss": 0.76147246, + "memory(GiB)": 135.77, + "step": 23230, + "train_speed(iter/s)": 0.201769 + }, + { + "acc": 0.77535591, + "epoch": 0.5422067979993875, + "grad_norm": 5.53125, + "learning_rate": 8.551460669414444e-06, + "loss": 0.80129585, + "memory(GiB)": 135.77, + "step": 23240, + "train_speed(iter/s)": 0.201817 + }, + { + "acc": 0.77153683, + "epoch": 0.5424401055716764, + "grad_norm": 5.84375, + "learning_rate": 8.550130657028797e-06, + "loss": 0.80181551, + "memory(GiB)": 135.77, + "step": 23250, + "train_speed(iter/s)": 0.201864 + }, + { + "acc": 0.78025579, + "epoch": 0.5426734131439653, + "grad_norm": 4.53125, + "learning_rate": 8.548800137863028e-06, + "loss": 0.78459616, + "memory(GiB)": 135.77, + "step": 23260, + "train_speed(iter/s)": 0.20191 + }, + { + "acc": 0.7832684, + "epoch": 0.5429067207162542, + "grad_norm": 5.25, + "learning_rate": 8.547469112107071e-06, + "loss": 0.79565368, + "memory(GiB)": 135.77, + "step": 23270, + "train_speed(iter/s)": 0.201959 + }, + { + "acc": 0.76917963, + "epoch": 0.5431400282885431, + "grad_norm": 5.96875, + "learning_rate": 8.54613757995093e-06, + "loss": 0.83835936, + "memory(GiB)": 135.77, + "step": 23280, + "train_speed(iter/s)": 0.202004 + }, + { + "acc": 0.76447086, + "epoch": 0.543373335860832, + "grad_norm": 7.375, + "learning_rate": 8.54480554158468e-06, + "loss": 0.83568411, + "memory(GiB)": 135.77, + "step": 23290, + "train_speed(iter/s)": 0.202048 + }, + { + "acc": 0.757164, + "epoch": 0.5436066434331209, + "grad_norm": 5.34375, + "learning_rate": 8.543472997198467e-06, + "loss": 0.87914925, + "memory(GiB)": 135.77, + "step": 23300, + "train_speed(iter/s)": 0.202092 + }, + { + "acc": 0.76810994, + "epoch": 0.5438399510054098, + "grad_norm": 5.96875, + "learning_rate": 8.542139946982516e-06, + "loss": 0.81503553, + "memory(GiB)": 135.77, + "step": 23310, + "train_speed(iter/s)": 0.202138 + }, + { + "acc": 0.7738739, + "epoch": 0.5440732585776987, + "grad_norm": 7.0625, + "learning_rate": 8.540806391127112e-06, + "loss": 0.82891731, + "memory(GiB)": 135.77, + "step": 23320, + "train_speed(iter/s)": 0.202183 + }, + { + "acc": 0.76221933, + "epoch": 0.5443065661499876, + "grad_norm": 6.1875, + "learning_rate": 8.539472329822627e-06, + "loss": 0.86402712, + "memory(GiB)": 135.77, + "step": 23330, + "train_speed(iter/s)": 0.20223 + }, + { + "acc": 0.79112253, + "epoch": 0.5445398737222765, + "grad_norm": 6.09375, + "learning_rate": 8.538137763259495e-06, + "loss": 0.75483222, + "memory(GiB)": 135.77, + "step": 23340, + "train_speed(iter/s)": 0.202277 + }, + { + "acc": 0.78928189, + "epoch": 0.5447731812945654, + "grad_norm": 5.4375, + "learning_rate": 8.536802691628226e-06, + "loss": 0.77416606, + "memory(GiB)": 135.77, + "step": 23350, + "train_speed(iter/s)": 0.202318 + }, + { + "acc": 0.77431383, + "epoch": 0.5450064888668543, + "grad_norm": 7.90625, + "learning_rate": 8.535467115119399e-06, + "loss": 0.79211545, + "memory(GiB)": 135.77, + "step": 23360, + "train_speed(iter/s)": 0.202362 + }, + { + "acc": 0.78563662, + "epoch": 0.5452397964391432, + "grad_norm": 4.40625, + "learning_rate": 8.534131033923668e-06, + "loss": 0.76018214, + "memory(GiB)": 135.77, + "step": 23370, + "train_speed(iter/s)": 0.202406 + }, + { + "acc": 0.7482007, + "epoch": 0.5454731040114321, + "grad_norm": 5.15625, + "learning_rate": 8.53279444823176e-06, + "loss": 0.90958967, + "memory(GiB)": 135.77, + "step": 23380, + "train_speed(iter/s)": 0.202452 + }, + { + "acc": 0.77748098, + "epoch": 0.545706411583721, + "grad_norm": 4.75, + "learning_rate": 8.531457358234469e-06, + "loss": 0.81523991, + "memory(GiB)": 135.77, + "step": 23390, + "train_speed(iter/s)": 0.202498 + }, + { + "acc": 0.78067293, + "epoch": 0.5459397191560098, + "grad_norm": 4.65625, + "learning_rate": 8.530119764122666e-06, + "loss": 0.80296059, + "memory(GiB)": 135.77, + "step": 23400, + "train_speed(iter/s)": 0.202546 + }, + { + "acc": 0.77213707, + "epoch": 0.5461730267282987, + "grad_norm": 6.3125, + "learning_rate": 8.528781666087294e-06, + "loss": 0.809834, + "memory(GiB)": 135.77, + "step": 23410, + "train_speed(iter/s)": 0.202589 + }, + { + "acc": 0.76574154, + "epoch": 0.5464063343005876, + "grad_norm": 5.75, + "learning_rate": 8.527443064319362e-06, + "loss": 0.82819872, + "memory(GiB)": 135.77, + "step": 23420, + "train_speed(iter/s)": 0.202637 + }, + { + "acc": 0.78312383, + "epoch": 0.5466396418728765, + "grad_norm": 5.4375, + "learning_rate": 8.526103959009959e-06, + "loss": 0.75177727, + "memory(GiB)": 135.77, + "step": 23430, + "train_speed(iter/s)": 0.202685 + }, + { + "acc": 0.77955656, + "epoch": 0.5468729494451654, + "grad_norm": 4.34375, + "learning_rate": 8.52476435035024e-06, + "loss": 0.78664408, + "memory(GiB)": 135.77, + "step": 23440, + "train_speed(iter/s)": 0.202731 + }, + { + "acc": 0.76553507, + "epoch": 0.5471062570174543, + "grad_norm": 5.8125, + "learning_rate": 8.523424238531435e-06, + "loss": 0.84986553, + "memory(GiB)": 135.77, + "step": 23450, + "train_speed(iter/s)": 0.202773 + }, + { + "acc": 0.77595615, + "epoch": 0.5473395645897432, + "grad_norm": 5.1875, + "learning_rate": 8.522083623744841e-06, + "loss": 0.80186443, + "memory(GiB)": 135.77, + "step": 23460, + "train_speed(iter/s)": 0.202818 + }, + { + "acc": 0.76666412, + "epoch": 0.5475728721620321, + "grad_norm": 7.65625, + "learning_rate": 8.520742506181834e-06, + "loss": 0.85667477, + "memory(GiB)": 135.77, + "step": 23470, + "train_speed(iter/s)": 0.202861 + }, + { + "acc": 0.78935471, + "epoch": 0.547806179734321, + "grad_norm": 5.625, + "learning_rate": 8.519400886033858e-06, + "loss": 0.77769156, + "memory(GiB)": 135.77, + "step": 23480, + "train_speed(iter/s)": 0.202904 + }, + { + "acc": 0.78004494, + "epoch": 0.5480394873066099, + "grad_norm": 5.46875, + "learning_rate": 8.518058763492428e-06, + "loss": 0.78160458, + "memory(GiB)": 135.77, + "step": 23490, + "train_speed(iter/s)": 0.202951 + }, + { + "acc": 0.7742156, + "epoch": 0.5482727948788988, + "grad_norm": 5.71875, + "learning_rate": 8.516716138749131e-06, + "loss": 0.82459059, + "memory(GiB)": 135.77, + "step": 23500, + "train_speed(iter/s)": 0.202994 + }, + { + "epoch": 0.5482727948788988, + "eval_acc": 0.7398569047226694, + "eval_loss": 0.8228998184204102, + "eval_runtime": 1269.4084, + "eval_samples_per_second": 28.353, + "eval_steps_per_second": 14.177, + "step": 23500 + }, + { + "acc": 0.76532078, + "epoch": 0.5485061024511877, + "grad_norm": 6.0, + "learning_rate": 8.515373011995624e-06, + "loss": 0.84984226, + "memory(GiB)": 135.77, + "step": 23510, + "train_speed(iter/s)": 0.200806 + }, + { + "acc": 0.78477125, + "epoch": 0.5487394100234766, + "grad_norm": 6.78125, + "learning_rate": 8.514029383423644e-06, + "loss": 0.75276413, + "memory(GiB)": 135.77, + "step": 23520, + "train_speed(iter/s)": 0.200846 + }, + { + "acc": 0.79451346, + "epoch": 0.5489727175957655, + "grad_norm": 6.03125, + "learning_rate": 8.51268525322499e-06, + "loss": 0.74686522, + "memory(GiB)": 135.77, + "step": 23530, + "train_speed(iter/s)": 0.200888 + }, + { + "acc": 0.76671562, + "epoch": 0.5492060251680544, + "grad_norm": 6.0625, + "learning_rate": 8.511340621591536e-06, + "loss": 0.84974279, + "memory(GiB)": 135.77, + "step": 23540, + "train_speed(iter/s)": 0.200932 + }, + { + "acc": 0.77346573, + "epoch": 0.5494393327403433, + "grad_norm": 5.78125, + "learning_rate": 8.509995488715228e-06, + "loss": 0.83321466, + "memory(GiB)": 135.77, + "step": 23550, + "train_speed(iter/s)": 0.200978 + }, + { + "acc": 0.7776125, + "epoch": 0.5496726403126322, + "grad_norm": 3.34375, + "learning_rate": 8.508649854788085e-06, + "loss": 0.80355291, + "memory(GiB)": 135.77, + "step": 23560, + "train_speed(iter/s)": 0.201019 + }, + { + "acc": 0.76173635, + "epoch": 0.5499059478849211, + "grad_norm": 7.90625, + "learning_rate": 8.507303720002194e-06, + "loss": 0.87492666, + "memory(GiB)": 135.77, + "step": 23570, + "train_speed(iter/s)": 0.201065 + }, + { + "acc": 0.77865009, + "epoch": 0.55013925545721, + "grad_norm": 5.0, + "learning_rate": 8.505957084549714e-06, + "loss": 0.82421484, + "memory(GiB)": 135.77, + "step": 23580, + "train_speed(iter/s)": 0.201109 + }, + { + "acc": 0.77421579, + "epoch": 0.5503725630294988, + "grad_norm": 5.1875, + "learning_rate": 8.50460994862288e-06, + "loss": 0.80633135, + "memory(GiB)": 135.77, + "step": 23590, + "train_speed(iter/s)": 0.201151 + }, + { + "acc": 0.77418938, + "epoch": 0.5506058706017877, + "grad_norm": 4.65625, + "learning_rate": 8.503262312413994e-06, + "loss": 0.82369556, + "memory(GiB)": 135.77, + "step": 23600, + "train_speed(iter/s)": 0.201192 + }, + { + "acc": 0.775385, + "epoch": 0.5508391781740766, + "grad_norm": 4.96875, + "learning_rate": 8.501914176115432e-06, + "loss": 0.81491432, + "memory(GiB)": 135.77, + "step": 23610, + "train_speed(iter/s)": 0.201234 + }, + { + "acc": 0.76755171, + "epoch": 0.5510724857463655, + "grad_norm": 7.46875, + "learning_rate": 8.500565539919636e-06, + "loss": 0.84395332, + "memory(GiB)": 135.77, + "step": 23620, + "train_speed(iter/s)": 0.201278 + }, + { + "acc": 0.77598143, + "epoch": 0.5513057933186544, + "grad_norm": 6.75, + "learning_rate": 8.499216404019129e-06, + "loss": 0.80111885, + "memory(GiB)": 135.77, + "step": 23630, + "train_speed(iter/s)": 0.201323 + }, + { + "acc": 0.7583106, + "epoch": 0.5515391008909433, + "grad_norm": 7.28125, + "learning_rate": 8.497866768606493e-06, + "loss": 0.86915989, + "memory(GiB)": 135.77, + "step": 23640, + "train_speed(iter/s)": 0.201369 + }, + { + "acc": 0.77197981, + "epoch": 0.5517724084632322, + "grad_norm": 5.4375, + "learning_rate": 8.496516633874395e-06, + "loss": 0.81613064, + "memory(GiB)": 135.77, + "step": 23650, + "train_speed(iter/s)": 0.201414 + }, + { + "acc": 0.7657783, + "epoch": 0.5520057160355211, + "grad_norm": 5.625, + "learning_rate": 8.495166000015562e-06, + "loss": 0.84468031, + "memory(GiB)": 135.77, + "step": 23660, + "train_speed(iter/s)": 0.201457 + }, + { + "acc": 0.77991076, + "epoch": 0.55223902360781, + "grad_norm": 6.15625, + "learning_rate": 8.493814867222799e-06, + "loss": 0.80958157, + "memory(GiB)": 135.77, + "step": 23670, + "train_speed(iter/s)": 0.201502 + }, + { + "acc": 0.77264977, + "epoch": 0.5524723311800989, + "grad_norm": 5.5625, + "learning_rate": 8.492463235688977e-06, + "loss": 0.79891062, + "memory(GiB)": 135.77, + "step": 23680, + "train_speed(iter/s)": 0.201547 + }, + { + "acc": 0.7838623, + "epoch": 0.5527056387523878, + "grad_norm": 5.65625, + "learning_rate": 8.491111105607044e-06, + "loss": 0.77505665, + "memory(GiB)": 135.77, + "step": 23690, + "train_speed(iter/s)": 0.20159 + }, + { + "acc": 0.77503614, + "epoch": 0.5529389463246767, + "grad_norm": 4.59375, + "learning_rate": 8.489758477170015e-06, + "loss": 0.80072899, + "memory(GiB)": 135.77, + "step": 23700, + "train_speed(iter/s)": 0.201631 + }, + { + "acc": 0.76735268, + "epoch": 0.5531722538969656, + "grad_norm": 6.40625, + "learning_rate": 8.488405350570976e-06, + "loss": 0.84666109, + "memory(GiB)": 135.77, + "step": 23710, + "train_speed(iter/s)": 0.201675 + }, + { + "acc": 0.76926999, + "epoch": 0.5534055614692545, + "grad_norm": 7.34375, + "learning_rate": 8.487051726003087e-06, + "loss": 0.81005325, + "memory(GiB)": 135.77, + "step": 23720, + "train_speed(iter/s)": 0.201719 + }, + { + "acc": 0.7819787, + "epoch": 0.5536388690415434, + "grad_norm": 6.34375, + "learning_rate": 8.485697603659578e-06, + "loss": 0.77697315, + "memory(GiB)": 135.77, + "step": 23730, + "train_speed(iter/s)": 0.201765 + }, + { + "acc": 0.76455793, + "epoch": 0.5538721766138323, + "grad_norm": 6.1875, + "learning_rate": 8.484342983733747e-06, + "loss": 0.8478878, + "memory(GiB)": 135.77, + "step": 23740, + "train_speed(iter/s)": 0.201811 + }, + { + "acc": 0.77278852, + "epoch": 0.5541054841861212, + "grad_norm": 5.375, + "learning_rate": 8.482987866418968e-06, + "loss": 0.82200966, + "memory(GiB)": 135.77, + "step": 23750, + "train_speed(iter/s)": 0.201851 + }, + { + "acc": 0.79056635, + "epoch": 0.55433879175841, + "grad_norm": 7.78125, + "learning_rate": 8.481632251908684e-06, + "loss": 0.75850916, + "memory(GiB)": 135.77, + "step": 23760, + "train_speed(iter/s)": 0.201893 + }, + { + "acc": 0.78154702, + "epoch": 0.554572099330699, + "grad_norm": 7.5, + "learning_rate": 8.480276140396406e-06, + "loss": 0.78442221, + "memory(GiB)": 135.77, + "step": 23770, + "train_speed(iter/s)": 0.201934 + }, + { + "acc": 0.77974281, + "epoch": 0.5548054069029877, + "grad_norm": 6.03125, + "learning_rate": 8.478919532075723e-06, + "loss": 0.80161047, + "memory(GiB)": 135.77, + "step": 23780, + "train_speed(iter/s)": 0.201977 + }, + { + "acc": 0.77860785, + "epoch": 0.5550387144752766, + "grad_norm": 5.53125, + "learning_rate": 8.477562427140283e-06, + "loss": 0.80894966, + "memory(GiB)": 135.77, + "step": 23790, + "train_speed(iter/s)": 0.20202 + }, + { + "acc": 0.77603893, + "epoch": 0.5552720220475655, + "grad_norm": 4.3125, + "learning_rate": 8.47620482578382e-06, + "loss": 0.81759472, + "memory(GiB)": 135.77, + "step": 23800, + "train_speed(iter/s)": 0.202062 + }, + { + "acc": 0.77046776, + "epoch": 0.5555053296198544, + "grad_norm": 4.90625, + "learning_rate": 8.474846728200125e-06, + "loss": 0.82772932, + "memory(GiB)": 135.77, + "step": 23810, + "train_speed(iter/s)": 0.202107 + }, + { + "acc": 0.77512856, + "epoch": 0.5557386371921433, + "grad_norm": 3.6875, + "learning_rate": 8.473488134583071e-06, + "loss": 0.80098133, + "memory(GiB)": 135.77, + "step": 23820, + "train_speed(iter/s)": 0.20215 + }, + { + "acc": 0.7914567, + "epoch": 0.5559719447644322, + "grad_norm": 7.59375, + "learning_rate": 8.472129045126596e-06, + "loss": 0.74370127, + "memory(GiB)": 135.77, + "step": 23830, + "train_speed(iter/s)": 0.202193 + }, + { + "acc": 0.78856716, + "epoch": 0.5562052523367211, + "grad_norm": 6.0, + "learning_rate": 8.470769460024705e-06, + "loss": 0.75198135, + "memory(GiB)": 135.77, + "step": 23840, + "train_speed(iter/s)": 0.202238 + }, + { + "acc": 0.76632452, + "epoch": 0.55643855990901, + "grad_norm": 5.9375, + "learning_rate": 8.469409379471486e-06, + "loss": 0.86098537, + "memory(GiB)": 135.77, + "step": 23850, + "train_speed(iter/s)": 0.202281 + }, + { + "acc": 0.77517405, + "epoch": 0.5566718674812989, + "grad_norm": 4.625, + "learning_rate": 8.468048803661083e-06, + "loss": 0.79920835, + "memory(GiB)": 135.77, + "step": 23860, + "train_speed(iter/s)": 0.202325 + }, + { + "acc": 0.78587723, + "epoch": 0.5569051750535878, + "grad_norm": 7.875, + "learning_rate": 8.466687732787721e-06, + "loss": 0.77098179, + "memory(GiB)": 135.77, + "step": 23870, + "train_speed(iter/s)": 0.202368 + }, + { + "acc": 0.76620317, + "epoch": 0.5571384826258767, + "grad_norm": 10.5625, + "learning_rate": 8.465326167045693e-06, + "loss": 0.86184235, + "memory(GiB)": 135.77, + "step": 23880, + "train_speed(iter/s)": 0.202409 + }, + { + "acc": 0.77027483, + "epoch": 0.5573717901981656, + "grad_norm": 5.875, + "learning_rate": 8.463964106629361e-06, + "loss": 0.81875496, + "memory(GiB)": 135.77, + "step": 23890, + "train_speed(iter/s)": 0.20245 + }, + { + "acc": 0.78449135, + "epoch": 0.5576050977704545, + "grad_norm": 4.625, + "learning_rate": 8.46260155173316e-06, + "loss": 0.77044706, + "memory(GiB)": 135.77, + "step": 23900, + "train_speed(iter/s)": 0.202491 + }, + { + "acc": 0.77500076, + "epoch": 0.5578384053427434, + "grad_norm": 4.90625, + "learning_rate": 8.461238502551592e-06, + "loss": 0.81618023, + "memory(GiB)": 135.77, + "step": 23910, + "train_speed(iter/s)": 0.202537 + }, + { + "acc": 0.7681963, + "epoch": 0.5580717129150323, + "grad_norm": 6.3125, + "learning_rate": 8.459874959279235e-06, + "loss": 0.85666189, + "memory(GiB)": 135.77, + "step": 23920, + "train_speed(iter/s)": 0.202581 + }, + { + "acc": 0.75990124, + "epoch": 0.5583050204873212, + "grad_norm": 4.40625, + "learning_rate": 8.45851092211073e-06, + "loss": 0.86498947, + "memory(GiB)": 135.77, + "step": 23930, + "train_speed(iter/s)": 0.202626 + }, + { + "acc": 0.77422037, + "epoch": 0.5585383280596101, + "grad_norm": 4.59375, + "learning_rate": 8.457146391240798e-06, + "loss": 0.81769314, + "memory(GiB)": 135.77, + "step": 23940, + "train_speed(iter/s)": 0.202668 + }, + { + "acc": 0.78469434, + "epoch": 0.558771635631899, + "grad_norm": 5.125, + "learning_rate": 8.455781366864223e-06, + "loss": 0.77262511, + "memory(GiB)": 135.77, + "step": 23950, + "train_speed(iter/s)": 0.202713 + }, + { + "acc": 0.77914438, + "epoch": 0.5590049432041879, + "grad_norm": 6.5, + "learning_rate": 8.45441584917586e-06, + "loss": 0.7662066, + "memory(GiB)": 135.77, + "step": 23960, + "train_speed(iter/s)": 0.202752 + }, + { + "acc": 0.78139782, + "epoch": 0.5592382507764767, + "grad_norm": 4.90625, + "learning_rate": 8.453049838370639e-06, + "loss": 0.78332119, + "memory(GiB)": 135.77, + "step": 23970, + "train_speed(iter/s)": 0.202796 + }, + { + "acc": 0.75784979, + "epoch": 0.5594715583487656, + "grad_norm": 7.40625, + "learning_rate": 8.451683334643557e-06, + "loss": 0.88706684, + "memory(GiB)": 135.77, + "step": 23980, + "train_speed(iter/s)": 0.20284 + }, + { + "acc": 0.79134464, + "epoch": 0.5597048659210545, + "grad_norm": 8.8125, + "learning_rate": 8.45031633818968e-06, + "loss": 0.73851681, + "memory(GiB)": 135.77, + "step": 23990, + "train_speed(iter/s)": 0.202884 + }, + { + "acc": 0.76034174, + "epoch": 0.5599381734933434, + "grad_norm": 5.25, + "learning_rate": 8.44894884920415e-06, + "loss": 0.87289333, + "memory(GiB)": 135.77, + "step": 24000, + "train_speed(iter/s)": 0.20293 + }, + { + "epoch": 0.5599381734933434, + "eval_acc": 0.7398836488094638, + "eval_loss": 0.822571873664856, + "eval_runtime": 1269.3247, + "eval_samples_per_second": 28.354, + "eval_steps_per_second": 14.178, + "step": 24000 + }, + { + "acc": 0.76281738, + "epoch": 0.5601714810656323, + "grad_norm": 19.375, + "learning_rate": 8.447580867882172e-06, + "loss": 0.85142126, + "memory(GiB)": 135.77, + "step": 24010, + "train_speed(iter/s)": 0.200792 + }, + { + "acc": 0.79376831, + "epoch": 0.5604047886379212, + "grad_norm": 4.65625, + "learning_rate": 8.446212394419028e-06, + "loss": 0.74882054, + "memory(GiB)": 135.77, + "step": 24020, + "train_speed(iter/s)": 0.200833 + }, + { + "acc": 0.78502007, + "epoch": 0.5606380962102101, + "grad_norm": 4.5625, + "learning_rate": 8.444843429010065e-06, + "loss": 0.7658041, + "memory(GiB)": 135.77, + "step": 24030, + "train_speed(iter/s)": 0.200876 + }, + { + "acc": 0.79705825, + "epoch": 0.560871403782499, + "grad_norm": 4.75, + "learning_rate": 8.443473971850703e-06, + "loss": 0.72210217, + "memory(GiB)": 135.77, + "step": 24040, + "train_speed(iter/s)": 0.200917 + }, + { + "acc": 0.76908588, + "epoch": 0.5611047113547879, + "grad_norm": 7.78125, + "learning_rate": 8.442104023136435e-06, + "loss": 0.83583965, + "memory(GiB)": 135.77, + "step": 24050, + "train_speed(iter/s)": 0.20096 + }, + { + "acc": 0.78580241, + "epoch": 0.5613380189270768, + "grad_norm": 6.5625, + "learning_rate": 8.440733583062814e-06, + "loss": 0.76494713, + "memory(GiB)": 135.77, + "step": 24060, + "train_speed(iter/s)": 0.201002 + }, + { + "acc": 0.75843534, + "epoch": 0.5615713264993657, + "grad_norm": 6.8125, + "learning_rate": 8.439362651825475e-06, + "loss": 0.8755847, + "memory(GiB)": 135.77, + "step": 24070, + "train_speed(iter/s)": 0.201046 + }, + { + "acc": 0.77627125, + "epoch": 0.5618046340716546, + "grad_norm": 9.9375, + "learning_rate": 8.437991229620117e-06, + "loss": 0.80955582, + "memory(GiB)": 135.77, + "step": 24080, + "train_speed(iter/s)": 0.201086 + }, + { + "acc": 0.77059631, + "epoch": 0.5620379416439435, + "grad_norm": 4.46875, + "learning_rate": 8.436619316642508e-06, + "loss": 0.82761564, + "memory(GiB)": 135.77, + "step": 24090, + "train_speed(iter/s)": 0.20113 + }, + { + "acc": 0.77916279, + "epoch": 0.5622712492162324, + "grad_norm": 4.625, + "learning_rate": 8.435246913088492e-06, + "loss": 0.80605087, + "memory(GiB)": 135.77, + "step": 24100, + "train_speed(iter/s)": 0.201173 + }, + { + "acc": 0.78393345, + "epoch": 0.5625045567885213, + "grad_norm": 5.9375, + "learning_rate": 8.433874019153976e-06, + "loss": 0.78385267, + "memory(GiB)": 135.77, + "step": 24110, + "train_speed(iter/s)": 0.201217 + }, + { + "acc": 0.78179646, + "epoch": 0.5627378643608102, + "grad_norm": 5.28125, + "learning_rate": 8.432500635034942e-06, + "loss": 0.77540627, + "memory(GiB)": 135.77, + "step": 24120, + "train_speed(iter/s)": 0.201259 + }, + { + "acc": 0.76452451, + "epoch": 0.5629711719330991, + "grad_norm": 6.65625, + "learning_rate": 8.43112676092744e-06, + "loss": 0.88243942, + "memory(GiB)": 135.77, + "step": 24130, + "train_speed(iter/s)": 0.201301 + }, + { + "acc": 0.77689223, + "epoch": 0.563204479505388, + "grad_norm": 5.375, + "learning_rate": 8.429752397027585e-06, + "loss": 0.79288564, + "memory(GiB)": 135.77, + "step": 24140, + "train_speed(iter/s)": 0.201342 + }, + { + "acc": 0.77824707, + "epoch": 0.5634377870776769, + "grad_norm": 7.28125, + "learning_rate": 8.428377543531577e-06, + "loss": 0.79980693, + "memory(GiB)": 135.77, + "step": 24150, + "train_speed(iter/s)": 0.201385 + }, + { + "acc": 0.77147899, + "epoch": 0.5636710946499658, + "grad_norm": 9.625, + "learning_rate": 8.427002200635669e-06, + "loss": 0.82787066, + "memory(GiB)": 135.77, + "step": 24160, + "train_speed(iter/s)": 0.201428 + }, + { + "acc": 0.75589886, + "epoch": 0.5639044022222546, + "grad_norm": 5.8125, + "learning_rate": 8.425626368536192e-06, + "loss": 0.88443241, + "memory(GiB)": 135.77, + "step": 24170, + "train_speed(iter/s)": 0.201471 + }, + { + "acc": 0.7736052, + "epoch": 0.5641377097945435, + "grad_norm": 6.0625, + "learning_rate": 8.424250047429547e-06, + "loss": 0.82636681, + "memory(GiB)": 135.77, + "step": 24180, + "train_speed(iter/s)": 0.201514 + }, + { + "acc": 0.76566911, + "epoch": 0.5643710173668324, + "grad_norm": 5.8125, + "learning_rate": 8.4228732375122e-06, + "loss": 0.85641251, + "memory(GiB)": 135.77, + "step": 24190, + "train_speed(iter/s)": 0.201557 + }, + { + "acc": 0.77742729, + "epoch": 0.5646043249391213, + "grad_norm": 5.375, + "learning_rate": 8.421495938980695e-06, + "loss": 0.78332605, + "memory(GiB)": 135.77, + "step": 24200, + "train_speed(iter/s)": 0.201598 + }, + { + "acc": 0.77955341, + "epoch": 0.5648376325114102, + "grad_norm": 4.40625, + "learning_rate": 8.420118152031638e-06, + "loss": 0.79840794, + "memory(GiB)": 135.77, + "step": 24210, + "train_speed(iter/s)": 0.201641 + }, + { + "acc": 0.79004874, + "epoch": 0.5650709400836991, + "grad_norm": 6.71875, + "learning_rate": 8.418739876861708e-06, + "loss": 0.75119066, + "memory(GiB)": 135.77, + "step": 24220, + "train_speed(iter/s)": 0.201681 + }, + { + "acc": 0.77495551, + "epoch": 0.565304247655988, + "grad_norm": 5.875, + "learning_rate": 8.417361113667654e-06, + "loss": 0.80294151, + "memory(GiB)": 135.77, + "step": 24230, + "train_speed(iter/s)": 0.201723 + }, + { + "acc": 0.77338634, + "epoch": 0.5655375552282769, + "grad_norm": 6.34375, + "learning_rate": 8.415981862646295e-06, + "loss": 0.80396366, + "memory(GiB)": 135.77, + "step": 24240, + "train_speed(iter/s)": 0.201765 + }, + { + "acc": 0.76142254, + "epoch": 0.5657708628005658, + "grad_norm": 5.53125, + "learning_rate": 8.414602123994517e-06, + "loss": 0.8642437, + "memory(GiB)": 135.77, + "step": 24250, + "train_speed(iter/s)": 0.201808 + }, + { + "acc": 0.77578001, + "epoch": 0.5660041703728547, + "grad_norm": 7.84375, + "learning_rate": 8.413221897909277e-06, + "loss": 0.80832329, + "memory(GiB)": 135.77, + "step": 24260, + "train_speed(iter/s)": 0.201849 + }, + { + "acc": 0.75002069, + "epoch": 0.5662374779451436, + "grad_norm": 27.875, + "learning_rate": 8.411841184587602e-06, + "loss": 0.94861403, + "memory(GiB)": 135.77, + "step": 24270, + "train_speed(iter/s)": 0.201892 + }, + { + "acc": 0.79898376, + "epoch": 0.5664707855174325, + "grad_norm": 6.53125, + "learning_rate": 8.41045998422659e-06, + "loss": 0.73172636, + "memory(GiB)": 135.77, + "step": 24280, + "train_speed(iter/s)": 0.201931 + }, + { + "acc": 0.79310303, + "epoch": 0.5667040930897214, + "grad_norm": 6.03125, + "learning_rate": 8.409078297023406e-06, + "loss": 0.71440797, + "memory(GiB)": 135.77, + "step": 24290, + "train_speed(iter/s)": 0.201972 + }, + { + "acc": 0.77959661, + "epoch": 0.5669374006620103, + "grad_norm": 4.5625, + "learning_rate": 8.407696123175285e-06, + "loss": 0.78759675, + "memory(GiB)": 135.77, + "step": 24300, + "train_speed(iter/s)": 0.202013 + }, + { + "acc": 0.79495506, + "epoch": 0.5671707082342992, + "grad_norm": 4.53125, + "learning_rate": 8.406313462879533e-06, + "loss": 0.73519535, + "memory(GiB)": 135.77, + "step": 24310, + "train_speed(iter/s)": 0.202053 + }, + { + "acc": 0.77956157, + "epoch": 0.5674040158065881, + "grad_norm": 4.875, + "learning_rate": 8.404930316333524e-06, + "loss": 0.80541306, + "memory(GiB)": 135.77, + "step": 24320, + "train_speed(iter/s)": 0.202096 + }, + { + "acc": 0.76412115, + "epoch": 0.567637323378877, + "grad_norm": 5.8125, + "learning_rate": 8.4035466837347e-06, + "loss": 0.86554079, + "memory(GiB)": 135.77, + "step": 24330, + "train_speed(iter/s)": 0.202139 + }, + { + "acc": 0.76175427, + "epoch": 0.5678706309511659, + "grad_norm": 4.46875, + "learning_rate": 8.402162565280577e-06, + "loss": 0.92497549, + "memory(GiB)": 135.77, + "step": 24340, + "train_speed(iter/s)": 0.202181 + }, + { + "acc": 0.77174816, + "epoch": 0.5681039385234548, + "grad_norm": 5.34375, + "learning_rate": 8.400777961168736e-06, + "loss": 0.83751183, + "memory(GiB)": 135.77, + "step": 24350, + "train_speed(iter/s)": 0.202224 + }, + { + "acc": 0.76675367, + "epoch": 0.5683372460957435, + "grad_norm": 4.5, + "learning_rate": 8.399392871596828e-06, + "loss": 0.82344303, + "memory(GiB)": 135.77, + "step": 24360, + "train_speed(iter/s)": 0.20227 + }, + { + "acc": 0.79096088, + "epoch": 0.5685705536680324, + "grad_norm": 6.0625, + "learning_rate": 8.398007296762576e-06, + "loss": 0.74808559, + "memory(GiB)": 135.77, + "step": 24370, + "train_speed(iter/s)": 0.202311 + }, + { + "acc": 0.78298712, + "epoch": 0.5688038612403213, + "grad_norm": 5.96875, + "learning_rate": 8.39662123686377e-06, + "loss": 0.7767374, + "memory(GiB)": 135.77, + "step": 24380, + "train_speed(iter/s)": 0.202355 + }, + { + "acc": 0.77403417, + "epoch": 0.5690371688126102, + "grad_norm": 5.65625, + "learning_rate": 8.395234692098267e-06, + "loss": 0.80425043, + "memory(GiB)": 135.77, + "step": 24390, + "train_speed(iter/s)": 0.202399 + }, + { + "acc": 0.76454625, + "epoch": 0.5692704763848991, + "grad_norm": 4.96875, + "learning_rate": 8.393847662663998e-06, + "loss": 0.83601761, + "memory(GiB)": 135.77, + "step": 24400, + "train_speed(iter/s)": 0.20244 + }, + { + "acc": 0.78817453, + "epoch": 0.569503783957188, + "grad_norm": 5.625, + "learning_rate": 8.392460148758962e-06, + "loss": 0.75539885, + "memory(GiB)": 135.77, + "step": 24410, + "train_speed(iter/s)": 0.202484 + }, + { + "acc": 0.77908316, + "epoch": 0.5697370915294769, + "grad_norm": 5.25, + "learning_rate": 8.391072150581228e-06, + "loss": 0.78923807, + "memory(GiB)": 135.77, + "step": 24420, + "train_speed(iter/s)": 0.202523 + }, + { + "acc": 0.77952948, + "epoch": 0.5699703991017658, + "grad_norm": 7.96875, + "learning_rate": 8.389683668328927e-06, + "loss": 0.79888625, + "memory(GiB)": 135.77, + "step": 24430, + "train_speed(iter/s)": 0.202564 + }, + { + "acc": 0.7668036, + "epoch": 0.5702037066740547, + "grad_norm": 5.21875, + "learning_rate": 8.388294702200267e-06, + "loss": 0.83973408, + "memory(GiB)": 135.77, + "step": 24440, + "train_speed(iter/s)": 0.202607 + }, + { + "acc": 0.78203883, + "epoch": 0.5704370142463436, + "grad_norm": 6.28125, + "learning_rate": 8.386905252393522e-06, + "loss": 0.7930686, + "memory(GiB)": 135.77, + "step": 24450, + "train_speed(iter/s)": 0.20265 + }, + { + "acc": 0.78412566, + "epoch": 0.5706703218186325, + "grad_norm": 10.0, + "learning_rate": 8.385515319107038e-06, + "loss": 0.77225199, + "memory(GiB)": 135.77, + "step": 24460, + "train_speed(iter/s)": 0.202691 + }, + { + "acc": 0.76364594, + "epoch": 0.5709036293909214, + "grad_norm": 6.0, + "learning_rate": 8.384124902539225e-06, + "loss": 0.83564758, + "memory(GiB)": 135.77, + "step": 24470, + "train_speed(iter/s)": 0.202734 + }, + { + "acc": 0.76822014, + "epoch": 0.5711369369632103, + "grad_norm": 4.6875, + "learning_rate": 8.382734002888565e-06, + "loss": 0.81515923, + "memory(GiB)": 135.77, + "step": 24480, + "train_speed(iter/s)": 0.202778 + }, + { + "acc": 0.78435955, + "epoch": 0.5713702445354992, + "grad_norm": 6.0625, + "learning_rate": 8.381342620353609e-06, + "loss": 0.7858181, + "memory(GiB)": 135.77, + "step": 24490, + "train_speed(iter/s)": 0.202821 + }, + { + "acc": 0.7721489, + "epoch": 0.5716035521077881, + "grad_norm": 6.1875, + "learning_rate": 8.379950755132975e-06, + "loss": 0.81357956, + "memory(GiB)": 135.77, + "step": 24500, + "train_speed(iter/s)": 0.202865 + }, + { + "epoch": 0.5716035521077881, + "eval_acc": 0.739892296598008, + "eval_loss": 0.8211784958839417, + "eval_runtime": 1271.0226, + "eval_samples_per_second": 28.317, + "eval_steps_per_second": 14.159, + "step": 24500 + }, + { + "acc": 0.79300923, + "epoch": 0.571836859680077, + "grad_norm": 6.03125, + "learning_rate": 8.378558407425355e-06, + "loss": 0.75731974, + "memory(GiB)": 135.77, + "step": 24510, + "train_speed(iter/s)": 0.200768 + }, + { + "acc": 0.7801857, + "epoch": 0.5720701672523659, + "grad_norm": 5.375, + "learning_rate": 8.377165577429502e-06, + "loss": 0.79804745, + "memory(GiB)": 135.77, + "step": 24520, + "train_speed(iter/s)": 0.20081 + }, + { + "acc": 0.78895264, + "epoch": 0.5723034748246548, + "grad_norm": 6.4375, + "learning_rate": 8.375772265344244e-06, + "loss": 0.74042835, + "memory(GiB)": 135.77, + "step": 24530, + "train_speed(iter/s)": 0.200848 + }, + { + "acc": 0.78116617, + "epoch": 0.5725367823969437, + "grad_norm": 6.0625, + "learning_rate": 8.374378471368476e-06, + "loss": 0.78446503, + "memory(GiB)": 135.77, + "step": 24540, + "train_speed(iter/s)": 0.200891 + }, + { + "acc": 0.78846378, + "epoch": 0.5727700899692325, + "grad_norm": 7.96875, + "learning_rate": 8.37298419570116e-06, + "loss": 0.76507263, + "memory(GiB)": 135.77, + "step": 24550, + "train_speed(iter/s)": 0.200932 + }, + { + "acc": 0.75134902, + "epoch": 0.5730033975415214, + "grad_norm": 8.1875, + "learning_rate": 8.371589438541333e-06, + "loss": 0.91664543, + "memory(GiB)": 135.77, + "step": 24560, + "train_speed(iter/s)": 0.200973 + }, + { + "acc": 0.77958789, + "epoch": 0.5732367051138103, + "grad_norm": 5.6875, + "learning_rate": 8.370194200088091e-06, + "loss": 0.78155866, + "memory(GiB)": 135.77, + "step": 24570, + "train_speed(iter/s)": 0.201014 + }, + { + "acc": 0.78482103, + "epoch": 0.5734700126860992, + "grad_norm": 5.375, + "learning_rate": 8.368798480540607e-06, + "loss": 0.76826344, + "memory(GiB)": 135.77, + "step": 24580, + "train_speed(iter/s)": 0.201055 + }, + { + "acc": 0.7931962, + "epoch": 0.5737033202583881, + "grad_norm": 5.125, + "learning_rate": 8.367402280098118e-06, + "loss": 0.71536484, + "memory(GiB)": 135.77, + "step": 24590, + "train_speed(iter/s)": 0.201093 + }, + { + "acc": 0.77020044, + "epoch": 0.573936627830677, + "grad_norm": 6.3125, + "learning_rate": 8.366005598959932e-06, + "loss": 0.83158503, + "memory(GiB)": 135.77, + "step": 24600, + "train_speed(iter/s)": 0.201134 + }, + { + "acc": 0.79495125, + "epoch": 0.5741699354029659, + "grad_norm": 5.09375, + "learning_rate": 8.364608437325426e-06, + "loss": 0.74491563, + "memory(GiB)": 135.77, + "step": 24610, + "train_speed(iter/s)": 0.201177 + }, + { + "acc": 0.75548391, + "epoch": 0.5744032429752548, + "grad_norm": 5.09375, + "learning_rate": 8.363210795394042e-06, + "loss": 0.88298302, + "memory(GiB)": 135.77, + "step": 24620, + "train_speed(iter/s)": 0.201221 + }, + { + "acc": 0.7842412, + "epoch": 0.5746365505475437, + "grad_norm": 6.53125, + "learning_rate": 8.361812673365292e-06, + "loss": 0.76854534, + "memory(GiB)": 135.77, + "step": 24630, + "train_speed(iter/s)": 0.201263 + }, + { + "acc": 0.7659914, + "epoch": 0.5748698581198326, + "grad_norm": 5.125, + "learning_rate": 8.360414071438761e-06, + "loss": 0.84088774, + "memory(GiB)": 135.77, + "step": 24640, + "train_speed(iter/s)": 0.201306 + }, + { + "acc": 0.78436427, + "epoch": 0.5751031656921215, + "grad_norm": 5.0625, + "learning_rate": 8.359014989814099e-06, + "loss": 0.79782057, + "memory(GiB)": 135.77, + "step": 24650, + "train_speed(iter/s)": 0.201352 + }, + { + "acc": 0.76118832, + "epoch": 0.5753364732644104, + "grad_norm": 8.0625, + "learning_rate": 8.35761542869102e-06, + "loss": 0.86918697, + "memory(GiB)": 135.77, + "step": 24660, + "train_speed(iter/s)": 0.201396 + }, + { + "acc": 0.76398478, + "epoch": 0.5755697808366993, + "grad_norm": 6.6875, + "learning_rate": 8.356215388269316e-06, + "loss": 0.84599447, + "memory(GiB)": 135.77, + "step": 24670, + "train_speed(iter/s)": 0.201435 + }, + { + "acc": 0.78095145, + "epoch": 0.5758030884089882, + "grad_norm": 5.3125, + "learning_rate": 8.354814868748839e-06, + "loss": 0.77294083, + "memory(GiB)": 135.77, + "step": 24680, + "train_speed(iter/s)": 0.201476 + }, + { + "acc": 0.76520529, + "epoch": 0.5760363959812771, + "grad_norm": 7.0625, + "learning_rate": 8.353413870329514e-06, + "loss": 0.86367378, + "memory(GiB)": 135.77, + "step": 24690, + "train_speed(iter/s)": 0.201518 + }, + { + "acc": 0.77567444, + "epoch": 0.576269703553566, + "grad_norm": 4.25, + "learning_rate": 8.352012393211336e-06, + "loss": 0.79126658, + "memory(GiB)": 135.77, + "step": 24700, + "train_speed(iter/s)": 0.20156 + }, + { + "acc": 0.7710907, + "epoch": 0.5765030111258549, + "grad_norm": 6.28125, + "learning_rate": 8.35061043759436e-06, + "loss": 0.83803148, + "memory(GiB)": 135.77, + "step": 24710, + "train_speed(iter/s)": 0.201599 + }, + { + "acc": 0.76869202, + "epoch": 0.5767363186981438, + "grad_norm": 5.21875, + "learning_rate": 8.349208003678716e-06, + "loss": 0.83444061, + "memory(GiB)": 135.77, + "step": 24720, + "train_speed(iter/s)": 0.201641 + }, + { + "acc": 0.77576461, + "epoch": 0.5769696262704327, + "grad_norm": 5.375, + "learning_rate": 8.347805091664606e-06, + "loss": 0.82614613, + "memory(GiB)": 135.77, + "step": 24730, + "train_speed(iter/s)": 0.201681 + }, + { + "acc": 0.77615957, + "epoch": 0.5772029338427216, + "grad_norm": 5.46875, + "learning_rate": 8.34640170175229e-06, + "loss": 0.79450846, + "memory(GiB)": 135.77, + "step": 24740, + "train_speed(iter/s)": 0.201723 + }, + { + "acc": 0.77073669, + "epoch": 0.5774362414150104, + "grad_norm": 7.9375, + "learning_rate": 8.344997834142103e-06, + "loss": 0.82609434, + "memory(GiB)": 135.77, + "step": 24750, + "train_speed(iter/s)": 0.201762 + }, + { + "acc": 0.79335299, + "epoch": 0.5776695489872993, + "grad_norm": 5.59375, + "learning_rate": 8.343593489034447e-06, + "loss": 0.74660645, + "memory(GiB)": 135.77, + "step": 24760, + "train_speed(iter/s)": 0.201803 + }, + { + "acc": 0.77887115, + "epoch": 0.5779028565595882, + "grad_norm": 6.34375, + "learning_rate": 8.342188666629793e-06, + "loss": 0.82571802, + "memory(GiB)": 135.77, + "step": 24770, + "train_speed(iter/s)": 0.201846 + }, + { + "acc": 0.78749199, + "epoch": 0.5781361641318771, + "grad_norm": 5.53125, + "learning_rate": 8.340783367128677e-06, + "loss": 0.7465394, + "memory(GiB)": 135.77, + "step": 24780, + "train_speed(iter/s)": 0.20189 + }, + { + "acc": 0.7764246, + "epoch": 0.578369471704166, + "grad_norm": 3.9375, + "learning_rate": 8.339377590731705e-06, + "loss": 0.78514347, + "memory(GiB)": 135.77, + "step": 24790, + "train_speed(iter/s)": 0.201931 + }, + { + "acc": 0.76679988, + "epoch": 0.5786027792764549, + "grad_norm": 6.8125, + "learning_rate": 8.337971337639552e-06, + "loss": 0.84274111, + "memory(GiB)": 135.77, + "step": 24800, + "train_speed(iter/s)": 0.201972 + }, + { + "acc": 0.77209625, + "epoch": 0.5788360868487438, + "grad_norm": 5.65625, + "learning_rate": 8.336564608052961e-06, + "loss": 0.80813961, + "memory(GiB)": 135.77, + "step": 24810, + "train_speed(iter/s)": 0.202017 + }, + { + "acc": 0.77081928, + "epoch": 0.5790693944210327, + "grad_norm": 5.0625, + "learning_rate": 8.335157402172743e-06, + "loss": 0.83020134, + "memory(GiB)": 135.77, + "step": 24820, + "train_speed(iter/s)": 0.20206 + }, + { + "acc": 0.78011541, + "epoch": 0.5793027019933216, + "grad_norm": 5.46875, + "learning_rate": 8.333749720199772e-06, + "loss": 0.80131474, + "memory(GiB)": 135.77, + "step": 24830, + "train_speed(iter/s)": 0.202103 + }, + { + "acc": 0.78027191, + "epoch": 0.5795360095656105, + "grad_norm": 7.375, + "learning_rate": 8.332341562334998e-06, + "loss": 0.78537641, + "memory(GiB)": 135.77, + "step": 24840, + "train_speed(iter/s)": 0.202141 + }, + { + "acc": 0.78687563, + "epoch": 0.5797693171378994, + "grad_norm": 4.90625, + "learning_rate": 8.330932928779434e-06, + "loss": 0.75612831, + "memory(GiB)": 135.77, + "step": 24850, + "train_speed(iter/s)": 0.202183 + }, + { + "acc": 0.78615999, + "epoch": 0.5800026247101883, + "grad_norm": 5.5625, + "learning_rate": 8.329523819734161e-06, + "loss": 0.74080048, + "memory(GiB)": 135.77, + "step": 24860, + "train_speed(iter/s)": 0.202226 + }, + { + "acc": 0.77790847, + "epoch": 0.5802359322824772, + "grad_norm": 7.1875, + "learning_rate": 8.328114235400331e-06, + "loss": 0.79633036, + "memory(GiB)": 135.77, + "step": 24870, + "train_speed(iter/s)": 0.202268 + }, + { + "acc": 0.76988811, + "epoch": 0.5804692398547661, + "grad_norm": 5.625, + "learning_rate": 8.326704175979162e-06, + "loss": 0.8448287, + "memory(GiB)": 135.77, + "step": 24880, + "train_speed(iter/s)": 0.20231 + }, + { + "acc": 0.78928442, + "epoch": 0.580702547427055, + "grad_norm": 4.65625, + "learning_rate": 8.325293641671936e-06, + "loss": 0.76444321, + "memory(GiB)": 135.77, + "step": 24890, + "train_speed(iter/s)": 0.202353 + }, + { + "acc": 0.77171555, + "epoch": 0.5809358549993439, + "grad_norm": 4.65625, + "learning_rate": 8.32388263268001e-06, + "loss": 0.83708153, + "memory(GiB)": 135.77, + "step": 24900, + "train_speed(iter/s)": 0.202393 + }, + { + "acc": 0.76470823, + "epoch": 0.5811691625716328, + "grad_norm": 6.5625, + "learning_rate": 8.322471149204804e-06, + "loss": 0.86168737, + "memory(GiB)": 135.77, + "step": 24910, + "train_speed(iter/s)": 0.202435 + }, + { + "acc": 0.76229396, + "epoch": 0.5814024701439217, + "grad_norm": 6.3125, + "learning_rate": 8.321059191447807e-06, + "loss": 0.8397192, + "memory(GiB)": 135.77, + "step": 24920, + "train_speed(iter/s)": 0.202476 + }, + { + "acc": 0.76328716, + "epoch": 0.5816357777162106, + "grad_norm": 4.65625, + "learning_rate": 8.319646759610573e-06, + "loss": 0.85694113, + "memory(GiB)": 135.77, + "step": 24930, + "train_speed(iter/s)": 0.202516 + }, + { + "acc": 0.76525183, + "epoch": 0.5818690852884993, + "grad_norm": 18.875, + "learning_rate": 8.31823385389473e-06, + "loss": 0.8547142, + "memory(GiB)": 135.77, + "step": 24940, + "train_speed(iter/s)": 0.202558 + }, + { + "acc": 0.79167852, + "epoch": 0.5821023928607882, + "grad_norm": 4.8125, + "learning_rate": 8.316820474501968e-06, + "loss": 0.74602165, + "memory(GiB)": 135.77, + "step": 24950, + "train_speed(iter/s)": 0.202601 + }, + { + "acc": 0.79028225, + "epoch": 0.5823357004330771, + "grad_norm": 14.0, + "learning_rate": 8.315406621634048e-06, + "loss": 0.76107903, + "memory(GiB)": 135.77, + "step": 24960, + "train_speed(iter/s)": 0.202643 + }, + { + "acc": 0.76247883, + "epoch": 0.582569008005366, + "grad_norm": 4.78125, + "learning_rate": 8.313992295492794e-06, + "loss": 0.87208462, + "memory(GiB)": 135.77, + "step": 24970, + "train_speed(iter/s)": 0.202687 + }, + { + "acc": 0.77712317, + "epoch": 0.5828023155776549, + "grad_norm": 4.59375, + "learning_rate": 8.312577496280103e-06, + "loss": 0.79026947, + "memory(GiB)": 135.77, + "step": 24980, + "train_speed(iter/s)": 0.202728 + }, + { + "acc": 0.76572886, + "epoch": 0.5830356231499438, + "grad_norm": 5.15625, + "learning_rate": 8.311162224197938e-06, + "loss": 0.84107924, + "memory(GiB)": 135.77, + "step": 24990, + "train_speed(iter/s)": 0.202769 + }, + { + "acc": 0.77434468, + "epoch": 0.5832689307222327, + "grad_norm": 5.53125, + "learning_rate": 8.309746479448324e-06, + "loss": 0.8128541, + "memory(GiB)": 135.77, + "step": 25000, + "train_speed(iter/s)": 0.202812 + }, + { + "epoch": 0.5832689307222327, + "eval_acc": 0.7402269980435179, + "eval_loss": 0.8210588693618774, + "eval_runtime": 1270.1769, + "eval_samples_per_second": 28.335, + "eval_steps_per_second": 14.168, + "step": 25000 + }, + { + "acc": 0.78035855, + "epoch": 0.5835022382945216, + "grad_norm": 4.78125, + "learning_rate": 8.308330262233366e-06, + "loss": 0.78389635, + "memory(GiB)": 135.77, + "step": 25010, + "train_speed(iter/s)": 0.200755 + }, + { + "acc": 0.77793059, + "epoch": 0.5837355458668105, + "grad_norm": 4.03125, + "learning_rate": 8.306913572755221e-06, + "loss": 0.77305503, + "memory(GiB)": 135.77, + "step": 25020, + "train_speed(iter/s)": 0.200798 + }, + { + "acc": 0.77444048, + "epoch": 0.5839688534390994, + "grad_norm": 12.3125, + "learning_rate": 8.305496411216125e-06, + "loss": 0.82593975, + "memory(GiB)": 135.77, + "step": 25030, + "train_speed(iter/s)": 0.200837 + }, + { + "acc": 0.79068418, + "epoch": 0.5842021610113883, + "grad_norm": 4.96875, + "learning_rate": 8.304078777818377e-06, + "loss": 0.76902728, + "memory(GiB)": 135.77, + "step": 25040, + "train_speed(iter/s)": 0.20088 + }, + { + "acc": 0.78544216, + "epoch": 0.5844354685836772, + "grad_norm": 6.4375, + "learning_rate": 8.302660672764343e-06, + "loss": 0.77539878, + "memory(GiB)": 135.77, + "step": 25050, + "train_speed(iter/s)": 0.200921 + }, + { + "acc": 0.7708941, + "epoch": 0.5846687761559661, + "grad_norm": 6.03125, + "learning_rate": 8.301242096256457e-06, + "loss": 0.82189264, + "memory(GiB)": 135.77, + "step": 25060, + "train_speed(iter/s)": 0.20096 + }, + { + "acc": 0.76837645, + "epoch": 0.584902083728255, + "grad_norm": 5.75, + "learning_rate": 8.299823048497221e-06, + "loss": 0.8290554, + "memory(GiB)": 135.77, + "step": 25070, + "train_speed(iter/s)": 0.201004 + }, + { + "acc": 0.77567482, + "epoch": 0.5851353913005439, + "grad_norm": 5.28125, + "learning_rate": 8.298403529689204e-06, + "loss": 0.81522255, + "memory(GiB)": 135.77, + "step": 25080, + "train_speed(iter/s)": 0.201047 + }, + { + "acc": 0.78811507, + "epoch": 0.5853686988728328, + "grad_norm": 5.15625, + "learning_rate": 8.296983540035041e-06, + "loss": 0.73775659, + "memory(GiB)": 135.77, + "step": 25090, + "train_speed(iter/s)": 0.201087 + }, + { + "acc": 0.74818373, + "epoch": 0.5856020064451217, + "grad_norm": 4.75, + "learning_rate": 8.295563079737436e-06, + "loss": 0.91113939, + "memory(GiB)": 135.77, + "step": 25100, + "train_speed(iter/s)": 0.201129 + }, + { + "acc": 0.7771193, + "epoch": 0.5858353140174106, + "grad_norm": 5.46875, + "learning_rate": 8.294142148999157e-06, + "loss": 0.78849659, + "memory(GiB)": 135.77, + "step": 25110, + "train_speed(iter/s)": 0.201173 + }, + { + "acc": 0.77626028, + "epoch": 0.5860686215896995, + "grad_norm": 4.9375, + "learning_rate": 8.292720748023045e-06, + "loss": 0.78736038, + "memory(GiB)": 135.77, + "step": 25120, + "train_speed(iter/s)": 0.201212 + }, + { + "acc": 0.78426456, + "epoch": 0.5863019291619883, + "grad_norm": 5.71875, + "learning_rate": 8.291298877012002e-06, + "loss": 0.78405647, + "memory(GiB)": 135.77, + "step": 25130, + "train_speed(iter/s)": 0.201251 + }, + { + "acc": 0.77909927, + "epoch": 0.5865352367342772, + "grad_norm": 5.9375, + "learning_rate": 8.289876536169002e-06, + "loss": 0.80689306, + "memory(GiB)": 135.77, + "step": 25140, + "train_speed(iter/s)": 0.201293 + }, + { + "acc": 0.77817984, + "epoch": 0.5867685443065661, + "grad_norm": 5.46875, + "learning_rate": 8.28845372569708e-06, + "loss": 0.80842056, + "memory(GiB)": 135.77, + "step": 25150, + "train_speed(iter/s)": 0.201335 + }, + { + "acc": 0.76585131, + "epoch": 0.587001851878855, + "grad_norm": 5.3125, + "learning_rate": 8.287030445799345e-06, + "loss": 0.84683504, + "memory(GiB)": 135.77, + "step": 25160, + "train_speed(iter/s)": 0.201377 + }, + { + "acc": 0.76378751, + "epoch": 0.5872351594511439, + "grad_norm": 5.9375, + "learning_rate": 8.285606696678969e-06, + "loss": 0.85097218, + "memory(GiB)": 135.77, + "step": 25170, + "train_speed(iter/s)": 0.201419 + }, + { + "acc": 0.76524734, + "epoch": 0.5874684670234328, + "grad_norm": 6.75, + "learning_rate": 8.28418247853919e-06, + "loss": 0.82923918, + "memory(GiB)": 135.77, + "step": 25180, + "train_speed(iter/s)": 0.201457 + }, + { + "acc": 0.7815011, + "epoch": 0.5877017745957217, + "grad_norm": 5.5, + "learning_rate": 8.282757791583316e-06, + "loss": 0.78978734, + "memory(GiB)": 135.77, + "step": 25190, + "train_speed(iter/s)": 0.201499 + }, + { + "acc": 0.75179167, + "epoch": 0.5879350821680106, + "grad_norm": 5.09375, + "learning_rate": 8.281332636014723e-06, + "loss": 0.89616728, + "memory(GiB)": 135.77, + "step": 25200, + "train_speed(iter/s)": 0.201541 + }, + { + "acc": 0.77656918, + "epoch": 0.5881683897402995, + "grad_norm": 5.09375, + "learning_rate": 8.279907012036849e-06, + "loss": 0.80306749, + "memory(GiB)": 135.77, + "step": 25210, + "train_speed(iter/s)": 0.201579 + }, + { + "acc": 0.77237129, + "epoch": 0.5884016973125884, + "grad_norm": 5.5625, + "learning_rate": 8.2784809198532e-06, + "loss": 0.8121336, + "memory(GiB)": 135.77, + "step": 25220, + "train_speed(iter/s)": 0.20162 + }, + { + "acc": 0.7861938, + "epoch": 0.5886350048848773, + "grad_norm": 5.125, + "learning_rate": 8.277054359667355e-06, + "loss": 0.76683536, + "memory(GiB)": 135.77, + "step": 25230, + "train_speed(iter/s)": 0.201662 + }, + { + "acc": 0.75934463, + "epoch": 0.5888683124571662, + "grad_norm": 5.8125, + "learning_rate": 8.27562733168295e-06, + "loss": 0.87756844, + "memory(GiB)": 135.77, + "step": 25240, + "train_speed(iter/s)": 0.201704 + }, + { + "acc": 0.76303396, + "epoch": 0.5891016200294551, + "grad_norm": 5.34375, + "learning_rate": 8.274199836103696e-06, + "loss": 0.84503393, + "memory(GiB)": 135.77, + "step": 25250, + "train_speed(iter/s)": 0.201746 + }, + { + "acc": 0.77054796, + "epoch": 0.589334927601744, + "grad_norm": 11.0, + "learning_rate": 8.272771873133365e-06, + "loss": 0.83606682, + "memory(GiB)": 135.77, + "step": 25260, + "train_speed(iter/s)": 0.201789 + }, + { + "acc": 0.78724136, + "epoch": 0.5895682351740329, + "grad_norm": 5.46875, + "learning_rate": 8.271343442975803e-06, + "loss": 0.76417646, + "memory(GiB)": 135.77, + "step": 25270, + "train_speed(iter/s)": 0.201829 + }, + { + "acc": 0.75213413, + "epoch": 0.5898015427463218, + "grad_norm": 5.375, + "learning_rate": 8.269914545834911e-06, + "loss": 0.90432196, + "memory(GiB)": 135.77, + "step": 25280, + "train_speed(iter/s)": 0.201869 + }, + { + "acc": 0.76026382, + "epoch": 0.5900348503186107, + "grad_norm": 5.34375, + "learning_rate": 8.26848518191467e-06, + "loss": 0.88067017, + "memory(GiB)": 135.77, + "step": 25290, + "train_speed(iter/s)": 0.201909 + }, + { + "acc": 0.77908268, + "epoch": 0.5902681578908996, + "grad_norm": 3.953125, + "learning_rate": 8.267055351419117e-06, + "loss": 0.78281631, + "memory(GiB)": 135.77, + "step": 25300, + "train_speed(iter/s)": 0.201951 + }, + { + "acc": 0.78081703, + "epoch": 0.5905014654631885, + "grad_norm": 5.6875, + "learning_rate": 8.265625054552363e-06, + "loss": 0.7916995, + "memory(GiB)": 135.77, + "step": 25310, + "train_speed(iter/s)": 0.201991 + }, + { + "acc": 0.78177204, + "epoch": 0.5907347730354773, + "grad_norm": 8.9375, + "learning_rate": 8.264194291518583e-06, + "loss": 0.79910288, + "memory(GiB)": 135.77, + "step": 25320, + "train_speed(iter/s)": 0.202033 + }, + { + "acc": 0.75857491, + "epoch": 0.5909680806077662, + "grad_norm": 7.46875, + "learning_rate": 8.262763062522013e-06, + "loss": 0.87688332, + "memory(GiB)": 135.77, + "step": 25330, + "train_speed(iter/s)": 0.202076 + }, + { + "acc": 0.76642208, + "epoch": 0.5912013881800551, + "grad_norm": 6.375, + "learning_rate": 8.261331367766965e-06, + "loss": 0.83190136, + "memory(GiB)": 135.77, + "step": 25340, + "train_speed(iter/s)": 0.202117 + }, + { + "acc": 0.77461624, + "epoch": 0.591434695752344, + "grad_norm": 5.3125, + "learning_rate": 8.25989920745781e-06, + "loss": 0.80886202, + "memory(GiB)": 135.77, + "step": 25350, + "train_speed(iter/s)": 0.202155 + }, + { + "acc": 0.75091953, + "epoch": 0.5916680033246329, + "grad_norm": 5.25, + "learning_rate": 8.258466581798992e-06, + "loss": 0.90273914, + "memory(GiB)": 135.77, + "step": 25360, + "train_speed(iter/s)": 0.202196 + }, + { + "acc": 0.77260246, + "epoch": 0.5919013108969218, + "grad_norm": 5.8125, + "learning_rate": 8.257033490995017e-06, + "loss": 0.81039619, + "memory(GiB)": 135.77, + "step": 25370, + "train_speed(iter/s)": 0.202235 + }, + { + "acc": 0.745508, + "epoch": 0.5921346184692107, + "grad_norm": 5.40625, + "learning_rate": 8.255599935250456e-06, + "loss": 0.90657196, + "memory(GiB)": 135.77, + "step": 25380, + "train_speed(iter/s)": 0.202278 + }, + { + "acc": 0.76827936, + "epoch": 0.5923679260414996, + "grad_norm": 6.0625, + "learning_rate": 8.254165914769949e-06, + "loss": 0.85670547, + "memory(GiB)": 135.77, + "step": 25390, + "train_speed(iter/s)": 0.202321 + }, + { + "acc": 0.76772423, + "epoch": 0.5926012336137885, + "grad_norm": 4.84375, + "learning_rate": 8.252731429758205e-06, + "loss": 0.83695488, + "memory(GiB)": 135.77, + "step": 25400, + "train_speed(iter/s)": 0.202362 + }, + { + "acc": 0.78266163, + "epoch": 0.5928345411860774, + "grad_norm": 5.65625, + "learning_rate": 8.251296480419992e-06, + "loss": 0.7788908, + "memory(GiB)": 135.77, + "step": 25410, + "train_speed(iter/s)": 0.202402 + }, + { + "acc": 0.77551551, + "epoch": 0.5930678487583663, + "grad_norm": 4.5625, + "learning_rate": 8.249861066960154e-06, + "loss": 0.80347424, + "memory(GiB)": 135.77, + "step": 25420, + "train_speed(iter/s)": 0.202442 + }, + { + "acc": 0.75031691, + "epoch": 0.5933011563306552, + "grad_norm": 5.25, + "learning_rate": 8.248425189583589e-06, + "loss": 0.90826054, + "memory(GiB)": 135.77, + "step": 25430, + "train_speed(iter/s)": 0.202484 + }, + { + "acc": 0.78029752, + "epoch": 0.5935344639029441, + "grad_norm": 6.40625, + "learning_rate": 8.246988848495275e-06, + "loss": 0.79855204, + "memory(GiB)": 135.77, + "step": 25440, + "train_speed(iter/s)": 0.202525 + }, + { + "acc": 0.77923479, + "epoch": 0.593767771475233, + "grad_norm": 4.5625, + "learning_rate": 8.245552043900245e-06, + "loss": 0.80320797, + "memory(GiB)": 135.77, + "step": 25450, + "train_speed(iter/s)": 0.202567 + }, + { + "acc": 0.76215806, + "epoch": 0.5940010790475219, + "grad_norm": 4.53125, + "learning_rate": 8.244114776003605e-06, + "loss": 0.88092327, + "memory(GiB)": 135.77, + "step": 25460, + "train_speed(iter/s)": 0.202603 + }, + { + "acc": 0.7797924, + "epoch": 0.5942343866198108, + "grad_norm": 5.6875, + "learning_rate": 8.24267704501052e-06, + "loss": 0.78126807, + "memory(GiB)": 135.77, + "step": 25470, + "train_speed(iter/s)": 0.202644 + }, + { + "acc": 0.78742337, + "epoch": 0.5944676941920997, + "grad_norm": 4.84375, + "learning_rate": 8.241238851126231e-06, + "loss": 0.78035583, + "memory(GiB)": 135.77, + "step": 25480, + "train_speed(iter/s)": 0.202687 + }, + { + "acc": 0.77751136, + "epoch": 0.5947010017643886, + "grad_norm": 7.8125, + "learning_rate": 8.239800194556036e-06, + "loss": 0.80985718, + "memory(GiB)": 135.77, + "step": 25490, + "train_speed(iter/s)": 0.202728 + }, + { + "acc": 0.76632509, + "epoch": 0.5949343093366775, + "grad_norm": 4.8125, + "learning_rate": 8.238361075505307e-06, + "loss": 0.8863204, + "memory(GiB)": 135.77, + "step": 25500, + "train_speed(iter/s)": 0.202766 + }, + { + "epoch": 0.5949343093366775, + "eval_acc": 0.7401671041006372, + "eval_loss": 0.8206676840782166, + "eval_runtime": 1268.9058, + "eval_samples_per_second": 28.364, + "eval_steps_per_second": 14.182, + "step": 25500 + }, + { + "acc": 0.77610083, + "epoch": 0.5951676169089664, + "grad_norm": 6.28125, + "learning_rate": 8.236921494179474e-06, + "loss": 0.79520016, + "memory(GiB)": 135.77, + "step": 25510, + "train_speed(iter/s)": 0.200756 + }, + { + "acc": 0.76967053, + "epoch": 0.5954009244812551, + "grad_norm": 4.5625, + "learning_rate": 8.235481450784037e-06, + "loss": 0.82552261, + "memory(GiB)": 135.77, + "step": 25520, + "train_speed(iter/s)": 0.200796 + }, + { + "acc": 0.78743782, + "epoch": 0.595634232053544, + "grad_norm": 5.78125, + "learning_rate": 8.234040945524563e-06, + "loss": 0.78872375, + "memory(GiB)": 135.77, + "step": 25530, + "train_speed(iter/s)": 0.200836 + }, + { + "acc": 0.77408667, + "epoch": 0.5958675396258329, + "grad_norm": 5.8125, + "learning_rate": 8.232599978606683e-06, + "loss": 0.82129602, + "memory(GiB)": 135.77, + "step": 25540, + "train_speed(iter/s)": 0.200877 + }, + { + "acc": 0.76378164, + "epoch": 0.5961008471981218, + "grad_norm": 7.125, + "learning_rate": 8.231158550236098e-06, + "loss": 0.86470642, + "memory(GiB)": 135.77, + "step": 25550, + "train_speed(iter/s)": 0.200919 + }, + { + "acc": 0.77116547, + "epoch": 0.5963341547704107, + "grad_norm": 4.8125, + "learning_rate": 8.229716660618567e-06, + "loss": 0.80563984, + "memory(GiB)": 135.77, + "step": 25560, + "train_speed(iter/s)": 0.200961 + }, + { + "acc": 0.78916759, + "epoch": 0.5965674623426996, + "grad_norm": 5.0, + "learning_rate": 8.22827430995992e-06, + "loss": 0.74210978, + "memory(GiB)": 135.77, + "step": 25570, + "train_speed(iter/s)": 0.201001 + }, + { + "acc": 0.76487999, + "epoch": 0.5968007699149885, + "grad_norm": 5.75, + "learning_rate": 8.226831498466054e-06, + "loss": 0.86971359, + "memory(GiB)": 135.77, + "step": 25580, + "train_speed(iter/s)": 0.201044 + }, + { + "acc": 0.77532277, + "epoch": 0.5970340774872774, + "grad_norm": 5.53125, + "learning_rate": 8.22538822634293e-06, + "loss": 0.79406919, + "memory(GiB)": 135.77, + "step": 25590, + "train_speed(iter/s)": 0.201083 + }, + { + "acc": 0.78031688, + "epoch": 0.5972673850595663, + "grad_norm": 4.90625, + "learning_rate": 8.223944493796572e-06, + "loss": 0.77325659, + "memory(GiB)": 135.77, + "step": 25600, + "train_speed(iter/s)": 0.201122 + }, + { + "acc": 0.78207927, + "epoch": 0.5975006926318552, + "grad_norm": 12.0625, + "learning_rate": 8.222500301033075e-06, + "loss": 0.78344164, + "memory(GiB)": 135.77, + "step": 25610, + "train_speed(iter/s)": 0.20116 + }, + { + "acc": 0.76561384, + "epoch": 0.5977340002041441, + "grad_norm": 5.1875, + "learning_rate": 8.221055648258596e-06, + "loss": 0.85672531, + "memory(GiB)": 135.77, + "step": 25620, + "train_speed(iter/s)": 0.201199 + }, + { + "acc": 0.76766024, + "epoch": 0.597967307776433, + "grad_norm": 5.0, + "learning_rate": 8.21961053567936e-06, + "loss": 0.82406807, + "memory(GiB)": 135.77, + "step": 25630, + "train_speed(iter/s)": 0.201242 + }, + { + "acc": 0.79372106, + "epoch": 0.5982006153487219, + "grad_norm": 6.25, + "learning_rate": 8.218164963501651e-06, + "loss": 0.71891031, + "memory(GiB)": 135.77, + "step": 25640, + "train_speed(iter/s)": 0.201286 + }, + { + "acc": 0.76460505, + "epoch": 0.5984339229210108, + "grad_norm": 5.375, + "learning_rate": 8.216718931931832e-06, + "loss": 0.83040361, + "memory(GiB)": 135.77, + "step": 25650, + "train_speed(iter/s)": 0.201327 + }, + { + "acc": 0.77256279, + "epoch": 0.5986672304932997, + "grad_norm": 10.75, + "learning_rate": 8.21527244117632e-06, + "loss": 0.82293129, + "memory(GiB)": 135.77, + "step": 25660, + "train_speed(iter/s)": 0.201372 + }, + { + "acc": 0.79064188, + "epoch": 0.5989005380655886, + "grad_norm": 5.90625, + "learning_rate": 8.2138254914416e-06, + "loss": 0.72057476, + "memory(GiB)": 135.77, + "step": 25670, + "train_speed(iter/s)": 0.201413 + }, + { + "acc": 0.77393141, + "epoch": 0.5991338456378775, + "grad_norm": 5.8125, + "learning_rate": 8.212378082934225e-06, + "loss": 0.84098873, + "memory(GiB)": 135.77, + "step": 25680, + "train_speed(iter/s)": 0.201451 + }, + { + "acc": 0.78354292, + "epoch": 0.5993671532101664, + "grad_norm": 4.9375, + "learning_rate": 8.210930215860812e-06, + "loss": 0.77515049, + "memory(GiB)": 135.77, + "step": 25690, + "train_speed(iter/s)": 0.201493 + }, + { + "acc": 0.76722078, + "epoch": 0.5996004607824553, + "grad_norm": 5.03125, + "learning_rate": 8.209481890428044e-06, + "loss": 0.85564814, + "memory(GiB)": 135.77, + "step": 25700, + "train_speed(iter/s)": 0.201533 + }, + { + "acc": 0.78583412, + "epoch": 0.5998337683547441, + "grad_norm": 4.25, + "learning_rate": 8.208033106842668e-06, + "loss": 0.79263468, + "memory(GiB)": 135.77, + "step": 25710, + "train_speed(iter/s)": 0.201569 + }, + { + "acc": 0.77676077, + "epoch": 0.600067075927033, + "grad_norm": 7.96875, + "learning_rate": 8.206583865311497e-06, + "loss": 0.79163513, + "memory(GiB)": 135.77, + "step": 25720, + "train_speed(iter/s)": 0.201609 + }, + { + "acc": 0.78243885, + "epoch": 0.6003003834993219, + "grad_norm": 4.96875, + "learning_rate": 8.205134166041412e-06, + "loss": 0.79570303, + "memory(GiB)": 135.77, + "step": 25730, + "train_speed(iter/s)": 0.201648 + }, + { + "acc": 0.7864254, + "epoch": 0.6005336910716108, + "grad_norm": 5.4375, + "learning_rate": 8.203684009239356e-06, + "loss": 0.76417632, + "memory(GiB)": 135.77, + "step": 25740, + "train_speed(iter/s)": 0.201687 + }, + { + "acc": 0.76960988, + "epoch": 0.6007669986438997, + "grad_norm": 5.75, + "learning_rate": 8.202233395112338e-06, + "loss": 0.82158051, + "memory(GiB)": 135.77, + "step": 25750, + "train_speed(iter/s)": 0.201728 + }, + { + "acc": 0.77994032, + "epoch": 0.6010003062161886, + "grad_norm": 6.03125, + "learning_rate": 8.200782323867432e-06, + "loss": 0.82229548, + "memory(GiB)": 135.77, + "step": 25760, + "train_speed(iter/s)": 0.201767 + }, + { + "acc": 0.77866874, + "epoch": 0.6012336137884775, + "grad_norm": 5.96875, + "learning_rate": 8.19933079571178e-06, + "loss": 0.78451872, + "memory(GiB)": 135.77, + "step": 25770, + "train_speed(iter/s)": 0.201805 + }, + { + "acc": 0.78407946, + "epoch": 0.6014669213607664, + "grad_norm": 4.84375, + "learning_rate": 8.197878810852587e-06, + "loss": 0.76278973, + "memory(GiB)": 135.77, + "step": 25780, + "train_speed(iter/s)": 0.201846 + }, + { + "acc": 0.78200378, + "epoch": 0.6017002289330553, + "grad_norm": 7.875, + "learning_rate": 8.196426369497121e-06, + "loss": 0.78061686, + "memory(GiB)": 135.77, + "step": 25790, + "train_speed(iter/s)": 0.201885 + }, + { + "acc": 0.76984148, + "epoch": 0.6019335365053442, + "grad_norm": 7.0625, + "learning_rate": 8.19497347185272e-06, + "loss": 0.82394571, + "memory(GiB)": 135.77, + "step": 25800, + "train_speed(iter/s)": 0.201928 + }, + { + "acc": 0.74774513, + "epoch": 0.6021668440776331, + "grad_norm": 4.90625, + "learning_rate": 8.193520118126785e-06, + "loss": 0.89157257, + "memory(GiB)": 135.77, + "step": 25810, + "train_speed(iter/s)": 0.201969 + }, + { + "acc": 0.77559814, + "epoch": 0.602400151649922, + "grad_norm": 6.15625, + "learning_rate": 8.19206630852678e-06, + "loss": 0.78932376, + "memory(GiB)": 135.77, + "step": 25820, + "train_speed(iter/s)": 0.202011 + }, + { + "acc": 0.76863947, + "epoch": 0.6026334592222109, + "grad_norm": 5.9375, + "learning_rate": 8.190612043260238e-06, + "loss": 0.83424625, + "memory(GiB)": 135.77, + "step": 25830, + "train_speed(iter/s)": 0.202052 + }, + { + "acc": 0.73870344, + "epoch": 0.6028667667944998, + "grad_norm": 5.15625, + "learning_rate": 8.189157322534753e-06, + "loss": 0.97275181, + "memory(GiB)": 135.77, + "step": 25840, + "train_speed(iter/s)": 0.202089 + }, + { + "acc": 0.77539172, + "epoch": 0.6031000743667887, + "grad_norm": 6.1875, + "learning_rate": 8.187702146557986e-06, + "loss": 0.78618755, + "memory(GiB)": 135.77, + "step": 25850, + "train_speed(iter/s)": 0.20213 + }, + { + "acc": 0.76942921, + "epoch": 0.6033333819390776, + "grad_norm": 4.21875, + "learning_rate": 8.186246515537664e-06, + "loss": 0.83434811, + "memory(GiB)": 135.77, + "step": 25860, + "train_speed(iter/s)": 0.202171 + }, + { + "acc": 0.78348265, + "epoch": 0.6035666895113665, + "grad_norm": 6.8125, + "learning_rate": 8.184790429681577e-06, + "loss": 0.77311735, + "memory(GiB)": 135.77, + "step": 25870, + "train_speed(iter/s)": 0.202209 + }, + { + "acc": 0.76982718, + "epoch": 0.6037999970836554, + "grad_norm": 5.125, + "learning_rate": 8.183333889197582e-06, + "loss": 0.84020672, + "memory(GiB)": 135.77, + "step": 25880, + "train_speed(iter/s)": 0.202249 + }, + { + "acc": 0.77111564, + "epoch": 0.6040333046559443, + "grad_norm": 5.75, + "learning_rate": 8.181876894293601e-06, + "loss": 0.80620203, + "memory(GiB)": 135.77, + "step": 25890, + "train_speed(iter/s)": 0.202291 + }, + { + "acc": 0.77545085, + "epoch": 0.6042666122282331, + "grad_norm": 5.8125, + "learning_rate": 8.180419445177614e-06, + "loss": 0.81389084, + "memory(GiB)": 135.77, + "step": 25900, + "train_speed(iter/s)": 0.202331 + }, + { + "acc": 0.77913561, + "epoch": 0.604499919800522, + "grad_norm": 5.875, + "learning_rate": 8.178961542057677e-06, + "loss": 0.79586811, + "memory(GiB)": 135.77, + "step": 25910, + "train_speed(iter/s)": 0.20237 + }, + { + "acc": 0.78647404, + "epoch": 0.6047332273728109, + "grad_norm": 6.15625, + "learning_rate": 8.177503185141904e-06, + "loss": 0.77673407, + "memory(GiB)": 135.77, + "step": 25920, + "train_speed(iter/s)": 0.202411 + }, + { + "acc": 0.78315163, + "epoch": 0.6049665349450998, + "grad_norm": 11.3125, + "learning_rate": 8.176044374638473e-06, + "loss": 0.767239, + "memory(GiB)": 135.77, + "step": 25930, + "train_speed(iter/s)": 0.202451 + }, + { + "acc": 0.76807127, + "epoch": 0.6051998425173887, + "grad_norm": 5.6875, + "learning_rate": 8.174585110755631e-06, + "loss": 0.86293449, + "memory(GiB)": 135.77, + "step": 25940, + "train_speed(iter/s)": 0.202491 + }, + { + "acc": 0.78088841, + "epoch": 0.6054331500896776, + "grad_norm": 4.5, + "learning_rate": 8.173125393701686e-06, + "loss": 0.7886466, + "memory(GiB)": 135.77, + "step": 25950, + "train_speed(iter/s)": 0.202533 + }, + { + "acc": 0.77306123, + "epoch": 0.6056664576619665, + "grad_norm": 7.28125, + "learning_rate": 8.171665223685014e-06, + "loss": 0.81952877, + "memory(GiB)": 135.77, + "step": 25960, + "train_speed(iter/s)": 0.202574 + }, + { + "acc": 0.78507872, + "epoch": 0.6058997652342554, + "grad_norm": 4.78125, + "learning_rate": 8.170204600914051e-06, + "loss": 0.76790247, + "memory(GiB)": 135.77, + "step": 25970, + "train_speed(iter/s)": 0.202612 + }, + { + "acc": 0.76558475, + "epoch": 0.6061330728065443, + "grad_norm": 6.09375, + "learning_rate": 8.168743525597304e-06, + "loss": 0.86151485, + "memory(GiB)": 135.77, + "step": 25980, + "train_speed(iter/s)": 0.20265 + }, + { + "acc": 0.75887089, + "epoch": 0.6063663803788332, + "grad_norm": 6.46875, + "learning_rate": 8.167281997943338e-06, + "loss": 0.87729855, + "memory(GiB)": 135.77, + "step": 25990, + "train_speed(iter/s)": 0.202691 + }, + { + "acc": 0.7482584, + "epoch": 0.6065996879511221, + "grad_norm": 7.3125, + "learning_rate": 8.165820018160787e-06, + "loss": 0.91938248, + "memory(GiB)": 135.77, + "step": 26000, + "train_speed(iter/s)": 0.202728 + }, + { + "epoch": 0.6065996879511221, + "eval_acc": 0.7404893142960276, + "eval_loss": 0.8198909163475037, + "eval_runtime": 1269.2964, + "eval_samples_per_second": 28.355, + "eval_steps_per_second": 14.178, + "step": 26000 + }, + { + "acc": 0.78726702, + "epoch": 0.606832995523411, + "grad_norm": 6.21875, + "learning_rate": 8.164357586458348e-06, + "loss": 0.75815754, + "memory(GiB)": 135.77, + "step": 26010, + "train_speed(iter/s)": 0.200755 + }, + { + "acc": 0.7841042, + "epoch": 0.6070663030956999, + "grad_norm": 20.0, + "learning_rate": 8.162894703044783e-06, + "loss": 0.7632184, + "memory(GiB)": 135.77, + "step": 26020, + "train_speed(iter/s)": 0.200792 + }, + { + "acc": 0.75316358, + "epoch": 0.6072996106679888, + "grad_norm": 4.625, + "learning_rate": 8.161431368128919e-06, + "loss": 0.89499702, + "memory(GiB)": 135.77, + "step": 26030, + "train_speed(iter/s)": 0.200832 + }, + { + "acc": 0.78723059, + "epoch": 0.6075329182402777, + "grad_norm": 6.0625, + "learning_rate": 8.159967581919644e-06, + "loss": 0.74924011, + "memory(GiB)": 135.77, + "step": 26040, + "train_speed(iter/s)": 0.200871 + }, + { + "acc": 0.79428697, + "epoch": 0.6077662258125666, + "grad_norm": 5.90625, + "learning_rate": 8.158503344625915e-06, + "loss": 0.73409123, + "memory(GiB)": 135.77, + "step": 26050, + "train_speed(iter/s)": 0.200911 + }, + { + "acc": 0.76840878, + "epoch": 0.6079995333848555, + "grad_norm": 5.53125, + "learning_rate": 8.157038656456752e-06, + "loss": 0.83939495, + "memory(GiB)": 135.77, + "step": 26060, + "train_speed(iter/s)": 0.200951 + }, + { + "acc": 0.77537632, + "epoch": 0.6082328409571444, + "grad_norm": 4.09375, + "learning_rate": 8.155573517621238e-06, + "loss": 0.81753826, + "memory(GiB)": 135.77, + "step": 26070, + "train_speed(iter/s)": 0.20099 + }, + { + "acc": 0.7842936, + "epoch": 0.6084661485294333, + "grad_norm": 6.59375, + "learning_rate": 8.154107928328521e-06, + "loss": 0.79942732, + "memory(GiB)": 135.77, + "step": 26080, + "train_speed(iter/s)": 0.201029 + }, + { + "acc": 0.77272925, + "epoch": 0.608699456101722, + "grad_norm": 6.71875, + "learning_rate": 8.152641888787812e-06, + "loss": 0.80734158, + "memory(GiB)": 135.77, + "step": 26090, + "train_speed(iter/s)": 0.201068 + }, + { + "acc": 0.76670575, + "epoch": 0.608932763674011, + "grad_norm": 7.75, + "learning_rate": 8.15117539920839e-06, + "loss": 0.85535202, + "memory(GiB)": 135.77, + "step": 26100, + "train_speed(iter/s)": 0.201109 + }, + { + "acc": 0.75531483, + "epoch": 0.6091660712462998, + "grad_norm": 6.09375, + "learning_rate": 8.149708459799595e-06, + "loss": 0.8802372, + "memory(GiB)": 135.77, + "step": 26110, + "train_speed(iter/s)": 0.20115 + }, + { + "acc": 0.76685176, + "epoch": 0.6093993788185887, + "grad_norm": 6.0, + "learning_rate": 8.148241070770834e-06, + "loss": 0.83814058, + "memory(GiB)": 135.77, + "step": 26120, + "train_speed(iter/s)": 0.20119 + }, + { + "acc": 0.7886631, + "epoch": 0.6096326863908776, + "grad_norm": 9.1875, + "learning_rate": 8.146773232331574e-06, + "loss": 0.77238207, + "memory(GiB)": 135.77, + "step": 26130, + "train_speed(iter/s)": 0.201229 + }, + { + "acc": 0.75037575, + "epoch": 0.6098659939631665, + "grad_norm": 5.65625, + "learning_rate": 8.145304944691347e-06, + "loss": 0.90184555, + "memory(GiB)": 135.77, + "step": 26140, + "train_speed(iter/s)": 0.201269 + }, + { + "acc": 0.7713768, + "epoch": 0.6100993015354554, + "grad_norm": 7.84375, + "learning_rate": 8.143836208059754e-06, + "loss": 0.80705109, + "memory(GiB)": 135.77, + "step": 26150, + "train_speed(iter/s)": 0.20131 + }, + { + "acc": 0.77182598, + "epoch": 0.6103326091077443, + "grad_norm": 4.78125, + "learning_rate": 8.142367022646457e-06, + "loss": 0.83266945, + "memory(GiB)": 135.77, + "step": 26160, + "train_speed(iter/s)": 0.201352 + }, + { + "acc": 0.77788215, + "epoch": 0.6105659166800332, + "grad_norm": 5.8125, + "learning_rate": 8.14089738866118e-06, + "loss": 0.80510206, + "memory(GiB)": 135.77, + "step": 26170, + "train_speed(iter/s)": 0.201388 + }, + { + "acc": 0.77358456, + "epoch": 0.6107992242523221, + "grad_norm": 4.59375, + "learning_rate": 8.139427306313713e-06, + "loss": 0.81454687, + "memory(GiB)": 135.77, + "step": 26180, + "train_speed(iter/s)": 0.201426 + }, + { + "acc": 0.78963194, + "epoch": 0.611032531824611, + "grad_norm": 7.6875, + "learning_rate": 8.137956775813909e-06, + "loss": 0.75747194, + "memory(GiB)": 135.77, + "step": 26190, + "train_speed(iter/s)": 0.201467 + }, + { + "acc": 0.76336627, + "epoch": 0.6112658393968999, + "grad_norm": 4.6875, + "learning_rate": 8.136485797371687e-06, + "loss": 0.84080048, + "memory(GiB)": 135.77, + "step": 26200, + "train_speed(iter/s)": 0.201506 + }, + { + "acc": 0.772651, + "epoch": 0.6114991469691888, + "grad_norm": 4.78125, + "learning_rate": 8.13501437119703e-06, + "loss": 0.81818466, + "memory(GiB)": 135.77, + "step": 26210, + "train_speed(iter/s)": 0.201547 + }, + { + "acc": 0.75983801, + "epoch": 0.6117324545414777, + "grad_norm": 7.03125, + "learning_rate": 8.133542497499981e-06, + "loss": 0.85904999, + "memory(GiB)": 135.77, + "step": 26220, + "train_speed(iter/s)": 0.201587 + }, + { + "acc": 0.77595029, + "epoch": 0.6119657621137666, + "grad_norm": 6.8125, + "learning_rate": 8.132070176490652e-06, + "loss": 0.79862223, + "memory(GiB)": 135.77, + "step": 26230, + "train_speed(iter/s)": 0.201625 + }, + { + "acc": 0.76928434, + "epoch": 0.6121990696860555, + "grad_norm": 6.28125, + "learning_rate": 8.130597408379214e-06, + "loss": 0.80921583, + "memory(GiB)": 135.77, + "step": 26240, + "train_speed(iter/s)": 0.201663 + }, + { + "acc": 0.76527472, + "epoch": 0.6124323772583444, + "grad_norm": 4.28125, + "learning_rate": 8.129124193375906e-06, + "loss": 0.83155708, + "memory(GiB)": 135.77, + "step": 26250, + "train_speed(iter/s)": 0.201702 + }, + { + "acc": 0.79154787, + "epoch": 0.6126656848306333, + "grad_norm": 5.53125, + "learning_rate": 8.127650531691028e-06, + "loss": 0.74951324, + "memory(GiB)": 135.77, + "step": 26260, + "train_speed(iter/s)": 0.201743 + }, + { + "acc": 0.78243771, + "epoch": 0.6128989924029222, + "grad_norm": 5.8125, + "learning_rate": 8.126176423534945e-06, + "loss": 0.76880646, + "memory(GiB)": 135.77, + "step": 26270, + "train_speed(iter/s)": 0.201785 + }, + { + "acc": 0.77010064, + "epoch": 0.6131322999752111, + "grad_norm": 5.34375, + "learning_rate": 8.124701869118086e-06, + "loss": 0.84182453, + "memory(GiB)": 135.77, + "step": 26280, + "train_speed(iter/s)": 0.201823 + }, + { + "acc": 0.7652318, + "epoch": 0.6133656075474999, + "grad_norm": 4.8125, + "learning_rate": 8.123226868650944e-06, + "loss": 0.8603693, + "memory(GiB)": 135.77, + "step": 26290, + "train_speed(iter/s)": 0.201861 + }, + { + "acc": 0.77005825, + "epoch": 0.6135989151197888, + "grad_norm": 8.875, + "learning_rate": 8.121751422344072e-06, + "loss": 0.81795826, + "memory(GiB)": 135.77, + "step": 26300, + "train_speed(iter/s)": 0.201901 + }, + { + "acc": 0.78708076, + "epoch": 0.6138322226920777, + "grad_norm": 6.1875, + "learning_rate": 8.120275530408092e-06, + "loss": 0.74994001, + "memory(GiB)": 135.77, + "step": 26310, + "train_speed(iter/s)": 0.201942 + }, + { + "acc": 0.78978472, + "epoch": 0.6140655302643666, + "grad_norm": 6.03125, + "learning_rate": 8.118799193053686e-06, + "loss": 0.74331207, + "memory(GiB)": 135.77, + "step": 26320, + "train_speed(iter/s)": 0.201979 + }, + { + "acc": 0.75865965, + "epoch": 0.6142988378366555, + "grad_norm": 5.6875, + "learning_rate": 8.117322410491602e-06, + "loss": 0.86435623, + "memory(GiB)": 135.77, + "step": 26330, + "train_speed(iter/s)": 0.202018 + }, + { + "acc": 0.77246876, + "epoch": 0.6145321454089444, + "grad_norm": 6.625, + "learning_rate": 8.11584518293265e-06, + "loss": 0.81503639, + "memory(GiB)": 135.77, + "step": 26340, + "train_speed(iter/s)": 0.202058 + }, + { + "acc": 0.76987753, + "epoch": 0.6147654529812333, + "grad_norm": 5.15625, + "learning_rate": 8.114367510587701e-06, + "loss": 0.82482548, + "memory(GiB)": 135.77, + "step": 26350, + "train_speed(iter/s)": 0.202097 + }, + { + "acc": 0.76622534, + "epoch": 0.6149987605535222, + "grad_norm": 6.28125, + "learning_rate": 8.112889393667698e-06, + "loss": 0.83077908, + "memory(GiB)": 135.77, + "step": 26360, + "train_speed(iter/s)": 0.202137 + }, + { + "acc": 0.77782397, + "epoch": 0.6152320681258111, + "grad_norm": 4.25, + "learning_rate": 8.111410832383635e-06, + "loss": 0.77891083, + "memory(GiB)": 135.77, + "step": 26370, + "train_speed(iter/s)": 0.202173 + }, + { + "acc": 0.79491034, + "epoch": 0.6154653756981, + "grad_norm": 4.21875, + "learning_rate": 8.109931826946582e-06, + "loss": 0.73813534, + "memory(GiB)": 135.77, + "step": 26380, + "train_speed(iter/s)": 0.202211 + }, + { + "acc": 0.78529496, + "epoch": 0.6156986832703889, + "grad_norm": 4.6875, + "learning_rate": 8.108452377567663e-06, + "loss": 0.77241392, + "memory(GiB)": 135.77, + "step": 26390, + "train_speed(iter/s)": 0.202246 + }, + { + "acc": 0.76942668, + "epoch": 0.6159319908426778, + "grad_norm": 6.875, + "learning_rate": 8.10697248445807e-06, + "loss": 0.82725449, + "memory(GiB)": 135.77, + "step": 26400, + "train_speed(iter/s)": 0.202286 + }, + { + "acc": 0.77350941, + "epoch": 0.6161652984149667, + "grad_norm": 4.5, + "learning_rate": 8.105492147829059e-06, + "loss": 0.81450577, + "memory(GiB)": 135.77, + "step": 26410, + "train_speed(iter/s)": 0.202328 + }, + { + "acc": 0.7737443, + "epoch": 0.6163986059872556, + "grad_norm": 5.6875, + "learning_rate": 8.104011367891944e-06, + "loss": 0.81999092, + "memory(GiB)": 135.77, + "step": 26420, + "train_speed(iter/s)": 0.202367 + }, + { + "acc": 0.78863039, + "epoch": 0.6166319135595445, + "grad_norm": 4.96875, + "learning_rate": 8.102530144858109e-06, + "loss": 0.77086744, + "memory(GiB)": 135.77, + "step": 26430, + "train_speed(iter/s)": 0.202406 + }, + { + "acc": 0.77370825, + "epoch": 0.6168652211318334, + "grad_norm": 4.46875, + "learning_rate": 8.101048478938997e-06, + "loss": 0.78342042, + "memory(GiB)": 135.77, + "step": 26440, + "train_speed(iter/s)": 0.202446 + }, + { + "acc": 0.78318248, + "epoch": 0.6170985287041223, + "grad_norm": 4.65625, + "learning_rate": 8.099566370346115e-06, + "loss": 0.78425016, + "memory(GiB)": 135.77, + "step": 26450, + "train_speed(iter/s)": 0.202487 + }, + { + "acc": 0.78459816, + "epoch": 0.6173318362764112, + "grad_norm": 5.4375, + "learning_rate": 8.098083819291034e-06, + "loss": 0.78062539, + "memory(GiB)": 135.77, + "step": 26460, + "train_speed(iter/s)": 0.202527 + }, + { + "acc": 0.77326651, + "epoch": 0.6175651438487001, + "grad_norm": 4.875, + "learning_rate": 8.096600825985388e-06, + "loss": 0.79871411, + "memory(GiB)": 135.77, + "step": 26470, + "train_speed(iter/s)": 0.202567 + }, + { + "acc": 0.75785589, + "epoch": 0.6177984514209889, + "grad_norm": 4.96875, + "learning_rate": 8.095117390640875e-06, + "loss": 0.87607994, + "memory(GiB)": 135.77, + "step": 26480, + "train_speed(iter/s)": 0.202605 + }, + { + "acc": 0.80151081, + "epoch": 0.6180317589932778, + "grad_norm": 7.53125, + "learning_rate": 8.093633513469252e-06, + "loss": 0.70373769, + "memory(GiB)": 135.77, + "step": 26490, + "train_speed(iter/s)": 0.202645 + }, + { + "acc": 0.78222547, + "epoch": 0.6182650665655667, + "grad_norm": 6.90625, + "learning_rate": 8.092149194682343e-06, + "loss": 0.78799133, + "memory(GiB)": 135.77, + "step": 26500, + "train_speed(iter/s)": 0.202684 + }, + { + "epoch": 0.6182650665655667, + "eval_acc": 0.7407008648268977, + "eval_loss": 0.8189985156059265, + "eval_runtime": 1271.9101, + "eval_samples_per_second": 28.297, + "eval_steps_per_second": 14.149, + "step": 26500 + }, + { + "acc": 0.77204461, + "epoch": 0.6184983741378556, + "grad_norm": 9.25, + "learning_rate": 8.090664434492037e-06, + "loss": 0.81106329, + "memory(GiB)": 135.77, + "step": 26510, + "train_speed(iter/s)": 0.200744 + }, + { + "acc": 0.79130363, + "epoch": 0.6187316817101445, + "grad_norm": 4.65625, + "learning_rate": 8.08917923311028e-06, + "loss": 0.7358408, + "memory(GiB)": 135.77, + "step": 26520, + "train_speed(iter/s)": 0.200781 + }, + { + "acc": 0.78020353, + "epoch": 0.6189649892824334, + "grad_norm": 6.3125, + "learning_rate": 8.087693590749083e-06, + "loss": 0.80418053, + "memory(GiB)": 135.77, + "step": 26530, + "train_speed(iter/s)": 0.200819 + }, + { + "acc": 0.79271488, + "epoch": 0.6191982968547223, + "grad_norm": 17.625, + "learning_rate": 8.086207507620524e-06, + "loss": 0.71984701, + "memory(GiB)": 135.77, + "step": 26540, + "train_speed(iter/s)": 0.200856 + }, + { + "acc": 0.7886724, + "epoch": 0.6194316044270112, + "grad_norm": 6.34375, + "learning_rate": 8.084720983936742e-06, + "loss": 0.75640163, + "memory(GiB)": 135.77, + "step": 26550, + "train_speed(iter/s)": 0.200895 + }, + { + "acc": 0.776264, + "epoch": 0.6196649119993001, + "grad_norm": 5.84375, + "learning_rate": 8.083234019909933e-06, + "loss": 0.79287157, + "memory(GiB)": 135.77, + "step": 26560, + "train_speed(iter/s)": 0.200932 + }, + { + "acc": 0.77901525, + "epoch": 0.619898219571589, + "grad_norm": 7.875, + "learning_rate": 8.081746615752365e-06, + "loss": 0.80261803, + "memory(GiB)": 135.77, + "step": 26570, + "train_speed(iter/s)": 0.20097 + }, + { + "acc": 0.77423306, + "epoch": 0.6201315271438779, + "grad_norm": 13.75, + "learning_rate": 8.080258771676363e-06, + "loss": 0.78158264, + "memory(GiB)": 135.77, + "step": 26580, + "train_speed(iter/s)": 0.20101 + }, + { + "acc": 0.79111242, + "epoch": 0.6203648347161668, + "grad_norm": 5.25, + "learning_rate": 8.078770487894314e-06, + "loss": 0.75197382, + "memory(GiB)": 135.77, + "step": 26590, + "train_speed(iter/s)": 0.201049 + }, + { + "acc": 0.76238708, + "epoch": 0.6205981422884557, + "grad_norm": 7.65625, + "learning_rate": 8.077281764618674e-06, + "loss": 0.85257568, + "memory(GiB)": 135.77, + "step": 26600, + "train_speed(iter/s)": 0.201088 + }, + { + "acc": 0.77836542, + "epoch": 0.6208314498607446, + "grad_norm": 4.625, + "learning_rate": 8.075792602061955e-06, + "loss": 0.79014702, + "memory(GiB)": 135.77, + "step": 26610, + "train_speed(iter/s)": 0.201128 + }, + { + "acc": 0.77676468, + "epoch": 0.6210647574330335, + "grad_norm": 4.90625, + "learning_rate": 8.074303000436737e-06, + "loss": 0.79348154, + "memory(GiB)": 135.77, + "step": 26620, + "train_speed(iter/s)": 0.201168 + }, + { + "acc": 0.7470377, + "epoch": 0.6212980650053224, + "grad_norm": 6.5625, + "learning_rate": 8.072812959955657e-06, + "loss": 0.92623911, + "memory(GiB)": 135.77, + "step": 26630, + "train_speed(iter/s)": 0.201208 + }, + { + "acc": 0.78787136, + "epoch": 0.6215313725776113, + "grad_norm": 5.0625, + "learning_rate": 8.071322480831422e-06, + "loss": 0.77648373, + "memory(GiB)": 135.77, + "step": 26640, + "train_speed(iter/s)": 0.201247 + }, + { + "acc": 0.7703825, + "epoch": 0.6217646801499002, + "grad_norm": 4.25, + "learning_rate": 8.069831563276793e-06, + "loss": 0.82653189, + "memory(GiB)": 135.77, + "step": 26650, + "train_speed(iter/s)": 0.201288 + }, + { + "acc": 0.76768293, + "epoch": 0.6219979877221891, + "grad_norm": 4.9375, + "learning_rate": 8.068340207504601e-06, + "loss": 0.84485979, + "memory(GiB)": 135.77, + "step": 26660, + "train_speed(iter/s)": 0.201326 + }, + { + "acc": 0.78323984, + "epoch": 0.6222312952944778, + "grad_norm": 5.9375, + "learning_rate": 8.066848413727736e-06, + "loss": 0.77385492, + "memory(GiB)": 135.77, + "step": 26670, + "train_speed(iter/s)": 0.201365 + }, + { + "acc": 0.77316294, + "epoch": 0.6224646028667667, + "grad_norm": 15.5, + "learning_rate": 8.06535618215915e-06, + "loss": 0.83223066, + "memory(GiB)": 135.77, + "step": 26680, + "train_speed(iter/s)": 0.201403 + }, + { + "acc": 0.77781687, + "epoch": 0.6226979104390556, + "grad_norm": 6.1875, + "learning_rate": 8.06386351301186e-06, + "loss": 0.80505581, + "memory(GiB)": 135.77, + "step": 26690, + "train_speed(iter/s)": 0.201442 + }, + { + "acc": 0.78201003, + "epoch": 0.6229312180113445, + "grad_norm": 4.96875, + "learning_rate": 8.062370406498944e-06, + "loss": 0.77910857, + "memory(GiB)": 135.77, + "step": 26700, + "train_speed(iter/s)": 0.20148 + }, + { + "acc": 0.75401936, + "epoch": 0.6231645255836334, + "grad_norm": 4.1875, + "learning_rate": 8.060876862833543e-06, + "loss": 0.88524904, + "memory(GiB)": 135.77, + "step": 26710, + "train_speed(iter/s)": 0.201519 + }, + { + "acc": 0.77211714, + "epoch": 0.6233978331559223, + "grad_norm": 4.5625, + "learning_rate": 8.059382882228857e-06, + "loss": 0.81682949, + "memory(GiB)": 135.77, + "step": 26720, + "train_speed(iter/s)": 0.201559 + }, + { + "acc": 0.77653065, + "epoch": 0.6236311407282112, + "grad_norm": 3.890625, + "learning_rate": 8.057888464898153e-06, + "loss": 0.81049767, + "memory(GiB)": 135.77, + "step": 26730, + "train_speed(iter/s)": 0.201597 + }, + { + "acc": 0.76580505, + "epoch": 0.6238644483005001, + "grad_norm": 6.96875, + "learning_rate": 8.056393611054761e-06, + "loss": 0.84048862, + "memory(GiB)": 135.77, + "step": 26740, + "train_speed(iter/s)": 0.201636 + }, + { + "acc": 0.77059517, + "epoch": 0.624097755872789, + "grad_norm": 6.0, + "learning_rate": 8.054898320912069e-06, + "loss": 0.84382706, + "memory(GiB)": 135.77, + "step": 26750, + "train_speed(iter/s)": 0.201676 + }, + { + "acc": 0.78693314, + "epoch": 0.6243310634450779, + "grad_norm": 7.25, + "learning_rate": 8.053402594683527e-06, + "loss": 0.76821909, + "memory(GiB)": 135.77, + "step": 26760, + "train_speed(iter/s)": 0.201715 + }, + { + "acc": 0.78143969, + "epoch": 0.6245643710173668, + "grad_norm": 6.21875, + "learning_rate": 8.051906432582651e-06, + "loss": 0.77775021, + "memory(GiB)": 135.77, + "step": 26770, + "train_speed(iter/s)": 0.201755 + }, + { + "acc": 0.78252888, + "epoch": 0.6247976785896557, + "grad_norm": 4.625, + "learning_rate": 8.050409834823021e-06, + "loss": 0.77935286, + "memory(GiB)": 135.77, + "step": 26780, + "train_speed(iter/s)": 0.201791 + }, + { + "acc": 0.78484788, + "epoch": 0.6250309861619446, + "grad_norm": 5.59375, + "learning_rate": 8.04891280161827e-06, + "loss": 0.7744689, + "memory(GiB)": 135.77, + "step": 26790, + "train_speed(iter/s)": 0.201828 + }, + { + "acc": 0.7690753, + "epoch": 0.6252642937342335, + "grad_norm": 4.84375, + "learning_rate": 8.047415333182105e-06, + "loss": 0.82785397, + "memory(GiB)": 135.77, + "step": 26800, + "train_speed(iter/s)": 0.201867 + }, + { + "acc": 0.78505082, + "epoch": 0.6254976013065224, + "grad_norm": 4.40625, + "learning_rate": 8.045917429728286e-06, + "loss": 0.79035482, + "memory(GiB)": 135.77, + "step": 26810, + "train_speed(iter/s)": 0.201908 + }, + { + "acc": 0.77882872, + "epoch": 0.6257309088788113, + "grad_norm": 5.0625, + "learning_rate": 8.044419091470638e-06, + "loss": 0.79798956, + "memory(GiB)": 135.77, + "step": 26820, + "train_speed(iter/s)": 0.201949 + }, + { + "acc": 0.80624542, + "epoch": 0.6259642164511002, + "grad_norm": 4.375, + "learning_rate": 8.042920318623051e-06, + "loss": 0.67850709, + "memory(GiB)": 135.77, + "step": 26830, + "train_speed(iter/s)": 0.201989 + }, + { + "acc": 0.77503967, + "epoch": 0.6261975240233891, + "grad_norm": 5.375, + "learning_rate": 8.04142111139947e-06, + "loss": 0.8061245, + "memory(GiB)": 135.77, + "step": 26840, + "train_speed(iter/s)": 0.202027 + }, + { + "acc": 0.76675572, + "epoch": 0.626430831595678, + "grad_norm": 4.75, + "learning_rate": 8.039921470013912e-06, + "loss": 0.85774889, + "memory(GiB)": 135.77, + "step": 26850, + "train_speed(iter/s)": 0.202064 + }, + { + "acc": 0.76741781, + "epoch": 0.6266641391679668, + "grad_norm": 9.8125, + "learning_rate": 8.038421394680445e-06, + "loss": 0.82042427, + "memory(GiB)": 135.77, + "step": 26860, + "train_speed(iter/s)": 0.202104 + }, + { + "acc": 0.78301311, + "epoch": 0.6268974467402557, + "grad_norm": 5.0, + "learning_rate": 8.036920885613206e-06, + "loss": 0.77580643, + "memory(GiB)": 135.77, + "step": 26870, + "train_speed(iter/s)": 0.202141 + }, + { + "acc": 0.78032942, + "epoch": 0.6271307543125446, + "grad_norm": 3.984375, + "learning_rate": 8.035419943026395e-06, + "loss": 0.7978806, + "memory(GiB)": 135.77, + "step": 26880, + "train_speed(iter/s)": 0.202178 + }, + { + "acc": 0.77603312, + "epoch": 0.6273640618848335, + "grad_norm": 4.28125, + "learning_rate": 8.033918567134266e-06, + "loss": 0.81758986, + "memory(GiB)": 135.77, + "step": 26890, + "train_speed(iter/s)": 0.202213 + }, + { + "acc": 0.76494641, + "epoch": 0.6275973694571224, + "grad_norm": 4.21875, + "learning_rate": 8.032416758151144e-06, + "loss": 0.83805542, + "memory(GiB)": 135.77, + "step": 26900, + "train_speed(iter/s)": 0.202252 + }, + { + "acc": 0.78844762, + "epoch": 0.6278306770294113, + "grad_norm": 6.625, + "learning_rate": 8.030914516291413e-06, + "loss": 0.73256688, + "memory(GiB)": 135.77, + "step": 26910, + "train_speed(iter/s)": 0.202292 + }, + { + "acc": 0.78790379, + "epoch": 0.6280639846017002, + "grad_norm": 4.84375, + "learning_rate": 8.029411841769515e-06, + "loss": 0.77325296, + "memory(GiB)": 135.77, + "step": 26920, + "train_speed(iter/s)": 0.20233 + }, + { + "acc": 0.77131805, + "epoch": 0.6282972921739891, + "grad_norm": 5.3125, + "learning_rate": 8.027908734799954e-06, + "loss": 0.82325258, + "memory(GiB)": 135.77, + "step": 26930, + "train_speed(iter/s)": 0.202368 + }, + { + "acc": 0.77684565, + "epoch": 0.628530599746278, + "grad_norm": 5.125, + "learning_rate": 8.026405195597302e-06, + "loss": 0.77707276, + "memory(GiB)": 135.77, + "step": 26940, + "train_speed(iter/s)": 0.202407 + }, + { + "acc": 0.78030272, + "epoch": 0.6287639073185669, + "grad_norm": 4.625, + "learning_rate": 8.024901224376186e-06, + "loss": 0.77970848, + "memory(GiB)": 135.77, + "step": 26950, + "train_speed(iter/s)": 0.202447 + }, + { + "acc": 0.78028889, + "epoch": 0.6289972148908558, + "grad_norm": 5.75, + "learning_rate": 8.023396821351302e-06, + "loss": 0.80391293, + "memory(GiB)": 135.77, + "step": 26960, + "train_speed(iter/s)": 0.202486 + }, + { + "acc": 0.76113973, + "epoch": 0.6292305224631447, + "grad_norm": 4.96875, + "learning_rate": 8.021891986737399e-06, + "loss": 0.84696064, + "memory(GiB)": 135.77, + "step": 26970, + "train_speed(iter/s)": 0.202525 + }, + { + "acc": 0.77851915, + "epoch": 0.6294638300354336, + "grad_norm": 12.1875, + "learning_rate": 8.020386720749292e-06, + "loss": 0.8118988, + "memory(GiB)": 135.77, + "step": 26980, + "train_speed(iter/s)": 0.202564 + }, + { + "acc": 0.78185272, + "epoch": 0.6296971376077225, + "grad_norm": 6.625, + "learning_rate": 8.018881023601858e-06, + "loss": 0.77297134, + "memory(GiB)": 135.77, + "step": 26990, + "train_speed(iter/s)": 0.202603 + }, + { + "acc": 0.77753963, + "epoch": 0.6299304451800114, + "grad_norm": 4.90625, + "learning_rate": 8.017374895510035e-06, + "loss": 0.80134878, + "memory(GiB)": 135.77, + "step": 27000, + "train_speed(iter/s)": 0.202641 + }, + { + "epoch": 0.6299304451800114, + "eval_acc": 0.7406633910765392, + "eval_loss": 0.8188360929489136, + "eval_runtime": 1270.4842, + "eval_samples_per_second": 28.329, + "eval_steps_per_second": 14.165, + "step": 27000 + }, + { + "acc": 0.78660893, + "epoch": 0.6301637527523003, + "grad_norm": 4.59375, + "learning_rate": 8.015868336688822e-06, + "loss": 0.79268298, + "memory(GiB)": 135.77, + "step": 27010, + "train_speed(iter/s)": 0.200742 + }, + { + "acc": 0.78694639, + "epoch": 0.6303970603245892, + "grad_norm": 10.5625, + "learning_rate": 8.01436134735328e-06, + "loss": 0.76775427, + "memory(GiB)": 135.77, + "step": 27020, + "train_speed(iter/s)": 0.20078 + }, + { + "acc": 0.79044333, + "epoch": 0.6306303678968781, + "grad_norm": 6.0, + "learning_rate": 8.012853927718532e-06, + "loss": 0.76151662, + "memory(GiB)": 135.77, + "step": 27030, + "train_speed(iter/s)": 0.200816 + }, + { + "acc": 0.77135372, + "epoch": 0.630863675469167, + "grad_norm": 5.28125, + "learning_rate": 8.011346077999762e-06, + "loss": 0.82462111, + "memory(GiB)": 135.77, + "step": 27040, + "train_speed(iter/s)": 0.200856 + }, + { + "acc": 0.76923323, + "epoch": 0.6310969830414559, + "grad_norm": 5.6875, + "learning_rate": 8.009837798412213e-06, + "loss": 0.81341896, + "memory(GiB)": 135.77, + "step": 27050, + "train_speed(iter/s)": 0.200892 + }, + { + "acc": 0.77677755, + "epoch": 0.6313302906137447, + "grad_norm": 5.9375, + "learning_rate": 8.008329089171192e-06, + "loss": 0.80337753, + "memory(GiB)": 135.77, + "step": 27060, + "train_speed(iter/s)": 0.20093 + }, + { + "acc": 0.77744932, + "epoch": 0.6315635981860336, + "grad_norm": 5.4375, + "learning_rate": 8.006819950492067e-06, + "loss": 0.79293747, + "memory(GiB)": 135.77, + "step": 27070, + "train_speed(iter/s)": 0.200969 + }, + { + "acc": 0.76862764, + "epoch": 0.6317969057583225, + "grad_norm": 5.9375, + "learning_rate": 8.00531038259027e-06, + "loss": 0.82000647, + "memory(GiB)": 135.77, + "step": 27080, + "train_speed(iter/s)": 0.201008 + }, + { + "acc": 0.79620075, + "epoch": 0.6320302133306114, + "grad_norm": 6.09375, + "learning_rate": 8.003800385681287e-06, + "loss": 0.73623171, + "memory(GiB)": 135.77, + "step": 27090, + "train_speed(iter/s)": 0.201047 + }, + { + "acc": 0.77440538, + "epoch": 0.6322635209029003, + "grad_norm": 5.75, + "learning_rate": 8.002289959980672e-06, + "loss": 0.81353531, + "memory(GiB)": 135.77, + "step": 27100, + "train_speed(iter/s)": 0.201087 + }, + { + "acc": 0.77269154, + "epoch": 0.6324968284751892, + "grad_norm": 7.9375, + "learning_rate": 8.000779105704037e-06, + "loss": 0.82167168, + "memory(GiB)": 135.77, + "step": 27110, + "train_speed(iter/s)": 0.201127 + }, + { + "acc": 0.78157363, + "epoch": 0.6327301360474781, + "grad_norm": 6.1875, + "learning_rate": 7.999267823067056e-06, + "loss": 0.79839826, + "memory(GiB)": 135.77, + "step": 27120, + "train_speed(iter/s)": 0.201168 + }, + { + "acc": 0.77923985, + "epoch": 0.632963443619767, + "grad_norm": 7.96875, + "learning_rate": 7.997756112285467e-06, + "loss": 0.77553253, + "memory(GiB)": 135.77, + "step": 27130, + "train_speed(iter/s)": 0.201207 + }, + { + "acc": 0.77168446, + "epoch": 0.6331967511920559, + "grad_norm": 3.703125, + "learning_rate": 7.996243973575062e-06, + "loss": 0.82124062, + "memory(GiB)": 135.77, + "step": 27140, + "train_speed(iter/s)": 0.201245 + }, + { + "acc": 0.77455969, + "epoch": 0.6334300587643448, + "grad_norm": 6.1875, + "learning_rate": 7.994731407151702e-06, + "loss": 0.80292664, + "memory(GiB)": 135.77, + "step": 27150, + "train_speed(iter/s)": 0.201281 + }, + { + "acc": 0.782617, + "epoch": 0.6336633663366337, + "grad_norm": 5.03125, + "learning_rate": 7.9932184132313e-06, + "loss": 0.77485504, + "memory(GiB)": 135.77, + "step": 27160, + "train_speed(iter/s)": 0.201316 + }, + { + "acc": 0.75979109, + "epoch": 0.6338966739089226, + "grad_norm": 4.46875, + "learning_rate": 7.99170499202984e-06, + "loss": 0.8872653, + "memory(GiB)": 135.77, + "step": 27170, + "train_speed(iter/s)": 0.201356 + }, + { + "acc": 0.7633131, + "epoch": 0.6341299814812115, + "grad_norm": 6.46875, + "learning_rate": 7.990191143763364e-06, + "loss": 0.84359808, + "memory(GiB)": 135.77, + "step": 27180, + "train_speed(iter/s)": 0.201395 + }, + { + "acc": 0.77210131, + "epoch": 0.6343632890535004, + "grad_norm": 7.96875, + "learning_rate": 7.988676868647969e-06, + "loss": 0.83302078, + "memory(GiB)": 135.77, + "step": 27190, + "train_speed(iter/s)": 0.201435 + }, + { + "acc": 0.76380386, + "epoch": 0.6345965966257893, + "grad_norm": 6.25, + "learning_rate": 7.98716216689982e-06, + "loss": 0.85607281, + "memory(GiB)": 135.77, + "step": 27200, + "train_speed(iter/s)": 0.201476 + }, + { + "acc": 0.77592916, + "epoch": 0.6348299041980782, + "grad_norm": 5.0625, + "learning_rate": 7.985647038735139e-06, + "loss": 0.812115, + "memory(GiB)": 135.77, + "step": 27210, + "train_speed(iter/s)": 0.201516 + }, + { + "acc": 0.77886858, + "epoch": 0.6350632117703671, + "grad_norm": 4.71875, + "learning_rate": 7.98413148437021e-06, + "loss": 0.79281015, + "memory(GiB)": 135.77, + "step": 27220, + "train_speed(iter/s)": 0.201551 + }, + { + "acc": 0.78781528, + "epoch": 0.635296519342656, + "grad_norm": 7.28125, + "learning_rate": 7.98261550402138e-06, + "loss": 0.75333433, + "memory(GiB)": 135.77, + "step": 27230, + "train_speed(iter/s)": 0.201589 + }, + { + "acc": 0.781744, + "epoch": 0.6355298269149449, + "grad_norm": 7.21875, + "learning_rate": 7.981099097905051e-06, + "loss": 0.78279748, + "memory(GiB)": 135.77, + "step": 27240, + "train_speed(iter/s)": 0.201626 + }, + { + "acc": 0.76686149, + "epoch": 0.6357631344872336, + "grad_norm": 6.34375, + "learning_rate": 7.979582266237695e-06, + "loss": 0.87493477, + "memory(GiB)": 135.77, + "step": 27250, + "train_speed(iter/s)": 0.201666 + }, + { + "acc": 0.7830512, + "epoch": 0.6359964420595225, + "grad_norm": 4.78125, + "learning_rate": 7.978065009235834e-06, + "loss": 0.78735223, + "memory(GiB)": 135.77, + "step": 27260, + "train_speed(iter/s)": 0.201705 + }, + { + "acc": 0.77184772, + "epoch": 0.6362297496318114, + "grad_norm": 5.53125, + "learning_rate": 7.976547327116058e-06, + "loss": 0.80568523, + "memory(GiB)": 135.77, + "step": 27270, + "train_speed(iter/s)": 0.201741 + }, + { + "acc": 0.76909132, + "epoch": 0.6364630572041003, + "grad_norm": 8.375, + "learning_rate": 7.975029220095016e-06, + "loss": 0.81954041, + "memory(GiB)": 135.77, + "step": 27280, + "train_speed(iter/s)": 0.20178 + }, + { + "acc": 0.77826443, + "epoch": 0.6366963647763892, + "grad_norm": 6.03125, + "learning_rate": 7.973510688389417e-06, + "loss": 0.7962677, + "memory(GiB)": 135.77, + "step": 27290, + "train_speed(iter/s)": 0.201818 + }, + { + "acc": 0.77817726, + "epoch": 0.6369296723486781, + "grad_norm": 5.6875, + "learning_rate": 7.971991732216032e-06, + "loss": 0.80456171, + "memory(GiB)": 135.77, + "step": 27300, + "train_speed(iter/s)": 0.201857 + }, + { + "acc": 0.76000805, + "epoch": 0.637162979920967, + "grad_norm": 5.375, + "learning_rate": 7.97047235179169e-06, + "loss": 0.8421195, + "memory(GiB)": 135.77, + "step": 27310, + "train_speed(iter/s)": 0.201896 + }, + { + "acc": 0.78686552, + "epoch": 0.6373962874932559, + "grad_norm": 5.71875, + "learning_rate": 7.968952547333281e-06, + "loss": 0.75428095, + "memory(GiB)": 135.77, + "step": 27320, + "train_speed(iter/s)": 0.201932 + }, + { + "acc": 0.76575708, + "epoch": 0.6376295950655448, + "grad_norm": 6.15625, + "learning_rate": 7.967432319057762e-06, + "loss": 0.84427853, + "memory(GiB)": 135.77, + "step": 27330, + "train_speed(iter/s)": 0.201969 + }, + { + "acc": 0.76280413, + "epoch": 0.6378629026378337, + "grad_norm": 4.1875, + "learning_rate": 7.965911667182138e-06, + "loss": 0.85895653, + "memory(GiB)": 135.77, + "step": 27340, + "train_speed(iter/s)": 0.202008 + }, + { + "acc": 0.78277311, + "epoch": 0.6380962102101226, + "grad_norm": 4.34375, + "learning_rate": 7.964390591923487e-06, + "loss": 0.77046175, + "memory(GiB)": 135.77, + "step": 27350, + "train_speed(iter/s)": 0.202046 + }, + { + "acc": 0.76807733, + "epoch": 0.6383295177824115, + "grad_norm": 4.25, + "learning_rate": 7.962869093498939e-06, + "loss": 0.84009323, + "memory(GiB)": 135.77, + "step": 27360, + "train_speed(iter/s)": 0.202083 + }, + { + "acc": 0.78158169, + "epoch": 0.6385628253547004, + "grad_norm": 5.34375, + "learning_rate": 7.961347172125689e-06, + "loss": 0.78689456, + "memory(GiB)": 135.77, + "step": 27370, + "train_speed(iter/s)": 0.202122 + }, + { + "acc": 0.75279794, + "epoch": 0.6387961329269893, + "grad_norm": 5.875, + "learning_rate": 7.959824828020991e-06, + "loss": 0.91323185, + "memory(GiB)": 135.77, + "step": 27380, + "train_speed(iter/s)": 0.202153 + }, + { + "acc": 0.78088951, + "epoch": 0.6390294404992782, + "grad_norm": 6.28125, + "learning_rate": 7.958302061402159e-06, + "loss": 0.76333561, + "memory(GiB)": 135.77, + "step": 27390, + "train_speed(iter/s)": 0.202192 + }, + { + "acc": 0.76975918, + "epoch": 0.6392627480715671, + "grad_norm": 4.5, + "learning_rate": 7.956778872486566e-06, + "loss": 0.82985058, + "memory(GiB)": 135.77, + "step": 27400, + "train_speed(iter/s)": 0.202226 + }, + { + "acc": 0.77449579, + "epoch": 0.639496055643856, + "grad_norm": 6.5, + "learning_rate": 7.955255261491648e-06, + "loss": 0.80823078, + "memory(GiB)": 135.77, + "step": 27410, + "train_speed(iter/s)": 0.202263 + }, + { + "acc": 0.78136916, + "epoch": 0.6397293632161449, + "grad_norm": 5.5, + "learning_rate": 7.9537312286349e-06, + "loss": 0.78419118, + "memory(GiB)": 135.77, + "step": 27420, + "train_speed(iter/s)": 0.202299 + }, + { + "acc": 0.7795558, + "epoch": 0.6399626707884338, + "grad_norm": 5.125, + "learning_rate": 7.952206774133878e-06, + "loss": 0.79317288, + "memory(GiB)": 135.77, + "step": 27430, + "train_speed(iter/s)": 0.202337 + }, + { + "acc": 0.76971769, + "epoch": 0.6401959783607226, + "grad_norm": 7.875, + "learning_rate": 7.950681898206197e-06, + "loss": 0.81421986, + "memory(GiB)": 135.77, + "step": 27440, + "train_speed(iter/s)": 0.202373 + }, + { + "acc": 0.76713629, + "epoch": 0.6404292859330115, + "grad_norm": 7.46875, + "learning_rate": 7.949156601069531e-06, + "loss": 0.86134014, + "memory(GiB)": 135.77, + "step": 27450, + "train_speed(iter/s)": 0.20241 + }, + { + "acc": 0.78460703, + "epoch": 0.6406625935053004, + "grad_norm": 5.40625, + "learning_rate": 7.947630882941617e-06, + "loss": 0.7690053, + "memory(GiB)": 135.77, + "step": 27460, + "train_speed(iter/s)": 0.202448 + }, + { + "acc": 0.77735882, + "epoch": 0.6408959010775893, + "grad_norm": 6.96875, + "learning_rate": 7.94610474404025e-06, + "loss": 0.82368259, + "memory(GiB)": 135.77, + "step": 27470, + "train_speed(iter/s)": 0.202484 + }, + { + "acc": 0.77271509, + "epoch": 0.6411292086498782, + "grad_norm": 6.15625, + "learning_rate": 7.944578184583289e-06, + "loss": 0.82098379, + "memory(GiB)": 135.77, + "step": 27480, + "train_speed(iter/s)": 0.202525 + }, + { + "acc": 0.77546682, + "epoch": 0.6413625162221671, + "grad_norm": 4.0625, + "learning_rate": 7.943051204788646e-06, + "loss": 0.81826458, + "memory(GiB)": 135.77, + "step": 27490, + "train_speed(iter/s)": 0.202563 + }, + { + "acc": 0.79403472, + "epoch": 0.641595823794456, + "grad_norm": 5.1875, + "learning_rate": 7.941523804874298e-06, + "loss": 0.73600636, + "memory(GiB)": 135.77, + "step": 27500, + "train_speed(iter/s)": 0.202599 + }, + { + "epoch": 0.641595823794456, + "eval_acc": 0.7411853011296093, + "eval_loss": 0.818037211894989, + "eval_runtime": 1271.0412, + "eval_samples_per_second": 28.316, + "eval_steps_per_second": 14.158, + "step": 27500 + }, + { + "acc": 0.757687, + "epoch": 0.6418291313667449, + "grad_norm": 6.21875, + "learning_rate": 7.939995985058282e-06, + "loss": 0.86413383, + "memory(GiB)": 135.77, + "step": 27510, + "train_speed(iter/s)": 0.200732 + }, + { + "acc": 0.7694766, + "epoch": 0.6420624389390338, + "grad_norm": 7.8125, + "learning_rate": 7.938467745558693e-06, + "loss": 0.82724009, + "memory(GiB)": 135.77, + "step": 27520, + "train_speed(iter/s)": 0.200767 + }, + { + "acc": 0.7876204, + "epoch": 0.6422957465113227, + "grad_norm": 8.8125, + "learning_rate": 7.936939086593688e-06, + "loss": 0.7523385, + "memory(GiB)": 135.77, + "step": 27530, + "train_speed(iter/s)": 0.200801 + }, + { + "acc": 0.81005259, + "epoch": 0.6425290540836116, + "grad_norm": 6.96875, + "learning_rate": 7.935410008381482e-06, + "loss": 0.67726383, + "memory(GiB)": 135.77, + "step": 27540, + "train_speed(iter/s)": 0.200839 + }, + { + "acc": 0.78100648, + "epoch": 0.6427623616559005, + "grad_norm": 5.625, + "learning_rate": 7.933880511140349e-06, + "loss": 0.77527313, + "memory(GiB)": 135.77, + "step": 27550, + "train_speed(iter/s)": 0.200879 + }, + { + "acc": 0.7760963, + "epoch": 0.6429956692281894, + "grad_norm": 5.5625, + "learning_rate": 7.932350595088623e-06, + "loss": 0.7997427, + "memory(GiB)": 135.77, + "step": 27560, + "train_speed(iter/s)": 0.200916 + }, + { + "acc": 0.78075814, + "epoch": 0.6432289768004783, + "grad_norm": 6.3125, + "learning_rate": 7.930820260444705e-06, + "loss": 0.78330669, + "memory(GiB)": 135.77, + "step": 27570, + "train_speed(iter/s)": 0.200952 + }, + { + "acc": 0.77887297, + "epoch": 0.6434622843727672, + "grad_norm": 11.3125, + "learning_rate": 7.929289507427044e-06, + "loss": 0.79019032, + "memory(GiB)": 135.77, + "step": 27580, + "train_speed(iter/s)": 0.200991 + }, + { + "acc": 0.78241224, + "epoch": 0.6436955919450561, + "grad_norm": 4.96875, + "learning_rate": 7.927758336254156e-06, + "loss": 0.79271832, + "memory(GiB)": 135.77, + "step": 27590, + "train_speed(iter/s)": 0.201029 + }, + { + "acc": 0.75557337, + "epoch": 0.643928899517345, + "grad_norm": 8.4375, + "learning_rate": 7.926226747144618e-06, + "loss": 0.85448341, + "memory(GiB)": 135.77, + "step": 27600, + "train_speed(iter/s)": 0.201065 + }, + { + "acc": 0.77307668, + "epoch": 0.6441622070896339, + "grad_norm": 5.34375, + "learning_rate": 7.924694740317063e-06, + "loss": 0.83668203, + "memory(GiB)": 135.77, + "step": 27610, + "train_speed(iter/s)": 0.201102 + }, + { + "acc": 0.76995897, + "epoch": 0.6443955146619228, + "grad_norm": 6.6875, + "learning_rate": 7.923162315990181e-06, + "loss": 0.82851915, + "memory(GiB)": 135.77, + "step": 27620, + "train_speed(iter/s)": 0.201139 + }, + { + "acc": 0.77117805, + "epoch": 0.6446288222342116, + "grad_norm": 5.65625, + "learning_rate": 7.92162947438273e-06, + "loss": 0.82947626, + "memory(GiB)": 135.77, + "step": 27630, + "train_speed(iter/s)": 0.201177 + }, + { + "acc": 0.76669693, + "epoch": 0.6448621298065005, + "grad_norm": 5.3125, + "learning_rate": 7.920096215713518e-06, + "loss": 0.83536053, + "memory(GiB)": 135.77, + "step": 27640, + "train_speed(iter/s)": 0.201211 + }, + { + "acc": 0.78462639, + "epoch": 0.6450954373787894, + "grad_norm": 6.09375, + "learning_rate": 7.91856254020142e-06, + "loss": 0.78443203, + "memory(GiB)": 135.77, + "step": 27650, + "train_speed(iter/s)": 0.201247 + }, + { + "acc": 0.78006172, + "epoch": 0.6453287449510783, + "grad_norm": 6.28125, + "learning_rate": 7.917028448065368e-06, + "loss": 0.8074728, + "memory(GiB)": 135.77, + "step": 27660, + "train_speed(iter/s)": 0.201283 + }, + { + "acc": 0.76845789, + "epoch": 0.6455620525233672, + "grad_norm": 4.1875, + "learning_rate": 7.915493939524352e-06, + "loss": 0.8165184, + "memory(GiB)": 135.77, + "step": 27670, + "train_speed(iter/s)": 0.201316 + }, + { + "acc": 0.78136225, + "epoch": 0.6457953600956561, + "grad_norm": 6.53125, + "learning_rate": 7.913959014797424e-06, + "loss": 0.77983022, + "memory(GiB)": 135.77, + "step": 27680, + "train_speed(iter/s)": 0.201352 + }, + { + "acc": 0.75951281, + "epoch": 0.646028667667945, + "grad_norm": 5.84375, + "learning_rate": 7.91242367410369e-06, + "loss": 0.85881014, + "memory(GiB)": 135.77, + "step": 27690, + "train_speed(iter/s)": 0.20139 + }, + { + "acc": 0.77766533, + "epoch": 0.6462619752402339, + "grad_norm": 5.28125, + "learning_rate": 7.910887917662326e-06, + "loss": 0.80150757, + "memory(GiB)": 135.77, + "step": 27700, + "train_speed(iter/s)": 0.201428 + }, + { + "acc": 0.77703729, + "epoch": 0.6464952828125228, + "grad_norm": 5.6875, + "learning_rate": 7.909351745692557e-06, + "loss": 0.83589764, + "memory(GiB)": 135.77, + "step": 27710, + "train_speed(iter/s)": 0.201465 + }, + { + "acc": 0.7554203, + "epoch": 0.6467285903848117, + "grad_norm": 4.6875, + "learning_rate": 7.907815158413669e-06, + "loss": 0.90056362, + "memory(GiB)": 135.77, + "step": 27720, + "train_speed(iter/s)": 0.201502 + }, + { + "acc": 0.77123346, + "epoch": 0.6469618979571006, + "grad_norm": 6.375, + "learning_rate": 7.906278156045015e-06, + "loss": 0.8136116, + "memory(GiB)": 135.77, + "step": 27730, + "train_speed(iter/s)": 0.20154 + }, + { + "acc": 0.78147049, + "epoch": 0.6471952055293895, + "grad_norm": 5.59375, + "learning_rate": 7.904740738805996e-06, + "loss": 0.79858246, + "memory(GiB)": 135.77, + "step": 27740, + "train_speed(iter/s)": 0.201576 + }, + { + "acc": 0.76583729, + "epoch": 0.6474285131016784, + "grad_norm": 19.5, + "learning_rate": 7.90320290691608e-06, + "loss": 0.8749052, + "memory(GiB)": 135.77, + "step": 27750, + "train_speed(iter/s)": 0.201611 + }, + { + "acc": 0.77532988, + "epoch": 0.6476618206739673, + "grad_norm": 5.3125, + "learning_rate": 7.901664660594794e-06, + "loss": 0.81616383, + "memory(GiB)": 135.77, + "step": 27760, + "train_speed(iter/s)": 0.20165 + }, + { + "acc": 0.78104067, + "epoch": 0.6478951282462562, + "grad_norm": 5.3125, + "learning_rate": 7.90012600006172e-06, + "loss": 0.76530261, + "memory(GiB)": 135.77, + "step": 27770, + "train_speed(iter/s)": 0.201684 + }, + { + "acc": 0.78051329, + "epoch": 0.6481284358185451, + "grad_norm": 4.1875, + "learning_rate": 7.898586925536504e-06, + "loss": 0.78549862, + "memory(GiB)": 135.77, + "step": 27780, + "train_speed(iter/s)": 0.201719 + }, + { + "acc": 0.77190304, + "epoch": 0.648361743390834, + "grad_norm": 5.1875, + "learning_rate": 7.897047437238845e-06, + "loss": 0.83245649, + "memory(GiB)": 135.77, + "step": 27790, + "train_speed(iter/s)": 0.201756 + }, + { + "acc": 0.76675649, + "epoch": 0.6485950509631229, + "grad_norm": 6.84375, + "learning_rate": 7.895507535388506e-06, + "loss": 0.85838509, + "memory(GiB)": 135.77, + "step": 27800, + "train_speed(iter/s)": 0.201794 + }, + { + "acc": 0.76727285, + "epoch": 0.6488283585354118, + "grad_norm": 4.9375, + "learning_rate": 7.893967220205307e-06, + "loss": 0.84388571, + "memory(GiB)": 135.77, + "step": 27810, + "train_speed(iter/s)": 0.201829 + }, + { + "acc": 0.77514563, + "epoch": 0.6490616661077007, + "grad_norm": 4.5625, + "learning_rate": 7.89242649190913e-06, + "loss": 0.80860777, + "memory(GiB)": 135.77, + "step": 27820, + "train_speed(iter/s)": 0.201866 + }, + { + "acc": 0.76715317, + "epoch": 0.6492949736799895, + "grad_norm": 5.53125, + "learning_rate": 7.890885350719907e-06, + "loss": 0.83706264, + "memory(GiB)": 135.77, + "step": 27830, + "train_speed(iter/s)": 0.201903 + }, + { + "acc": 0.77728567, + "epoch": 0.6495282812522783, + "grad_norm": 5.25, + "learning_rate": 7.889343796857645e-06, + "loss": 0.79823217, + "memory(GiB)": 135.77, + "step": 27840, + "train_speed(iter/s)": 0.201939 + }, + { + "acc": 0.78821392, + "epoch": 0.6497615888245672, + "grad_norm": 6.71875, + "learning_rate": 7.887801830542392e-06, + "loss": 0.75157042, + "memory(GiB)": 135.77, + "step": 27850, + "train_speed(iter/s)": 0.201976 + }, + { + "acc": 0.7779901, + "epoch": 0.6499948963968561, + "grad_norm": 4.84375, + "learning_rate": 7.886259451994267e-06, + "loss": 0.79298458, + "memory(GiB)": 135.77, + "step": 27860, + "train_speed(iter/s)": 0.202013 + }, + { + "acc": 0.76499109, + "epoch": 0.650228203969145, + "grad_norm": 5.25, + "learning_rate": 7.884716661433444e-06, + "loss": 0.87161884, + "memory(GiB)": 135.77, + "step": 27870, + "train_speed(iter/s)": 0.202049 + }, + { + "acc": 0.7676559, + "epoch": 0.650461511541434, + "grad_norm": 6.40625, + "learning_rate": 7.883173459080159e-06, + "loss": 0.83052654, + "memory(GiB)": 135.77, + "step": 27880, + "train_speed(iter/s)": 0.202088 + }, + { + "acc": 0.78522291, + "epoch": 0.6506948191137228, + "grad_norm": 5.46875, + "learning_rate": 7.881629845154696e-06, + "loss": 0.77097383, + "memory(GiB)": 135.77, + "step": 27890, + "train_speed(iter/s)": 0.202123 + }, + { + "acc": 0.76635356, + "epoch": 0.6509281266860117, + "grad_norm": 4.8125, + "learning_rate": 7.880085819877411e-06, + "loss": 0.84650917, + "memory(GiB)": 135.77, + "step": 27900, + "train_speed(iter/s)": 0.202162 + }, + { + "acc": 0.76490736, + "epoch": 0.6511614342583006, + "grad_norm": 4.71875, + "learning_rate": 7.878541383468712e-06, + "loss": 0.8467804, + "memory(GiB)": 135.77, + "step": 27910, + "train_speed(iter/s)": 0.202199 + }, + { + "acc": 0.76074095, + "epoch": 0.6513947418305895, + "grad_norm": 5.03125, + "learning_rate": 7.876996536149067e-06, + "loss": 0.85577497, + "memory(GiB)": 135.77, + "step": 27920, + "train_speed(iter/s)": 0.202233 + }, + { + "acc": 0.79905081, + "epoch": 0.6516280494028784, + "grad_norm": 5.9375, + "learning_rate": 7.875451278139001e-06, + "loss": 0.71797628, + "memory(GiB)": 135.77, + "step": 27930, + "train_speed(iter/s)": 0.202268 + }, + { + "acc": 0.77437353, + "epoch": 0.6518613569751673, + "grad_norm": 6.6875, + "learning_rate": 7.873905609659102e-06, + "loss": 0.80220547, + "memory(GiB)": 135.77, + "step": 27940, + "train_speed(iter/s)": 0.202306 + }, + { + "acc": 0.78056316, + "epoch": 0.6520946645474562, + "grad_norm": 5.25, + "learning_rate": 7.872359530930011e-06, + "loss": 0.76737919, + "memory(GiB)": 135.77, + "step": 27950, + "train_speed(iter/s)": 0.202343 + }, + { + "acc": 0.77328043, + "epoch": 0.6523279721197451, + "grad_norm": 6.5, + "learning_rate": 7.870813042172432e-06, + "loss": 0.8179678, + "memory(GiB)": 135.77, + "step": 27960, + "train_speed(iter/s)": 0.202381 + }, + { + "acc": 0.784305, + "epoch": 0.652561279692034, + "grad_norm": 4.0, + "learning_rate": 7.869266143607124e-06, + "loss": 0.76675787, + "memory(GiB)": 135.77, + "step": 27970, + "train_speed(iter/s)": 0.202421 + }, + { + "acc": 0.77683382, + "epoch": 0.6527945872643229, + "grad_norm": 4.6875, + "learning_rate": 7.86771883545491e-06, + "loss": 0.80360317, + "memory(GiB)": 135.77, + "step": 27980, + "train_speed(iter/s)": 0.202459 + }, + { + "acc": 0.76954408, + "epoch": 0.6530278948366118, + "grad_norm": 6.28125, + "learning_rate": 7.866171117936663e-06, + "loss": 0.83105335, + "memory(GiB)": 135.77, + "step": 27990, + "train_speed(iter/s)": 0.202497 + }, + { + "acc": 0.76160703, + "epoch": 0.6532612024089007, + "grad_norm": 6.3125, + "learning_rate": 7.864622991273322e-06, + "loss": 0.8450861, + "memory(GiB)": 135.77, + "step": 28000, + "train_speed(iter/s)": 0.202537 + }, + { + "epoch": 0.6532612024089007, + "eval_acc": 0.7410069004548256, + "eval_loss": 0.8178855180740356, + "eval_runtime": 1271.7089, + "eval_samples_per_second": 28.301, + "eval_steps_per_second": 14.151, + "step": 28000 + }, + { + "acc": 0.77182312, + "epoch": 0.6534945099811896, + "grad_norm": 8.375, + "learning_rate": 7.863074455685882e-06, + "loss": 0.83168869, + "memory(GiB)": 135.77, + "step": 28010, + "train_speed(iter/s)": 0.200706 + }, + { + "acc": 0.78294449, + "epoch": 0.6537278175534784, + "grad_norm": 4.375, + "learning_rate": 7.861525511395394e-06, + "loss": 0.76020308, + "memory(GiB)": 135.77, + "step": 28020, + "train_speed(iter/s)": 0.200743 + }, + { + "acc": 0.76998968, + "epoch": 0.6539611251257673, + "grad_norm": 4.71875, + "learning_rate": 7.859976158622971e-06, + "loss": 0.82431984, + "memory(GiB)": 135.77, + "step": 28030, + "train_speed(iter/s)": 0.200778 + }, + { + "acc": 0.75824432, + "epoch": 0.6541944326980562, + "grad_norm": 6.0625, + "learning_rate": 7.858426397589783e-06, + "loss": 0.87359171, + "memory(GiB)": 135.77, + "step": 28040, + "train_speed(iter/s)": 0.200818 + }, + { + "acc": 0.76430311, + "epoch": 0.6544277402703451, + "grad_norm": 10.0, + "learning_rate": 7.856876228517057e-06, + "loss": 0.84922371, + "memory(GiB)": 135.77, + "step": 28050, + "train_speed(iter/s)": 0.200859 + }, + { + "acc": 0.7763073, + "epoch": 0.654661047842634, + "grad_norm": 5.21875, + "learning_rate": 7.85532565162608e-06, + "loss": 0.81707954, + "memory(GiB)": 135.77, + "step": 28060, + "train_speed(iter/s)": 0.200896 + }, + { + "acc": 0.78287649, + "epoch": 0.6548943554149229, + "grad_norm": 3.984375, + "learning_rate": 7.853774667138192e-06, + "loss": 0.76915636, + "memory(GiB)": 135.77, + "step": 28070, + "train_speed(iter/s)": 0.200937 + }, + { + "acc": 0.78447509, + "epoch": 0.6551276629872118, + "grad_norm": 4.34375, + "learning_rate": 7.852223275274804e-06, + "loss": 0.77548103, + "memory(GiB)": 135.77, + "step": 28080, + "train_speed(iter/s)": 0.200974 + }, + { + "acc": 0.77808089, + "epoch": 0.6553609705595007, + "grad_norm": 4.625, + "learning_rate": 7.85067147625737e-06, + "loss": 0.79100828, + "memory(GiB)": 135.77, + "step": 28090, + "train_speed(iter/s)": 0.20101 + }, + { + "acc": 0.776159, + "epoch": 0.6555942781317896, + "grad_norm": 8.3125, + "learning_rate": 7.84911927030741e-06, + "loss": 0.79818797, + "memory(GiB)": 135.77, + "step": 28100, + "train_speed(iter/s)": 0.201047 + }, + { + "acc": 0.76561847, + "epoch": 0.6558275857040785, + "grad_norm": 3.890625, + "learning_rate": 7.847566657646502e-06, + "loss": 0.84216652, + "memory(GiB)": 135.77, + "step": 28110, + "train_speed(iter/s)": 0.201085 + }, + { + "acc": 0.77506824, + "epoch": 0.6560608932763674, + "grad_norm": 5.34375, + "learning_rate": 7.846013638496281e-06, + "loss": 0.82243309, + "memory(GiB)": 135.77, + "step": 28120, + "train_speed(iter/s)": 0.201123 + }, + { + "acc": 0.78300853, + "epoch": 0.6562942008486563, + "grad_norm": 4.75, + "learning_rate": 7.84446021307844e-06, + "loss": 0.76296067, + "memory(GiB)": 135.77, + "step": 28130, + "train_speed(iter/s)": 0.20116 + }, + { + "acc": 0.75912266, + "epoch": 0.6565275084209452, + "grad_norm": 4.40625, + "learning_rate": 7.842906381614732e-06, + "loss": 0.84529533, + "memory(GiB)": 135.77, + "step": 28140, + "train_speed(iter/s)": 0.201196 + }, + { + "acc": 0.78884659, + "epoch": 0.6567608159932341, + "grad_norm": 6.75, + "learning_rate": 7.841352144326962e-06, + "loss": 0.76649256, + "memory(GiB)": 135.77, + "step": 28150, + "train_speed(iter/s)": 0.201231 + }, + { + "acc": 0.78107457, + "epoch": 0.656994123565523, + "grad_norm": 3.78125, + "learning_rate": 7.839797501436999e-06, + "loss": 0.7952775, + "memory(GiB)": 135.77, + "step": 28160, + "train_speed(iter/s)": 0.201266 + }, + { + "acc": 0.7618453, + "epoch": 0.6572274311378119, + "grad_norm": 5.28125, + "learning_rate": 7.838242453166766e-06, + "loss": 0.86572094, + "memory(GiB)": 135.77, + "step": 28170, + "train_speed(iter/s)": 0.201305 + }, + { + "acc": 0.77299466, + "epoch": 0.6574607387101008, + "grad_norm": 6.59375, + "learning_rate": 7.83668699973825e-06, + "loss": 0.83398046, + "memory(GiB)": 135.77, + "step": 28180, + "train_speed(iter/s)": 0.201342 + }, + { + "acc": 0.77326827, + "epoch": 0.6576940462823897, + "grad_norm": 5.3125, + "learning_rate": 7.835131141373487e-06, + "loss": 0.80730629, + "memory(GiB)": 135.77, + "step": 28190, + "train_speed(iter/s)": 0.201379 + }, + { + "acc": 0.78218737, + "epoch": 0.6579273538546786, + "grad_norm": 5.65625, + "learning_rate": 7.833574878294578e-06, + "loss": 0.79669929, + "memory(GiB)": 135.77, + "step": 28200, + "train_speed(iter/s)": 0.201411 + }, + { + "acc": 0.76740227, + "epoch": 0.6581606614269674, + "grad_norm": 5.6875, + "learning_rate": 7.832018210723679e-06, + "loss": 0.84285278, + "memory(GiB)": 135.77, + "step": 28210, + "train_speed(iter/s)": 0.201447 + }, + { + "acc": 0.76748447, + "epoch": 0.6583939689992563, + "grad_norm": 4.46875, + "learning_rate": 7.830461138883e-06, + "loss": 0.84718819, + "memory(GiB)": 135.77, + "step": 28220, + "train_speed(iter/s)": 0.201482 + }, + { + "acc": 0.77524757, + "epoch": 0.6586272765715452, + "grad_norm": 6.375, + "learning_rate": 7.82890366299482e-06, + "loss": 0.81450605, + "memory(GiB)": 135.77, + "step": 28230, + "train_speed(iter/s)": 0.201522 + }, + { + "acc": 0.76804585, + "epoch": 0.6588605841438341, + "grad_norm": 4.8125, + "learning_rate": 7.827345783281462e-06, + "loss": 0.82887039, + "memory(GiB)": 135.77, + "step": 28240, + "train_speed(iter/s)": 0.201558 + }, + { + "acc": 0.77396336, + "epoch": 0.659093891716123, + "grad_norm": 5.5625, + "learning_rate": 7.825787499965315e-06, + "loss": 0.83359642, + "memory(GiB)": 135.77, + "step": 28250, + "train_speed(iter/s)": 0.201595 + }, + { + "acc": 0.74629498, + "epoch": 0.6593271992884119, + "grad_norm": 5.96875, + "learning_rate": 7.824228813268823e-06, + "loss": 0.92181091, + "memory(GiB)": 135.77, + "step": 28260, + "train_speed(iter/s)": 0.20163 + }, + { + "acc": 0.78333774, + "epoch": 0.6595605068607008, + "grad_norm": 4.71875, + "learning_rate": 7.822669723414488e-06, + "loss": 0.78887167, + "memory(GiB)": 135.77, + "step": 28270, + "train_speed(iter/s)": 0.201667 + }, + { + "acc": 0.75382862, + "epoch": 0.6597938144329897, + "grad_norm": 3.953125, + "learning_rate": 7.82111023062487e-06, + "loss": 0.89491863, + "memory(GiB)": 135.77, + "step": 28280, + "train_speed(iter/s)": 0.2017 + }, + { + "acc": 0.76489215, + "epoch": 0.6600271220052786, + "grad_norm": 5.5625, + "learning_rate": 7.819550335122587e-06, + "loss": 0.86177435, + "memory(GiB)": 135.77, + "step": 28290, + "train_speed(iter/s)": 0.201738 + }, + { + "acc": 0.76324902, + "epoch": 0.6602604295775675, + "grad_norm": 5.25, + "learning_rate": 7.817990037130312e-06, + "loss": 0.85025234, + "memory(GiB)": 135.77, + "step": 28300, + "train_speed(iter/s)": 0.201777 + }, + { + "acc": 0.77776289, + "epoch": 0.6604937371498564, + "grad_norm": 6.0, + "learning_rate": 7.816429336870778e-06, + "loss": 0.80502405, + "memory(GiB)": 135.77, + "step": 28310, + "train_speed(iter/s)": 0.201815 + }, + { + "acc": 0.78841567, + "epoch": 0.6607270447221453, + "grad_norm": 10.3125, + "learning_rate": 7.814868234566775e-06, + "loss": 0.72700367, + "memory(GiB)": 135.77, + "step": 28320, + "train_speed(iter/s)": 0.201851 + }, + { + "acc": 0.7645155, + "epoch": 0.6609603522944342, + "grad_norm": 4.75, + "learning_rate": 7.813306730441147e-06, + "loss": 0.84438887, + "memory(GiB)": 135.77, + "step": 28330, + "train_speed(iter/s)": 0.20189 + }, + { + "acc": 0.77188535, + "epoch": 0.6611936598667231, + "grad_norm": 4.9375, + "learning_rate": 7.811744824716803e-06, + "loss": 0.82249565, + "memory(GiB)": 135.77, + "step": 28340, + "train_speed(iter/s)": 0.201927 + }, + { + "acc": 0.79082394, + "epoch": 0.661426967439012, + "grad_norm": 6.4375, + "learning_rate": 7.810182517616702e-06, + "loss": 0.73457441, + "memory(GiB)": 135.77, + "step": 28350, + "train_speed(iter/s)": 0.201965 + }, + { + "acc": 0.77906938, + "epoch": 0.6616602750113009, + "grad_norm": 5.625, + "learning_rate": 7.808619809363863e-06, + "loss": 0.78538847, + "memory(GiB)": 135.77, + "step": 28360, + "train_speed(iter/s)": 0.202 + }, + { + "acc": 0.78205838, + "epoch": 0.6618935825835898, + "grad_norm": 9.25, + "learning_rate": 7.80705670018136e-06, + "loss": 0.79499912, + "memory(GiB)": 135.77, + "step": 28370, + "train_speed(iter/s)": 0.202036 + }, + { + "acc": 0.78381143, + "epoch": 0.6621268901558787, + "grad_norm": 5.96875, + "learning_rate": 7.805493190292327e-06, + "loss": 0.78962469, + "memory(GiB)": 135.77, + "step": 28380, + "train_speed(iter/s)": 0.202067 + }, + { + "acc": 0.77339664, + "epoch": 0.6623601977281676, + "grad_norm": 4.15625, + "learning_rate": 7.80392927991996e-06, + "loss": 0.79976535, + "memory(GiB)": 135.77, + "step": 28390, + "train_speed(iter/s)": 0.202101 + }, + { + "acc": 0.77922812, + "epoch": 0.6625935053004564, + "grad_norm": 5.25, + "learning_rate": 7.802364969287501e-06, + "loss": 0.77621737, + "memory(GiB)": 135.77, + "step": 28400, + "train_speed(iter/s)": 0.202139 + }, + { + "acc": 0.7975256, + "epoch": 0.6628268128727453, + "grad_norm": 5.1875, + "learning_rate": 7.80080025861826e-06, + "loss": 0.7247571, + "memory(GiB)": 135.77, + "step": 28410, + "train_speed(iter/s)": 0.202175 + }, + { + "acc": 0.79595299, + "epoch": 0.6630601204450342, + "grad_norm": 5.4375, + "learning_rate": 7.799235148135592e-06, + "loss": 0.71732335, + "memory(GiB)": 135.77, + "step": 28420, + "train_speed(iter/s)": 0.20221 + }, + { + "acc": 0.77594852, + "epoch": 0.663293428017323, + "grad_norm": 5.84375, + "learning_rate": 7.797669638062921e-06, + "loss": 0.79501762, + "memory(GiB)": 135.77, + "step": 28430, + "train_speed(iter/s)": 0.202245 + }, + { + "acc": 0.77370367, + "epoch": 0.663526735589612, + "grad_norm": 8.25, + "learning_rate": 7.796103728623723e-06, + "loss": 0.82125921, + "memory(GiB)": 135.77, + "step": 28440, + "train_speed(iter/s)": 0.202281 + }, + { + "acc": 0.78018131, + "epoch": 0.6637600431619008, + "grad_norm": 5.53125, + "learning_rate": 7.794537420041527e-06, + "loss": 0.80163622, + "memory(GiB)": 135.77, + "step": 28450, + "train_speed(iter/s)": 0.202317 + }, + { + "acc": 0.77699895, + "epoch": 0.6639933507341897, + "grad_norm": 7.96875, + "learning_rate": 7.792970712539929e-06, + "loss": 0.81302996, + "memory(GiB)": 135.77, + "step": 28460, + "train_speed(iter/s)": 0.202354 + }, + { + "acc": 0.78811188, + "epoch": 0.6642266583064786, + "grad_norm": 5.0625, + "learning_rate": 7.791403606342572e-06, + "loss": 0.77973518, + "memory(GiB)": 135.77, + "step": 28470, + "train_speed(iter/s)": 0.202388 + }, + { + "acc": 0.77782664, + "epoch": 0.6644599658787675, + "grad_norm": 7.90625, + "learning_rate": 7.78983610167316e-06, + "loss": 0.79536843, + "memory(GiB)": 135.77, + "step": 28480, + "train_speed(iter/s)": 0.202424 + }, + { + "acc": 0.78967228, + "epoch": 0.6646932734510564, + "grad_norm": 7.125, + "learning_rate": 7.788268198755456e-06, + "loss": 0.7731019, + "memory(GiB)": 135.77, + "step": 28490, + "train_speed(iter/s)": 0.202461 + }, + { + "acc": 0.76639204, + "epoch": 0.6649265810233453, + "grad_norm": 6.96875, + "learning_rate": 7.786699897813277e-06, + "loss": 0.83777695, + "memory(GiB)": 135.77, + "step": 28500, + "train_speed(iter/s)": 0.202497 + }, + { + "epoch": 0.6649265810233453, + "eval_acc": 0.7410190714164805, + "eval_loss": 0.8165611028671265, + "eval_runtime": 1270.5641, + "eval_samples_per_second": 28.327, + "eval_steps_per_second": 14.164, + "step": 28500 + }, + { + "acc": 0.75553088, + "epoch": 0.6651598885956342, + "grad_norm": 9.125, + "learning_rate": 7.785131199070497e-06, + "loss": 0.88834152, + "memory(GiB)": 135.77, + "step": 28510, + "train_speed(iter/s)": 0.200698 + }, + { + "acc": 0.76838694, + "epoch": 0.6653931961679231, + "grad_norm": 3.9375, + "learning_rate": 7.783562102751048e-06, + "loss": 0.82640839, + "memory(GiB)": 135.77, + "step": 28520, + "train_speed(iter/s)": 0.200734 + }, + { + "acc": 0.78102646, + "epoch": 0.665626503740212, + "grad_norm": 7.0, + "learning_rate": 7.781992609078916e-06, + "loss": 0.77242785, + "memory(GiB)": 135.77, + "step": 28530, + "train_speed(iter/s)": 0.200769 + }, + { + "acc": 0.78945589, + "epoch": 0.6658598113125009, + "grad_norm": 7.03125, + "learning_rate": 7.780422718278148e-06, + "loss": 0.77674723, + "memory(GiB)": 135.77, + "step": 28540, + "train_speed(iter/s)": 0.200804 + }, + { + "acc": 0.75834866, + "epoch": 0.6660931188847898, + "grad_norm": 5.5, + "learning_rate": 7.778852430572846e-06, + "loss": 0.89787149, + "memory(GiB)": 135.77, + "step": 28550, + "train_speed(iter/s)": 0.20084 + }, + { + "acc": 0.7631052, + "epoch": 0.6663264264570787, + "grad_norm": 6.75, + "learning_rate": 7.777281746187163e-06, + "loss": 0.86248188, + "memory(GiB)": 135.77, + "step": 28560, + "train_speed(iter/s)": 0.200877 + }, + { + "acc": 0.78461332, + "epoch": 0.6665597340293676, + "grad_norm": 6.25, + "learning_rate": 7.775710665345322e-06, + "loss": 0.78493948, + "memory(GiB)": 135.77, + "step": 28570, + "train_speed(iter/s)": 0.200913 + }, + { + "acc": 0.75799332, + "epoch": 0.6667930416016565, + "grad_norm": 4.71875, + "learning_rate": 7.774139188271588e-06, + "loss": 0.86719799, + "memory(GiB)": 135.77, + "step": 28580, + "train_speed(iter/s)": 0.200949 + }, + { + "acc": 0.78168793, + "epoch": 0.6670263491739454, + "grad_norm": 5.59375, + "learning_rate": 7.772567315190291e-06, + "loss": 0.79164333, + "memory(GiB)": 135.77, + "step": 28590, + "train_speed(iter/s)": 0.200984 + }, + { + "acc": 0.76213493, + "epoch": 0.6672596567462342, + "grad_norm": 4.65625, + "learning_rate": 7.770995046325813e-06, + "loss": 0.86090012, + "memory(GiB)": 135.77, + "step": 28600, + "train_speed(iter/s)": 0.201022 + }, + { + "acc": 0.76110024, + "epoch": 0.6674929643185231, + "grad_norm": 5.53125, + "learning_rate": 7.769422381902601e-06, + "loss": 0.84771814, + "memory(GiB)": 135.77, + "step": 28610, + "train_speed(iter/s)": 0.201061 + }, + { + "acc": 0.78071322, + "epoch": 0.667726271890812, + "grad_norm": 5.8125, + "learning_rate": 7.767849322145144e-06, + "loss": 0.79260278, + "memory(GiB)": 135.77, + "step": 28620, + "train_speed(iter/s)": 0.201098 + }, + { + "acc": 0.76000824, + "epoch": 0.6679595794631009, + "grad_norm": 5.96875, + "learning_rate": 7.766275867278004e-06, + "loss": 0.8921133, + "memory(GiB)": 135.77, + "step": 28630, + "train_speed(iter/s)": 0.201133 + }, + { + "acc": 0.77165904, + "epoch": 0.6681928870353898, + "grad_norm": 7.3125, + "learning_rate": 7.764702017525787e-06, + "loss": 0.80725384, + "memory(GiB)": 135.77, + "step": 28640, + "train_speed(iter/s)": 0.201167 + }, + { + "acc": 0.77876086, + "epoch": 0.6684261946076787, + "grad_norm": 5.15625, + "learning_rate": 7.763127773113159e-06, + "loss": 0.79236784, + "memory(GiB)": 135.77, + "step": 28650, + "train_speed(iter/s)": 0.201201 + }, + { + "acc": 0.79020371, + "epoch": 0.6686595021799676, + "grad_norm": 8.625, + "learning_rate": 7.761553134264844e-06, + "loss": 0.75915909, + "memory(GiB)": 135.77, + "step": 28660, + "train_speed(iter/s)": 0.201238 + }, + { + "acc": 0.76552114, + "epoch": 0.6688928097522565, + "grad_norm": 5.90625, + "learning_rate": 7.759978101205623e-06, + "loss": 0.84368448, + "memory(GiB)": 135.77, + "step": 28670, + "train_speed(iter/s)": 0.201275 + }, + { + "acc": 0.78740005, + "epoch": 0.6691261173245454, + "grad_norm": 5.71875, + "learning_rate": 7.758402674160328e-06, + "loss": 0.75369263, + "memory(GiB)": 135.77, + "step": 28680, + "train_speed(iter/s)": 0.201313 + }, + { + "acc": 0.76241765, + "epoch": 0.6693594248968343, + "grad_norm": 4.96875, + "learning_rate": 7.756826853353854e-06, + "loss": 0.85766029, + "memory(GiB)": 135.77, + "step": 28690, + "train_speed(iter/s)": 0.201349 + }, + { + "acc": 0.76723084, + "epoch": 0.6695927324691232, + "grad_norm": 5.625, + "learning_rate": 7.755250639011147e-06, + "loss": 0.84183302, + "memory(GiB)": 135.77, + "step": 28700, + "train_speed(iter/s)": 0.201389 + }, + { + "acc": 0.77271857, + "epoch": 0.6698260400414121, + "grad_norm": 4.65625, + "learning_rate": 7.75367403135721e-06, + "loss": 0.80901413, + "memory(GiB)": 135.77, + "step": 28710, + "train_speed(iter/s)": 0.201427 + }, + { + "acc": 0.75930796, + "epoch": 0.670059347613701, + "grad_norm": 6.78125, + "learning_rate": 7.752097030617107e-06, + "loss": 0.87077732, + "memory(GiB)": 135.77, + "step": 28720, + "train_speed(iter/s)": 0.201465 + }, + { + "acc": 0.77311296, + "epoch": 0.6702926551859899, + "grad_norm": 7.84375, + "learning_rate": 7.750519637015953e-06, + "loss": 0.80176973, + "memory(GiB)": 135.77, + "step": 28730, + "train_speed(iter/s)": 0.2015 + }, + { + "acc": 0.78030658, + "epoch": 0.6705259627582788, + "grad_norm": 4.78125, + "learning_rate": 7.748941850778917e-06, + "loss": 0.774928, + "memory(GiB)": 135.77, + "step": 28740, + "train_speed(iter/s)": 0.201535 + }, + { + "acc": 0.76701117, + "epoch": 0.6707592703305677, + "grad_norm": 5.625, + "learning_rate": 7.747363672131233e-06, + "loss": 0.83319283, + "memory(GiB)": 135.77, + "step": 28750, + "train_speed(iter/s)": 0.20157 + }, + { + "acc": 0.78679342, + "epoch": 0.6709925779028566, + "grad_norm": 5.75, + "learning_rate": 7.745785101298182e-06, + "loss": 0.75664225, + "memory(GiB)": 135.77, + "step": 28760, + "train_speed(iter/s)": 0.201606 + }, + { + "acc": 0.76925483, + "epoch": 0.6712258854751455, + "grad_norm": 7.3125, + "learning_rate": 7.744206138505106e-06, + "loss": 0.83768616, + "memory(GiB)": 135.77, + "step": 28770, + "train_speed(iter/s)": 0.201642 + }, + { + "acc": 0.80132675, + "epoch": 0.6714591930474344, + "grad_norm": 5.6875, + "learning_rate": 7.7426267839774e-06, + "loss": 0.70183382, + "memory(GiB)": 135.77, + "step": 28780, + "train_speed(iter/s)": 0.201677 + }, + { + "acc": 0.77842226, + "epoch": 0.6716925006197232, + "grad_norm": 6.09375, + "learning_rate": 7.741047037940516e-06, + "loss": 0.79888177, + "memory(GiB)": 135.77, + "step": 28790, + "train_speed(iter/s)": 0.201714 + }, + { + "acc": 0.77139082, + "epoch": 0.6719258081920121, + "grad_norm": 5.15625, + "learning_rate": 7.739466900619966e-06, + "loss": 0.81673717, + "memory(GiB)": 135.77, + "step": 28800, + "train_speed(iter/s)": 0.201753 + }, + { + "acc": 0.76923008, + "epoch": 0.672159115764301, + "grad_norm": 8.4375, + "learning_rate": 7.737886372241311e-06, + "loss": 0.84230728, + "memory(GiB)": 135.77, + "step": 28810, + "train_speed(iter/s)": 0.201789 + }, + { + "acc": 0.75902824, + "epoch": 0.6723924233365899, + "grad_norm": 8.0625, + "learning_rate": 7.736305453030172e-06, + "loss": 0.86554337, + "memory(GiB)": 135.77, + "step": 28820, + "train_speed(iter/s)": 0.201823 + }, + { + "acc": 0.78267469, + "epoch": 0.6726257309088788, + "grad_norm": 9.0625, + "learning_rate": 7.734724143212224e-06, + "loss": 0.79381328, + "memory(GiB)": 135.77, + "step": 28830, + "train_speed(iter/s)": 0.201857 + }, + { + "acc": 0.77059851, + "epoch": 0.6728590384811677, + "grad_norm": 5.8125, + "learning_rate": 7.733142443013199e-06, + "loss": 0.82365742, + "memory(GiB)": 135.77, + "step": 28840, + "train_speed(iter/s)": 0.201894 + }, + { + "acc": 0.7740037, + "epoch": 0.6730923460534566, + "grad_norm": 5.6875, + "learning_rate": 7.731560352658886e-06, + "loss": 0.81236382, + "memory(GiB)": 135.77, + "step": 28850, + "train_speed(iter/s)": 0.20193 + }, + { + "acc": 0.77031889, + "epoch": 0.6733256536257455, + "grad_norm": 8.0, + "learning_rate": 7.729977872375125e-06, + "loss": 0.83344927, + "memory(GiB)": 135.77, + "step": 28860, + "train_speed(iter/s)": 0.201964 + }, + { + "acc": 0.77817774, + "epoch": 0.6735589611980344, + "grad_norm": 5.25, + "learning_rate": 7.728395002387815e-06, + "loss": 0.79905758, + "memory(GiB)": 135.77, + "step": 28870, + "train_speed(iter/s)": 0.201999 + }, + { + "acc": 0.76579313, + "epoch": 0.6737922687703233, + "grad_norm": 5.4375, + "learning_rate": 7.726811742922912e-06, + "loss": 0.84702206, + "memory(GiB)": 135.77, + "step": 28880, + "train_speed(iter/s)": 0.202036 + }, + { + "acc": 0.77942319, + "epoch": 0.6740255763426122, + "grad_norm": 4.8125, + "learning_rate": 7.725228094206423e-06, + "loss": 0.78971949, + "memory(GiB)": 135.77, + "step": 28890, + "train_speed(iter/s)": 0.202071 + }, + { + "acc": 0.76637468, + "epoch": 0.6742588839149011, + "grad_norm": 4.9375, + "learning_rate": 7.723644056464416e-06, + "loss": 0.83664494, + "memory(GiB)": 135.77, + "step": 28900, + "train_speed(iter/s)": 0.202105 + }, + { + "acc": 0.77963085, + "epoch": 0.67449219148719, + "grad_norm": 4.46875, + "learning_rate": 7.722059629923014e-06, + "loss": 0.78608251, + "memory(GiB)": 135.77, + "step": 28910, + "train_speed(iter/s)": 0.202138 + }, + { + "acc": 0.78031402, + "epoch": 0.6747254990594789, + "grad_norm": 5.09375, + "learning_rate": 7.720474814808387e-06, + "loss": 0.77677441, + "memory(GiB)": 135.77, + "step": 28920, + "train_speed(iter/s)": 0.202173 + }, + { + "acc": 0.76079268, + "epoch": 0.6749588066317678, + "grad_norm": 5.375, + "learning_rate": 7.718889611346771e-06, + "loss": 0.8742177, + "memory(GiB)": 135.77, + "step": 28930, + "train_speed(iter/s)": 0.202208 + }, + { + "acc": 0.7644505, + "epoch": 0.6751921142040567, + "grad_norm": 5.9375, + "learning_rate": 7.717304019764456e-06, + "loss": 0.86214409, + "memory(GiB)": 135.77, + "step": 28940, + "train_speed(iter/s)": 0.202241 + }, + { + "acc": 0.77155561, + "epoch": 0.6754254217763456, + "grad_norm": 4.4375, + "learning_rate": 7.71571804028778e-06, + "loss": 0.81877155, + "memory(GiB)": 135.77, + "step": 28950, + "train_speed(iter/s)": 0.202275 + }, + { + "acc": 0.77610378, + "epoch": 0.6756587293486345, + "grad_norm": 5.84375, + "learning_rate": 7.714131673143139e-06, + "loss": 0.80574827, + "memory(GiB)": 135.77, + "step": 28960, + "train_speed(iter/s)": 0.202311 + }, + { + "acc": 0.77921677, + "epoch": 0.6758920369209234, + "grad_norm": 7.875, + "learning_rate": 7.712544918556994e-06, + "loss": 0.77683654, + "memory(GiB)": 135.77, + "step": 28970, + "train_speed(iter/s)": 0.202347 + }, + { + "acc": 0.76622157, + "epoch": 0.6761253444932122, + "grad_norm": 6.09375, + "learning_rate": 7.71095777675585e-06, + "loss": 0.87221556, + "memory(GiB)": 135.77, + "step": 28980, + "train_speed(iter/s)": 0.202383 + }, + { + "acc": 0.79096737, + "epoch": 0.676358652065501, + "grad_norm": 6.09375, + "learning_rate": 7.709370247966269e-06, + "loss": 0.7472538, + "memory(GiB)": 135.77, + "step": 28990, + "train_speed(iter/s)": 0.202418 + }, + { + "acc": 0.77579155, + "epoch": 0.67659195963779, + "grad_norm": 6.75, + "learning_rate": 7.707782332414873e-06, + "loss": 0.80246773, + "memory(GiB)": 135.77, + "step": 29000, + "train_speed(iter/s)": 0.202455 + }, + { + "epoch": 0.67659195963779, + "eval_acc": 0.7414119052183158, + "eval_loss": 0.8165162205696106, + "eval_runtime": 1270.8006, + "eval_samples_per_second": 28.322, + "eval_steps_per_second": 14.161, + "step": 29000 + }, + { + "acc": 0.76836233, + "epoch": 0.6768252672100789, + "grad_norm": 6.34375, + "learning_rate": 7.706194030328336e-06, + "loss": 0.81320305, + "memory(GiB)": 135.77, + "step": 29010, + "train_speed(iter/s)": 0.200687 + }, + { + "acc": 0.77826796, + "epoch": 0.6770585747823678, + "grad_norm": 4.1875, + "learning_rate": 7.704605341933385e-06, + "loss": 0.79997253, + "memory(GiB)": 135.77, + "step": 29020, + "train_speed(iter/s)": 0.200723 + }, + { + "acc": 0.76980276, + "epoch": 0.6772918823546566, + "grad_norm": 5.34375, + "learning_rate": 7.70301626745681e-06, + "loss": 0.82670012, + "memory(GiB)": 135.77, + "step": 29030, + "train_speed(iter/s)": 0.200759 + }, + { + "acc": 0.7695508, + "epoch": 0.6775251899269455, + "grad_norm": 7.53125, + "learning_rate": 7.701426807125447e-06, + "loss": 0.83088665, + "memory(GiB)": 135.77, + "step": 29040, + "train_speed(iter/s)": 0.200796 + }, + { + "acc": 0.77592602, + "epoch": 0.6777584974992344, + "grad_norm": 5.5, + "learning_rate": 7.699836961166192e-06, + "loss": 0.82842121, + "memory(GiB)": 135.77, + "step": 29050, + "train_speed(iter/s)": 0.20083 + }, + { + "acc": 0.78777199, + "epoch": 0.6779918050715233, + "grad_norm": 4.9375, + "learning_rate": 7.698246729805996e-06, + "loss": 0.76934967, + "memory(GiB)": 135.77, + "step": 29060, + "train_speed(iter/s)": 0.200867 + }, + { + "acc": 0.7728559, + "epoch": 0.6782251126438122, + "grad_norm": 4.9375, + "learning_rate": 7.696656113271863e-06, + "loss": 0.80948372, + "memory(GiB)": 135.77, + "step": 29070, + "train_speed(iter/s)": 0.200902 + }, + { + "acc": 0.79076042, + "epoch": 0.6784584202161011, + "grad_norm": 6.25, + "learning_rate": 7.695065111790852e-06, + "loss": 0.7413765, + "memory(GiB)": 135.77, + "step": 29080, + "train_speed(iter/s)": 0.20094 + }, + { + "acc": 0.76918368, + "epoch": 0.67869172778839, + "grad_norm": 5.28125, + "learning_rate": 7.693473725590079e-06, + "loss": 0.82540627, + "memory(GiB)": 135.77, + "step": 29090, + "train_speed(iter/s)": 0.200974 + }, + { + "acc": 0.77715392, + "epoch": 0.6789250353606789, + "grad_norm": 5.71875, + "learning_rate": 7.691881954896716e-06, + "loss": 0.77763295, + "memory(GiB)": 135.77, + "step": 29100, + "train_speed(iter/s)": 0.201008 + }, + { + "acc": 0.78276401, + "epoch": 0.6791583429329678, + "grad_norm": 5.0625, + "learning_rate": 7.690289799937985e-06, + "loss": 0.78063684, + "memory(GiB)": 135.77, + "step": 29110, + "train_speed(iter/s)": 0.20104 + }, + { + "acc": 0.76057873, + "epoch": 0.6793916505052567, + "grad_norm": 5.75, + "learning_rate": 7.688697260941164e-06, + "loss": 0.87658768, + "memory(GiB)": 135.77, + "step": 29120, + "train_speed(iter/s)": 0.201077 + }, + { + "acc": 0.78228416, + "epoch": 0.6796249580775456, + "grad_norm": 4.75, + "learning_rate": 7.687104338133595e-06, + "loss": 0.79393601, + "memory(GiB)": 135.77, + "step": 29130, + "train_speed(iter/s)": 0.201111 + }, + { + "acc": 0.7692297, + "epoch": 0.6798582656498345, + "grad_norm": 4.8125, + "learning_rate": 7.68551103174266e-06, + "loss": 0.82615576, + "memory(GiB)": 135.77, + "step": 29140, + "train_speed(iter/s)": 0.201147 + }, + { + "acc": 0.7818614, + "epoch": 0.6800915732221234, + "grad_norm": 4.3125, + "learning_rate": 7.683917341995806e-06, + "loss": 0.76776161, + "memory(GiB)": 135.77, + "step": 29150, + "train_speed(iter/s)": 0.201182 + }, + { + "acc": 0.75885959, + "epoch": 0.6803248807944123, + "grad_norm": 8.5, + "learning_rate": 7.68232326912053e-06, + "loss": 0.88169804, + "memory(GiB)": 135.77, + "step": 29160, + "train_speed(iter/s)": 0.201219 + }, + { + "acc": 0.77344918, + "epoch": 0.6805581883667011, + "grad_norm": 6.78125, + "learning_rate": 7.680728813344388e-06, + "loss": 0.81062412, + "memory(GiB)": 135.77, + "step": 29170, + "train_speed(iter/s)": 0.201256 + }, + { + "acc": 0.75116019, + "epoch": 0.68079149593899, + "grad_norm": 10.125, + "learning_rate": 7.679133974894984e-06, + "loss": 0.88641338, + "memory(GiB)": 135.77, + "step": 29180, + "train_speed(iter/s)": 0.201294 + }, + { + "acc": 0.77474976, + "epoch": 0.6810248035112789, + "grad_norm": 4.6875, + "learning_rate": 7.677538753999984e-06, + "loss": 0.81274004, + "memory(GiB)": 135.77, + "step": 29190, + "train_speed(iter/s)": 0.201331 + }, + { + "acc": 0.77214217, + "epoch": 0.6812581110835678, + "grad_norm": 5.875, + "learning_rate": 7.675943150887107e-06, + "loss": 0.83875799, + "memory(GiB)": 135.77, + "step": 29200, + "train_speed(iter/s)": 0.201368 + }, + { + "acc": 0.77097301, + "epoch": 0.6814914186558567, + "grad_norm": 6.8125, + "learning_rate": 7.674347165784122e-06, + "loss": 0.82045498, + "memory(GiB)": 135.77, + "step": 29210, + "train_speed(iter/s)": 0.201403 + }, + { + "acc": 0.77530422, + "epoch": 0.6817247262281456, + "grad_norm": 4.71875, + "learning_rate": 7.672750798918854e-06, + "loss": 0.81686249, + "memory(GiB)": 135.77, + "step": 29220, + "train_speed(iter/s)": 0.201439 + }, + { + "acc": 0.77789793, + "epoch": 0.6819580338004345, + "grad_norm": 7.09375, + "learning_rate": 7.671154050519187e-06, + "loss": 0.80894594, + "memory(GiB)": 135.77, + "step": 29230, + "train_speed(iter/s)": 0.201476 + }, + { + "acc": 0.77112041, + "epoch": 0.6821913413727234, + "grad_norm": 4.65625, + "learning_rate": 7.669556920813056e-06, + "loss": 0.82041979, + "memory(GiB)": 135.77, + "step": 29240, + "train_speed(iter/s)": 0.201512 + }, + { + "acc": 0.79637423, + "epoch": 0.6824246489450123, + "grad_norm": 7.46875, + "learning_rate": 7.66795941002845e-06, + "loss": 0.74020443, + "memory(GiB)": 135.77, + "step": 29250, + "train_speed(iter/s)": 0.201548 + }, + { + "acc": 0.77368479, + "epoch": 0.6826579565173012, + "grad_norm": 4.65625, + "learning_rate": 7.666361518393413e-06, + "loss": 0.83006983, + "memory(GiB)": 135.77, + "step": 29260, + "train_speed(iter/s)": 0.201584 + }, + { + "acc": 0.77014456, + "epoch": 0.6828912640895901, + "grad_norm": 4.25, + "learning_rate": 7.664763246136042e-06, + "loss": 0.830159, + "memory(GiB)": 135.77, + "step": 29270, + "train_speed(iter/s)": 0.201618 + }, + { + "acc": 0.78274126, + "epoch": 0.683124571661879, + "grad_norm": 15.0, + "learning_rate": 7.663164593484493e-06, + "loss": 0.80830526, + "memory(GiB)": 135.77, + "step": 29280, + "train_speed(iter/s)": 0.201656 + }, + { + "acc": 0.7715332, + "epoch": 0.6833578792341679, + "grad_norm": 6.09375, + "learning_rate": 7.661565560666973e-06, + "loss": 0.81128178, + "memory(GiB)": 135.77, + "step": 29290, + "train_speed(iter/s)": 0.201691 + }, + { + "acc": 0.78186312, + "epoch": 0.6835911868064568, + "grad_norm": 6.5625, + "learning_rate": 7.65996614791174e-06, + "loss": 0.76622462, + "memory(GiB)": 135.77, + "step": 29300, + "train_speed(iter/s)": 0.201725 + }, + { + "acc": 0.77319794, + "epoch": 0.6838244943787457, + "grad_norm": 5.09375, + "learning_rate": 7.658366355447115e-06, + "loss": 0.85582533, + "memory(GiB)": 135.77, + "step": 29310, + "train_speed(iter/s)": 0.201761 + }, + { + "acc": 0.74860148, + "epoch": 0.6840578019510346, + "grad_norm": 5.375, + "learning_rate": 7.656766183501465e-06, + "loss": 0.90369864, + "memory(GiB)": 135.77, + "step": 29320, + "train_speed(iter/s)": 0.201797 + }, + { + "acc": 0.78511214, + "epoch": 0.6842911095233235, + "grad_norm": 5.21875, + "learning_rate": 7.655165632303212e-06, + "loss": 0.77512712, + "memory(GiB)": 135.77, + "step": 29330, + "train_speed(iter/s)": 0.201833 + }, + { + "acc": 0.78140364, + "epoch": 0.6845244170956124, + "grad_norm": 6.34375, + "learning_rate": 7.653564702080837e-06, + "loss": 0.78134556, + "memory(GiB)": 135.77, + "step": 29340, + "train_speed(iter/s)": 0.201869 + }, + { + "acc": 0.77730131, + "epoch": 0.6847577246679013, + "grad_norm": 4.1875, + "learning_rate": 7.651963393062872e-06, + "loss": 0.80187654, + "memory(GiB)": 135.77, + "step": 29350, + "train_speed(iter/s)": 0.201905 + }, + { + "acc": 0.78949361, + "epoch": 0.6849910322401902, + "grad_norm": 7.34375, + "learning_rate": 7.650361705477903e-06, + "loss": 0.73667974, + "memory(GiB)": 135.77, + "step": 29360, + "train_speed(iter/s)": 0.201939 + }, + { + "acc": 0.78830619, + "epoch": 0.685224339812479, + "grad_norm": 5.875, + "learning_rate": 7.648759639554571e-06, + "loss": 0.74675617, + "memory(GiB)": 135.77, + "step": 29370, + "train_speed(iter/s)": 0.201976 + }, + { + "acc": 0.77039886, + "epoch": 0.6854576473847679, + "grad_norm": 5.34375, + "learning_rate": 7.647157195521568e-06, + "loss": 0.82584143, + "memory(GiB)": 135.77, + "step": 29380, + "train_speed(iter/s)": 0.202012 + }, + { + "acc": 0.78677912, + "epoch": 0.6856909549570568, + "grad_norm": 6.34375, + "learning_rate": 7.645554373607647e-06, + "loss": 0.74630208, + "memory(GiB)": 135.77, + "step": 29390, + "train_speed(iter/s)": 0.202047 + }, + { + "acc": 0.76948566, + "epoch": 0.6859242625293457, + "grad_norm": 4.5, + "learning_rate": 7.643951174041606e-06, + "loss": 0.82137985, + "memory(GiB)": 135.77, + "step": 29400, + "train_speed(iter/s)": 0.20208 + }, + { + "acc": 0.78999381, + "epoch": 0.6861575701016346, + "grad_norm": 4.78125, + "learning_rate": 7.642347597052303e-06, + "loss": 0.74593878, + "memory(GiB)": 135.77, + "step": 29410, + "train_speed(iter/s)": 0.202116 + }, + { + "acc": 0.76763272, + "epoch": 0.6863908776739235, + "grad_norm": 7.21875, + "learning_rate": 7.64074364286865e-06, + "loss": 0.84214306, + "memory(GiB)": 135.77, + "step": 29420, + "train_speed(iter/s)": 0.202153 + }, + { + "acc": 0.78076696, + "epoch": 0.6866241852462124, + "grad_norm": 6.65625, + "learning_rate": 7.639139311719605e-06, + "loss": 0.7823586, + "memory(GiB)": 135.77, + "step": 29430, + "train_speed(iter/s)": 0.202187 + }, + { + "acc": 0.77389212, + "epoch": 0.6868574928185013, + "grad_norm": 4.59375, + "learning_rate": 7.637534603834193e-06, + "loss": 0.8016942, + "memory(GiB)": 135.77, + "step": 29440, + "train_speed(iter/s)": 0.202222 + }, + { + "acc": 0.77874303, + "epoch": 0.6870908003907902, + "grad_norm": 6.28125, + "learning_rate": 7.635929519441483e-06, + "loss": 0.80179472, + "memory(GiB)": 135.77, + "step": 29450, + "train_speed(iter/s)": 0.202259 + }, + { + "acc": 0.76265059, + "epoch": 0.6873241079630791, + "grad_norm": 5.78125, + "learning_rate": 7.634324058770598e-06, + "loss": 0.86517248, + "memory(GiB)": 135.77, + "step": 29460, + "train_speed(iter/s)": 0.202293 + }, + { + "acc": 0.77614689, + "epoch": 0.687557415535368, + "grad_norm": 4.125, + "learning_rate": 7.632718222050719e-06, + "loss": 0.80183201, + "memory(GiB)": 135.77, + "step": 29470, + "train_speed(iter/s)": 0.202329 + }, + { + "acc": 0.76075153, + "epoch": 0.6877907231076569, + "grad_norm": 12.5, + "learning_rate": 7.63111200951108e-06, + "loss": 0.85887947, + "memory(GiB)": 135.77, + "step": 29480, + "train_speed(iter/s)": 0.202363 + }, + { + "acc": 0.76486063, + "epoch": 0.6880240306799458, + "grad_norm": 6.46875, + "learning_rate": 7.629505421380965e-06, + "loss": 0.83933964, + "memory(GiB)": 135.77, + "step": 29490, + "train_speed(iter/s)": 0.2024 + }, + { + "acc": 0.7917264, + "epoch": 0.6882573382522347, + "grad_norm": 5.4375, + "learning_rate": 7.627898457889717e-06, + "loss": 0.74840088, + "memory(GiB)": 135.77, + "step": 29500, + "train_speed(iter/s)": 0.202435 + }, + { + "epoch": 0.6882573382522347, + "eval_acc": 0.7415806972391614, + "eval_loss": 0.8159891366958618, + "eval_runtime": 1269.8855, + "eval_samples_per_second": 28.342, + "eval_steps_per_second": 14.171, + "step": 29500 + }, + { + "acc": 0.78372993, + "epoch": 0.6884906458245236, + "grad_norm": 4.625, + "learning_rate": 7.6262911192667245e-06, + "loss": 0.7768353, + "memory(GiB)": 135.77, + "step": 29510, + "train_speed(iter/s)": 0.200698 + }, + { + "acc": 0.78984094, + "epoch": 0.6887239533968125, + "grad_norm": 5.9375, + "learning_rate": 7.62468340574144e-06, + "loss": 0.75353394, + "memory(GiB)": 135.77, + "step": 29520, + "train_speed(iter/s)": 0.20073 + }, + { + "acc": 0.78566389, + "epoch": 0.6889572609691014, + "grad_norm": 4.46875, + "learning_rate": 7.623075317543361e-06, + "loss": 0.74840536, + "memory(GiB)": 135.77, + "step": 29530, + "train_speed(iter/s)": 0.200764 + }, + { + "acc": 0.79994617, + "epoch": 0.6891905685413903, + "grad_norm": 6.40625, + "learning_rate": 7.62146685490204e-06, + "loss": 0.70613475, + "memory(GiB)": 135.77, + "step": 29540, + "train_speed(iter/s)": 0.200797 + }, + { + "acc": 0.78054943, + "epoch": 0.6894238761136792, + "grad_norm": 5.28125, + "learning_rate": 7.6198580180470904e-06, + "loss": 0.80197172, + "memory(GiB)": 135.77, + "step": 29550, + "train_speed(iter/s)": 0.200835 + }, + { + "acc": 0.75962677, + "epoch": 0.689657183685968, + "grad_norm": 5.9375, + "learning_rate": 7.618248807208169e-06, + "loss": 0.85815258, + "memory(GiB)": 135.77, + "step": 29560, + "train_speed(iter/s)": 0.200868 + }, + { + "acc": 0.76958151, + "epoch": 0.6898904912582569, + "grad_norm": 5.65625, + "learning_rate": 7.61663922261499e-06, + "loss": 0.82609196, + "memory(GiB)": 135.77, + "step": 29570, + "train_speed(iter/s)": 0.200904 + }, + { + "acc": 0.7712954, + "epoch": 0.6901237988305458, + "grad_norm": 4.90625, + "learning_rate": 7.615029264497322e-06, + "loss": 0.82603159, + "memory(GiB)": 135.77, + "step": 29580, + "train_speed(iter/s)": 0.200938 + }, + { + "acc": 0.76029787, + "epoch": 0.6903571064028347, + "grad_norm": 5.15625, + "learning_rate": 7.6134189330849885e-06, + "loss": 0.86350775, + "memory(GiB)": 135.77, + "step": 29590, + "train_speed(iter/s)": 0.200973 + }, + { + "acc": 0.77219863, + "epoch": 0.6905904139751236, + "grad_norm": 5.34375, + "learning_rate": 7.611808228607859e-06, + "loss": 0.82623463, + "memory(GiB)": 135.77, + "step": 29600, + "train_speed(iter/s)": 0.201006 + }, + { + "acc": 0.78632302, + "epoch": 0.6908237215474125, + "grad_norm": 6.53125, + "learning_rate": 7.610197151295865e-06, + "loss": 0.76113815, + "memory(GiB)": 135.77, + "step": 29610, + "train_speed(iter/s)": 0.201042 + }, + { + "acc": 0.78305635, + "epoch": 0.6910570291197013, + "grad_norm": 5.375, + "learning_rate": 7.608585701378985e-06, + "loss": 0.76118941, + "memory(GiB)": 135.77, + "step": 29620, + "train_speed(iter/s)": 0.201074 + }, + { + "acc": 0.7888319, + "epoch": 0.6912903366919902, + "grad_norm": 4.03125, + "learning_rate": 7.6069738790872545e-06, + "loss": 0.7636095, + "memory(GiB)": 135.77, + "step": 29630, + "train_speed(iter/s)": 0.201108 + }, + { + "acc": 0.77132835, + "epoch": 0.6915236442642791, + "grad_norm": 4.9375, + "learning_rate": 7.6053616846507606e-06, + "loss": 0.79716229, + "memory(GiB)": 135.77, + "step": 29640, + "train_speed(iter/s)": 0.201143 + }, + { + "acc": 0.7883872, + "epoch": 0.691756951836568, + "grad_norm": 3.296875, + "learning_rate": 7.6037491182996415e-06, + "loss": 0.77416801, + "memory(GiB)": 135.77, + "step": 29650, + "train_speed(iter/s)": 0.201178 + }, + { + "acc": 0.78499637, + "epoch": 0.691990259408857, + "grad_norm": 4.3125, + "learning_rate": 7.602136180264094e-06, + "loss": 0.77752113, + "memory(GiB)": 135.77, + "step": 29660, + "train_speed(iter/s)": 0.201215 + }, + { + "acc": 0.76922569, + "epoch": 0.6922235669811458, + "grad_norm": 6.0, + "learning_rate": 7.6005228707743606e-06, + "loss": 0.83131847, + "memory(GiB)": 135.77, + "step": 29670, + "train_speed(iter/s)": 0.201242 + }, + { + "acc": 0.77869415, + "epoch": 0.6924568745534347, + "grad_norm": 6.84375, + "learning_rate": 7.598909190060744e-06, + "loss": 0.8007268, + "memory(GiB)": 135.77, + "step": 29680, + "train_speed(iter/s)": 0.201276 + }, + { + "acc": 0.78622375, + "epoch": 0.6926901821257236, + "grad_norm": 5.0, + "learning_rate": 7.597295138353596e-06, + "loss": 0.75257502, + "memory(GiB)": 135.77, + "step": 29690, + "train_speed(iter/s)": 0.201314 + }, + { + "acc": 0.77413979, + "epoch": 0.6929234896980125, + "grad_norm": 5.6875, + "learning_rate": 7.595680715883321e-06, + "loss": 0.80841618, + "memory(GiB)": 135.77, + "step": 29700, + "train_speed(iter/s)": 0.201349 + }, + { + "acc": 0.77899332, + "epoch": 0.6931567972703014, + "grad_norm": 5.4375, + "learning_rate": 7.594065922880378e-06, + "loss": 0.78428011, + "memory(GiB)": 135.77, + "step": 29710, + "train_speed(iter/s)": 0.201382 + }, + { + "acc": 0.78220787, + "epoch": 0.6933901048425903, + "grad_norm": 3.53125, + "learning_rate": 7.592450759575278e-06, + "loss": 0.77431474, + "memory(GiB)": 135.77, + "step": 29720, + "train_speed(iter/s)": 0.201414 + }, + { + "acc": 0.78128052, + "epoch": 0.6936234124148792, + "grad_norm": 4.3125, + "learning_rate": 7.590835226198585e-06, + "loss": 0.79154778, + "memory(GiB)": 135.77, + "step": 29730, + "train_speed(iter/s)": 0.201451 + }, + { + "acc": 0.77904191, + "epoch": 0.6938567199871681, + "grad_norm": 5.28125, + "learning_rate": 7.589219322980916e-06, + "loss": 0.79745936, + "memory(GiB)": 135.77, + "step": 29740, + "train_speed(iter/s)": 0.201485 + }, + { + "acc": 0.78388314, + "epoch": 0.6940900275594569, + "grad_norm": 5.34375, + "learning_rate": 7.587603050152941e-06, + "loss": 0.79330082, + "memory(GiB)": 135.77, + "step": 29750, + "train_speed(iter/s)": 0.20152 + }, + { + "acc": 0.77729387, + "epoch": 0.6943233351317458, + "grad_norm": 4.75, + "learning_rate": 7.585986407945383e-06, + "loss": 0.79987221, + "memory(GiB)": 135.77, + "step": 29760, + "train_speed(iter/s)": 0.201555 + }, + { + "acc": 0.76541524, + "epoch": 0.6945566427040347, + "grad_norm": 5.78125, + "learning_rate": 7.584369396589015e-06, + "loss": 0.8547636, + "memory(GiB)": 135.77, + "step": 29770, + "train_speed(iter/s)": 0.201591 + }, + { + "acc": 0.75868149, + "epoch": 0.6947899502763236, + "grad_norm": 5.78125, + "learning_rate": 7.582752016314669e-06, + "loss": 0.84939613, + "memory(GiB)": 135.77, + "step": 29780, + "train_speed(iter/s)": 0.201626 + }, + { + "acc": 0.80678043, + "epoch": 0.6950232578486125, + "grad_norm": 4.90625, + "learning_rate": 7.58113426735322e-06, + "loss": 0.68545928, + "memory(GiB)": 135.77, + "step": 29790, + "train_speed(iter/s)": 0.201662 + }, + { + "acc": 0.75615764, + "epoch": 0.6952565654209014, + "grad_norm": 6.9375, + "learning_rate": 7.579516149935606e-06, + "loss": 0.86680984, + "memory(GiB)": 135.77, + "step": 29800, + "train_speed(iter/s)": 0.201699 + }, + { + "acc": 0.77607203, + "epoch": 0.6954898729931903, + "grad_norm": 4.8125, + "learning_rate": 7.577897664292811e-06, + "loss": 0.80326805, + "memory(GiB)": 135.77, + "step": 29810, + "train_speed(iter/s)": 0.201734 + }, + { + "acc": 0.78733912, + "epoch": 0.6957231805654792, + "grad_norm": 5.40625, + "learning_rate": 7.57627881065587e-06, + "loss": 0.77941236, + "memory(GiB)": 135.77, + "step": 29820, + "train_speed(iter/s)": 0.201768 + }, + { + "acc": 0.75822802, + "epoch": 0.6959564881377681, + "grad_norm": 4.65625, + "learning_rate": 7.574659589255881e-06, + "loss": 0.86697035, + "memory(GiB)": 135.77, + "step": 29830, + "train_speed(iter/s)": 0.201804 + }, + { + "acc": 0.74630752, + "epoch": 0.696189795710057, + "grad_norm": 5.25, + "learning_rate": 7.573040000323984e-06, + "loss": 0.92794971, + "memory(GiB)": 135.77, + "step": 29840, + "train_speed(iter/s)": 0.201839 + }, + { + "acc": 0.79665966, + "epoch": 0.6964231032823459, + "grad_norm": 5.46875, + "learning_rate": 7.571420044091372e-06, + "loss": 0.70622387, + "memory(GiB)": 135.77, + "step": 29850, + "train_speed(iter/s)": 0.201874 + }, + { + "acc": 0.76552095, + "epoch": 0.6966564108546348, + "grad_norm": 3.84375, + "learning_rate": 7.569799720789297e-06, + "loss": 0.86479321, + "memory(GiB)": 135.77, + "step": 29860, + "train_speed(iter/s)": 0.201909 + }, + { + "acc": 0.78278189, + "epoch": 0.6968897184269237, + "grad_norm": 5.90625, + "learning_rate": 7.568179030649057e-06, + "loss": 0.76828461, + "memory(GiB)": 135.77, + "step": 29870, + "train_speed(iter/s)": 0.201945 + }, + { + "acc": 0.76996212, + "epoch": 0.6971230259992126, + "grad_norm": 4.84375, + "learning_rate": 7.566557973902007e-06, + "loss": 0.85869331, + "memory(GiB)": 135.77, + "step": 29880, + "train_speed(iter/s)": 0.201981 + }, + { + "acc": 0.79064722, + "epoch": 0.6973563335715015, + "grad_norm": 5.96875, + "learning_rate": 7.564936550779553e-06, + "loss": 0.73188696, + "memory(GiB)": 135.77, + "step": 29890, + "train_speed(iter/s)": 0.202018 + }, + { + "acc": 0.77210503, + "epoch": 0.6975896411437904, + "grad_norm": 5.375, + "learning_rate": 7.563314761513151e-06, + "loss": 0.81467781, + "memory(GiB)": 135.77, + "step": 29900, + "train_speed(iter/s)": 0.202053 + }, + { + "acc": 0.79614601, + "epoch": 0.6978229487160793, + "grad_norm": 4.15625, + "learning_rate": 7.56169260633431e-06, + "loss": 0.71139755, + "memory(GiB)": 135.77, + "step": 29910, + "train_speed(iter/s)": 0.202089 + }, + { + "acc": 0.80187359, + "epoch": 0.6980562562883682, + "grad_norm": 4.40625, + "learning_rate": 7.560070085474596e-06, + "loss": 0.69751916, + "memory(GiB)": 135.77, + "step": 29920, + "train_speed(iter/s)": 0.202123 + }, + { + "acc": 0.77271166, + "epoch": 0.6982895638606571, + "grad_norm": 5.125, + "learning_rate": 7.55844719916562e-06, + "loss": 0.82679062, + "memory(GiB)": 135.77, + "step": 29930, + "train_speed(iter/s)": 0.202157 + }, + { + "acc": 0.77691226, + "epoch": 0.6985228714329459, + "grad_norm": 4.6875, + "learning_rate": 7.556823947639048e-06, + "loss": 0.79194837, + "memory(GiB)": 135.77, + "step": 29940, + "train_speed(iter/s)": 0.202193 + }, + { + "acc": 0.77552891, + "epoch": 0.6987561790052348, + "grad_norm": 5.84375, + "learning_rate": 7.555200331126602e-06, + "loss": 0.82497501, + "memory(GiB)": 135.77, + "step": 29950, + "train_speed(iter/s)": 0.202227 + }, + { + "acc": 0.77520766, + "epoch": 0.6989894865775237, + "grad_norm": 5.6875, + "learning_rate": 7.55357634986005e-06, + "loss": 0.81748133, + "memory(GiB)": 135.77, + "step": 29960, + "train_speed(iter/s)": 0.202263 + }, + { + "acc": 0.78164778, + "epoch": 0.6992227941498126, + "grad_norm": 5.75, + "learning_rate": 7.551952004071217e-06, + "loss": 0.78227224, + "memory(GiB)": 135.77, + "step": 29970, + "train_speed(iter/s)": 0.202296 + }, + { + "acc": 0.77848063, + "epoch": 0.6994561017221015, + "grad_norm": 6.4375, + "learning_rate": 7.550327293991976e-06, + "loss": 0.78909693, + "memory(GiB)": 135.77, + "step": 29980, + "train_speed(iter/s)": 0.202326 + }, + { + "acc": 0.75997553, + "epoch": 0.6996894092943904, + "grad_norm": 5.65625, + "learning_rate": 7.5487022198542555e-06, + "loss": 0.8917922, + "memory(GiB)": 135.77, + "step": 29990, + "train_speed(iter/s)": 0.202361 + }, + { + "acc": 0.78532553, + "epoch": 0.6999227168666793, + "grad_norm": 6.84375, + "learning_rate": 7.547076781890032e-06, + "loss": 0.7683835, + "memory(GiB)": 135.77, + "step": 30000, + "train_speed(iter/s)": 0.202393 + }, + { + "epoch": 0.6999227168666793, + "eval_acc": 0.7416280999319227, + "eval_loss": 0.8156638741493225, + "eval_runtime": 1270.1597, + "eval_samples_per_second": 28.336, + "eval_steps_per_second": 14.168, + "step": 30000 + }, + { + "acc": 0.7785594, + "epoch": 0.7001560244389682, + "grad_norm": 7.5625, + "learning_rate": 7.5454509803313394e-06, + "loss": 0.79886799, + "memory(GiB)": 135.77, + "step": 30010, + "train_speed(iter/s)": 0.200687 + }, + { + "acc": 0.76497021, + "epoch": 0.7003893320112571, + "grad_norm": 4.71875, + "learning_rate": 7.543824815410259e-06, + "loss": 0.85206814, + "memory(GiB)": 135.77, + "step": 30020, + "train_speed(iter/s)": 0.200722 + }, + { + "acc": 0.78620939, + "epoch": 0.700622639583546, + "grad_norm": 5.46875, + "learning_rate": 7.542198287358924e-06, + "loss": 0.78464289, + "memory(GiB)": 135.77, + "step": 30030, + "train_speed(iter/s)": 0.200747 + }, + { + "acc": 0.76833134, + "epoch": 0.7008559471558349, + "grad_norm": 4.5625, + "learning_rate": 7.540571396409522e-06, + "loss": 0.8300333, + "memory(GiB)": 135.77, + "step": 30040, + "train_speed(iter/s)": 0.200782 + }, + { + "acc": 0.78864598, + "epoch": 0.7010892547281238, + "grad_norm": 4.0625, + "learning_rate": 7.538944142794291e-06, + "loss": 0.74310522, + "memory(GiB)": 135.77, + "step": 30050, + "train_speed(iter/s)": 0.200815 + }, + { + "acc": 0.76193199, + "epoch": 0.7013225623004127, + "grad_norm": 4.78125, + "learning_rate": 7.537316526745522e-06, + "loss": 0.86004868, + "memory(GiB)": 135.77, + "step": 30060, + "train_speed(iter/s)": 0.200848 + }, + { + "acc": 0.79334602, + "epoch": 0.7015558698727016, + "grad_norm": 5.125, + "learning_rate": 7.535688548495557e-06, + "loss": 0.73706589, + "memory(GiB)": 135.77, + "step": 30070, + "train_speed(iter/s)": 0.200881 + }, + { + "acc": 0.76868596, + "epoch": 0.7017891774449905, + "grad_norm": 7.25, + "learning_rate": 7.534060208276786e-06, + "loss": 0.81477661, + "memory(GiB)": 135.77, + "step": 30080, + "train_speed(iter/s)": 0.200914 + }, + { + "acc": 0.74454851, + "epoch": 0.7020224850172794, + "grad_norm": 5.78125, + "learning_rate": 7.532431506321657e-06, + "loss": 0.95340862, + "memory(GiB)": 135.77, + "step": 30090, + "train_speed(iter/s)": 0.200948 + }, + { + "acc": 0.76665716, + "epoch": 0.7022557925895683, + "grad_norm": 6.03125, + "learning_rate": 7.530802442862666e-06, + "loss": 0.84706726, + "memory(GiB)": 135.77, + "step": 30100, + "train_speed(iter/s)": 0.20098 + }, + { + "acc": 0.78391695, + "epoch": 0.7024891001618572, + "grad_norm": 5.1875, + "learning_rate": 7.529173018132362e-06, + "loss": 0.77013426, + "memory(GiB)": 135.77, + "step": 30110, + "train_speed(iter/s)": 0.201015 + }, + { + "acc": 0.80045862, + "epoch": 0.7027224077341461, + "grad_norm": 6.0, + "learning_rate": 7.5275432323633446e-06, + "loss": 0.71537471, + "memory(GiB)": 135.77, + "step": 30120, + "train_speed(iter/s)": 0.20105 + }, + { + "acc": 0.76974192, + "epoch": 0.702955715306435, + "grad_norm": 4.5, + "learning_rate": 7.525913085788264e-06, + "loss": 0.81845627, + "memory(GiB)": 135.77, + "step": 30130, + "train_speed(iter/s)": 0.201085 + }, + { + "acc": 0.7624217, + "epoch": 0.7031890228787238, + "grad_norm": 5.625, + "learning_rate": 7.524282578639825e-06, + "loss": 0.85741539, + "memory(GiB)": 135.77, + "step": 30140, + "train_speed(iter/s)": 0.201118 + }, + { + "acc": 0.77117977, + "epoch": 0.7034223304510127, + "grad_norm": 5.5, + "learning_rate": 7.522651711150781e-06, + "loss": 0.82536449, + "memory(GiB)": 135.77, + "step": 30150, + "train_speed(iter/s)": 0.201154 + }, + { + "acc": 0.76665888, + "epoch": 0.7036556380233016, + "grad_norm": 4.8125, + "learning_rate": 7.521020483553939e-06, + "loss": 0.82699442, + "memory(GiB)": 135.77, + "step": 30160, + "train_speed(iter/s)": 0.201188 + }, + { + "acc": 0.76989846, + "epoch": 0.7038889455955905, + "grad_norm": 4.75, + "learning_rate": 7.519388896082154e-06, + "loss": 0.83109045, + "memory(GiB)": 135.77, + "step": 30170, + "train_speed(iter/s)": 0.201223 + }, + { + "acc": 0.77924395, + "epoch": 0.7041222531678794, + "grad_norm": 4.78125, + "learning_rate": 7.517756948968338e-06, + "loss": 0.7907198, + "memory(GiB)": 135.77, + "step": 30180, + "train_speed(iter/s)": 0.201257 + }, + { + "acc": 0.76878471, + "epoch": 0.7043555607401683, + "grad_norm": 5.25, + "learning_rate": 7.516124642445447e-06, + "loss": 0.84302406, + "memory(GiB)": 135.77, + "step": 30190, + "train_speed(iter/s)": 0.201291 + }, + { + "acc": 0.77351613, + "epoch": 0.7045888683124572, + "grad_norm": 5.4375, + "learning_rate": 7.514491976746494e-06, + "loss": 0.81995602, + "memory(GiB)": 135.77, + "step": 30200, + "train_speed(iter/s)": 0.201326 + }, + { + "acc": 0.76795378, + "epoch": 0.704822175884746, + "grad_norm": 5.9375, + "learning_rate": 7.512858952104544e-06, + "loss": 0.83630714, + "memory(GiB)": 135.77, + "step": 30210, + "train_speed(iter/s)": 0.201361 + }, + { + "acc": 0.75973902, + "epoch": 0.705055483457035, + "grad_norm": 6.03125, + "learning_rate": 7.511225568752707e-06, + "loss": 0.854778, + "memory(GiB)": 135.77, + "step": 30220, + "train_speed(iter/s)": 0.201397 + }, + { + "acc": 0.7747612, + "epoch": 0.7052887910293238, + "grad_norm": 4.21875, + "learning_rate": 7.50959182692415e-06, + "loss": 0.81778402, + "memory(GiB)": 135.77, + "step": 30230, + "train_speed(iter/s)": 0.201434 + }, + { + "acc": 0.77487125, + "epoch": 0.7055220986016127, + "grad_norm": 3.984375, + "learning_rate": 7.507957726852087e-06, + "loss": 0.78481297, + "memory(GiB)": 135.77, + "step": 30240, + "train_speed(iter/s)": 0.201469 + }, + { + "acc": 0.77745609, + "epoch": 0.7057554061739016, + "grad_norm": 6.03125, + "learning_rate": 7.506323268769788e-06, + "loss": 0.7958005, + "memory(GiB)": 135.77, + "step": 30250, + "train_speed(iter/s)": 0.201506 + }, + { + "acc": 0.77071285, + "epoch": 0.7059887137461905, + "grad_norm": 4.8125, + "learning_rate": 7.504688452910571e-06, + "loss": 0.81425562, + "memory(GiB)": 135.77, + "step": 30260, + "train_speed(iter/s)": 0.201541 + }, + { + "acc": 0.77105627, + "epoch": 0.7062220213184794, + "grad_norm": 5.78125, + "learning_rate": 7.503053279507806e-06, + "loss": 0.82091007, + "memory(GiB)": 135.77, + "step": 30270, + "train_speed(iter/s)": 0.201575 + }, + { + "acc": 0.79748535, + "epoch": 0.7064553288907683, + "grad_norm": 5.21875, + "learning_rate": 7.501417748794911e-06, + "loss": 0.73630304, + "memory(GiB)": 135.77, + "step": 30280, + "train_speed(iter/s)": 0.20161 + }, + { + "acc": 0.78553853, + "epoch": 0.7066886364630572, + "grad_norm": 6.0625, + "learning_rate": 7.49978186100536e-06, + "loss": 0.77834482, + "memory(GiB)": 135.77, + "step": 30290, + "train_speed(iter/s)": 0.201643 + }, + { + "acc": 0.77966051, + "epoch": 0.7069219440353461, + "grad_norm": 4.90625, + "learning_rate": 7.498145616372674e-06, + "loss": 0.77920589, + "memory(GiB)": 135.77, + "step": 30300, + "train_speed(iter/s)": 0.201678 + }, + { + "acc": 0.78804102, + "epoch": 0.707155251607635, + "grad_norm": 4.875, + "learning_rate": 7.4965090151304265e-06, + "loss": 0.75938053, + "memory(GiB)": 135.77, + "step": 30310, + "train_speed(iter/s)": 0.201711 + }, + { + "acc": 0.7741745, + "epoch": 0.7073885591799239, + "grad_norm": 4.6875, + "learning_rate": 7.494872057512242e-06, + "loss": 0.82201767, + "memory(GiB)": 135.77, + "step": 30320, + "train_speed(iter/s)": 0.201744 + }, + { + "acc": 0.78020763, + "epoch": 0.7076218667522127, + "grad_norm": 4.25, + "learning_rate": 7.493234743751797e-06, + "loss": 0.77473249, + "memory(GiB)": 135.77, + "step": 30330, + "train_speed(iter/s)": 0.201778 + }, + { + "acc": 0.78065357, + "epoch": 0.7078551743245016, + "grad_norm": 8.8125, + "learning_rate": 7.491597074082817e-06, + "loss": 0.79642534, + "memory(GiB)": 135.77, + "step": 30340, + "train_speed(iter/s)": 0.201811 + }, + { + "acc": 0.77584829, + "epoch": 0.7080884818967905, + "grad_norm": 7.34375, + "learning_rate": 7.489959048739079e-06, + "loss": 0.81745863, + "memory(GiB)": 135.77, + "step": 30350, + "train_speed(iter/s)": 0.201842 + }, + { + "acc": 0.77412333, + "epoch": 0.7083217894690794, + "grad_norm": 9.125, + "learning_rate": 7.488320667954408e-06, + "loss": 0.81564293, + "memory(GiB)": 135.77, + "step": 30360, + "train_speed(iter/s)": 0.201876 + }, + { + "acc": 0.76567802, + "epoch": 0.7085550970413683, + "grad_norm": 4.96875, + "learning_rate": 7.486681931962686e-06, + "loss": 0.84731541, + "memory(GiB)": 135.77, + "step": 30370, + "train_speed(iter/s)": 0.20191 + }, + { + "acc": 0.76095247, + "epoch": 0.7087884046136572, + "grad_norm": 5.65625, + "learning_rate": 7.48504284099784e-06, + "loss": 0.87232685, + "memory(GiB)": 135.77, + "step": 30380, + "train_speed(iter/s)": 0.201943 + }, + { + "acc": 0.77653809, + "epoch": 0.7090217121859461, + "grad_norm": 6.53125, + "learning_rate": 7.48340339529385e-06, + "loss": 0.78932204, + "memory(GiB)": 135.77, + "step": 30390, + "train_speed(iter/s)": 0.201979 + }, + { + "acc": 0.78220081, + "epoch": 0.709255019758235, + "grad_norm": 5.34375, + "learning_rate": 7.481763595084747e-06, + "loss": 0.77936535, + "memory(GiB)": 135.77, + "step": 30400, + "train_speed(iter/s)": 0.202013 + }, + { + "acc": 0.7953701, + "epoch": 0.7094883273305239, + "grad_norm": 3.890625, + "learning_rate": 7.480123440604613e-06, + "loss": 0.73328466, + "memory(GiB)": 135.77, + "step": 30410, + "train_speed(iter/s)": 0.202048 + }, + { + "acc": 0.77866468, + "epoch": 0.7097216349028128, + "grad_norm": 6.71875, + "learning_rate": 7.478482932087577e-06, + "loss": 0.80838766, + "memory(GiB)": 135.77, + "step": 30420, + "train_speed(iter/s)": 0.202079 + }, + { + "acc": 0.78703556, + "epoch": 0.7099549424751017, + "grad_norm": 4.125, + "learning_rate": 7.476842069767824e-06, + "loss": 0.77653255, + "memory(GiB)": 135.77, + "step": 30430, + "train_speed(iter/s)": 0.202113 + }, + { + "acc": 0.77524529, + "epoch": 0.7101882500473906, + "grad_norm": 7.28125, + "learning_rate": 7.475200853879583e-06, + "loss": 0.79806619, + "memory(GiB)": 135.77, + "step": 30440, + "train_speed(iter/s)": 0.202148 + }, + { + "acc": 0.7719944, + "epoch": 0.7104215576196795, + "grad_norm": 5.21875, + "learning_rate": 7.473559284657139e-06, + "loss": 0.82557144, + "memory(GiB)": 135.77, + "step": 30450, + "train_speed(iter/s)": 0.202185 + }, + { + "acc": 0.77690635, + "epoch": 0.7106548651919684, + "grad_norm": 6.40625, + "learning_rate": 7.471917362334828e-06, + "loss": 0.79608321, + "memory(GiB)": 135.77, + "step": 30460, + "train_speed(iter/s)": 0.20222 + }, + { + "acc": 0.76047997, + "epoch": 0.7108881727642573, + "grad_norm": 5.75, + "learning_rate": 7.47027508714703e-06, + "loss": 0.8347681, + "memory(GiB)": 135.77, + "step": 30470, + "train_speed(iter/s)": 0.202255 + }, + { + "acc": 0.78284283, + "epoch": 0.7111214803365462, + "grad_norm": 4.53125, + "learning_rate": 7.468632459328181e-06, + "loss": 0.76897001, + "memory(GiB)": 135.77, + "step": 30480, + "train_speed(iter/s)": 0.202289 + }, + { + "acc": 0.77444568, + "epoch": 0.7113547879088351, + "grad_norm": 6.71875, + "learning_rate": 7.466989479112766e-06, + "loss": 0.81205473, + "memory(GiB)": 135.77, + "step": 30490, + "train_speed(iter/s)": 0.202323 + }, + { + "acc": 0.76428804, + "epoch": 0.711588095481124, + "grad_norm": 5.90625, + "learning_rate": 7.465346146735319e-06, + "loss": 0.83355408, + "memory(GiB)": 135.77, + "step": 30500, + "train_speed(iter/s)": 0.202361 + }, + { + "epoch": 0.711588095481124, + "eval_acc": 0.7417022467114782, + "eval_loss": 0.8149046301841736, + "eval_runtime": 1270.5935, + "eval_samples_per_second": 28.326, + "eval_steps_per_second": 14.163, + "step": 30500 + }, + { + "acc": 0.77408185, + "epoch": 0.7118214030534129, + "grad_norm": 7.84375, + "learning_rate": 7.463702462430427e-06, + "loss": 0.82522774, + "memory(GiB)": 135.77, + "step": 30510, + "train_speed(iter/s)": 0.20068 + }, + { + "acc": 0.78299146, + "epoch": 0.7120547106257017, + "grad_norm": 4.4375, + "learning_rate": 7.4620584264327236e-06, + "loss": 0.79395037, + "memory(GiB)": 135.77, + "step": 30520, + "train_speed(iter/s)": 0.200715 + }, + { + "acc": 0.7901659, + "epoch": 0.7122880181979906, + "grad_norm": 5.03125, + "learning_rate": 7.460414038976894e-06, + "loss": 0.73990803, + "memory(GiB)": 135.77, + "step": 30530, + "train_speed(iter/s)": 0.200747 + }, + { + "acc": 0.76877952, + "epoch": 0.7125213257702795, + "grad_norm": 4.9375, + "learning_rate": 7.458769300297676e-06, + "loss": 0.82868195, + "memory(GiB)": 135.77, + "step": 30540, + "train_speed(iter/s)": 0.200778 + }, + { + "acc": 0.77149434, + "epoch": 0.7127546333425684, + "grad_norm": 6.40625, + "learning_rate": 7.457124210629853e-06, + "loss": 0.80213509, + "memory(GiB)": 135.77, + "step": 30550, + "train_speed(iter/s)": 0.200813 + }, + { + "acc": 0.76947355, + "epoch": 0.7129879409148573, + "grad_norm": 8.5, + "learning_rate": 7.455478770208267e-06, + "loss": 0.84286346, + "memory(GiB)": 135.77, + "step": 30560, + "train_speed(iter/s)": 0.200847 + }, + { + "acc": 0.77109156, + "epoch": 0.7132212484871462, + "grad_norm": 4.3125, + "learning_rate": 7.453832979267796e-06, + "loss": 0.81874142, + "memory(GiB)": 135.77, + "step": 30570, + "train_speed(iter/s)": 0.200882 + }, + { + "acc": 0.77773933, + "epoch": 0.7134545560594351, + "grad_norm": 5.46875, + "learning_rate": 7.452186838043381e-06, + "loss": 0.79809036, + "memory(GiB)": 135.77, + "step": 30580, + "train_speed(iter/s)": 0.200915 + }, + { + "acc": 0.78017259, + "epoch": 0.713687863631724, + "grad_norm": 4.34375, + "learning_rate": 7.450540346770008e-06, + "loss": 0.79023871, + "memory(GiB)": 135.77, + "step": 30590, + "train_speed(iter/s)": 0.200952 + }, + { + "acc": 0.78224049, + "epoch": 0.7139211712040129, + "grad_norm": 5.5625, + "learning_rate": 7.4488935056827115e-06, + "loss": 0.79336486, + "memory(GiB)": 135.77, + "step": 30600, + "train_speed(iter/s)": 0.200985 + }, + { + "acc": 0.7667264, + "epoch": 0.7141544787763018, + "grad_norm": 5.53125, + "learning_rate": 7.447246315016579e-06, + "loss": 0.84322777, + "memory(GiB)": 135.77, + "step": 30610, + "train_speed(iter/s)": 0.201018 + }, + { + "acc": 0.78506231, + "epoch": 0.7143877863485907, + "grad_norm": 6.4375, + "learning_rate": 7.445598775006745e-06, + "loss": 0.77431684, + "memory(GiB)": 135.77, + "step": 30620, + "train_speed(iter/s)": 0.201053 + }, + { + "acc": 0.76365628, + "epoch": 0.7146210939208796, + "grad_norm": 6.59375, + "learning_rate": 7.443950885888398e-06, + "loss": 0.85304022, + "memory(GiB)": 135.77, + "step": 30630, + "train_speed(iter/s)": 0.201088 + }, + { + "acc": 0.7706419, + "epoch": 0.7148544014931685, + "grad_norm": 5.8125, + "learning_rate": 7.4423026478967706e-06, + "loss": 0.83546371, + "memory(GiB)": 135.77, + "step": 30640, + "train_speed(iter/s)": 0.201119 + }, + { + "acc": 0.76318879, + "epoch": 0.7150877090654574, + "grad_norm": 4.78125, + "learning_rate": 7.440654061267151e-06, + "loss": 0.84243946, + "memory(GiB)": 135.77, + "step": 30650, + "train_speed(iter/s)": 0.201148 + }, + { + "acc": 0.7682981, + "epoch": 0.7153210166377463, + "grad_norm": 5.5625, + "learning_rate": 7.439005126234872e-06, + "loss": 0.82606144, + "memory(GiB)": 135.77, + "step": 30660, + "train_speed(iter/s)": 0.201182 + }, + { + "acc": 0.76082344, + "epoch": 0.7155543242100352, + "grad_norm": 5.3125, + "learning_rate": 7.43735584303532e-06, + "loss": 0.86378193, + "memory(GiB)": 135.77, + "step": 30670, + "train_speed(iter/s)": 0.201216 + }, + { + "acc": 0.77880592, + "epoch": 0.7157876317823241, + "grad_norm": 5.34375, + "learning_rate": 7.435706211903929e-06, + "loss": 0.8006321, + "memory(GiB)": 135.77, + "step": 30680, + "train_speed(iter/s)": 0.201249 + }, + { + "acc": 0.78893566, + "epoch": 0.716020939354613, + "grad_norm": 6.03125, + "learning_rate": 7.434056233076184e-06, + "loss": 0.74485593, + "memory(GiB)": 135.77, + "step": 30690, + "train_speed(iter/s)": 0.201283 + }, + { + "acc": 0.77817774, + "epoch": 0.7162542469269019, + "grad_norm": 6.84375, + "learning_rate": 7.43240590678762e-06, + "loss": 0.76789885, + "memory(GiB)": 135.77, + "step": 30700, + "train_speed(iter/s)": 0.201319 + }, + { + "acc": 0.77524462, + "epoch": 0.7164875544991907, + "grad_norm": 5.9375, + "learning_rate": 7.4307552332738184e-06, + "loss": 0.79615641, + "memory(GiB)": 135.77, + "step": 30710, + "train_speed(iter/s)": 0.201355 + }, + { + "acc": 0.76029038, + "epoch": 0.7167208620714796, + "grad_norm": 5.53125, + "learning_rate": 7.429104212770414e-06, + "loss": 0.88227682, + "memory(GiB)": 135.77, + "step": 30720, + "train_speed(iter/s)": 0.201386 + }, + { + "acc": 0.7853837, + "epoch": 0.7169541696437685, + "grad_norm": 4.1875, + "learning_rate": 7.427452845513088e-06, + "loss": 0.78255091, + "memory(GiB)": 135.77, + "step": 30730, + "train_speed(iter/s)": 0.201419 + }, + { + "acc": 0.7675179, + "epoch": 0.7171874772160574, + "grad_norm": 10.1875, + "learning_rate": 7.4258011317375735e-06, + "loss": 0.83768845, + "memory(GiB)": 135.77, + "step": 30740, + "train_speed(iter/s)": 0.201452 + }, + { + "acc": 0.79261498, + "epoch": 0.7174207847883463, + "grad_norm": 6.875, + "learning_rate": 7.424149071679654e-06, + "loss": 0.74290218, + "memory(GiB)": 135.77, + "step": 30750, + "train_speed(iter/s)": 0.201484 + }, + { + "acc": 0.78618517, + "epoch": 0.7176540923606352, + "grad_norm": 4.46875, + "learning_rate": 7.422496665575156e-06, + "loss": 0.75908709, + "memory(GiB)": 135.77, + "step": 30760, + "train_speed(iter/s)": 0.201519 + }, + { + "acc": 0.79118176, + "epoch": 0.717887399932924, + "grad_norm": 4.71875, + "learning_rate": 7.420843913659965e-06, + "loss": 0.75808463, + "memory(GiB)": 135.77, + "step": 30770, + "train_speed(iter/s)": 0.201554 + }, + { + "acc": 0.77106438, + "epoch": 0.718120707505213, + "grad_norm": 5.4375, + "learning_rate": 7.419190816170008e-06, + "loss": 0.83572817, + "memory(GiB)": 135.77, + "step": 30780, + "train_speed(iter/s)": 0.201588 + }, + { + "acc": 0.76454983, + "epoch": 0.7183540150775019, + "grad_norm": 4.4375, + "learning_rate": 7.417537373341263e-06, + "loss": 0.86166372, + "memory(GiB)": 135.77, + "step": 30790, + "train_speed(iter/s)": 0.201623 + }, + { + "acc": 0.77741642, + "epoch": 0.7185873226497907, + "grad_norm": 4.59375, + "learning_rate": 7.415883585409762e-06, + "loss": 0.79155254, + "memory(GiB)": 135.77, + "step": 30800, + "train_speed(iter/s)": 0.201659 + }, + { + "acc": 0.78092527, + "epoch": 0.7188206302220796, + "grad_norm": 5.4375, + "learning_rate": 7.414229452611582e-06, + "loss": 0.78085165, + "memory(GiB)": 135.77, + "step": 30810, + "train_speed(iter/s)": 0.201694 + }, + { + "acc": 0.75764637, + "epoch": 0.7190539377943685, + "grad_norm": 4.59375, + "learning_rate": 7.412574975182848e-06, + "loss": 0.88091917, + "memory(GiB)": 135.77, + "step": 30820, + "train_speed(iter/s)": 0.201727 + }, + { + "acc": 0.77109008, + "epoch": 0.7192872453666574, + "grad_norm": 4.34375, + "learning_rate": 7.410920153359736e-06, + "loss": 0.8264492, + "memory(GiB)": 135.77, + "step": 30830, + "train_speed(iter/s)": 0.201761 + }, + { + "acc": 0.76509042, + "epoch": 0.7195205529389463, + "grad_norm": 11.5625, + "learning_rate": 7.409264987378473e-06, + "loss": 0.83569603, + "memory(GiB)": 135.77, + "step": 30840, + "train_speed(iter/s)": 0.201795 + }, + { + "acc": 0.75531335, + "epoch": 0.7197538605112352, + "grad_norm": 6.625, + "learning_rate": 7.407609477475334e-06, + "loss": 0.87194748, + "memory(GiB)": 135.77, + "step": 30850, + "train_speed(iter/s)": 0.201829 + }, + { + "acc": 0.78215508, + "epoch": 0.7199871680835241, + "grad_norm": 4.90625, + "learning_rate": 7.405953623886642e-06, + "loss": 0.78222771, + "memory(GiB)": 135.77, + "step": 30860, + "train_speed(iter/s)": 0.201862 + }, + { + "acc": 0.77151766, + "epoch": 0.720220475655813, + "grad_norm": 9.6875, + "learning_rate": 7.404297426848768e-06, + "loss": 0.82328587, + "memory(GiB)": 135.77, + "step": 30870, + "train_speed(iter/s)": 0.201896 + }, + { + "acc": 0.74816885, + "epoch": 0.7204537832281019, + "grad_norm": 7.03125, + "learning_rate": 7.4026408865981335e-06, + "loss": 0.90460873, + "memory(GiB)": 135.77, + "step": 30880, + "train_speed(iter/s)": 0.201931 + }, + { + "acc": 0.77453132, + "epoch": 0.7206870908003908, + "grad_norm": 6.71875, + "learning_rate": 7.400984003371211e-06, + "loss": 0.83123531, + "memory(GiB)": 135.77, + "step": 30890, + "train_speed(iter/s)": 0.201965 + }, + { + "acc": 0.78442779, + "epoch": 0.7209203983726797, + "grad_norm": 5.75, + "learning_rate": 7.3993267774045206e-06, + "loss": 0.7607954, + "memory(GiB)": 135.77, + "step": 30900, + "train_speed(iter/s)": 0.201998 + }, + { + "acc": 0.79025621, + "epoch": 0.7211537059449685, + "grad_norm": 13.8125, + "learning_rate": 7.397669208934628e-06, + "loss": 0.77729826, + "memory(GiB)": 135.77, + "step": 30910, + "train_speed(iter/s)": 0.202029 + }, + { + "acc": 0.7896234, + "epoch": 0.7213870135172574, + "grad_norm": 4.25, + "learning_rate": 7.396011298198155e-06, + "loss": 0.74766607, + "memory(GiB)": 135.77, + "step": 30920, + "train_speed(iter/s)": 0.202061 + }, + { + "acc": 0.76902571, + "epoch": 0.7216203210895463, + "grad_norm": 5.625, + "learning_rate": 7.394353045431765e-06, + "loss": 0.83471451, + "memory(GiB)": 135.77, + "step": 30930, + "train_speed(iter/s)": 0.202094 + }, + { + "acc": 0.76524734, + "epoch": 0.7218536286618352, + "grad_norm": 6.84375, + "learning_rate": 7.392694450872171e-06, + "loss": 0.84251699, + "memory(GiB)": 135.77, + "step": 30940, + "train_speed(iter/s)": 0.202129 + }, + { + "acc": 0.75952892, + "epoch": 0.7220869362341241, + "grad_norm": 10.0625, + "learning_rate": 7.3910355147561394e-06, + "loss": 0.86127234, + "memory(GiB)": 135.77, + "step": 30950, + "train_speed(iter/s)": 0.202164 + }, + { + "acc": 0.78890076, + "epoch": 0.722320243806413, + "grad_norm": 5.09375, + "learning_rate": 7.389376237320485e-06, + "loss": 0.73546257, + "memory(GiB)": 135.77, + "step": 30960, + "train_speed(iter/s)": 0.202198 + }, + { + "acc": 0.79085941, + "epoch": 0.7225535513787019, + "grad_norm": 4.46875, + "learning_rate": 7.387716618802064e-06, + "loss": 0.73829441, + "memory(GiB)": 135.77, + "step": 30970, + "train_speed(iter/s)": 0.20223 + }, + { + "acc": 0.76388206, + "epoch": 0.7227868589509908, + "grad_norm": 3.796875, + "learning_rate": 7.386056659437792e-06, + "loss": 0.84366646, + "memory(GiB)": 135.77, + "step": 30980, + "train_speed(iter/s)": 0.202265 + }, + { + "acc": 0.77729826, + "epoch": 0.7230201665232797, + "grad_norm": 5.21875, + "learning_rate": 7.384396359464623e-06, + "loss": 0.80204477, + "memory(GiB)": 135.77, + "step": 30990, + "train_speed(iter/s)": 0.202296 + }, + { + "acc": 0.77947817, + "epoch": 0.7232534740955686, + "grad_norm": 4.5, + "learning_rate": 7.382735719119568e-06, + "loss": 0.77528582, + "memory(GiB)": 135.77, + "step": 31000, + "train_speed(iter/s)": 0.20233 + }, + { + "epoch": 0.7232534740955686, + "eval_acc": 0.7419815382526118, + "eval_loss": 0.8143436312675476, + "eval_runtime": 1269.6984, + "eval_samples_per_second": 28.346, + "eval_steps_per_second": 14.173, + "step": 31000 + }, + { + "acc": 0.77952061, + "epoch": 0.7234867816678575, + "grad_norm": 5.15625, + "learning_rate": 7.38107473863968e-06, + "loss": 0.80758448, + "memory(GiB)": 135.77, + "step": 31010, + "train_speed(iter/s)": 0.200679 + }, + { + "acc": 0.76929088, + "epoch": 0.7237200892401464, + "grad_norm": 4.75, + "learning_rate": 7.3794134182620646e-06, + "loss": 0.84603424, + "memory(GiB)": 135.77, + "step": 31020, + "train_speed(iter/s)": 0.200712 + }, + { + "acc": 0.78471966, + "epoch": 0.7239533968124353, + "grad_norm": 5.625, + "learning_rate": 7.377751758223876e-06, + "loss": 0.77363634, + "memory(GiB)": 135.77, + "step": 31030, + "train_speed(iter/s)": 0.200746 + }, + { + "acc": 0.75998449, + "epoch": 0.7241867043847242, + "grad_norm": 5.96875, + "learning_rate": 7.376089758762315e-06, + "loss": 0.86344814, + "memory(GiB)": 135.77, + "step": 31040, + "train_speed(iter/s)": 0.20078 + }, + { + "acc": 0.79045277, + "epoch": 0.7244200119570131, + "grad_norm": 7.46875, + "learning_rate": 7.374427420114629e-06, + "loss": 0.74859829, + "memory(GiB)": 135.77, + "step": 31050, + "train_speed(iter/s)": 0.20081 + }, + { + "acc": 0.78292723, + "epoch": 0.724653319529302, + "grad_norm": 4.34375, + "learning_rate": 7.37276474251812e-06, + "loss": 0.78144426, + "memory(GiB)": 135.77, + "step": 31060, + "train_speed(iter/s)": 0.200841 + }, + { + "acc": 0.77971792, + "epoch": 0.7248866271015909, + "grad_norm": 4.3125, + "learning_rate": 7.371101726210135e-06, + "loss": 0.79532051, + "memory(GiB)": 135.77, + "step": 31070, + "train_speed(iter/s)": 0.200875 + }, + { + "acc": 0.7901329, + "epoch": 0.7251199346738798, + "grad_norm": 7.375, + "learning_rate": 7.369438371428065e-06, + "loss": 0.75919361, + "memory(GiB)": 135.77, + "step": 31080, + "train_speed(iter/s)": 0.200907 + }, + { + "acc": 0.77284174, + "epoch": 0.7253532422461687, + "grad_norm": 7.5625, + "learning_rate": 7.367774678409357e-06, + "loss": 0.81404877, + "memory(GiB)": 135.77, + "step": 31090, + "train_speed(iter/s)": 0.200941 + }, + { + "acc": 0.78015275, + "epoch": 0.7255865498184575, + "grad_norm": 4.8125, + "learning_rate": 7.366110647391501e-06, + "loss": 0.78611984, + "memory(GiB)": 135.77, + "step": 31100, + "train_speed(iter/s)": 0.200974 + }, + { + "acc": 0.79383612, + "epoch": 0.7258198573907464, + "grad_norm": 4.65625, + "learning_rate": 7.364446278612036e-06, + "loss": 0.7279952, + "memory(GiB)": 135.77, + "step": 31110, + "train_speed(iter/s)": 0.201005 + }, + { + "acc": 0.7822289, + "epoch": 0.7260531649630353, + "grad_norm": 4.59375, + "learning_rate": 7.3627815723085535e-06, + "loss": 0.77796512, + "memory(GiB)": 135.77, + "step": 31120, + "train_speed(iter/s)": 0.201037 + }, + { + "acc": 0.77322626, + "epoch": 0.7262864725353242, + "grad_norm": 8.75, + "learning_rate": 7.361116528718688e-06, + "loss": 0.80861044, + "memory(GiB)": 135.77, + "step": 31130, + "train_speed(iter/s)": 0.201071 + }, + { + "acc": 0.77036018, + "epoch": 0.7265197801076131, + "grad_norm": 8.0, + "learning_rate": 7.359451148080123e-06, + "loss": 0.83087883, + "memory(GiB)": 135.77, + "step": 31140, + "train_speed(iter/s)": 0.201104 + }, + { + "acc": 0.77294779, + "epoch": 0.726753087679902, + "grad_norm": 5.96875, + "learning_rate": 7.357785430630593e-06, + "loss": 0.81432848, + "memory(GiB)": 135.77, + "step": 31150, + "train_speed(iter/s)": 0.201138 + }, + { + "acc": 0.78335571, + "epoch": 0.7269863952521909, + "grad_norm": 4.0625, + "learning_rate": 7.356119376607877e-06, + "loss": 0.78180637, + "memory(GiB)": 135.77, + "step": 31160, + "train_speed(iter/s)": 0.201171 + }, + { + "acc": 0.75630546, + "epoch": 0.7272197028244798, + "grad_norm": 6.03125, + "learning_rate": 7.354452986249805e-06, + "loss": 0.88844528, + "memory(GiB)": 135.77, + "step": 31170, + "train_speed(iter/s)": 0.201206 + }, + { + "acc": 0.76719637, + "epoch": 0.7274530103967687, + "grad_norm": 9.9375, + "learning_rate": 7.352786259794252e-06, + "loss": 0.84311905, + "memory(GiB)": 135.77, + "step": 31180, + "train_speed(iter/s)": 0.201239 + }, + { + "acc": 0.78604798, + "epoch": 0.7276863179690576, + "grad_norm": 6.09375, + "learning_rate": 7.351119197479144e-06, + "loss": 0.7596261, + "memory(GiB)": 135.77, + "step": 31190, + "train_speed(iter/s)": 0.201273 + }, + { + "acc": 0.78300457, + "epoch": 0.7279196255413465, + "grad_norm": 6.0, + "learning_rate": 7.349451799542455e-06, + "loss": 0.75782876, + "memory(GiB)": 135.77, + "step": 31200, + "train_speed(iter/s)": 0.201308 + }, + { + "acc": 0.79482107, + "epoch": 0.7281529331136354, + "grad_norm": 6.25, + "learning_rate": 7.3477840662222045e-06, + "loss": 0.73927917, + "memory(GiB)": 135.77, + "step": 31210, + "train_speed(iter/s)": 0.201342 + }, + { + "acc": 0.7678937, + "epoch": 0.7283862406859243, + "grad_norm": 4.4375, + "learning_rate": 7.346115997756459e-06, + "loss": 0.82241888, + "memory(GiB)": 135.77, + "step": 31220, + "train_speed(iter/s)": 0.201377 + }, + { + "acc": 0.78195829, + "epoch": 0.7286195482582132, + "grad_norm": 4.65625, + "learning_rate": 7.3444475943833375e-06, + "loss": 0.78644238, + "memory(GiB)": 135.77, + "step": 31230, + "train_speed(iter/s)": 0.201413 + }, + { + "acc": 0.78667397, + "epoch": 0.7288528558305021, + "grad_norm": 6.96875, + "learning_rate": 7.342778856341002e-06, + "loss": 0.78144002, + "memory(GiB)": 135.77, + "step": 31240, + "train_speed(iter/s)": 0.201444 + }, + { + "acc": 0.75145693, + "epoch": 0.729086163402791, + "grad_norm": 4.65625, + "learning_rate": 7.3411097838676645e-06, + "loss": 0.92597151, + "memory(GiB)": 135.77, + "step": 31250, + "train_speed(iter/s)": 0.201478 + }, + { + "acc": 0.78834095, + "epoch": 0.7293194709750799, + "grad_norm": 6.09375, + "learning_rate": 7.339440377201588e-06, + "loss": 0.76001892, + "memory(GiB)": 135.77, + "step": 31260, + "train_speed(iter/s)": 0.201512 + }, + { + "acc": 0.76082201, + "epoch": 0.7295527785473688, + "grad_norm": 7.3125, + "learning_rate": 7.337770636581075e-06, + "loss": 0.87978334, + "memory(GiB)": 135.77, + "step": 31270, + "train_speed(iter/s)": 0.201545 + }, + { + "acc": 0.78952765, + "epoch": 0.7297860861196577, + "grad_norm": 4.5, + "learning_rate": 7.3361005622444834e-06, + "loss": 0.76534405, + "memory(GiB)": 135.77, + "step": 31280, + "train_speed(iter/s)": 0.201581 + }, + { + "acc": 0.78288898, + "epoch": 0.7300193936919465, + "grad_norm": 6.5, + "learning_rate": 7.334430154430217e-06, + "loss": 0.79545956, + "memory(GiB)": 135.77, + "step": 31290, + "train_speed(iter/s)": 0.201612 + }, + { + "acc": 0.76784639, + "epoch": 0.7302527012642354, + "grad_norm": 8.0625, + "learning_rate": 7.332759413376721e-06, + "loss": 0.83361969, + "memory(GiB)": 135.77, + "step": 31300, + "train_speed(iter/s)": 0.201644 + }, + { + "acc": 0.78950572, + "epoch": 0.7304860088365243, + "grad_norm": 4.5, + "learning_rate": 7.331088339322499e-06, + "loss": 0.7704586, + "memory(GiB)": 135.77, + "step": 31310, + "train_speed(iter/s)": 0.201676 + }, + { + "acc": 0.77479763, + "epoch": 0.7307193164088132, + "grad_norm": 5.15625, + "learning_rate": 7.3294169325060925e-06, + "loss": 0.81133862, + "memory(GiB)": 135.77, + "step": 31320, + "train_speed(iter/s)": 0.20171 + }, + { + "acc": 0.76270666, + "epoch": 0.7309526239811021, + "grad_norm": 5.875, + "learning_rate": 7.327745193166096e-06, + "loss": 0.86329832, + "memory(GiB)": 135.77, + "step": 31330, + "train_speed(iter/s)": 0.201744 + }, + { + "acc": 0.77042761, + "epoch": 0.731185931553391, + "grad_norm": 8.1875, + "learning_rate": 7.3260731215411484e-06, + "loss": 0.84471989, + "memory(GiB)": 135.77, + "step": 31340, + "train_speed(iter/s)": 0.201777 + }, + { + "acc": 0.76804247, + "epoch": 0.7314192391256799, + "grad_norm": 7.75, + "learning_rate": 7.32440071786994e-06, + "loss": 0.84253273, + "memory(GiB)": 135.77, + "step": 31350, + "train_speed(iter/s)": 0.20181 + }, + { + "acc": 0.77282157, + "epoch": 0.7316525466979688, + "grad_norm": 6.4375, + "learning_rate": 7.322727982391203e-06, + "loss": 0.82727652, + "memory(GiB)": 135.77, + "step": 31360, + "train_speed(iter/s)": 0.201842 + }, + { + "acc": 0.7756979, + "epoch": 0.7318858542702577, + "grad_norm": 6.59375, + "learning_rate": 7.321054915343722e-06, + "loss": 0.82158718, + "memory(GiB)": 135.77, + "step": 31370, + "train_speed(iter/s)": 0.201877 + }, + { + "acc": 0.75797024, + "epoch": 0.7321191618425466, + "grad_norm": 6.5625, + "learning_rate": 7.3193815169663266e-06, + "loss": 0.87084713, + "memory(GiB)": 135.77, + "step": 31380, + "train_speed(iter/s)": 0.201911 + }, + { + "acc": 0.7460907, + "epoch": 0.7323524694148354, + "grad_norm": 6.90625, + "learning_rate": 7.317707787497892e-06, + "loss": 0.90985165, + "memory(GiB)": 135.77, + "step": 31390, + "train_speed(iter/s)": 0.201946 + }, + { + "acc": 0.79898553, + "epoch": 0.7325857769871243, + "grad_norm": 6.5, + "learning_rate": 7.316033727177345e-06, + "loss": 0.72486515, + "memory(GiB)": 135.77, + "step": 31400, + "train_speed(iter/s)": 0.20198 + }, + { + "acc": 0.78949633, + "epoch": 0.7328190845594132, + "grad_norm": 4.9375, + "learning_rate": 7.314359336243656e-06, + "loss": 0.77411165, + "memory(GiB)": 135.77, + "step": 31410, + "train_speed(iter/s)": 0.202011 + }, + { + "acc": 0.77078295, + "epoch": 0.7330523921317021, + "grad_norm": 5.53125, + "learning_rate": 7.312684614935846e-06, + "loss": 0.80220585, + "memory(GiB)": 135.77, + "step": 31420, + "train_speed(iter/s)": 0.20204 + }, + { + "acc": 0.78742967, + "epoch": 0.733285699703991, + "grad_norm": 6.0625, + "learning_rate": 7.311009563492977e-06, + "loss": 0.77101741, + "memory(GiB)": 135.77, + "step": 31430, + "train_speed(iter/s)": 0.202071 + }, + { + "acc": 0.78280091, + "epoch": 0.7335190072762799, + "grad_norm": 5.125, + "learning_rate": 7.309334182154164e-06, + "loss": 0.78739457, + "memory(GiB)": 135.77, + "step": 31440, + "train_speed(iter/s)": 0.202105 + }, + { + "acc": 0.78376379, + "epoch": 0.7337523148485688, + "grad_norm": 8.9375, + "learning_rate": 7.307658471158567e-06, + "loss": 0.78450098, + "memory(GiB)": 135.77, + "step": 31450, + "train_speed(iter/s)": 0.202138 + }, + { + "acc": 0.79066029, + "epoch": 0.7339856224208577, + "grad_norm": 4.875, + "learning_rate": 7.305982430745395e-06, + "loss": 0.75622902, + "memory(GiB)": 135.77, + "step": 31460, + "train_speed(iter/s)": 0.202173 + }, + { + "acc": 0.769873, + "epoch": 0.7342189299931466, + "grad_norm": 3.984375, + "learning_rate": 7.3043060611538995e-06, + "loss": 0.82575455, + "memory(GiB)": 135.77, + "step": 31470, + "train_speed(iter/s)": 0.202204 + }, + { + "acc": 0.77303362, + "epoch": 0.7344522375654354, + "grad_norm": 6.5, + "learning_rate": 7.302629362623384e-06, + "loss": 0.81925964, + "memory(GiB)": 135.77, + "step": 31480, + "train_speed(iter/s)": 0.202236 + }, + { + "acc": 0.76517944, + "epoch": 0.7346855451377243, + "grad_norm": 5.125, + "learning_rate": 7.3009523353931966e-06, + "loss": 0.83144541, + "memory(GiB)": 135.77, + "step": 31490, + "train_speed(iter/s)": 0.202271 + }, + { + "acc": 0.78074522, + "epoch": 0.7349188527100132, + "grad_norm": 5.03125, + "learning_rate": 7.299274979702732e-06, + "loss": 0.79223261, + "memory(GiB)": 135.77, + "step": 31500, + "train_speed(iter/s)": 0.202305 + }, + { + "epoch": 0.7349188527100132, + "eval_acc": 0.741985381714187, + "eval_loss": 0.8141977787017822, + "eval_runtime": 1269.7118, + "eval_samples_per_second": 28.346, + "eval_steps_per_second": 14.173, + "step": 31500 + }, + { + "acc": 0.78632488, + "epoch": 0.7351521602823021, + "grad_norm": 3.75, + "learning_rate": 7.29759729579143e-06, + "loss": 0.78005762, + "memory(GiB)": 135.77, + "step": 31510, + "train_speed(iter/s)": 0.200679 + }, + { + "acc": 0.77185574, + "epoch": 0.735385467854591, + "grad_norm": 7.78125, + "learning_rate": 7.295919283898782e-06, + "loss": 0.82155762, + "memory(GiB)": 135.77, + "step": 31520, + "train_speed(iter/s)": 0.200713 + }, + { + "acc": 0.77478409, + "epoch": 0.7356187754268799, + "grad_norm": 3.828125, + "learning_rate": 7.294240944264323e-06, + "loss": 0.80740013, + "memory(GiB)": 135.77, + "step": 31530, + "train_speed(iter/s)": 0.200747 + }, + { + "acc": 0.79292769, + "epoch": 0.7358520829991688, + "grad_norm": 4.78125, + "learning_rate": 7.292562277127637e-06, + "loss": 0.7431262, + "memory(GiB)": 135.77, + "step": 31540, + "train_speed(iter/s)": 0.200781 + }, + { + "acc": 0.79306731, + "epoch": 0.7360853905714577, + "grad_norm": 4.6875, + "learning_rate": 7.290883282728352e-06, + "loss": 0.74995842, + "memory(GiB)": 135.77, + "step": 31550, + "train_speed(iter/s)": 0.200813 + }, + { + "acc": 0.77480698, + "epoch": 0.7363186981437466, + "grad_norm": 5.6875, + "learning_rate": 7.289203961306143e-06, + "loss": 0.82969999, + "memory(GiB)": 135.77, + "step": 31560, + "train_speed(iter/s)": 0.200845 + }, + { + "acc": 0.76988225, + "epoch": 0.7365520057160355, + "grad_norm": 6.90625, + "learning_rate": 7.287524313100735e-06, + "loss": 0.82661495, + "memory(GiB)": 135.77, + "step": 31570, + "train_speed(iter/s)": 0.20088 + }, + { + "acc": 0.75901461, + "epoch": 0.7367853132883244, + "grad_norm": 5.15625, + "learning_rate": 7.285844338351894e-06, + "loss": 0.87335548, + "memory(GiB)": 135.77, + "step": 31580, + "train_speed(iter/s)": 0.200914 + }, + { + "acc": 0.77605467, + "epoch": 0.7370186208606133, + "grad_norm": 7.65625, + "learning_rate": 7.284164037299438e-06, + "loss": 0.80913677, + "memory(GiB)": 135.77, + "step": 31590, + "train_speed(iter/s)": 0.200947 + }, + { + "acc": 0.79126759, + "epoch": 0.7372519284329022, + "grad_norm": 7.125, + "learning_rate": 7.28248341018323e-06, + "loss": 0.72892237, + "memory(GiB)": 135.77, + "step": 31600, + "train_speed(iter/s)": 0.20098 + }, + { + "acc": 0.78675766, + "epoch": 0.7374852360051911, + "grad_norm": 5.375, + "learning_rate": 7.280802457243178e-06, + "loss": 0.76203346, + "memory(GiB)": 135.77, + "step": 31610, + "train_speed(iter/s)": 0.201012 + }, + { + "acc": 0.77029533, + "epoch": 0.73771854357748, + "grad_norm": 5.84375, + "learning_rate": 7.2791211787192376e-06, + "loss": 0.8017519, + "memory(GiB)": 135.77, + "step": 31620, + "train_speed(iter/s)": 0.201045 + }, + { + "acc": 0.76799793, + "epoch": 0.7379518511497689, + "grad_norm": 5.09375, + "learning_rate": 7.27743957485141e-06, + "loss": 0.83303394, + "memory(GiB)": 135.77, + "step": 31630, + "train_speed(iter/s)": 0.20108 + }, + { + "acc": 0.77150707, + "epoch": 0.7381851587220578, + "grad_norm": 4.5, + "learning_rate": 7.2757576458797465e-06, + "loss": 0.83672428, + "memory(GiB)": 135.77, + "step": 31640, + "train_speed(iter/s)": 0.201114 + }, + { + "acc": 0.77037902, + "epoch": 0.7384184662943467, + "grad_norm": 6.65625, + "learning_rate": 7.27407539204434e-06, + "loss": 0.83148661, + "memory(GiB)": 135.77, + "step": 31650, + "train_speed(iter/s)": 0.201149 + }, + { + "acc": 0.76977768, + "epoch": 0.7386517738666356, + "grad_norm": 6.0, + "learning_rate": 7.272392813585332e-06, + "loss": 0.81001329, + "memory(GiB)": 135.77, + "step": 31660, + "train_speed(iter/s)": 0.201183 + }, + { + "acc": 0.80247803, + "epoch": 0.7388850814389245, + "grad_norm": 3.875, + "learning_rate": 7.270709910742908e-06, + "loss": 0.70846043, + "memory(GiB)": 135.77, + "step": 31670, + "train_speed(iter/s)": 0.201216 + }, + { + "acc": 0.78375702, + "epoch": 0.7391183890112133, + "grad_norm": 4.9375, + "learning_rate": 7.269026683757306e-06, + "loss": 0.78587623, + "memory(GiB)": 135.77, + "step": 31680, + "train_speed(iter/s)": 0.201247 + }, + { + "acc": 0.76821456, + "epoch": 0.7393516965835022, + "grad_norm": 4.84375, + "learning_rate": 7.267343132868803e-06, + "loss": 0.82991257, + "memory(GiB)": 135.77, + "step": 31690, + "train_speed(iter/s)": 0.201281 + }, + { + "acc": 0.78773546, + "epoch": 0.7395850041557911, + "grad_norm": 8.5, + "learning_rate": 7.265659258317725e-06, + "loss": 0.77818766, + "memory(GiB)": 135.77, + "step": 31700, + "train_speed(iter/s)": 0.201314 + }, + { + "acc": 0.77224665, + "epoch": 0.73981831172808, + "grad_norm": 5.375, + "learning_rate": 7.263975060344449e-06, + "loss": 0.81635828, + "memory(GiB)": 135.77, + "step": 31710, + "train_speed(iter/s)": 0.201345 + }, + { + "acc": 0.79697518, + "epoch": 0.7400516193003689, + "grad_norm": 5.46875, + "learning_rate": 7.26229053918939e-06, + "loss": 0.74089785, + "memory(GiB)": 135.77, + "step": 31720, + "train_speed(iter/s)": 0.201376 + }, + { + "acc": 0.79544964, + "epoch": 0.7402849268726578, + "grad_norm": 4.78125, + "learning_rate": 7.260605695093014e-06, + "loss": 0.69942369, + "memory(GiB)": 135.77, + "step": 31730, + "train_speed(iter/s)": 0.201411 + }, + { + "acc": 0.77880449, + "epoch": 0.7405182344449467, + "grad_norm": 5.40625, + "learning_rate": 7.25892052829583e-06, + "loss": 0.79678488, + "memory(GiB)": 135.77, + "step": 31740, + "train_speed(iter/s)": 0.201442 + }, + { + "acc": 0.78319736, + "epoch": 0.7407515420172356, + "grad_norm": 5.3125, + "learning_rate": 7.257235039038397e-06, + "loss": 0.76695404, + "memory(GiB)": 135.77, + "step": 31750, + "train_speed(iter/s)": 0.201476 + }, + { + "acc": 0.78057632, + "epoch": 0.7409848495895245, + "grad_norm": 6.59375, + "learning_rate": 7.25554922756132e-06, + "loss": 0.77434168, + "memory(GiB)": 135.77, + "step": 31760, + "train_speed(iter/s)": 0.20151 + }, + { + "acc": 0.78480177, + "epoch": 0.7412181571618134, + "grad_norm": 4.59375, + "learning_rate": 7.253863094105243e-06, + "loss": 0.76221313, + "memory(GiB)": 135.77, + "step": 31770, + "train_speed(iter/s)": 0.201541 + }, + { + "acc": 0.78093719, + "epoch": 0.7414514647341023, + "grad_norm": 5.46875, + "learning_rate": 7.252176638910867e-06, + "loss": 0.79842443, + "memory(GiB)": 135.77, + "step": 31780, + "train_speed(iter/s)": 0.201573 + }, + { + "acc": 0.78112631, + "epoch": 0.7416847723063912, + "grad_norm": 7.46875, + "learning_rate": 7.25048986221893e-06, + "loss": 0.79473867, + "memory(GiB)": 135.77, + "step": 31790, + "train_speed(iter/s)": 0.201606 + }, + { + "acc": 0.76295033, + "epoch": 0.7419180798786801, + "grad_norm": 5.5625, + "learning_rate": 7.248802764270217e-06, + "loss": 0.86861839, + "memory(GiB)": 135.77, + "step": 31800, + "train_speed(iter/s)": 0.201638 + }, + { + "acc": 0.7788949, + "epoch": 0.742151387450969, + "grad_norm": 5.34375, + "learning_rate": 7.247115345305564e-06, + "loss": 0.79273958, + "memory(GiB)": 135.77, + "step": 31810, + "train_speed(iter/s)": 0.201671 + }, + { + "acc": 0.78000889, + "epoch": 0.7423846950232579, + "grad_norm": 6.15625, + "learning_rate": 7.245427605565847e-06, + "loss": 0.77720737, + "memory(GiB)": 135.77, + "step": 31820, + "train_speed(iter/s)": 0.201705 + }, + { + "acc": 0.77433429, + "epoch": 0.7426180025955468, + "grad_norm": 5.8125, + "learning_rate": 7.243739545291994e-06, + "loss": 0.83012581, + "memory(GiB)": 135.77, + "step": 31830, + "train_speed(iter/s)": 0.201738 + }, + { + "acc": 0.77186079, + "epoch": 0.7428513101678357, + "grad_norm": 7.53125, + "learning_rate": 7.24205116472497e-06, + "loss": 0.81709404, + "memory(GiB)": 135.77, + "step": 31840, + "train_speed(iter/s)": 0.20177 + }, + { + "acc": 0.77884254, + "epoch": 0.7430846177401246, + "grad_norm": 8.1875, + "learning_rate": 7.240362464105795e-06, + "loss": 0.79129939, + "memory(GiB)": 135.77, + "step": 31850, + "train_speed(iter/s)": 0.201803 + }, + { + "acc": 0.77921705, + "epoch": 0.7433179253124135, + "grad_norm": 4.96875, + "learning_rate": 7.238673443675529e-06, + "loss": 0.78770342, + "memory(GiB)": 135.77, + "step": 31860, + "train_speed(iter/s)": 0.201836 + }, + { + "acc": 0.79026484, + "epoch": 0.7435512328847023, + "grad_norm": 5.09375, + "learning_rate": 7.236984103675278e-06, + "loss": 0.73446693, + "memory(GiB)": 135.77, + "step": 31870, + "train_speed(iter/s)": 0.20187 + }, + { + "acc": 0.76663461, + "epoch": 0.7437845404569912, + "grad_norm": 4.71875, + "learning_rate": 7.235294444346197e-06, + "loss": 0.85680876, + "memory(GiB)": 135.77, + "step": 31880, + "train_speed(iter/s)": 0.201904 + }, + { + "acc": 0.76660204, + "epoch": 0.7440178480292801, + "grad_norm": 7.84375, + "learning_rate": 7.233604465929485e-06, + "loss": 0.83537083, + "memory(GiB)": 135.77, + "step": 31890, + "train_speed(iter/s)": 0.201937 + }, + { + "acc": 0.78674612, + "epoch": 0.744251155601569, + "grad_norm": 4.59375, + "learning_rate": 7.231914168666382e-06, + "loss": 0.73860254, + "memory(GiB)": 135.77, + "step": 31900, + "train_speed(iter/s)": 0.201973 + }, + { + "acc": 0.76229892, + "epoch": 0.7444844631738579, + "grad_norm": 7.0, + "learning_rate": 7.23022355279818e-06, + "loss": 0.86733685, + "memory(GiB)": 135.77, + "step": 31910, + "train_speed(iter/s)": 0.202005 + }, + { + "acc": 0.76368475, + "epoch": 0.7447177707461468, + "grad_norm": 5.0625, + "learning_rate": 7.228532618566214e-06, + "loss": 0.8780262, + "memory(GiB)": 135.77, + "step": 31920, + "train_speed(iter/s)": 0.202034 + }, + { + "acc": 0.78525524, + "epoch": 0.7449510783184357, + "grad_norm": 6.34375, + "learning_rate": 7.226841366211865e-06, + "loss": 0.77032347, + "memory(GiB)": 135.77, + "step": 31930, + "train_speed(iter/s)": 0.202065 + }, + { + "acc": 0.77381964, + "epoch": 0.7451843858907246, + "grad_norm": 5.3125, + "learning_rate": 7.225149795976558e-06, + "loss": 0.79736581, + "memory(GiB)": 135.77, + "step": 31940, + "train_speed(iter/s)": 0.202097 + }, + { + "acc": 0.78271856, + "epoch": 0.7454176934630135, + "grad_norm": 6.1875, + "learning_rate": 7.223457908101763e-06, + "loss": 0.769701, + "memory(GiB)": 135.77, + "step": 31950, + "train_speed(iter/s)": 0.202131 + }, + { + "acc": 0.77590184, + "epoch": 0.7456510010353024, + "grad_norm": 7.28125, + "learning_rate": 7.2217657028289974e-06, + "loss": 0.81713409, + "memory(GiB)": 135.77, + "step": 31960, + "train_speed(iter/s)": 0.202164 + }, + { + "acc": 0.79009981, + "epoch": 0.7458843086075913, + "grad_norm": 5.46875, + "learning_rate": 7.220073180399824e-06, + "loss": 0.76196728, + "memory(GiB)": 135.77, + "step": 31970, + "train_speed(iter/s)": 0.202199 + }, + { + "acc": 0.79176197, + "epoch": 0.7461176161798801, + "grad_norm": 5.0, + "learning_rate": 7.218380341055848e-06, + "loss": 0.74932814, + "memory(GiB)": 135.77, + "step": 31980, + "train_speed(iter/s)": 0.202231 + }, + { + "acc": 0.75970087, + "epoch": 0.746350923752169, + "grad_norm": 4.625, + "learning_rate": 7.216687185038724e-06, + "loss": 0.87884674, + "memory(GiB)": 135.77, + "step": 31990, + "train_speed(iter/s)": 0.202261 + }, + { + "acc": 0.77163577, + "epoch": 0.746584231324458, + "grad_norm": 6.3125, + "learning_rate": 7.214993712590148e-06, + "loss": 0.85210857, + "memory(GiB)": 135.77, + "step": 32000, + "train_speed(iter/s)": 0.202296 + }, + { + "epoch": 0.746584231324458, + "eval_acc": 0.7420050794547601, + "eval_loss": 0.8138281106948853, + "eval_runtime": 1270.8326, + "eval_samples_per_second": 28.321, + "eval_steps_per_second": 14.161, + "step": 32000 + }, + { + "acc": 0.76468716, + "epoch": 0.7468175388967468, + "grad_norm": 100.5, + "learning_rate": 7.213299923951863e-06, + "loss": 0.88969822, + "memory(GiB)": 135.77, + "step": 32010, + "train_speed(iter/s)": 0.200694 + }, + { + "acc": 0.77527003, + "epoch": 0.7470508464690357, + "grad_norm": 5.59375, + "learning_rate": 7.211605819365657e-06, + "loss": 0.82085714, + "memory(GiB)": 135.77, + "step": 32020, + "train_speed(iter/s)": 0.200726 + }, + { + "acc": 0.77617159, + "epoch": 0.7472841540413246, + "grad_norm": 6.375, + "learning_rate": 7.209911399073361e-06, + "loss": 0.83005333, + "memory(GiB)": 135.77, + "step": 32030, + "train_speed(iter/s)": 0.200758 + }, + { + "acc": 0.77418909, + "epoch": 0.7475174616136135, + "grad_norm": 6.0, + "learning_rate": 7.208216663316856e-06, + "loss": 0.79054193, + "memory(GiB)": 135.77, + "step": 32040, + "train_speed(iter/s)": 0.200791 + }, + { + "acc": 0.78965578, + "epoch": 0.7477507691859024, + "grad_norm": 4.4375, + "learning_rate": 7.206521612338064e-06, + "loss": 0.74522982, + "memory(GiB)": 135.77, + "step": 32050, + "train_speed(iter/s)": 0.200824 + }, + { + "acc": 0.76647468, + "epoch": 0.7479840767581912, + "grad_norm": 6.28125, + "learning_rate": 7.204826246378953e-06, + "loss": 0.81988811, + "memory(GiB)": 135.77, + "step": 32060, + "train_speed(iter/s)": 0.200858 + }, + { + "acc": 0.77853847, + "epoch": 0.7482173843304801, + "grad_norm": 4.9375, + "learning_rate": 7.203130565681537e-06, + "loss": 0.79634762, + "memory(GiB)": 135.77, + "step": 32070, + "train_speed(iter/s)": 0.200889 + }, + { + "acc": 0.7814064, + "epoch": 0.748450691902769, + "grad_norm": 7.0, + "learning_rate": 7.201434570487871e-06, + "loss": 0.78645287, + "memory(GiB)": 135.77, + "step": 32080, + "train_speed(iter/s)": 0.200919 + }, + { + "acc": 0.77312088, + "epoch": 0.7486839994750579, + "grad_norm": 4.78125, + "learning_rate": 7.199738261040059e-06, + "loss": 0.83004112, + "memory(GiB)": 135.77, + "step": 32090, + "train_speed(iter/s)": 0.200951 + }, + { + "acc": 0.79272485, + "epoch": 0.7489173070473468, + "grad_norm": 5.9375, + "learning_rate": 7.1980416375802494e-06, + "loss": 0.7410665, + "memory(GiB)": 135.77, + "step": 32100, + "train_speed(iter/s)": 0.200982 + }, + { + "acc": 0.76613655, + "epoch": 0.7491506146196357, + "grad_norm": 5.46875, + "learning_rate": 7.196344700350635e-06, + "loss": 0.86439323, + "memory(GiB)": 135.77, + "step": 32110, + "train_speed(iter/s)": 0.201014 + }, + { + "acc": 0.78310966, + "epoch": 0.7493839221919246, + "grad_norm": 5.375, + "learning_rate": 7.1946474495934535e-06, + "loss": 0.765874, + "memory(GiB)": 135.77, + "step": 32120, + "train_speed(iter/s)": 0.201043 + }, + { + "acc": 0.79054403, + "epoch": 0.7496172297642135, + "grad_norm": 4.5, + "learning_rate": 7.192949885550986e-06, + "loss": 0.73549843, + "memory(GiB)": 135.77, + "step": 32130, + "train_speed(iter/s)": 0.201075 + }, + { + "acc": 0.77819924, + "epoch": 0.7498505373365024, + "grad_norm": 4.625, + "learning_rate": 7.1912520084655594e-06, + "loss": 0.79788404, + "memory(GiB)": 135.77, + "step": 32140, + "train_speed(iter/s)": 0.201108 + }, + { + "acc": 0.79777637, + "epoch": 0.7500838449087913, + "grad_norm": 6.8125, + "learning_rate": 7.189553818579545e-06, + "loss": 0.70898933, + "memory(GiB)": 135.77, + "step": 32150, + "train_speed(iter/s)": 0.201138 + }, + { + "acc": 0.77831688, + "epoch": 0.7503171524810802, + "grad_norm": 4.9375, + "learning_rate": 7.187855316135358e-06, + "loss": 0.79211369, + "memory(GiB)": 135.77, + "step": 32160, + "train_speed(iter/s)": 0.201171 + }, + { + "acc": 0.77343702, + "epoch": 0.7505504600533691, + "grad_norm": 5.1875, + "learning_rate": 7.1861565013754605e-06, + "loss": 0.81101351, + "memory(GiB)": 135.77, + "step": 32170, + "train_speed(iter/s)": 0.201199 + }, + { + "acc": 0.77341728, + "epoch": 0.750783767625658, + "grad_norm": 6.625, + "learning_rate": 7.18445737454236e-06, + "loss": 0.81564074, + "memory(GiB)": 135.77, + "step": 32180, + "train_speed(iter/s)": 0.201231 + }, + { + "acc": 0.78846521, + "epoch": 0.7510170751979469, + "grad_norm": 4.5625, + "learning_rate": 7.182757935878601e-06, + "loss": 0.75658441, + "memory(GiB)": 135.77, + "step": 32190, + "train_speed(iter/s)": 0.201264 + }, + { + "acc": 0.77531843, + "epoch": 0.7512503827702358, + "grad_norm": 4.90625, + "learning_rate": 7.1810581856267815e-06, + "loss": 0.82540045, + "memory(GiB)": 135.77, + "step": 32200, + "train_speed(iter/s)": 0.201295 + }, + { + "acc": 0.78034782, + "epoch": 0.7514836903425247, + "grad_norm": 14.75, + "learning_rate": 7.17935812402954e-06, + "loss": 0.78024654, + "memory(GiB)": 135.77, + "step": 32210, + "train_speed(iter/s)": 0.201326 + }, + { + "acc": 0.76428576, + "epoch": 0.7517169979148136, + "grad_norm": 5.0, + "learning_rate": 7.177657751329559e-06, + "loss": 0.84508133, + "memory(GiB)": 135.77, + "step": 32220, + "train_speed(iter/s)": 0.201356 + }, + { + "acc": 0.77687807, + "epoch": 0.7519503054871025, + "grad_norm": 4.6875, + "learning_rate": 7.1759570677695665e-06, + "loss": 0.80794582, + "memory(GiB)": 135.77, + "step": 32230, + "train_speed(iter/s)": 0.201387 + }, + { + "acc": 0.75994444, + "epoch": 0.7521836130593914, + "grad_norm": 6.78125, + "learning_rate": 7.174256073592335e-06, + "loss": 0.86417484, + "memory(GiB)": 135.77, + "step": 32240, + "train_speed(iter/s)": 0.201419 + }, + { + "acc": 0.79446583, + "epoch": 0.7524169206316802, + "grad_norm": 6.5625, + "learning_rate": 7.172554769040681e-06, + "loss": 0.72564263, + "memory(GiB)": 135.77, + "step": 32250, + "train_speed(iter/s)": 0.20145 + }, + { + "acc": 0.77692814, + "epoch": 0.7526502282039691, + "grad_norm": 5.1875, + "learning_rate": 7.1708531543574635e-06, + "loss": 0.79970336, + "memory(GiB)": 135.77, + "step": 32260, + "train_speed(iter/s)": 0.201483 + }, + { + "acc": 0.79846258, + "epoch": 0.752883535776258, + "grad_norm": 4.75, + "learning_rate": 7.169151229785589e-06, + "loss": 0.72652612, + "memory(GiB)": 135.77, + "step": 32270, + "train_speed(iter/s)": 0.201517 + }, + { + "acc": 0.79553976, + "epoch": 0.7531168433485469, + "grad_norm": 5.90625, + "learning_rate": 7.167448995568009e-06, + "loss": 0.73974028, + "memory(GiB)": 135.77, + "step": 32280, + "train_speed(iter/s)": 0.201549 + }, + { + "acc": 0.77979794, + "epoch": 0.7533501509208358, + "grad_norm": 9.8125, + "learning_rate": 7.165746451947713e-06, + "loss": 0.79384131, + "memory(GiB)": 135.77, + "step": 32290, + "train_speed(iter/s)": 0.201579 + }, + { + "acc": 0.76877642, + "epoch": 0.7535834584931247, + "grad_norm": 5.5625, + "learning_rate": 7.16404359916774e-06, + "loss": 0.86544285, + "memory(GiB)": 135.77, + "step": 32300, + "train_speed(iter/s)": 0.20161 + }, + { + "acc": 0.76505909, + "epoch": 0.7538167660654136, + "grad_norm": 10.9375, + "learning_rate": 7.1623404374711715e-06, + "loss": 0.85449657, + "memory(GiB)": 135.77, + "step": 32310, + "train_speed(iter/s)": 0.201643 + }, + { + "acc": 0.78930197, + "epoch": 0.7540500736377025, + "grad_norm": 4.125, + "learning_rate": 7.160636967101134e-06, + "loss": 0.75111027, + "memory(GiB)": 135.77, + "step": 32320, + "train_speed(iter/s)": 0.201676 + }, + { + "acc": 0.79103222, + "epoch": 0.7542833812099914, + "grad_norm": 5.9375, + "learning_rate": 7.1589331883007965e-06, + "loss": 0.75493755, + "memory(GiB)": 135.77, + "step": 32330, + "train_speed(iter/s)": 0.201708 + }, + { + "acc": 0.76673937, + "epoch": 0.7545166887822803, + "grad_norm": 5.3125, + "learning_rate": 7.1572291013133745e-06, + "loss": 0.858673, + "memory(GiB)": 135.77, + "step": 32340, + "train_speed(iter/s)": 0.201739 + }, + { + "acc": 0.77912583, + "epoch": 0.7547499963545692, + "grad_norm": 5.5625, + "learning_rate": 7.155524706382125e-06, + "loss": 0.79643412, + "memory(GiB)": 135.77, + "step": 32350, + "train_speed(iter/s)": 0.201771 + }, + { + "acc": 0.77822075, + "epoch": 0.7549833039268581, + "grad_norm": 7.15625, + "learning_rate": 7.15382000375035e-06, + "loss": 0.79455347, + "memory(GiB)": 135.77, + "step": 32360, + "train_speed(iter/s)": 0.201803 + }, + { + "acc": 0.79522676, + "epoch": 0.755216611499147, + "grad_norm": 4.0625, + "learning_rate": 7.152114993661394e-06, + "loss": 0.71623573, + "memory(GiB)": 135.77, + "step": 32370, + "train_speed(iter/s)": 0.201835 + }, + { + "acc": 0.77095437, + "epoch": 0.7554499190714359, + "grad_norm": 4.03125, + "learning_rate": 7.150409676358649e-06, + "loss": 0.83390207, + "memory(GiB)": 135.77, + "step": 32380, + "train_speed(iter/s)": 0.201866 + }, + { + "acc": 0.76215563, + "epoch": 0.7556832266437248, + "grad_norm": 5.28125, + "learning_rate": 7.148704052085547e-06, + "loss": 0.87032471, + "memory(GiB)": 135.77, + "step": 32390, + "train_speed(iter/s)": 0.201899 + }, + { + "acc": 0.77448578, + "epoch": 0.7559165342160137, + "grad_norm": 15.5, + "learning_rate": 7.146998121085566e-06, + "loss": 0.80818844, + "memory(GiB)": 135.77, + "step": 32400, + "train_speed(iter/s)": 0.20193 + }, + { + "acc": 0.78134699, + "epoch": 0.7561498417883026, + "grad_norm": 4.5, + "learning_rate": 7.145291883602226e-06, + "loss": 0.76230822, + "memory(GiB)": 135.77, + "step": 32410, + "train_speed(iter/s)": 0.201961 + }, + { + "acc": 0.7758811, + "epoch": 0.7563831493605915, + "grad_norm": 5.65625, + "learning_rate": 7.143585339879093e-06, + "loss": 0.81609783, + "memory(GiB)": 135.77, + "step": 32420, + "train_speed(iter/s)": 0.201993 + }, + { + "acc": 0.76103873, + "epoch": 0.7566164569328804, + "grad_norm": 7.3125, + "learning_rate": 7.141878490159777e-06, + "loss": 0.88947668, + "memory(GiB)": 135.77, + "step": 32430, + "train_speed(iter/s)": 0.202027 + }, + { + "acc": 0.79736533, + "epoch": 0.7568497645051693, + "grad_norm": 6.03125, + "learning_rate": 7.140171334687927e-06, + "loss": 0.73429241, + "memory(GiB)": 135.77, + "step": 32440, + "train_speed(iter/s)": 0.20206 + }, + { + "acc": 0.76088171, + "epoch": 0.7570830720774581, + "grad_norm": 7.0, + "learning_rate": 7.138463873707242e-06, + "loss": 0.87091007, + "memory(GiB)": 135.77, + "step": 32450, + "train_speed(iter/s)": 0.202093 + }, + { + "acc": 0.77715158, + "epoch": 0.757316379649747, + "grad_norm": 4.21875, + "learning_rate": 7.13675610746146e-06, + "loss": 0.80676775, + "memory(GiB)": 135.77, + "step": 32460, + "train_speed(iter/s)": 0.202125 + }, + { + "acc": 0.77457027, + "epoch": 0.7575496872220359, + "grad_norm": 4.65625, + "learning_rate": 7.135048036194364e-06, + "loss": 0.81614847, + "memory(GiB)": 135.77, + "step": 32470, + "train_speed(iter/s)": 0.202157 + }, + { + "acc": 0.76460385, + "epoch": 0.7577829947943248, + "grad_norm": 6.46875, + "learning_rate": 7.13333966014978e-06, + "loss": 0.84198284, + "memory(GiB)": 135.77, + "step": 32480, + "train_speed(iter/s)": 0.20219 + }, + { + "acc": 0.77977839, + "epoch": 0.7580163023666137, + "grad_norm": 6.625, + "learning_rate": 7.131630979571581e-06, + "loss": 0.79072237, + "memory(GiB)": 135.77, + "step": 32490, + "train_speed(iter/s)": 0.20222 + }, + { + "acc": 0.77567453, + "epoch": 0.7582496099389026, + "grad_norm": 4.96875, + "learning_rate": 7.1299219947036795e-06, + "loss": 0.81002426, + "memory(GiB)": 135.77, + "step": 32500, + "train_speed(iter/s)": 0.202251 + }, + { + "epoch": 0.7582496099389026, + "eval_acc": 0.742288214457469, + "eval_loss": 0.8132081031799316, + "eval_runtime": 1270.5334, + "eval_samples_per_second": 28.327, + "eval_steps_per_second": 14.164, + "step": 32500 + }, + { + "acc": 0.77000389, + "epoch": 0.7584829175111915, + "grad_norm": 5.34375, + "learning_rate": 7.12821270579003e-06, + "loss": 0.80007992, + "memory(GiB)": 135.77, + "step": 32510, + "train_speed(iter/s)": 0.200677 + }, + { + "acc": 0.78935204, + "epoch": 0.7587162250834804, + "grad_norm": 5.75, + "learning_rate": 7.126503113074636e-06, + "loss": 0.74440832, + "memory(GiB)": 135.77, + "step": 32520, + "train_speed(iter/s)": 0.200709 + }, + { + "acc": 0.77540474, + "epoch": 0.7589495326557693, + "grad_norm": 5.15625, + "learning_rate": 7.1247932168015396e-06, + "loss": 0.81684599, + "memory(GiB)": 135.77, + "step": 32530, + "train_speed(iter/s)": 0.20074 + }, + { + "acc": 0.77324686, + "epoch": 0.7591828402280582, + "grad_norm": 5.8125, + "learning_rate": 7.123083017214829e-06, + "loss": 0.79553862, + "memory(GiB)": 135.77, + "step": 32540, + "train_speed(iter/s)": 0.200771 + }, + { + "acc": 0.77071304, + "epoch": 0.759416147800347, + "grad_norm": 8.5, + "learning_rate": 7.121372514558635e-06, + "loss": 0.83940582, + "memory(GiB)": 135.77, + "step": 32550, + "train_speed(iter/s)": 0.200803 + }, + { + "acc": 0.76741009, + "epoch": 0.759649455372636, + "grad_norm": 5.28125, + "learning_rate": 7.1196617090771305e-06, + "loss": 0.83808603, + "memory(GiB)": 135.77, + "step": 32560, + "train_speed(iter/s)": 0.200835 + }, + { + "acc": 0.79151773, + "epoch": 0.7598827629449248, + "grad_norm": 4.40625, + "learning_rate": 7.1179506010145335e-06, + "loss": 0.75109611, + "memory(GiB)": 135.77, + "step": 32570, + "train_speed(iter/s)": 0.200869 + }, + { + "acc": 0.76236095, + "epoch": 0.7601160705172137, + "grad_norm": 3.59375, + "learning_rate": 7.116239190615104e-06, + "loss": 0.87826719, + "memory(GiB)": 135.77, + "step": 32580, + "train_speed(iter/s)": 0.200901 + }, + { + "acc": 0.76941271, + "epoch": 0.7603493780895026, + "grad_norm": 5.15625, + "learning_rate": 7.1145274781231435e-06, + "loss": 0.84898701, + "memory(GiB)": 135.77, + "step": 32590, + "train_speed(iter/s)": 0.200932 + }, + { + "acc": 0.7833683, + "epoch": 0.7605826856617915, + "grad_norm": 7.25, + "learning_rate": 7.112815463782998e-06, + "loss": 0.79329114, + "memory(GiB)": 135.77, + "step": 32600, + "train_speed(iter/s)": 0.200964 + }, + { + "acc": 0.77949257, + "epoch": 0.7608159932340804, + "grad_norm": 5.6875, + "learning_rate": 7.111103147839062e-06, + "loss": 0.78569312, + "memory(GiB)": 135.77, + "step": 32610, + "train_speed(iter/s)": 0.200995 + }, + { + "acc": 0.79783745, + "epoch": 0.7610493008063693, + "grad_norm": 5.46875, + "learning_rate": 7.109390530535762e-06, + "loss": 0.698351, + "memory(GiB)": 135.77, + "step": 32620, + "train_speed(iter/s)": 0.201026 + }, + { + "acc": 0.77755065, + "epoch": 0.7612826083786582, + "grad_norm": 6.71875, + "learning_rate": 7.1076776121175794e-06, + "loss": 0.82539043, + "memory(GiB)": 135.77, + "step": 32630, + "train_speed(iter/s)": 0.201059 + }, + { + "acc": 0.78371029, + "epoch": 0.761515915950947, + "grad_norm": 5.25, + "learning_rate": 7.105964392829029e-06, + "loss": 0.77058702, + "memory(GiB)": 135.77, + "step": 32640, + "train_speed(iter/s)": 0.201091 + }, + { + "acc": 0.76712589, + "epoch": 0.7617492235232359, + "grad_norm": 5.125, + "learning_rate": 7.104250872914673e-06, + "loss": 0.83793221, + "memory(GiB)": 135.77, + "step": 32650, + "train_speed(iter/s)": 0.201124 + }, + { + "acc": 0.78511324, + "epoch": 0.7619825310955248, + "grad_norm": 17.625, + "learning_rate": 7.102537052619116e-06, + "loss": 0.7639678, + "memory(GiB)": 135.77, + "step": 32660, + "train_speed(iter/s)": 0.201156 + }, + { + "acc": 0.78138227, + "epoch": 0.7622158386678137, + "grad_norm": 5.1875, + "learning_rate": 7.100822932187006e-06, + "loss": 0.78408322, + "memory(GiB)": 135.77, + "step": 32670, + "train_speed(iter/s)": 0.201185 + }, + { + "acc": 0.78767548, + "epoch": 0.7624491462401026, + "grad_norm": 5.5, + "learning_rate": 7.099108511863032e-06, + "loss": 0.75953522, + "memory(GiB)": 135.77, + "step": 32680, + "train_speed(iter/s)": 0.201217 + }, + { + "acc": 0.77689095, + "epoch": 0.7626824538123915, + "grad_norm": 4.9375, + "learning_rate": 7.097393791891929e-06, + "loss": 0.8115571, + "memory(GiB)": 135.77, + "step": 32690, + "train_speed(iter/s)": 0.201248 + }, + { + "acc": 0.79620209, + "epoch": 0.7629157613846804, + "grad_norm": 6.0, + "learning_rate": 7.095678772518471e-06, + "loss": 0.70747728, + "memory(GiB)": 135.77, + "step": 32700, + "train_speed(iter/s)": 0.201281 + }, + { + "acc": 0.77352734, + "epoch": 0.7631490689569693, + "grad_norm": 10.5625, + "learning_rate": 7.093963453987476e-06, + "loss": 0.81851959, + "memory(GiB)": 135.77, + "step": 32710, + "train_speed(iter/s)": 0.201315 + }, + { + "acc": 0.7703311, + "epoch": 0.7633823765292582, + "grad_norm": 6.03125, + "learning_rate": 7.092247836543808e-06, + "loss": 0.84133596, + "memory(GiB)": 135.77, + "step": 32720, + "train_speed(iter/s)": 0.201343 + }, + { + "acc": 0.79012804, + "epoch": 0.7636156841015471, + "grad_norm": 5.46875, + "learning_rate": 7.090531920432368e-06, + "loss": 0.7244441, + "memory(GiB)": 135.77, + "step": 32730, + "train_speed(iter/s)": 0.201373 + }, + { + "acc": 0.78044481, + "epoch": 0.763848991673836, + "grad_norm": 7.59375, + "learning_rate": 7.088815705898103e-06, + "loss": 0.80008049, + "memory(GiB)": 135.77, + "step": 32740, + "train_speed(iter/s)": 0.201404 + }, + { + "acc": 0.75984888, + "epoch": 0.7640822992461249, + "grad_norm": 4.46875, + "learning_rate": 7.0870991931860044e-06, + "loss": 0.85378962, + "memory(GiB)": 135.77, + "step": 32750, + "train_speed(iter/s)": 0.201435 + }, + { + "acc": 0.80223875, + "epoch": 0.7643156068184138, + "grad_norm": 5.40625, + "learning_rate": 7.0853823825411005e-06, + "loss": 0.69828153, + "memory(GiB)": 135.77, + "step": 32760, + "train_speed(iter/s)": 0.201466 + }, + { + "acc": 0.78329859, + "epoch": 0.7645489143907027, + "grad_norm": 8.0, + "learning_rate": 7.083665274208469e-06, + "loss": 0.75291567, + "memory(GiB)": 135.77, + "step": 32770, + "train_speed(iter/s)": 0.201495 + }, + { + "acc": 0.79095221, + "epoch": 0.7647822219629916, + "grad_norm": 5.1875, + "learning_rate": 7.081947868433223e-06, + "loss": 0.75146961, + "memory(GiB)": 135.77, + "step": 32780, + "train_speed(iter/s)": 0.201528 + }, + { + "acc": 0.75557308, + "epoch": 0.7650155295352805, + "grad_norm": 6.875, + "learning_rate": 7.0802301654605255e-06, + "loss": 0.88731985, + "memory(GiB)": 135.77, + "step": 32790, + "train_speed(iter/s)": 0.201561 + }, + { + "acc": 0.7902915, + "epoch": 0.7652488371075694, + "grad_norm": 5.375, + "learning_rate": 7.078512165535576e-06, + "loss": 0.74065533, + "memory(GiB)": 135.77, + "step": 32800, + "train_speed(iter/s)": 0.201593 + }, + { + "acc": 0.78259544, + "epoch": 0.7654821446798583, + "grad_norm": 5.75, + "learning_rate": 7.076793868903617e-06, + "loss": 0.77633686, + "memory(GiB)": 135.77, + "step": 32810, + "train_speed(iter/s)": 0.201624 + }, + { + "acc": 0.77786484, + "epoch": 0.7657154522521472, + "grad_norm": 3.9375, + "learning_rate": 7.0750752758099384e-06, + "loss": 0.81483288, + "memory(GiB)": 135.77, + "step": 32820, + "train_speed(iter/s)": 0.201655 + }, + { + "acc": 0.78939443, + "epoch": 0.765948759824436, + "grad_norm": 3.96875, + "learning_rate": 7.073356386499865e-06, + "loss": 0.76333141, + "memory(GiB)": 135.77, + "step": 32830, + "train_speed(iter/s)": 0.201684 + }, + { + "acc": 0.76923409, + "epoch": 0.7661820673967249, + "grad_norm": 5.21875, + "learning_rate": 7.071637201218772e-06, + "loss": 0.83631258, + "memory(GiB)": 135.77, + "step": 32840, + "train_speed(iter/s)": 0.201716 + }, + { + "acc": 0.78508043, + "epoch": 0.7664153749690138, + "grad_norm": 5.5, + "learning_rate": 7.06991772021207e-06, + "loss": 0.77355008, + "memory(GiB)": 135.77, + "step": 32850, + "train_speed(iter/s)": 0.201748 + }, + { + "acc": 0.78850126, + "epoch": 0.7666486825413027, + "grad_norm": 4.875, + "learning_rate": 7.068197943725214e-06, + "loss": 0.77419615, + "memory(GiB)": 135.77, + "step": 32860, + "train_speed(iter/s)": 0.201779 + }, + { + "acc": 0.77871504, + "epoch": 0.7668819901135916, + "grad_norm": 11.375, + "learning_rate": 7.0664778720037034e-06, + "loss": 0.78229799, + "memory(GiB)": 135.77, + "step": 32870, + "train_speed(iter/s)": 0.201812 + }, + { + "acc": 0.78440037, + "epoch": 0.7671152976858805, + "grad_norm": 7.875, + "learning_rate": 7.064757505293075e-06, + "loss": 0.77941685, + "memory(GiB)": 135.77, + "step": 32880, + "train_speed(iter/s)": 0.201844 + }, + { + "acc": 0.75921545, + "epoch": 0.7673486052581694, + "grad_norm": 6.0625, + "learning_rate": 7.063036843838913e-06, + "loss": 0.85018616, + "memory(GiB)": 135.77, + "step": 32890, + "train_speed(iter/s)": 0.201875 + }, + { + "acc": 0.77700405, + "epoch": 0.7675819128304583, + "grad_norm": 5.21875, + "learning_rate": 7.061315887886841e-06, + "loss": 0.80813074, + "memory(GiB)": 135.77, + "step": 32900, + "train_speed(iter/s)": 0.201908 + }, + { + "acc": 0.78043146, + "epoch": 0.7678152204027472, + "grad_norm": 6.46875, + "learning_rate": 7.059594637682526e-06, + "loss": 0.7803668, + "memory(GiB)": 135.77, + "step": 32910, + "train_speed(iter/s)": 0.201936 + }, + { + "acc": 0.76210136, + "epoch": 0.7680485279750361, + "grad_norm": 6.25, + "learning_rate": 7.057873093471673e-06, + "loss": 0.85728846, + "memory(GiB)": 135.77, + "step": 32920, + "train_speed(iter/s)": 0.201968 + }, + { + "acc": 0.78007121, + "epoch": 0.768281835547325, + "grad_norm": 4.8125, + "learning_rate": 7.056151255500036e-06, + "loss": 0.77832518, + "memory(GiB)": 135.77, + "step": 32930, + "train_speed(iter/s)": 0.202 + }, + { + "acc": 0.75704546, + "epoch": 0.7685151431196139, + "grad_norm": 5.59375, + "learning_rate": 7.0544291240134025e-06, + "loss": 0.88629284, + "memory(GiB)": 135.77, + "step": 32940, + "train_speed(iter/s)": 0.202032 + }, + { + "acc": 0.7763031, + "epoch": 0.7687484506919028, + "grad_norm": 16.625, + "learning_rate": 7.052706699257609e-06, + "loss": 0.79555397, + "memory(GiB)": 135.77, + "step": 32950, + "train_speed(iter/s)": 0.202064 + }, + { + "acc": 0.77789965, + "epoch": 0.7689817582641917, + "grad_norm": 7.25, + "learning_rate": 7.05098398147853e-06, + "loss": 0.80233002, + "memory(GiB)": 135.77, + "step": 32960, + "train_speed(iter/s)": 0.202096 + }, + { + "acc": 0.77640162, + "epoch": 0.7692150658364806, + "grad_norm": 6.3125, + "learning_rate": 7.0492609709220835e-06, + "loss": 0.81024342, + "memory(GiB)": 135.77, + "step": 32970, + "train_speed(iter/s)": 0.202129 + }, + { + "acc": 0.79174838, + "epoch": 0.7694483734087695, + "grad_norm": 4.5625, + "learning_rate": 7.04753766783423e-06, + "loss": 0.75231781, + "memory(GiB)": 135.77, + "step": 32980, + "train_speed(iter/s)": 0.20216 + }, + { + "acc": 0.77766156, + "epoch": 0.7696816809810584, + "grad_norm": 6.46875, + "learning_rate": 7.045814072460968e-06, + "loss": 0.78604116, + "memory(GiB)": 135.77, + "step": 32990, + "train_speed(iter/s)": 0.202192 + }, + { + "acc": 0.76858015, + "epoch": 0.7699149885533473, + "grad_norm": 4.96875, + "learning_rate": 7.044090185048343e-06, + "loss": 0.86624451, + "memory(GiB)": 135.77, + "step": 33000, + "train_speed(iter/s)": 0.202225 + }, + { + "epoch": 0.7699149885533473, + "eval_acc": 0.742469978161131, + "eval_loss": 0.8125158548355103, + "eval_runtime": 1269.1292, + "eval_samples_per_second": 28.359, + "eval_steps_per_second": 14.18, + "step": 33000 + }, + { + "acc": 0.76679192, + "epoch": 0.7701482961256362, + "grad_norm": 4.75, + "learning_rate": 7.042366005842437e-06, + "loss": 0.83562012, + "memory(GiB)": 135.77, + "step": 33010, + "train_speed(iter/s)": 0.200677 + }, + { + "acc": 0.78599691, + "epoch": 0.770381603697925, + "grad_norm": 6.6875, + "learning_rate": 7.040641535089377e-06, + "loss": 0.82067356, + "memory(GiB)": 135.77, + "step": 33020, + "train_speed(iter/s)": 0.200708 + }, + { + "acc": 0.77557378, + "epoch": 0.7706149112702139, + "grad_norm": 6.3125, + "learning_rate": 7.038916773035332e-06, + "loss": 0.80831299, + "memory(GiB)": 135.77, + "step": 33030, + "train_speed(iter/s)": 0.200739 + }, + { + "acc": 0.78095722, + "epoch": 0.7708482188425028, + "grad_norm": 6.09375, + "learning_rate": 7.037191719926507e-06, + "loss": 0.77209196, + "memory(GiB)": 135.77, + "step": 33040, + "train_speed(iter/s)": 0.200771 + }, + { + "acc": 0.78821402, + "epoch": 0.7710815264147917, + "grad_norm": 8.0, + "learning_rate": 7.035466376009157e-06, + "loss": 0.75552502, + "memory(GiB)": 135.77, + "step": 33050, + "train_speed(iter/s)": 0.200803 + }, + { + "acc": 0.7637373, + "epoch": 0.7713148339870806, + "grad_norm": 4.25, + "learning_rate": 7.033740741529573e-06, + "loss": 0.84696598, + "memory(GiB)": 135.77, + "step": 33060, + "train_speed(iter/s)": 0.200834 + }, + { + "acc": 0.79854646, + "epoch": 0.7715481415593695, + "grad_norm": 4.65625, + "learning_rate": 7.03201481673409e-06, + "loss": 0.69255486, + "memory(GiB)": 135.77, + "step": 33070, + "train_speed(iter/s)": 0.200865 + }, + { + "acc": 0.79627194, + "epoch": 0.7717814491316584, + "grad_norm": 5.09375, + "learning_rate": 7.030288601869082e-06, + "loss": 0.74422112, + "memory(GiB)": 135.77, + "step": 33080, + "train_speed(iter/s)": 0.200896 + }, + { + "acc": 0.75897083, + "epoch": 0.7720147567039473, + "grad_norm": 5.0625, + "learning_rate": 7.028562097180965e-06, + "loss": 0.87837725, + "memory(GiB)": 135.77, + "step": 33090, + "train_speed(iter/s)": 0.200927 + }, + { + "acc": 0.7633698, + "epoch": 0.7722480642762362, + "grad_norm": 6.46875, + "learning_rate": 7.026835302916198e-06, + "loss": 0.87001724, + "memory(GiB)": 135.77, + "step": 33100, + "train_speed(iter/s)": 0.200958 + }, + { + "acc": 0.77343187, + "epoch": 0.7724813718485251, + "grad_norm": 5.8125, + "learning_rate": 7.025108219321281e-06, + "loss": 0.82100897, + "memory(GiB)": 135.77, + "step": 33110, + "train_speed(iter/s)": 0.200991 + }, + { + "acc": 0.78877459, + "epoch": 0.772714679420814, + "grad_norm": 4.28125, + "learning_rate": 7.023380846642754e-06, + "loss": 0.76161418, + "memory(GiB)": 135.77, + "step": 33120, + "train_speed(iter/s)": 0.201023 + }, + { + "acc": 0.77717133, + "epoch": 0.7729479869931029, + "grad_norm": 5.15625, + "learning_rate": 7.021653185127197e-06, + "loss": 0.79486446, + "memory(GiB)": 135.77, + "step": 33130, + "train_speed(iter/s)": 0.201053 + }, + { + "acc": 0.77787819, + "epoch": 0.7731812945653918, + "grad_norm": 7.53125, + "learning_rate": 7.019925235021237e-06, + "loss": 0.8035284, + "memory(GiB)": 135.77, + "step": 33140, + "train_speed(iter/s)": 0.201083 + }, + { + "acc": 0.77615976, + "epoch": 0.7734146021376807, + "grad_norm": 5.125, + "learning_rate": 7.018196996571538e-06, + "loss": 0.81237202, + "memory(GiB)": 135.77, + "step": 33150, + "train_speed(iter/s)": 0.201114 + }, + { + "acc": 0.78682756, + "epoch": 0.7736479097099696, + "grad_norm": 5.4375, + "learning_rate": 7.016468470024802e-06, + "loss": 0.74849882, + "memory(GiB)": 135.77, + "step": 33160, + "train_speed(iter/s)": 0.201146 + }, + { + "acc": 0.77469473, + "epoch": 0.7738812172822584, + "grad_norm": 5.28125, + "learning_rate": 7.014739655627778e-06, + "loss": 0.79529195, + "memory(GiB)": 135.77, + "step": 33170, + "train_speed(iter/s)": 0.201177 + }, + { + "acc": 0.7825881, + "epoch": 0.7741145248545473, + "grad_norm": 4.5625, + "learning_rate": 7.013010553627253e-06, + "loss": 0.76973028, + "memory(GiB)": 135.77, + "step": 33180, + "train_speed(iter/s)": 0.201207 + }, + { + "acc": 0.79780211, + "epoch": 0.7743478324268362, + "grad_norm": 5.59375, + "learning_rate": 7.011281164270056e-06, + "loss": 0.72503815, + "memory(GiB)": 135.77, + "step": 33190, + "train_speed(iter/s)": 0.201239 + }, + { + "acc": 0.78128815, + "epoch": 0.7745811399991251, + "grad_norm": 5.71875, + "learning_rate": 7.009551487803058e-06, + "loss": 0.7804842, + "memory(GiB)": 135.77, + "step": 33200, + "train_speed(iter/s)": 0.201271 + }, + { + "acc": 0.78946109, + "epoch": 0.774814447571414, + "grad_norm": 6.375, + "learning_rate": 7.0078215244731685e-06, + "loss": 0.74558954, + "memory(GiB)": 135.77, + "step": 33210, + "train_speed(iter/s)": 0.201302 + }, + { + "acc": 0.77743354, + "epoch": 0.7750477551437028, + "grad_norm": 5.875, + "learning_rate": 7.00609127452734e-06, + "loss": 0.80112734, + "memory(GiB)": 135.77, + "step": 33220, + "train_speed(iter/s)": 0.201333 + }, + { + "acc": 0.78358135, + "epoch": 0.7752810627159917, + "grad_norm": 5.625, + "learning_rate": 7.0043607382125645e-06, + "loss": 0.77119946, + "memory(GiB)": 135.77, + "step": 33230, + "train_speed(iter/s)": 0.201365 + }, + { + "acc": 0.78574867, + "epoch": 0.7755143702882806, + "grad_norm": 4.78125, + "learning_rate": 7.002629915775876e-06, + "loss": 0.77710333, + "memory(GiB)": 135.77, + "step": 33240, + "train_speed(iter/s)": 0.201395 + }, + { + "acc": 0.77628012, + "epoch": 0.7757476778605695, + "grad_norm": 6.6875, + "learning_rate": 7.000898807464349e-06, + "loss": 0.84035797, + "memory(GiB)": 135.77, + "step": 33250, + "train_speed(iter/s)": 0.201424 + }, + { + "acc": 0.78520012, + "epoch": 0.7759809854328584, + "grad_norm": 4.40625, + "learning_rate": 6.999167413525099e-06, + "loss": 0.76161404, + "memory(GiB)": 135.77, + "step": 33260, + "train_speed(iter/s)": 0.201455 + }, + { + "acc": 0.78030691, + "epoch": 0.7762142930051473, + "grad_norm": 6.15625, + "learning_rate": 6.9974357342052805e-06, + "loss": 0.79453855, + "memory(GiB)": 135.77, + "step": 33270, + "train_speed(iter/s)": 0.201487 + }, + { + "acc": 0.79090891, + "epoch": 0.7764476005774362, + "grad_norm": 4.65625, + "learning_rate": 6.995703769752091e-06, + "loss": 0.73884726, + "memory(GiB)": 135.77, + "step": 33280, + "train_speed(iter/s)": 0.201521 + }, + { + "acc": 0.804249, + "epoch": 0.7766809081497251, + "grad_norm": 4.875, + "learning_rate": 6.993971520412769e-06, + "loss": 0.70278206, + "memory(GiB)": 135.77, + "step": 33290, + "train_speed(iter/s)": 0.201554 + }, + { + "acc": 0.78132133, + "epoch": 0.776914215722014, + "grad_norm": 6.875, + "learning_rate": 6.992238986434591e-06, + "loss": 0.79448328, + "memory(GiB)": 135.77, + "step": 33300, + "train_speed(iter/s)": 0.201587 + }, + { + "acc": 0.77671242, + "epoch": 0.7771475232943029, + "grad_norm": 4.75, + "learning_rate": 6.9905061680648765e-06, + "loss": 0.80529804, + "memory(GiB)": 135.77, + "step": 33310, + "train_speed(iter/s)": 0.201617 + }, + { + "acc": 0.78557901, + "epoch": 0.7773808308665918, + "grad_norm": 5.03125, + "learning_rate": 6.9887730655509855e-06, + "loss": 0.76596785, + "memory(GiB)": 135.77, + "step": 33320, + "train_speed(iter/s)": 0.201648 + }, + { + "acc": 0.7802887, + "epoch": 0.7776141384388807, + "grad_norm": 8.375, + "learning_rate": 6.987039679140316e-06, + "loss": 0.79592075, + "memory(GiB)": 135.77, + "step": 33330, + "train_speed(iter/s)": 0.20168 + }, + { + "acc": 0.77079372, + "epoch": 0.7778474460111696, + "grad_norm": 6.1875, + "learning_rate": 6.9853060090803105e-06, + "loss": 0.8273881, + "memory(GiB)": 135.77, + "step": 33340, + "train_speed(iter/s)": 0.201712 + }, + { + "acc": 0.76755457, + "epoch": 0.7780807535834585, + "grad_norm": 6.09375, + "learning_rate": 6.983572055618449e-06, + "loss": 0.84588947, + "memory(GiB)": 135.77, + "step": 33350, + "train_speed(iter/s)": 0.201741 + }, + { + "acc": 0.7820858, + "epoch": 0.7783140611557474, + "grad_norm": 3.640625, + "learning_rate": 6.981837819002252e-06, + "loss": 0.78927259, + "memory(GiB)": 135.77, + "step": 33360, + "train_speed(iter/s)": 0.201773 + }, + { + "acc": 0.78572111, + "epoch": 0.7785473687280363, + "grad_norm": 4.625, + "learning_rate": 6.980103299479281e-06, + "loss": 0.76833, + "memory(GiB)": 135.77, + "step": 33370, + "train_speed(iter/s)": 0.201804 + }, + { + "acc": 0.78051925, + "epoch": 0.7787806763003252, + "grad_norm": 5.65625, + "learning_rate": 6.978368497297143e-06, + "loss": 0.79444427, + "memory(GiB)": 135.77, + "step": 33380, + "train_speed(iter/s)": 0.201834 + }, + { + "acc": 0.76521983, + "epoch": 0.7790139838726141, + "grad_norm": 4.84375, + "learning_rate": 6.976633412703474e-06, + "loss": 0.85772009, + "memory(GiB)": 135.77, + "step": 33390, + "train_speed(iter/s)": 0.201864 + }, + { + "acc": 0.79295754, + "epoch": 0.779247291444903, + "grad_norm": 5.28125, + "learning_rate": 6.974898045945959e-06, + "loss": 0.74175, + "memory(GiB)": 135.77, + "step": 33400, + "train_speed(iter/s)": 0.201895 + }, + { + "acc": 0.78126574, + "epoch": 0.7794805990171918, + "grad_norm": 6.03125, + "learning_rate": 6.973162397272323e-06, + "loss": 0.79368715, + "memory(GiB)": 135.77, + "step": 33410, + "train_speed(iter/s)": 0.201925 + }, + { + "acc": 0.79608712, + "epoch": 0.7797139065894807, + "grad_norm": 6.59375, + "learning_rate": 6.971426466930327e-06, + "loss": 0.74015937, + "memory(GiB)": 135.77, + "step": 33420, + "train_speed(iter/s)": 0.201954 + }, + { + "acc": 0.78107915, + "epoch": 0.7799472141617696, + "grad_norm": 4.8125, + "learning_rate": 6.969690255167777e-06, + "loss": 0.77917404, + "memory(GiB)": 135.77, + "step": 33430, + "train_speed(iter/s)": 0.201986 + }, + { + "acc": 0.79521189, + "epoch": 0.7801805217340585, + "grad_norm": 6.90625, + "learning_rate": 6.9679537622325154e-06, + "loss": 0.73068018, + "memory(GiB)": 135.77, + "step": 33440, + "train_speed(iter/s)": 0.202018 + }, + { + "acc": 0.80074768, + "epoch": 0.7804138293063474, + "grad_norm": 4.8125, + "learning_rate": 6.966216988372424e-06, + "loss": 0.72729893, + "memory(GiB)": 135.77, + "step": 33450, + "train_speed(iter/s)": 0.202049 + }, + { + "acc": 0.78108997, + "epoch": 0.7806471368786363, + "grad_norm": 4.21875, + "learning_rate": 6.964479933835429e-06, + "loss": 0.78009777, + "memory(GiB)": 135.77, + "step": 33460, + "train_speed(iter/s)": 0.202079 + }, + { + "acc": 0.76426306, + "epoch": 0.7808804444509252, + "grad_norm": 6.15625, + "learning_rate": 6.962742598869495e-06, + "loss": 0.87753067, + "memory(GiB)": 135.77, + "step": 33470, + "train_speed(iter/s)": 0.202111 + }, + { + "acc": 0.77631526, + "epoch": 0.7811137520232141, + "grad_norm": 5.46875, + "learning_rate": 6.961004983722625e-06, + "loss": 0.80698681, + "memory(GiB)": 135.77, + "step": 33480, + "train_speed(iter/s)": 0.20214 + }, + { + "acc": 0.75766716, + "epoch": 0.781347059595503, + "grad_norm": 5.5, + "learning_rate": 6.959267088642864e-06, + "loss": 0.89244823, + "memory(GiB)": 135.77, + "step": 33490, + "train_speed(iter/s)": 0.202169 + }, + { + "acc": 0.77791281, + "epoch": 0.7815803671677919, + "grad_norm": 6.0625, + "learning_rate": 6.9575289138782944e-06, + "loss": 0.78410869, + "memory(GiB)": 135.77, + "step": 33500, + "train_speed(iter/s)": 0.202201 + }, + { + "epoch": 0.7815803671677919, + "eval_acc": 0.7424501202763256, + "eval_loss": 0.8124200105667114, + "eval_runtime": 1269.5877, + "eval_samples_per_second": 28.349, + "eval_steps_per_second": 14.175, + "step": 33500 + }, + { + "acc": 0.79002161, + "epoch": 0.7818136747400808, + "grad_norm": 5.15625, + "learning_rate": 6.955790459677041e-06, + "loss": 0.74639416, + "memory(GiB)": 135.77, + "step": 33510, + "train_speed(iter/s)": 0.200672 + }, + { + "acc": 0.7705555, + "epoch": 0.7820469823123697, + "grad_norm": 4.90625, + "learning_rate": 6.9540517262872675e-06, + "loss": 0.82598076, + "memory(GiB)": 135.77, + "step": 33520, + "train_speed(iter/s)": 0.200704 + }, + { + "acc": 0.76916981, + "epoch": 0.7822802898846586, + "grad_norm": 6.28125, + "learning_rate": 6.952312713957179e-06, + "loss": 0.82480154, + "memory(GiB)": 135.77, + "step": 33530, + "train_speed(iter/s)": 0.200733 + }, + { + "acc": 0.77232513, + "epoch": 0.7825135974569475, + "grad_norm": 4.875, + "learning_rate": 6.9505734229350155e-06, + "loss": 0.8362215, + "memory(GiB)": 135.77, + "step": 33540, + "train_speed(iter/s)": 0.200762 + }, + { + "acc": 0.78856249, + "epoch": 0.7827469050292364, + "grad_norm": 5.0625, + "learning_rate": 6.948833853469065e-06, + "loss": 0.76099873, + "memory(GiB)": 135.77, + "step": 33550, + "train_speed(iter/s)": 0.200795 + }, + { + "acc": 0.78895731, + "epoch": 0.7829802126015253, + "grad_norm": 4.25, + "learning_rate": 6.947094005807646e-06, + "loss": 0.76939201, + "memory(GiB)": 135.77, + "step": 33560, + "train_speed(iter/s)": 0.200828 + }, + { + "acc": 0.76670189, + "epoch": 0.7832135201738142, + "grad_norm": 4.59375, + "learning_rate": 6.945353880199124e-06, + "loss": 0.84993162, + "memory(GiB)": 135.77, + "step": 33570, + "train_speed(iter/s)": 0.20086 + }, + { + "acc": 0.76875811, + "epoch": 0.7834468277461031, + "grad_norm": 5.25, + "learning_rate": 6.943613476891902e-06, + "loss": 0.84574566, + "memory(GiB)": 135.77, + "step": 33580, + "train_speed(iter/s)": 0.200891 + }, + { + "acc": 0.77109642, + "epoch": 0.783680135318392, + "grad_norm": 4.3125, + "learning_rate": 6.941872796134419e-06, + "loss": 0.82063408, + "memory(GiB)": 135.77, + "step": 33590, + "train_speed(iter/s)": 0.200925 + }, + { + "acc": 0.76629744, + "epoch": 0.7839134428906808, + "grad_norm": 4.1875, + "learning_rate": 6.940131838175159e-06, + "loss": 0.85809727, + "memory(GiB)": 135.77, + "step": 33600, + "train_speed(iter/s)": 0.200955 + }, + { + "acc": 0.77465167, + "epoch": 0.7841467504629697, + "grad_norm": 5.3125, + "learning_rate": 6.938390603262644e-06, + "loss": 0.80659037, + "memory(GiB)": 135.77, + "step": 33610, + "train_speed(iter/s)": 0.200986 + }, + { + "acc": 0.77369032, + "epoch": 0.7843800580352586, + "grad_norm": 5.09375, + "learning_rate": 6.936649091645431e-06, + "loss": 0.82941837, + "memory(GiB)": 135.77, + "step": 33620, + "train_speed(iter/s)": 0.201015 + }, + { + "acc": 0.79202423, + "epoch": 0.7846133656075475, + "grad_norm": 4.5, + "learning_rate": 6.9349073035721235e-06, + "loss": 0.7462399, + "memory(GiB)": 135.77, + "step": 33630, + "train_speed(iter/s)": 0.201046 + }, + { + "acc": 0.770962, + "epoch": 0.7848466731798364, + "grad_norm": 5.6875, + "learning_rate": 6.933165239291362e-06, + "loss": 0.82215929, + "memory(GiB)": 135.77, + "step": 33640, + "train_speed(iter/s)": 0.201077 + }, + { + "acc": 0.78424397, + "epoch": 0.7850799807521253, + "grad_norm": 4.90625, + "learning_rate": 6.931422899051823e-06, + "loss": 0.76600466, + "memory(GiB)": 135.77, + "step": 33650, + "train_speed(iter/s)": 0.201109 + }, + { + "acc": 0.76943855, + "epoch": 0.7853132883244142, + "grad_norm": 5.09375, + "learning_rate": 6.929680283102227e-06, + "loss": 0.83709068, + "memory(GiB)": 135.77, + "step": 33660, + "train_speed(iter/s)": 0.20114 + }, + { + "acc": 0.78439827, + "epoch": 0.7855465958967031, + "grad_norm": 5.1875, + "learning_rate": 6.9279373916913305e-06, + "loss": 0.78921213, + "memory(GiB)": 135.77, + "step": 33670, + "train_speed(iter/s)": 0.20117 + }, + { + "acc": 0.78686228, + "epoch": 0.785779903468992, + "grad_norm": 4.53125, + "learning_rate": 6.926194225067932e-06, + "loss": 0.77492542, + "memory(GiB)": 135.77, + "step": 33680, + "train_speed(iter/s)": 0.201202 + }, + { + "acc": 0.79153872, + "epoch": 0.7860132110412809, + "grad_norm": 5.78125, + "learning_rate": 6.924450783480866e-06, + "loss": 0.74138288, + "memory(GiB)": 135.77, + "step": 33690, + "train_speed(iter/s)": 0.201232 + }, + { + "acc": 0.78364296, + "epoch": 0.7862465186135698, + "grad_norm": 4.46875, + "learning_rate": 6.922707067179011e-06, + "loss": 0.78220735, + "memory(GiB)": 135.77, + "step": 33700, + "train_speed(iter/s)": 0.201263 + }, + { + "acc": 0.76618633, + "epoch": 0.7864798261858587, + "grad_norm": 5.34375, + "learning_rate": 6.92096307641128e-06, + "loss": 0.84831276, + "memory(GiB)": 135.77, + "step": 33710, + "train_speed(iter/s)": 0.201293 + }, + { + "acc": 0.78253098, + "epoch": 0.7867131337581476, + "grad_norm": 4.84375, + "learning_rate": 6.919218811426629e-06, + "loss": 0.78605328, + "memory(GiB)": 135.77, + "step": 33720, + "train_speed(iter/s)": 0.201324 + }, + { + "acc": 0.79246273, + "epoch": 0.7869464413304365, + "grad_norm": 5.5625, + "learning_rate": 6.91747427247405e-06, + "loss": 0.72274895, + "memory(GiB)": 135.77, + "step": 33730, + "train_speed(iter/s)": 0.201356 + }, + { + "acc": 0.7848577, + "epoch": 0.7871797489027254, + "grad_norm": 11.9375, + "learning_rate": 6.915729459802575e-06, + "loss": 0.76760449, + "memory(GiB)": 135.77, + "step": 33740, + "train_speed(iter/s)": 0.201388 + }, + { + "acc": 0.80589046, + "epoch": 0.7874130564750143, + "grad_norm": 5.25, + "learning_rate": 6.913984373661275e-06, + "loss": 0.68049059, + "memory(GiB)": 135.77, + "step": 33750, + "train_speed(iter/s)": 0.20142 + }, + { + "acc": 0.7855567, + "epoch": 0.7876463640473031, + "grad_norm": 5.09375, + "learning_rate": 6.9122390142992634e-06, + "loss": 0.77877674, + "memory(GiB)": 135.77, + "step": 33760, + "train_speed(iter/s)": 0.201453 + }, + { + "acc": 0.76452026, + "epoch": 0.787879671619592, + "grad_norm": 6.875, + "learning_rate": 6.910493381965687e-06, + "loss": 0.85262928, + "memory(GiB)": 135.77, + "step": 33770, + "train_speed(iter/s)": 0.201485 + }, + { + "acc": 0.77853546, + "epoch": 0.788112979191881, + "grad_norm": 6.59375, + "learning_rate": 6.9087474769097366e-06, + "loss": 0.7785665, + "memory(GiB)": 135.77, + "step": 33780, + "train_speed(iter/s)": 0.201516 + }, + { + "acc": 0.78647032, + "epoch": 0.7883462867641697, + "grad_norm": 6.65625, + "learning_rate": 6.907001299380639e-06, + "loss": 0.77372932, + "memory(GiB)": 135.77, + "step": 33790, + "train_speed(iter/s)": 0.201546 + }, + { + "acc": 0.78218312, + "epoch": 0.7885795943364586, + "grad_norm": 5.375, + "learning_rate": 6.905254849627658e-06, + "loss": 0.78796425, + "memory(GiB)": 135.77, + "step": 33800, + "train_speed(iter/s)": 0.201576 + }, + { + "acc": 0.76979618, + "epoch": 0.7888129019087475, + "grad_norm": 6.5, + "learning_rate": 6.9035081279001e-06, + "loss": 0.82921257, + "memory(GiB)": 135.77, + "step": 33810, + "train_speed(iter/s)": 0.201608 + }, + { + "acc": 0.79161835, + "epoch": 0.7890462094810364, + "grad_norm": 4.09375, + "learning_rate": 6.901761134447311e-06, + "loss": 0.76767821, + "memory(GiB)": 135.77, + "step": 33820, + "train_speed(iter/s)": 0.201636 + }, + { + "acc": 0.79137611, + "epoch": 0.7892795170533253, + "grad_norm": 6.375, + "learning_rate": 6.900013869518673e-06, + "loss": 0.74047461, + "memory(GiB)": 135.77, + "step": 33830, + "train_speed(iter/s)": 0.201666 + }, + { + "acc": 0.77348399, + "epoch": 0.7895128246256142, + "grad_norm": 7.4375, + "learning_rate": 6.898266333363607e-06, + "loss": 0.81377163, + "memory(GiB)": 135.77, + "step": 33840, + "train_speed(iter/s)": 0.201698 + }, + { + "acc": 0.77934666, + "epoch": 0.7897461321979031, + "grad_norm": 7.25, + "learning_rate": 6.8965185262315725e-06, + "loss": 0.79142418, + "memory(GiB)": 135.77, + "step": 33850, + "train_speed(iter/s)": 0.201728 + }, + { + "acc": 0.78089814, + "epoch": 0.789979439770192, + "grad_norm": 4.875, + "learning_rate": 6.89477044837207e-06, + "loss": 0.77429733, + "memory(GiB)": 135.77, + "step": 33860, + "train_speed(iter/s)": 0.201757 + }, + { + "acc": 0.7920969, + "epoch": 0.7902127473424809, + "grad_norm": 4.8125, + "learning_rate": 6.893022100034636e-06, + "loss": 0.74223742, + "memory(GiB)": 135.77, + "step": 33870, + "train_speed(iter/s)": 0.201785 + }, + { + "acc": 0.78182831, + "epoch": 0.7904460549147698, + "grad_norm": 5.28125, + "learning_rate": 6.891273481468847e-06, + "loss": 0.79486198, + "memory(GiB)": 135.77, + "step": 33880, + "train_speed(iter/s)": 0.201814 + }, + { + "acc": 0.78645382, + "epoch": 0.7906793624870587, + "grad_norm": 4.40625, + "learning_rate": 6.889524592924319e-06, + "loss": 0.75907497, + "memory(GiB)": 135.77, + "step": 33890, + "train_speed(iter/s)": 0.201846 + }, + { + "acc": 0.78487177, + "epoch": 0.7909126700593476, + "grad_norm": 8.5, + "learning_rate": 6.887775434650704e-06, + "loss": 0.76534662, + "memory(GiB)": 135.77, + "step": 33900, + "train_speed(iter/s)": 0.201877 + }, + { + "acc": 0.81099415, + "epoch": 0.7911459776316365, + "grad_norm": 5.65625, + "learning_rate": 6.8860260068976935e-06, + "loss": 0.66781607, + "memory(GiB)": 135.77, + "step": 33910, + "train_speed(iter/s)": 0.201908 + }, + { + "acc": 0.7786212, + "epoch": 0.7913792852039254, + "grad_norm": 10.1875, + "learning_rate": 6.884276309915018e-06, + "loss": 0.77360516, + "memory(GiB)": 135.77, + "step": 33920, + "train_speed(iter/s)": 0.20194 + }, + { + "acc": 0.77263908, + "epoch": 0.7916125927762143, + "grad_norm": 8.3125, + "learning_rate": 6.882526343952448e-06, + "loss": 0.81154432, + "memory(GiB)": 135.77, + "step": 33930, + "train_speed(iter/s)": 0.201971 + }, + { + "acc": 0.7702785, + "epoch": 0.7918459003485032, + "grad_norm": 5.375, + "learning_rate": 6.880776109259788e-06, + "loss": 0.81274462, + "memory(GiB)": 135.77, + "step": 33940, + "train_speed(iter/s)": 0.202002 + }, + { + "acc": 0.77342148, + "epoch": 0.7920792079207921, + "grad_norm": 4.6875, + "learning_rate": 6.8790256060868866e-06, + "loss": 0.80101671, + "memory(GiB)": 135.77, + "step": 33950, + "train_speed(iter/s)": 0.202032 + }, + { + "acc": 0.78165035, + "epoch": 0.792312515493081, + "grad_norm": 6.03125, + "learning_rate": 6.8772748346836235e-06, + "loss": 0.80019588, + "memory(GiB)": 135.77, + "step": 33960, + "train_speed(iter/s)": 0.202064 + }, + { + "acc": 0.7831409, + "epoch": 0.7925458230653699, + "grad_norm": 5.75, + "learning_rate": 6.875523795299925e-06, + "loss": 0.77129364, + "memory(GiB)": 135.77, + "step": 33970, + "train_speed(iter/s)": 0.202096 + }, + { + "acc": 0.77243023, + "epoch": 0.7927791306376588, + "grad_norm": 5.46875, + "learning_rate": 6.873772488185747e-06, + "loss": 0.83966198, + "memory(GiB)": 135.77, + "step": 33980, + "train_speed(iter/s)": 0.202126 + }, + { + "acc": 0.79374552, + "epoch": 0.7930124382099476, + "grad_norm": 5.71875, + "learning_rate": 6.872020913591092e-06, + "loss": 0.73950348, + "memory(GiB)": 135.77, + "step": 33990, + "train_speed(iter/s)": 0.202158 + }, + { + "acc": 0.78279028, + "epoch": 0.7932457457822365, + "grad_norm": 5.78125, + "learning_rate": 6.870269071765997e-06, + "loss": 0.79019928, + "memory(GiB)": 135.77, + "step": 34000, + "train_speed(iter/s)": 0.202189 + }, + { + "epoch": 0.7932457457822365, + "eval_acc": 0.7426201934510297, + "eval_loss": 0.8120821714401245, + "eval_runtime": 1267.8986, + "eval_samples_per_second": 28.386, + "eval_steps_per_second": 14.194, + "step": 34000 + }, + { + "acc": 0.78353124, + "epoch": 0.7934790533545254, + "grad_norm": 6.4375, + "learning_rate": 6.868516962960534e-06, + "loss": 0.75921488, + "memory(GiB)": 135.77, + "step": 34010, + "train_speed(iter/s)": 0.200686 + }, + { + "acc": 0.78991613, + "epoch": 0.7937123609268143, + "grad_norm": 4.875, + "learning_rate": 6.866764587424818e-06, + "loss": 0.75100489, + "memory(GiB)": 135.77, + "step": 34020, + "train_speed(iter/s)": 0.200716 + }, + { + "acc": 0.78309031, + "epoch": 0.7939456684991032, + "grad_norm": 5.625, + "learning_rate": 6.865011945408998e-06, + "loss": 0.78480835, + "memory(GiB)": 135.77, + "step": 34030, + "train_speed(iter/s)": 0.200748 + }, + { + "acc": 0.7720583, + "epoch": 0.7941789760713921, + "grad_norm": 6.78125, + "learning_rate": 6.863259037163266e-06, + "loss": 0.81258068, + "memory(GiB)": 135.77, + "step": 34040, + "train_speed(iter/s)": 0.200777 + }, + { + "acc": 0.77889328, + "epoch": 0.794412283643681, + "grad_norm": 5.625, + "learning_rate": 6.8615058629378465e-06, + "loss": 0.79917364, + "memory(GiB)": 135.77, + "step": 34050, + "train_speed(iter/s)": 0.200807 + }, + { + "acc": 0.79887199, + "epoch": 0.7946455912159699, + "grad_norm": 5.375, + "learning_rate": 6.859752422983006e-06, + "loss": 0.72141008, + "memory(GiB)": 135.77, + "step": 34060, + "train_speed(iter/s)": 0.200838 + }, + { + "acc": 0.78431706, + "epoch": 0.7948788987882588, + "grad_norm": 7.625, + "learning_rate": 6.857998717549048e-06, + "loss": 0.7462605, + "memory(GiB)": 135.77, + "step": 34070, + "train_speed(iter/s)": 0.200869 + }, + { + "acc": 0.78418608, + "epoch": 0.7951122063605477, + "grad_norm": 5.5, + "learning_rate": 6.856244746886313e-06, + "loss": 0.77865896, + "memory(GiB)": 135.77, + "step": 34080, + "train_speed(iter/s)": 0.200901 + }, + { + "acc": 0.75828471, + "epoch": 0.7953455139328366, + "grad_norm": 4.46875, + "learning_rate": 6.85449051124518e-06, + "loss": 0.85784569, + "memory(GiB)": 135.77, + "step": 34090, + "train_speed(iter/s)": 0.200935 + }, + { + "acc": 0.78311863, + "epoch": 0.7955788215051255, + "grad_norm": 7.40625, + "learning_rate": 6.852736010876063e-06, + "loss": 0.76896353, + "memory(GiB)": 135.77, + "step": 34100, + "train_speed(iter/s)": 0.200966 + }, + { + "acc": 0.78136911, + "epoch": 0.7958121290774144, + "grad_norm": 4.96875, + "learning_rate": 6.85098124602942e-06, + "loss": 0.78809237, + "memory(GiB)": 135.77, + "step": 34110, + "train_speed(iter/s)": 0.200996 + }, + { + "acc": 0.78460474, + "epoch": 0.7960454366497033, + "grad_norm": 6.125, + "learning_rate": 6.8492262169557435e-06, + "loss": 0.74483161, + "memory(GiB)": 135.77, + "step": 34120, + "train_speed(iter/s)": 0.201024 + }, + { + "acc": 0.80227928, + "epoch": 0.7962787442219922, + "grad_norm": 4.375, + "learning_rate": 6.847470923905559e-06, + "loss": 0.71104026, + "memory(GiB)": 135.77, + "step": 34130, + "train_speed(iter/s)": 0.201054 + }, + { + "acc": 0.78113799, + "epoch": 0.7965120517942811, + "grad_norm": 7.125, + "learning_rate": 6.845715367129438e-06, + "loss": 0.79373302, + "memory(GiB)": 135.77, + "step": 34140, + "train_speed(iter/s)": 0.201083 + }, + { + "acc": 0.78072958, + "epoch": 0.79674535936657, + "grad_norm": 5.5, + "learning_rate": 6.843959546877985e-06, + "loss": 0.77883039, + "memory(GiB)": 135.77, + "step": 34150, + "train_speed(iter/s)": 0.201115 + }, + { + "acc": 0.78991518, + "epoch": 0.7969786669388589, + "grad_norm": 4.78125, + "learning_rate": 6.842203463401842e-06, + "loss": 0.74006615, + "memory(GiB)": 135.77, + "step": 34160, + "train_speed(iter/s)": 0.201145 + }, + { + "acc": 0.76768169, + "epoch": 0.7972119745111478, + "grad_norm": 6.8125, + "learning_rate": 6.84044711695169e-06, + "loss": 0.8356452, + "memory(GiB)": 135.77, + "step": 34170, + "train_speed(iter/s)": 0.201176 + }, + { + "acc": 0.78966255, + "epoch": 0.7974452820834366, + "grad_norm": 5.21875, + "learning_rate": 6.838690507778247e-06, + "loss": 0.75411134, + "memory(GiB)": 135.77, + "step": 34180, + "train_speed(iter/s)": 0.201206 + }, + { + "acc": 0.78948689, + "epoch": 0.7976785896557255, + "grad_norm": 7.0, + "learning_rate": 6.836933636132267e-06, + "loss": 0.75135803, + "memory(GiB)": 135.77, + "step": 34190, + "train_speed(iter/s)": 0.201235 + }, + { + "acc": 0.77856131, + "epoch": 0.7979118972280144, + "grad_norm": 8.9375, + "learning_rate": 6.835176502264544e-06, + "loss": 0.81702185, + "memory(GiB)": 135.77, + "step": 34200, + "train_speed(iter/s)": 0.201263 + }, + { + "acc": 0.76752882, + "epoch": 0.7981452048003033, + "grad_norm": 7.84375, + "learning_rate": 6.8334191064259095e-06, + "loss": 0.85097122, + "memory(GiB)": 135.77, + "step": 34210, + "train_speed(iter/s)": 0.201294 + }, + { + "acc": 0.76000738, + "epoch": 0.7983785123725922, + "grad_norm": 4.375, + "learning_rate": 6.8316614488672305e-06, + "loss": 0.88621998, + "memory(GiB)": 135.77, + "step": 34220, + "train_speed(iter/s)": 0.201324 + }, + { + "acc": 0.77623281, + "epoch": 0.7986118199448811, + "grad_norm": 5.875, + "learning_rate": 6.829903529839411e-06, + "loss": 0.79441414, + "memory(GiB)": 135.77, + "step": 34230, + "train_speed(iter/s)": 0.201353 + }, + { + "acc": 0.77455211, + "epoch": 0.79884512751717, + "grad_norm": 6.8125, + "learning_rate": 6.828145349593395e-06, + "loss": 0.80323877, + "memory(GiB)": 135.77, + "step": 34240, + "train_speed(iter/s)": 0.201383 + }, + { + "acc": 0.77257781, + "epoch": 0.7990784350894589, + "grad_norm": 5.0625, + "learning_rate": 6.82638690838016e-06, + "loss": 0.83061676, + "memory(GiB)": 135.77, + "step": 34250, + "train_speed(iter/s)": 0.201413 + }, + { + "acc": 0.76844459, + "epoch": 0.7993117426617478, + "grad_norm": 4.65625, + "learning_rate": 6.824628206450724e-06, + "loss": 0.84785528, + "memory(GiB)": 135.77, + "step": 34260, + "train_speed(iter/s)": 0.201443 + }, + { + "acc": 0.78014994, + "epoch": 0.7995450502340367, + "grad_norm": 6.21875, + "learning_rate": 6.822869244056143e-06, + "loss": 0.79277802, + "memory(GiB)": 135.77, + "step": 34270, + "train_speed(iter/s)": 0.201472 + }, + { + "acc": 0.78358684, + "epoch": 0.7997783578063256, + "grad_norm": 5.46875, + "learning_rate": 6.821110021447506e-06, + "loss": 0.79087338, + "memory(GiB)": 135.77, + "step": 34280, + "train_speed(iter/s)": 0.201502 + }, + { + "acc": 0.76477342, + "epoch": 0.8000116653786145, + "grad_norm": 5.9375, + "learning_rate": 6.819350538875944e-06, + "loss": 0.86270304, + "memory(GiB)": 135.77, + "step": 34290, + "train_speed(iter/s)": 0.201534 + }, + { + "acc": 0.77901411, + "epoch": 0.8002449729509034, + "grad_norm": 4.78125, + "learning_rate": 6.817590796592621e-06, + "loss": 0.7760778, + "memory(GiB)": 135.77, + "step": 34300, + "train_speed(iter/s)": 0.201563 + }, + { + "acc": 0.786831, + "epoch": 0.8004782805231923, + "grad_norm": 5.125, + "learning_rate": 6.815830794848739e-06, + "loss": 0.75008812, + "memory(GiB)": 135.77, + "step": 34310, + "train_speed(iter/s)": 0.201594 + }, + { + "acc": 0.77648001, + "epoch": 0.8007115880954812, + "grad_norm": 4.46875, + "learning_rate": 6.8140705338955386e-06, + "loss": 0.80113869, + "memory(GiB)": 135.77, + "step": 34320, + "train_speed(iter/s)": 0.201625 + }, + { + "acc": 0.76347504, + "epoch": 0.80094489566777, + "grad_norm": 7.34375, + "learning_rate": 6.812310013984296e-06, + "loss": 0.85464201, + "memory(GiB)": 135.77, + "step": 34330, + "train_speed(iter/s)": 0.201655 + }, + { + "acc": 0.77026582, + "epoch": 0.801178203240059, + "grad_norm": 5.34375, + "learning_rate": 6.810549235366325e-06, + "loss": 0.8082571, + "memory(GiB)": 135.77, + "step": 34340, + "train_speed(iter/s)": 0.201686 + }, + { + "acc": 0.77789001, + "epoch": 0.8014115108123478, + "grad_norm": 6.1875, + "learning_rate": 6.808788198292977e-06, + "loss": 0.78789454, + "memory(GiB)": 135.77, + "step": 34350, + "train_speed(iter/s)": 0.201712 + }, + { + "acc": 0.76755538, + "epoch": 0.8016448183846367, + "grad_norm": 6.25, + "learning_rate": 6.80702690301564e-06, + "loss": 0.83963947, + "memory(GiB)": 135.77, + "step": 34360, + "train_speed(iter/s)": 0.201742 + }, + { + "acc": 0.78171759, + "epoch": 0.8018781259569255, + "grad_norm": 6.5, + "learning_rate": 6.805265349785738e-06, + "loss": 0.78367205, + "memory(GiB)": 135.77, + "step": 34370, + "train_speed(iter/s)": 0.201774 + }, + { + "acc": 0.77745686, + "epoch": 0.8021114335292144, + "grad_norm": 5.90625, + "learning_rate": 6.80350353885473e-06, + "loss": 0.79816732, + "memory(GiB)": 135.77, + "step": 34380, + "train_speed(iter/s)": 0.201804 + }, + { + "acc": 0.76283531, + "epoch": 0.8023447411015033, + "grad_norm": 6.9375, + "learning_rate": 6.801741470474117e-06, + "loss": 0.86973667, + "memory(GiB)": 135.77, + "step": 34390, + "train_speed(iter/s)": 0.201837 + }, + { + "acc": 0.7806407, + "epoch": 0.8025780486737922, + "grad_norm": 5.0625, + "learning_rate": 6.799979144895432e-06, + "loss": 0.79195518, + "memory(GiB)": 135.77, + "step": 34400, + "train_speed(iter/s)": 0.201868 + }, + { + "acc": 0.75355606, + "epoch": 0.8028113562460811, + "grad_norm": 5.40625, + "learning_rate": 6.798216562370247e-06, + "loss": 0.90384073, + "memory(GiB)": 135.77, + "step": 34410, + "train_speed(iter/s)": 0.201899 + }, + { + "acc": 0.77500319, + "epoch": 0.80304466381837, + "grad_norm": 7.1875, + "learning_rate": 6.79645372315017e-06, + "loss": 0.82036419, + "memory(GiB)": 135.77, + "step": 34420, + "train_speed(iter/s)": 0.201929 + }, + { + "acc": 0.7839057, + "epoch": 0.8032779713906589, + "grad_norm": 4.09375, + "learning_rate": 6.794690627486846e-06, + "loss": 0.76988935, + "memory(GiB)": 135.77, + "step": 34430, + "train_speed(iter/s)": 0.201958 + }, + { + "acc": 0.78427267, + "epoch": 0.8035112789629478, + "grad_norm": 5.625, + "learning_rate": 6.792927275631957e-06, + "loss": 0.77875066, + "memory(GiB)": 135.77, + "step": 34440, + "train_speed(iter/s)": 0.201985 + }, + { + "acc": 0.78074837, + "epoch": 0.8037445865352367, + "grad_norm": 5.59375, + "learning_rate": 6.791163667837219e-06, + "loss": 0.77121277, + "memory(GiB)": 135.77, + "step": 34450, + "train_speed(iter/s)": 0.202014 + }, + { + "acc": 0.77221909, + "epoch": 0.8039778941075256, + "grad_norm": 6.21875, + "learning_rate": 6.789399804354389e-06, + "loss": 0.82327061, + "memory(GiB)": 135.77, + "step": 34460, + "train_speed(iter/s)": 0.202044 + }, + { + "acc": 0.78617983, + "epoch": 0.8042112016798145, + "grad_norm": 4.90625, + "learning_rate": 6.787635685435255e-06, + "loss": 0.78236666, + "memory(GiB)": 135.77, + "step": 34470, + "train_speed(iter/s)": 0.202072 + }, + { + "acc": 0.77123075, + "epoch": 0.8044445092521034, + "grad_norm": 6.90625, + "learning_rate": 6.785871311331648e-06, + "loss": 0.83095236, + "memory(GiB)": 135.77, + "step": 34480, + "train_speed(iter/s)": 0.202104 + }, + { + "acc": 0.7779706, + "epoch": 0.8046778168243923, + "grad_norm": 4.1875, + "learning_rate": 6.7841066822954284e-06, + "loss": 0.80731964, + "memory(GiB)": 135.77, + "step": 34490, + "train_speed(iter/s)": 0.202133 + }, + { + "acc": 0.78801842, + "epoch": 0.8049111243966812, + "grad_norm": 7.28125, + "learning_rate": 6.7823417985784986e-06, + "loss": 0.74819498, + "memory(GiB)": 135.77, + "step": 34500, + "train_speed(iter/s)": 0.202161 + }, + { + "epoch": 0.8049111243966812, + "eval_acc": 0.7425651038351181, + "eval_loss": 0.8118574619293213, + "eval_runtime": 1269.6887, + "eval_samples_per_second": 28.346, + "eval_steps_per_second": 14.174, + "step": 34500 + }, + { + "acc": 0.77680259, + "epoch": 0.8051444319689701, + "grad_norm": 5.75, + "learning_rate": 6.780576660432797e-06, + "loss": 0.81476183, + "memory(GiB)": 135.77, + "step": 34510, + "train_speed(iter/s)": 0.20068 + }, + { + "acc": 0.75909719, + "epoch": 0.805377739541259, + "grad_norm": 5.90625, + "learning_rate": 6.778811268110294e-06, + "loss": 0.86525078, + "memory(GiB)": 135.77, + "step": 34520, + "train_speed(iter/s)": 0.20071 + }, + { + "acc": 0.76516881, + "epoch": 0.8056110471135479, + "grad_norm": 6.4375, + "learning_rate": 6.777045621862997e-06, + "loss": 0.83675842, + "memory(GiB)": 135.77, + "step": 34530, + "train_speed(iter/s)": 0.200741 + }, + { + "acc": 0.78065295, + "epoch": 0.8058443546858368, + "grad_norm": 7.59375, + "learning_rate": 6.775279721942954e-06, + "loss": 0.77787828, + "memory(GiB)": 135.77, + "step": 34540, + "train_speed(iter/s)": 0.200771 + }, + { + "acc": 0.78033791, + "epoch": 0.8060776622581257, + "grad_norm": 4.53125, + "learning_rate": 6.773513568602248e-06, + "loss": 0.78399673, + "memory(GiB)": 135.77, + "step": 34550, + "train_speed(iter/s)": 0.2008 + }, + { + "acc": 0.78510847, + "epoch": 0.8063109698304145, + "grad_norm": 4.90625, + "learning_rate": 6.771747162092993e-06, + "loss": 0.75295162, + "memory(GiB)": 135.77, + "step": 34560, + "train_speed(iter/s)": 0.20083 + }, + { + "acc": 0.75793881, + "epoch": 0.8065442774027034, + "grad_norm": 6.5625, + "learning_rate": 6.769980502667348e-06, + "loss": 0.85168362, + "memory(GiB)": 135.77, + "step": 34570, + "train_speed(iter/s)": 0.20086 + }, + { + "acc": 0.77159257, + "epoch": 0.8067775849749923, + "grad_norm": 5.8125, + "learning_rate": 6.7682135905775e-06, + "loss": 0.7980968, + "memory(GiB)": 135.77, + "step": 34580, + "train_speed(iter/s)": 0.200891 + }, + { + "acc": 0.77705431, + "epoch": 0.8070108925472812, + "grad_norm": 5.90625, + "learning_rate": 6.7664464260756745e-06, + "loss": 0.81086655, + "memory(GiB)": 135.77, + "step": 34590, + "train_speed(iter/s)": 0.200921 + }, + { + "acc": 0.77288313, + "epoch": 0.8072442001195701, + "grad_norm": 5.0, + "learning_rate": 6.764679009414135e-06, + "loss": 0.82601871, + "memory(GiB)": 135.77, + "step": 34600, + "train_speed(iter/s)": 0.20095 + }, + { + "acc": 0.77113543, + "epoch": 0.807477507691859, + "grad_norm": 6.3125, + "learning_rate": 6.76291134084518e-06, + "loss": 0.81248341, + "memory(GiB)": 135.77, + "step": 34610, + "train_speed(iter/s)": 0.200982 + }, + { + "acc": 0.77865152, + "epoch": 0.8077108152641479, + "grad_norm": 5.53125, + "learning_rate": 6.761143420621141e-06, + "loss": 0.81596804, + "memory(GiB)": 135.77, + "step": 34620, + "train_speed(iter/s)": 0.201012 + }, + { + "acc": 0.75378828, + "epoch": 0.8079441228364368, + "grad_norm": 7.0, + "learning_rate": 6.759375248994393e-06, + "loss": 0.90001888, + "memory(GiB)": 135.77, + "step": 34630, + "train_speed(iter/s)": 0.201042 + }, + { + "acc": 0.79571762, + "epoch": 0.8081774304087257, + "grad_norm": 3.953125, + "learning_rate": 6.757606826217339e-06, + "loss": 0.72643576, + "memory(GiB)": 135.77, + "step": 34640, + "train_speed(iter/s)": 0.201072 + }, + { + "acc": 0.77722073, + "epoch": 0.8084107379810146, + "grad_norm": 6.75, + "learning_rate": 6.755838152542421e-06, + "loss": 0.81900959, + "memory(GiB)": 135.77, + "step": 34650, + "train_speed(iter/s)": 0.201102 + }, + { + "acc": 0.76127481, + "epoch": 0.8086440455533035, + "grad_norm": 4.53125, + "learning_rate": 6.754069228222117e-06, + "loss": 0.85369015, + "memory(GiB)": 135.77, + "step": 34660, + "train_speed(iter/s)": 0.201132 + }, + { + "acc": 0.77648983, + "epoch": 0.8088773531255924, + "grad_norm": 4.8125, + "learning_rate": 6.752300053508939e-06, + "loss": 0.80511589, + "memory(GiB)": 135.77, + "step": 34670, + "train_speed(iter/s)": 0.201162 + }, + { + "acc": 0.7765605, + "epoch": 0.8091106606978813, + "grad_norm": 5.71875, + "learning_rate": 6.750530628655437e-06, + "loss": 0.80522938, + "memory(GiB)": 135.77, + "step": 34680, + "train_speed(iter/s)": 0.201192 + }, + { + "acc": 0.77244101, + "epoch": 0.8093439682701702, + "grad_norm": 5.0, + "learning_rate": 6.748760953914198e-06, + "loss": 0.81097126, + "memory(GiB)": 135.77, + "step": 34690, + "train_speed(iter/s)": 0.201221 + }, + { + "acc": 0.76321468, + "epoch": 0.8095772758424591, + "grad_norm": 24.375, + "learning_rate": 6.746991029537841e-06, + "loss": 0.84920597, + "memory(GiB)": 135.77, + "step": 34700, + "train_speed(iter/s)": 0.201251 + }, + { + "acc": 0.76385412, + "epoch": 0.809810583414748, + "grad_norm": 6.0, + "learning_rate": 6.74522085577902e-06, + "loss": 0.84149389, + "memory(GiB)": 135.77, + "step": 34710, + "train_speed(iter/s)": 0.201281 + }, + { + "acc": 0.77857213, + "epoch": 0.8100438909870369, + "grad_norm": 7.21875, + "learning_rate": 6.743450432890431e-06, + "loss": 0.77943163, + "memory(GiB)": 135.77, + "step": 34720, + "train_speed(iter/s)": 0.201312 + }, + { + "acc": 0.76652179, + "epoch": 0.8102771985593258, + "grad_norm": 5.84375, + "learning_rate": 6.741679761124798e-06, + "loss": 0.81627655, + "memory(GiB)": 135.77, + "step": 34730, + "train_speed(iter/s)": 0.201342 + }, + { + "acc": 0.78065634, + "epoch": 0.8105105061316147, + "grad_norm": 6.375, + "learning_rate": 6.739908840734885e-06, + "loss": 0.7564887, + "memory(GiB)": 135.77, + "step": 34740, + "train_speed(iter/s)": 0.201371 + }, + { + "acc": 0.78042436, + "epoch": 0.8107438137039036, + "grad_norm": 5.375, + "learning_rate": 6.738137671973492e-06, + "loss": 0.78124051, + "memory(GiB)": 135.77, + "step": 34750, + "train_speed(iter/s)": 0.201399 + }, + { + "acc": 0.78339186, + "epoch": 0.8109771212761924, + "grad_norm": 7.21875, + "learning_rate": 6.736366255093449e-06, + "loss": 0.75932827, + "memory(GiB)": 135.77, + "step": 34760, + "train_speed(iter/s)": 0.201431 + }, + { + "acc": 0.77975254, + "epoch": 0.8112104288484813, + "grad_norm": 5.4375, + "learning_rate": 6.73459459034763e-06, + "loss": 0.78994699, + "memory(GiB)": 135.77, + "step": 34770, + "train_speed(iter/s)": 0.20146 + }, + { + "acc": 0.77287989, + "epoch": 0.8114437364207702, + "grad_norm": 6.65625, + "learning_rate": 6.732822677988935e-06, + "loss": 0.81505728, + "memory(GiB)": 135.77, + "step": 34780, + "train_speed(iter/s)": 0.20149 + }, + { + "acc": 0.77970748, + "epoch": 0.8116770439930591, + "grad_norm": 7.3125, + "learning_rate": 6.731050518270307e-06, + "loss": 0.79578314, + "memory(GiB)": 135.77, + "step": 34790, + "train_speed(iter/s)": 0.20152 + }, + { + "acc": 0.77548347, + "epoch": 0.811910351565348, + "grad_norm": 5.75, + "learning_rate": 6.729278111444721e-06, + "loss": 0.80311823, + "memory(GiB)": 135.77, + "step": 34800, + "train_speed(iter/s)": 0.201549 + }, + { + "acc": 0.77532001, + "epoch": 0.8121436591376369, + "grad_norm": 4.6875, + "learning_rate": 6.727505457765185e-06, + "loss": 0.81310711, + "memory(GiB)": 135.77, + "step": 34810, + "train_speed(iter/s)": 0.201581 + }, + { + "acc": 0.77574654, + "epoch": 0.8123769667099258, + "grad_norm": 6.09375, + "learning_rate": 6.725732557484748e-06, + "loss": 0.79087381, + "memory(GiB)": 135.77, + "step": 34820, + "train_speed(iter/s)": 0.201612 + }, + { + "acc": 0.79127479, + "epoch": 0.8126102742822147, + "grad_norm": 5.96875, + "learning_rate": 6.723959410856489e-06, + "loss": 0.74439774, + "memory(GiB)": 135.77, + "step": 34830, + "train_speed(iter/s)": 0.201639 + }, + { + "acc": 0.78656416, + "epoch": 0.8128435818545036, + "grad_norm": 6.65625, + "learning_rate": 6.722186018133525e-06, + "loss": 0.76410608, + "memory(GiB)": 135.77, + "step": 34840, + "train_speed(iter/s)": 0.20167 + }, + { + "acc": 0.7781281, + "epoch": 0.8130768894267925, + "grad_norm": 7.25, + "learning_rate": 6.720412379569008e-06, + "loss": 0.79291306, + "memory(GiB)": 135.77, + "step": 34850, + "train_speed(iter/s)": 0.201701 + }, + { + "acc": 0.80160904, + "epoch": 0.8133101969990814, + "grad_norm": 4.8125, + "learning_rate": 6.718638495416124e-06, + "loss": 0.7177484, + "memory(GiB)": 135.77, + "step": 34860, + "train_speed(iter/s)": 0.20173 + }, + { + "acc": 0.77137651, + "epoch": 0.8135435045713703, + "grad_norm": 6.21875, + "learning_rate": 6.716864365928094e-06, + "loss": 0.82935047, + "memory(GiB)": 135.77, + "step": 34870, + "train_speed(iter/s)": 0.201759 + }, + { + "acc": 0.76962833, + "epoch": 0.8137768121436592, + "grad_norm": 5.34375, + "learning_rate": 6.715089991358174e-06, + "loss": 0.83387051, + "memory(GiB)": 135.77, + "step": 34880, + "train_speed(iter/s)": 0.201789 + }, + { + "acc": 0.74992967, + "epoch": 0.814010119715948, + "grad_norm": 5.625, + "learning_rate": 6.713315371959656e-06, + "loss": 0.89630518, + "memory(GiB)": 135.77, + "step": 34890, + "train_speed(iter/s)": 0.201819 + }, + { + "acc": 0.78066454, + "epoch": 0.814243427288237, + "grad_norm": 6.53125, + "learning_rate": 6.7115405079858656e-06, + "loss": 0.77816544, + "memory(GiB)": 135.77, + "step": 34900, + "train_speed(iter/s)": 0.201848 + }, + { + "acc": 0.77989941, + "epoch": 0.8144767348605259, + "grad_norm": 5.40625, + "learning_rate": 6.709765399690164e-06, + "loss": 0.77061434, + "memory(GiB)": 135.77, + "step": 34910, + "train_speed(iter/s)": 0.20188 + }, + { + "acc": 0.77812452, + "epoch": 0.8147100424328148, + "grad_norm": 4.375, + "learning_rate": 6.707990047325952e-06, + "loss": 0.79744072, + "memory(GiB)": 135.77, + "step": 34920, + "train_speed(iter/s)": 0.201909 + }, + { + "acc": 0.7687603, + "epoch": 0.8149433500051037, + "grad_norm": 5.875, + "learning_rate": 6.706214451146654e-06, + "loss": 0.82320156, + "memory(GiB)": 135.77, + "step": 34930, + "train_speed(iter/s)": 0.201939 + }, + { + "acc": 0.7765502, + "epoch": 0.8151766575773925, + "grad_norm": 4.96875, + "learning_rate": 6.70443861140574e-06, + "loss": 0.78917093, + "memory(GiB)": 135.77, + "step": 34940, + "train_speed(iter/s)": 0.201968 + }, + { + "acc": 0.7818922, + "epoch": 0.8154099651496813, + "grad_norm": 5.875, + "learning_rate": 6.702662528356709e-06, + "loss": 0.77447877, + "memory(GiB)": 135.77, + "step": 34950, + "train_speed(iter/s)": 0.201998 + }, + { + "acc": 0.77006559, + "epoch": 0.8156432727219702, + "grad_norm": 5.53125, + "learning_rate": 6.700886202253096e-06, + "loss": 0.83356829, + "memory(GiB)": 135.77, + "step": 34960, + "train_speed(iter/s)": 0.202028 + }, + { + "acc": 0.76896172, + "epoch": 0.8158765802942591, + "grad_norm": 6.0625, + "learning_rate": 6.699109633348473e-06, + "loss": 0.853722, + "memory(GiB)": 135.77, + "step": 34970, + "train_speed(iter/s)": 0.202058 + }, + { + "acc": 0.78813248, + "epoch": 0.816109887866548, + "grad_norm": 8.75, + "learning_rate": 6.697332821896443e-06, + "loss": 0.78391209, + "memory(GiB)": 135.77, + "step": 34980, + "train_speed(iter/s)": 0.202086 + }, + { + "acc": 0.77606354, + "epoch": 0.8163431954388369, + "grad_norm": 4.4375, + "learning_rate": 6.695555768150644e-06, + "loss": 0.78519859, + "memory(GiB)": 135.77, + "step": 34990, + "train_speed(iter/s)": 0.202116 + }, + { + "acc": 0.75625358, + "epoch": 0.8165765030111258, + "grad_norm": 5.71875, + "learning_rate": 6.693778472364754e-06, + "loss": 0.89699631, + "memory(GiB)": 135.77, + "step": 35000, + "train_speed(iter/s)": 0.202146 + }, + { + "epoch": 0.8165765030111258, + "eval_acc": 0.7427814586929572, + "eval_loss": 0.8112027049064636, + "eval_runtime": 1268.9891, + "eval_samples_per_second": 28.362, + "eval_steps_per_second": 14.181, + "step": 35000 + }, + { + "acc": 0.76805382, + "epoch": 0.8168098105834147, + "grad_norm": 5.4375, + "learning_rate": 6.692000934792479e-06, + "loss": 0.81355896, + "memory(GiB)": 135.77, + "step": 35010, + "train_speed(iter/s)": 0.200683 + }, + { + "acc": 0.78000903, + "epoch": 0.8170431181557036, + "grad_norm": 6.96875, + "learning_rate": 6.6902231556875605e-06, + "loss": 0.79382019, + "memory(GiB)": 135.77, + "step": 35020, + "train_speed(iter/s)": 0.200712 + }, + { + "acc": 0.77730827, + "epoch": 0.8172764257279925, + "grad_norm": 4.59375, + "learning_rate": 6.688445135303779e-06, + "loss": 0.78697395, + "memory(GiB)": 135.77, + "step": 35030, + "train_speed(iter/s)": 0.200742 + }, + { + "acc": 0.80807705, + "epoch": 0.8175097333002814, + "grad_norm": 5.84375, + "learning_rate": 6.686666873894945e-06, + "loss": 0.6698247, + "memory(GiB)": 135.77, + "step": 35040, + "train_speed(iter/s)": 0.200771 + }, + { + "acc": 0.78960042, + "epoch": 0.8177430408725703, + "grad_norm": 5.3125, + "learning_rate": 6.684888371714903e-06, + "loss": 0.74742336, + "memory(GiB)": 135.77, + "step": 35050, + "train_speed(iter/s)": 0.200798 + }, + { + "acc": 0.77717094, + "epoch": 0.8179763484448592, + "grad_norm": 5.0625, + "learning_rate": 6.683109629017536e-06, + "loss": 0.79351301, + "memory(GiB)": 135.77, + "step": 35060, + "train_speed(iter/s)": 0.200825 + }, + { + "acc": 0.77711973, + "epoch": 0.8182096560171481, + "grad_norm": 10.8125, + "learning_rate": 6.681330646056758e-06, + "loss": 0.80530987, + "memory(GiB)": 135.77, + "step": 35070, + "train_speed(iter/s)": 0.200854 + }, + { + "acc": 0.78023176, + "epoch": 0.818442963589437, + "grad_norm": 5.53125, + "learning_rate": 6.679551423086521e-06, + "loss": 0.80717087, + "memory(GiB)": 135.77, + "step": 35080, + "train_speed(iter/s)": 0.200883 + }, + { + "acc": 0.76191292, + "epoch": 0.8186762711617259, + "grad_norm": 4.9375, + "learning_rate": 6.677771960360806e-06, + "loss": 0.8590991, + "memory(GiB)": 135.77, + "step": 35090, + "train_speed(iter/s)": 0.200912 + }, + { + "acc": 0.76519403, + "epoch": 0.8189095787340148, + "grad_norm": 4.875, + "learning_rate": 6.6759922581336285e-06, + "loss": 0.83596706, + "memory(GiB)": 135.77, + "step": 35100, + "train_speed(iter/s)": 0.200943 + }, + { + "acc": 0.77768407, + "epoch": 0.8191428863063037, + "grad_norm": 7.1875, + "learning_rate": 6.674212316659045e-06, + "loss": 0.79807773, + "memory(GiB)": 135.77, + "step": 35110, + "train_speed(iter/s)": 0.200972 + }, + { + "acc": 0.76411262, + "epoch": 0.8193761938785926, + "grad_norm": 5.09375, + "learning_rate": 6.6724321361911384e-06, + "loss": 0.84451618, + "memory(GiB)": 135.77, + "step": 35120, + "train_speed(iter/s)": 0.201002 + }, + { + "acc": 0.76351461, + "epoch": 0.8196095014508815, + "grad_norm": 4.875, + "learning_rate": 6.6706517169840305e-06, + "loss": 0.85793591, + "memory(GiB)": 135.77, + "step": 35130, + "train_speed(iter/s)": 0.201032 + }, + { + "acc": 0.77693996, + "epoch": 0.8198428090231703, + "grad_norm": 8.75, + "learning_rate": 6.668871059291875e-06, + "loss": 0.78128543, + "memory(GiB)": 135.77, + "step": 35140, + "train_speed(iter/s)": 0.201062 + }, + { + "acc": 0.78849602, + "epoch": 0.8200761165954592, + "grad_norm": 4.5, + "learning_rate": 6.667090163368863e-06, + "loss": 0.7485117, + "memory(GiB)": 135.77, + "step": 35150, + "train_speed(iter/s)": 0.201092 + }, + { + "acc": 0.77250385, + "epoch": 0.8203094241677481, + "grad_norm": 4.5625, + "learning_rate": 6.665309029469214e-06, + "loss": 0.81979828, + "memory(GiB)": 135.77, + "step": 35160, + "train_speed(iter/s)": 0.201123 + }, + { + "acc": 0.77600441, + "epoch": 0.820542731740037, + "grad_norm": 4.71875, + "learning_rate": 6.663527657847182e-06, + "loss": 0.81544447, + "memory(GiB)": 135.77, + "step": 35170, + "train_speed(iter/s)": 0.201153 + }, + { + "acc": 0.789469, + "epoch": 0.8207760393123259, + "grad_norm": 4.40625, + "learning_rate": 6.661746048757061e-06, + "loss": 0.75934238, + "memory(GiB)": 135.77, + "step": 35180, + "train_speed(iter/s)": 0.201184 + }, + { + "acc": 0.75972357, + "epoch": 0.8210093468846148, + "grad_norm": 4.21875, + "learning_rate": 6.6599642024531755e-06, + "loss": 0.87705612, + "memory(GiB)": 135.77, + "step": 35190, + "train_speed(iter/s)": 0.201214 + }, + { + "acc": 0.80127926, + "epoch": 0.8212426544569037, + "grad_norm": 5.125, + "learning_rate": 6.658182119189882e-06, + "loss": 0.71490326, + "memory(GiB)": 135.77, + "step": 35200, + "train_speed(iter/s)": 0.201242 + }, + { + "acc": 0.77492847, + "epoch": 0.8214759620291926, + "grad_norm": 5.125, + "learning_rate": 6.656399799221572e-06, + "loss": 0.79777231, + "memory(GiB)": 135.77, + "step": 35210, + "train_speed(iter/s)": 0.201272 + }, + { + "acc": 0.78476725, + "epoch": 0.8217092696014815, + "grad_norm": 6.0625, + "learning_rate": 6.654617242802672e-06, + "loss": 0.78095455, + "memory(GiB)": 135.77, + "step": 35220, + "train_speed(iter/s)": 0.201303 + }, + { + "acc": 0.78755674, + "epoch": 0.8219425771737704, + "grad_norm": 5.5625, + "learning_rate": 6.652834450187643e-06, + "loss": 0.77176509, + "memory(GiB)": 135.77, + "step": 35230, + "train_speed(iter/s)": 0.201333 + }, + { + "acc": 0.78302078, + "epoch": 0.8221758847460593, + "grad_norm": 4.84375, + "learning_rate": 6.651051421630974e-06, + "loss": 0.77143784, + "memory(GiB)": 135.77, + "step": 35240, + "train_speed(iter/s)": 0.201363 + }, + { + "acc": 0.80360794, + "epoch": 0.8224091923183482, + "grad_norm": 5.5, + "learning_rate": 6.649268157387195e-06, + "loss": 0.69977317, + "memory(GiB)": 135.77, + "step": 35250, + "train_speed(iter/s)": 0.201395 + }, + { + "acc": 0.7762537, + "epoch": 0.8226424998906371, + "grad_norm": 5.65625, + "learning_rate": 6.647484657710867e-06, + "loss": 0.80590124, + "memory(GiB)": 135.77, + "step": 35260, + "train_speed(iter/s)": 0.201424 + }, + { + "acc": 0.79981756, + "epoch": 0.822875807462926, + "grad_norm": 4.875, + "learning_rate": 6.645700922856582e-06, + "loss": 0.71729341, + "memory(GiB)": 135.77, + "step": 35270, + "train_speed(iter/s)": 0.201452 + }, + { + "acc": 0.77900333, + "epoch": 0.8231091150352149, + "grad_norm": 4.25, + "learning_rate": 6.643916953078966e-06, + "loss": 0.79960418, + "memory(GiB)": 135.77, + "step": 35280, + "train_speed(iter/s)": 0.201482 + }, + { + "acc": 0.783249, + "epoch": 0.8233424226075038, + "grad_norm": 4.875, + "learning_rate": 6.642132748632685e-06, + "loss": 0.79931679, + "memory(GiB)": 135.77, + "step": 35290, + "train_speed(iter/s)": 0.201512 + }, + { + "acc": 0.77131052, + "epoch": 0.8235757301797927, + "grad_norm": 7.09375, + "learning_rate": 6.640348309772431e-06, + "loss": 0.82600031, + "memory(GiB)": 135.77, + "step": 35300, + "train_speed(iter/s)": 0.201543 + }, + { + "acc": 0.77425413, + "epoch": 0.8238090377520816, + "grad_norm": 5.8125, + "learning_rate": 6.638563636752932e-06, + "loss": 0.81250858, + "memory(GiB)": 135.77, + "step": 35310, + "train_speed(iter/s)": 0.201574 + }, + { + "acc": 0.7789402, + "epoch": 0.8240423453243705, + "grad_norm": 6.59375, + "learning_rate": 6.63677872982895e-06, + "loss": 0.77923145, + "memory(GiB)": 135.77, + "step": 35320, + "train_speed(iter/s)": 0.201602 + }, + { + "acc": 0.79925971, + "epoch": 0.8242756528966593, + "grad_norm": 5.125, + "learning_rate": 6.634993589255278e-06, + "loss": 0.71169586, + "memory(GiB)": 135.77, + "step": 35330, + "train_speed(iter/s)": 0.201631 + }, + { + "acc": 0.79988279, + "epoch": 0.8245089604689482, + "grad_norm": 6.03125, + "learning_rate": 6.633208215286748e-06, + "loss": 0.70508504, + "memory(GiB)": 135.77, + "step": 35340, + "train_speed(iter/s)": 0.201658 + }, + { + "acc": 0.77253027, + "epoch": 0.8247422680412371, + "grad_norm": 5.15625, + "learning_rate": 6.6314226081782195e-06, + "loss": 0.80938549, + "memory(GiB)": 135.77, + "step": 35350, + "train_speed(iter/s)": 0.201685 + }, + { + "acc": 0.78187051, + "epoch": 0.824975575613526, + "grad_norm": 5.3125, + "learning_rate": 6.6296367681845875e-06, + "loss": 0.76466885, + "memory(GiB)": 135.77, + "step": 35360, + "train_speed(iter/s)": 0.201716 + }, + { + "acc": 0.76568704, + "epoch": 0.8252088831858149, + "grad_norm": 5.5, + "learning_rate": 6.62785069556078e-06, + "loss": 0.86667337, + "memory(GiB)": 135.77, + "step": 35370, + "train_speed(iter/s)": 0.201747 + }, + { + "acc": 0.78236561, + "epoch": 0.8254421907581038, + "grad_norm": 5.21875, + "learning_rate": 6.6260643905617605e-06, + "loss": 0.76878605, + "memory(GiB)": 135.77, + "step": 35380, + "train_speed(iter/s)": 0.201776 + }, + { + "acc": 0.80533562, + "epoch": 0.8256754983303927, + "grad_norm": 7.34375, + "learning_rate": 6.624277853442519e-06, + "loss": 0.69753013, + "memory(GiB)": 135.77, + "step": 35390, + "train_speed(iter/s)": 0.201806 + }, + { + "acc": 0.78438272, + "epoch": 0.8259088059026816, + "grad_norm": 7.625, + "learning_rate": 6.622491084458087e-06, + "loss": 0.75821857, + "memory(GiB)": 135.77, + "step": 35400, + "train_speed(iter/s)": 0.201837 + }, + { + "acc": 0.77371559, + "epoch": 0.8261421134749705, + "grad_norm": 5.3125, + "learning_rate": 6.620704083863523e-06, + "loss": 0.82229176, + "memory(GiB)": 135.77, + "step": 35410, + "train_speed(iter/s)": 0.201866 + }, + { + "acc": 0.77689691, + "epoch": 0.8263754210472594, + "grad_norm": 5.15625, + "learning_rate": 6.618916851913923e-06, + "loss": 0.79822955, + "memory(GiB)": 135.77, + "step": 35420, + "train_speed(iter/s)": 0.201896 + }, + { + "acc": 0.76417141, + "epoch": 0.8266087286195483, + "grad_norm": 10.0, + "learning_rate": 6.617129388864412e-06, + "loss": 0.86350403, + "memory(GiB)": 135.77, + "step": 35430, + "train_speed(iter/s)": 0.201926 + }, + { + "acc": 0.7825284, + "epoch": 0.8268420361918372, + "grad_norm": 11.75, + "learning_rate": 6.615341694970151e-06, + "loss": 0.76954165, + "memory(GiB)": 135.77, + "step": 35440, + "train_speed(iter/s)": 0.201953 + }, + { + "acc": 0.78282237, + "epoch": 0.8270753437641261, + "grad_norm": 7.4375, + "learning_rate": 6.613553770486333e-06, + "loss": 0.77155132, + "memory(GiB)": 135.77, + "step": 35450, + "train_speed(iter/s)": 0.201981 + }, + { + "acc": 0.76642599, + "epoch": 0.827308651336415, + "grad_norm": 4.28125, + "learning_rate": 6.611765615668182e-06, + "loss": 0.8688242, + "memory(GiB)": 135.77, + "step": 35460, + "train_speed(iter/s)": 0.202011 + }, + { + "acc": 0.79817481, + "epoch": 0.8275419589087039, + "grad_norm": 4.9375, + "learning_rate": 6.609977230770957e-06, + "loss": 0.71652417, + "memory(GiB)": 135.77, + "step": 35470, + "train_speed(iter/s)": 0.20204 + }, + { + "acc": 0.76829128, + "epoch": 0.8277752664809928, + "grad_norm": 5.5625, + "learning_rate": 6.608188616049951e-06, + "loss": 0.83743439, + "memory(GiB)": 135.77, + "step": 35480, + "train_speed(iter/s)": 0.20207 + }, + { + "acc": 0.78781743, + "epoch": 0.8280085740532817, + "grad_norm": 7.53125, + "learning_rate": 6.606399771760487e-06, + "loss": 0.76809006, + "memory(GiB)": 135.77, + "step": 35490, + "train_speed(iter/s)": 0.2021 + }, + { + "acc": 0.78514261, + "epoch": 0.8282418816255706, + "grad_norm": 6.03125, + "learning_rate": 6.6046106981579216e-06, + "loss": 0.76895576, + "memory(GiB)": 135.77, + "step": 35500, + "train_speed(iter/s)": 0.202131 + }, + { + "epoch": 0.8282418816255706, + "eval_acc": 0.7427417429233465, + "eval_loss": 0.8110933899879456, + "eval_runtime": 1270.2164, + "eval_samples_per_second": 28.335, + "eval_steps_per_second": 14.168, + "step": 35500 + }, + { + "acc": 0.7827517, + "epoch": 0.8284751891978595, + "grad_norm": 5.8125, + "learning_rate": 6.6028213954976474e-06, + "loss": 0.79220624, + "memory(GiB)": 135.77, + "step": 35510, + "train_speed(iter/s)": 0.200689 + }, + { + "acc": 0.77409925, + "epoch": 0.8287084967701484, + "grad_norm": 6.15625, + "learning_rate": 6.601031864035082e-06, + "loss": 0.80091858, + "memory(GiB)": 135.77, + "step": 35520, + "train_speed(iter/s)": 0.200717 + }, + { + "acc": 0.78355837, + "epoch": 0.8289418043424371, + "grad_norm": 5.25, + "learning_rate": 6.5992421040256834e-06, + "loss": 0.75921507, + "memory(GiB)": 135.77, + "step": 35530, + "train_speed(iter/s)": 0.200747 + }, + { + "acc": 0.76949062, + "epoch": 0.829175111914726, + "grad_norm": 5.375, + "learning_rate": 6.597452115724939e-06, + "loss": 0.81914463, + "memory(GiB)": 135.77, + "step": 35540, + "train_speed(iter/s)": 0.200775 + }, + { + "acc": 0.77408943, + "epoch": 0.8294084194870149, + "grad_norm": 5.59375, + "learning_rate": 6.5956618993883716e-06, + "loss": 0.80963411, + "memory(GiB)": 135.77, + "step": 35550, + "train_speed(iter/s)": 0.200806 + }, + { + "acc": 0.80136003, + "epoch": 0.8296417270593038, + "grad_norm": 4.53125, + "learning_rate": 6.59387145527153e-06, + "loss": 0.71679068, + "memory(GiB)": 135.77, + "step": 35560, + "train_speed(iter/s)": 0.200835 + }, + { + "acc": 0.76815443, + "epoch": 0.8298750346315927, + "grad_norm": 5.4375, + "learning_rate": 6.59208078363e-06, + "loss": 0.8367054, + "memory(GiB)": 135.77, + "step": 35570, + "train_speed(iter/s)": 0.200866 + }, + { + "acc": 0.7694231, + "epoch": 0.8301083422038816, + "grad_norm": 5.34375, + "learning_rate": 6.590289884719403e-06, + "loss": 0.85313902, + "memory(GiB)": 135.77, + "step": 35580, + "train_speed(iter/s)": 0.200898 + }, + { + "acc": 0.79432936, + "epoch": 0.8303416497761705, + "grad_norm": 5.09375, + "learning_rate": 6.588498758795386e-06, + "loss": 0.75772634, + "memory(GiB)": 135.77, + "step": 35590, + "train_speed(iter/s)": 0.200927 + }, + { + "acc": 0.77298503, + "epoch": 0.8305749573484594, + "grad_norm": 5.625, + "learning_rate": 6.586707406113632e-06, + "loss": 0.81344709, + "memory(GiB)": 135.77, + "step": 35600, + "train_speed(iter/s)": 0.200956 + }, + { + "acc": 0.78691454, + "epoch": 0.8308082649207483, + "grad_norm": 5.5, + "learning_rate": 6.5849158269298565e-06, + "loss": 0.76828012, + "memory(GiB)": 135.77, + "step": 35610, + "train_speed(iter/s)": 0.200984 + }, + { + "acc": 0.76740513, + "epoch": 0.8310415724930372, + "grad_norm": 6.34375, + "learning_rate": 6.583124021499807e-06, + "loss": 0.83374586, + "memory(GiB)": 135.77, + "step": 35620, + "train_speed(iter/s)": 0.201013 + }, + { + "acc": 0.76556411, + "epoch": 0.8312748800653261, + "grad_norm": 6.0625, + "learning_rate": 6.581331990079264e-06, + "loss": 0.84218559, + "memory(GiB)": 135.77, + "step": 35630, + "train_speed(iter/s)": 0.201041 + }, + { + "acc": 0.77049723, + "epoch": 0.831508187637615, + "grad_norm": 5.75, + "learning_rate": 6.579539732924038e-06, + "loss": 0.83664169, + "memory(GiB)": 135.77, + "step": 35640, + "train_speed(iter/s)": 0.201069 + }, + { + "acc": 0.76034393, + "epoch": 0.8317414952099039, + "grad_norm": 7.5625, + "learning_rate": 6.5777472502899765e-06, + "loss": 0.87293215, + "memory(GiB)": 135.77, + "step": 35650, + "train_speed(iter/s)": 0.201097 + }, + { + "acc": 0.78872604, + "epoch": 0.8319748027821928, + "grad_norm": 5.03125, + "learning_rate": 6.5759545424329514e-06, + "loss": 0.7763586, + "memory(GiB)": 135.77, + "step": 35660, + "train_speed(iter/s)": 0.201127 + }, + { + "acc": 0.76089096, + "epoch": 0.8322081103544817, + "grad_norm": 5.90625, + "learning_rate": 6.574161609608873e-06, + "loss": 0.84671097, + "memory(GiB)": 135.77, + "step": 35670, + "train_speed(iter/s)": 0.201157 + }, + { + "acc": 0.78287115, + "epoch": 0.8324414179267706, + "grad_norm": 7.78125, + "learning_rate": 6.572368452073683e-06, + "loss": 0.79170933, + "memory(GiB)": 135.77, + "step": 35680, + "train_speed(iter/s)": 0.201186 + }, + { + "acc": 0.79201784, + "epoch": 0.8326747254990595, + "grad_norm": 5.46875, + "learning_rate": 6.570575070083351e-06, + "loss": 0.72850819, + "memory(GiB)": 135.77, + "step": 35690, + "train_speed(iter/s)": 0.201217 + }, + { + "acc": 0.76989765, + "epoch": 0.8329080330713484, + "grad_norm": 4.3125, + "learning_rate": 6.5687814638938865e-06, + "loss": 0.80628548, + "memory(GiB)": 135.77, + "step": 35700, + "train_speed(iter/s)": 0.201247 + }, + { + "acc": 0.77745337, + "epoch": 0.8331413406436373, + "grad_norm": 4.78125, + "learning_rate": 6.566987633761323e-06, + "loss": 0.80288849, + "memory(GiB)": 135.77, + "step": 35710, + "train_speed(iter/s)": 0.201276 + }, + { + "acc": 0.77031932, + "epoch": 0.8333746482159261, + "grad_norm": 5.25, + "learning_rate": 6.5651935799417295e-06, + "loss": 0.84479132, + "memory(GiB)": 135.77, + "step": 35720, + "train_speed(iter/s)": 0.201306 + }, + { + "acc": 0.7754509, + "epoch": 0.833607955788215, + "grad_norm": 5.53125, + "learning_rate": 6.563399302691209e-06, + "loss": 0.78503389, + "memory(GiB)": 135.77, + "step": 35730, + "train_speed(iter/s)": 0.201336 + }, + { + "acc": 0.77183795, + "epoch": 0.8338412633605039, + "grad_norm": 5.65625, + "learning_rate": 6.561604802265891e-06, + "loss": 0.82641964, + "memory(GiB)": 135.77, + "step": 35740, + "train_speed(iter/s)": 0.201364 + }, + { + "acc": 0.78574123, + "epoch": 0.8340745709327928, + "grad_norm": 4.21875, + "learning_rate": 6.55981007892194e-06, + "loss": 0.76580262, + "memory(GiB)": 135.77, + "step": 35750, + "train_speed(iter/s)": 0.201393 + }, + { + "acc": 0.79073353, + "epoch": 0.8343078785050817, + "grad_norm": 7.90625, + "learning_rate": 6.558015132915554e-06, + "loss": 0.7411293, + "memory(GiB)": 135.77, + "step": 35760, + "train_speed(iter/s)": 0.201422 + }, + { + "acc": 0.77999744, + "epoch": 0.8345411860773706, + "grad_norm": 5.5, + "learning_rate": 6.556219964502961e-06, + "loss": 0.77981787, + "memory(GiB)": 135.77, + "step": 35770, + "train_speed(iter/s)": 0.201452 + }, + { + "acc": 0.77890453, + "epoch": 0.8347744936496595, + "grad_norm": 6.5, + "learning_rate": 6.5544245739404196e-06, + "loss": 0.76951075, + "memory(GiB)": 135.77, + "step": 35780, + "train_speed(iter/s)": 0.201478 + }, + { + "acc": 0.78939867, + "epoch": 0.8350078012219484, + "grad_norm": 5.4375, + "learning_rate": 6.552628961484222e-06, + "loss": 0.76422501, + "memory(GiB)": 135.77, + "step": 35790, + "train_speed(iter/s)": 0.201507 + }, + { + "acc": 0.78203793, + "epoch": 0.8352411087942373, + "grad_norm": 8.1875, + "learning_rate": 6.550833127390692e-06, + "loss": 0.7660037, + "memory(GiB)": 135.77, + "step": 35800, + "train_speed(iter/s)": 0.201535 + }, + { + "acc": 0.7650146, + "epoch": 0.8354744163665262, + "grad_norm": 4.96875, + "learning_rate": 6.549037071916184e-06, + "loss": 0.87431145, + "memory(GiB)": 135.77, + "step": 35810, + "train_speed(iter/s)": 0.201565 + }, + { + "acc": 0.75421095, + "epoch": 0.8357077239388151, + "grad_norm": 4.21875, + "learning_rate": 6.547240795317081e-06, + "loss": 0.86133585, + "memory(GiB)": 135.77, + "step": 35820, + "train_speed(iter/s)": 0.201593 + }, + { + "acc": 0.76499453, + "epoch": 0.835941031511104, + "grad_norm": 8.5625, + "learning_rate": 6.545444297849808e-06, + "loss": 0.85870771, + "memory(GiB)": 135.77, + "step": 35830, + "train_speed(iter/s)": 0.201622 + }, + { + "acc": 0.77958255, + "epoch": 0.8361743390833929, + "grad_norm": 6.0625, + "learning_rate": 6.543647579770806e-06, + "loss": 0.78675489, + "memory(GiB)": 135.77, + "step": 35840, + "train_speed(iter/s)": 0.20165 + }, + { + "acc": 0.77734413, + "epoch": 0.8364076466556818, + "grad_norm": 5.65625, + "learning_rate": 6.5418506413365634e-06, + "loss": 0.78364563, + "memory(GiB)": 135.77, + "step": 35850, + "train_speed(iter/s)": 0.201679 + }, + { + "acc": 0.78868957, + "epoch": 0.8366409542279707, + "grad_norm": 5.59375, + "learning_rate": 6.5400534828035885e-06, + "loss": 0.74761543, + "memory(GiB)": 135.77, + "step": 35860, + "train_speed(iter/s)": 0.201708 + }, + { + "acc": 0.76377254, + "epoch": 0.8368742618002596, + "grad_norm": 5.90625, + "learning_rate": 6.538256104428427e-06, + "loss": 0.85419159, + "memory(GiB)": 135.77, + "step": 35870, + "train_speed(iter/s)": 0.201738 + }, + { + "acc": 0.77922173, + "epoch": 0.8371075693725485, + "grad_norm": 6.5, + "learning_rate": 6.536458506467654e-06, + "loss": 0.78688426, + "memory(GiB)": 135.77, + "step": 35880, + "train_speed(iter/s)": 0.201765 + }, + { + "acc": 0.76489186, + "epoch": 0.8373408769448374, + "grad_norm": 4.875, + "learning_rate": 6.5346606891778755e-06, + "loss": 0.84113293, + "memory(GiB)": 135.77, + "step": 35890, + "train_speed(iter/s)": 0.201795 + }, + { + "acc": 0.77273259, + "epoch": 0.8375741845171263, + "grad_norm": 7.375, + "learning_rate": 6.532862652815728e-06, + "loss": 0.8084506, + "memory(GiB)": 135.77, + "step": 35900, + "train_speed(iter/s)": 0.201825 + }, + { + "acc": 0.74848862, + "epoch": 0.8378074920894151, + "grad_norm": 7.0625, + "learning_rate": 6.531064397637883e-06, + "loss": 0.90964317, + "memory(GiB)": 135.77, + "step": 35910, + "train_speed(iter/s)": 0.201853 + }, + { + "acc": 0.78161101, + "epoch": 0.838040799661704, + "grad_norm": 5.8125, + "learning_rate": 6.529265923901039e-06, + "loss": 0.78454123, + "memory(GiB)": 135.77, + "step": 35920, + "train_speed(iter/s)": 0.201882 + }, + { + "acc": 0.78447218, + "epoch": 0.8382741072339929, + "grad_norm": 5.25, + "learning_rate": 6.527467231861929e-06, + "loss": 0.76308413, + "memory(GiB)": 135.77, + "step": 35930, + "train_speed(iter/s)": 0.201912 + }, + { + "acc": 0.78148451, + "epoch": 0.8385074148062818, + "grad_norm": 97.5, + "learning_rate": 6.525668321777317e-06, + "loss": 0.78811293, + "memory(GiB)": 135.77, + "step": 35940, + "train_speed(iter/s)": 0.201942 + }, + { + "acc": 0.76809978, + "epoch": 0.8387407223785707, + "grad_norm": 5.09375, + "learning_rate": 6.523869193903994e-06, + "loss": 0.84421577, + "memory(GiB)": 135.77, + "step": 35950, + "train_speed(iter/s)": 0.201973 + }, + { + "acc": 0.76690893, + "epoch": 0.8389740299508596, + "grad_norm": 4.28125, + "learning_rate": 6.522069848498787e-06, + "loss": 0.8514679, + "memory(GiB)": 135.77, + "step": 35960, + "train_speed(iter/s)": 0.202001 + }, + { + "acc": 0.78587494, + "epoch": 0.8392073375231485, + "grad_norm": 7.875, + "learning_rate": 6.5202702858185495e-06, + "loss": 0.75838385, + "memory(GiB)": 135.77, + "step": 35970, + "train_speed(iter/s)": 0.20203 + }, + { + "acc": 0.77815952, + "epoch": 0.8394406450954374, + "grad_norm": 5.15625, + "learning_rate": 6.518470506120171e-06, + "loss": 0.7992382, + "memory(GiB)": 135.77, + "step": 35980, + "train_speed(iter/s)": 0.202058 + }, + { + "acc": 0.77503004, + "epoch": 0.8396739526677263, + "grad_norm": 6.75, + "learning_rate": 6.51667050966057e-06, + "loss": 0.79922199, + "memory(GiB)": 135.77, + "step": 35990, + "train_speed(iter/s)": 0.202088 + }, + { + "acc": 0.76703901, + "epoch": 0.8399072602400152, + "grad_norm": 5.875, + "learning_rate": 6.514870296696694e-06, + "loss": 0.84329872, + "memory(GiB)": 135.77, + "step": 36000, + "train_speed(iter/s)": 0.202117 + }, + { + "epoch": 0.8399072602400152, + "eval_acc": 0.7429556956177011, + "eval_loss": 0.8103926777839661, + "eval_runtime": 1268.5973, + "eval_samples_per_second": 28.371, + "eval_steps_per_second": 14.186, + "step": 36000 + }, + { + "acc": 0.76743212, + "epoch": 0.8401405678123041, + "grad_norm": 5.5, + "learning_rate": 6.513069867485523e-06, + "loss": 0.8609807, + "memory(GiB)": 135.77, + "step": 36010, + "train_speed(iter/s)": 0.2007 + }, + { + "acc": 0.75457373, + "epoch": 0.840373875384593, + "grad_norm": 6.21875, + "learning_rate": 6.511269222284069e-06, + "loss": 0.86449337, + "memory(GiB)": 135.77, + "step": 36020, + "train_speed(iter/s)": 0.200728 + }, + { + "acc": 0.76079054, + "epoch": 0.8406071829568819, + "grad_norm": 4.71875, + "learning_rate": 6.509468361349371e-06, + "loss": 0.87414017, + "memory(GiB)": 135.77, + "step": 36030, + "train_speed(iter/s)": 0.200758 + }, + { + "acc": 0.79017248, + "epoch": 0.8408404905291708, + "grad_norm": 4.625, + "learning_rate": 6.507667284938502e-06, + "loss": 0.75717969, + "memory(GiB)": 135.77, + "step": 36040, + "train_speed(iter/s)": 0.200785 + }, + { + "acc": 0.77153301, + "epoch": 0.8410737981014597, + "grad_norm": 5.5, + "learning_rate": 6.505865993308568e-06, + "loss": 0.84561863, + "memory(GiB)": 135.77, + "step": 36050, + "train_speed(iter/s)": 0.200815 + }, + { + "acc": 0.7702539, + "epoch": 0.8413071056737486, + "grad_norm": 10.75, + "learning_rate": 6.5040644867167e-06, + "loss": 0.83923016, + "memory(GiB)": 135.77, + "step": 36060, + "train_speed(iter/s)": 0.200843 + }, + { + "acc": 0.78410187, + "epoch": 0.8415404132460375, + "grad_norm": 7.4375, + "learning_rate": 6.502262765420064e-06, + "loss": 0.78843861, + "memory(GiB)": 135.77, + "step": 36070, + "train_speed(iter/s)": 0.200871 + }, + { + "acc": 0.76989207, + "epoch": 0.8417737208183264, + "grad_norm": 5.03125, + "learning_rate": 6.500460829675854e-06, + "loss": 0.81716394, + "memory(GiB)": 135.77, + "step": 36080, + "train_speed(iter/s)": 0.2009 + }, + { + "acc": 0.75923405, + "epoch": 0.8420070283906153, + "grad_norm": 6.4375, + "learning_rate": 6.498658679741298e-06, + "loss": 0.86520176, + "memory(GiB)": 135.77, + "step": 36090, + "train_speed(iter/s)": 0.200925 + }, + { + "acc": 0.79458346, + "epoch": 0.842240335962904, + "grad_norm": 6.75, + "learning_rate": 6.49685631587365e-06, + "loss": 0.74665785, + "memory(GiB)": 135.77, + "step": 36100, + "train_speed(iter/s)": 0.200953 + }, + { + "acc": 0.7725565, + "epoch": 0.8424736435351929, + "grad_norm": 6.46875, + "learning_rate": 6.495053738330196e-06, + "loss": 0.80447502, + "memory(GiB)": 135.77, + "step": 36110, + "train_speed(iter/s)": 0.200982 + }, + { + "acc": 0.78278484, + "epoch": 0.8427069511074818, + "grad_norm": 5.5625, + "learning_rate": 6.493250947368257e-06, + "loss": 0.77641745, + "memory(GiB)": 135.77, + "step": 36120, + "train_speed(iter/s)": 0.20101 + }, + { + "acc": 0.79837875, + "epoch": 0.8429402586797707, + "grad_norm": 5.0625, + "learning_rate": 6.491447943245179e-06, + "loss": 0.71321259, + "memory(GiB)": 135.77, + "step": 36130, + "train_speed(iter/s)": 0.20104 + }, + { + "acc": 0.79435396, + "epoch": 0.8431735662520596, + "grad_norm": 5.125, + "learning_rate": 6.489644726218339e-06, + "loss": 0.7343874, + "memory(GiB)": 135.77, + "step": 36140, + "train_speed(iter/s)": 0.201069 + }, + { + "acc": 0.78385534, + "epoch": 0.8434068738243485, + "grad_norm": 6.59375, + "learning_rate": 6.4878412965451485e-06, + "loss": 0.79530344, + "memory(GiB)": 135.77, + "step": 36150, + "train_speed(iter/s)": 0.201099 + }, + { + "acc": 0.78743668, + "epoch": 0.8436401813966374, + "grad_norm": 7.125, + "learning_rate": 6.486037654483046e-06, + "loss": 0.74157157, + "memory(GiB)": 135.77, + "step": 36160, + "train_speed(iter/s)": 0.201127 + }, + { + "acc": 0.80141068, + "epoch": 0.8438734889689263, + "grad_norm": 4.8125, + "learning_rate": 6.484233800289499e-06, + "loss": 0.70053635, + "memory(GiB)": 135.77, + "step": 36170, + "train_speed(iter/s)": 0.201156 + }, + { + "acc": 0.76834717, + "epoch": 0.8441067965412152, + "grad_norm": 6.09375, + "learning_rate": 6.482429734222008e-06, + "loss": 0.85586014, + "memory(GiB)": 135.77, + "step": 36180, + "train_speed(iter/s)": 0.201183 + }, + { + "acc": 0.77482719, + "epoch": 0.8443401041135041, + "grad_norm": 6.0625, + "learning_rate": 6.4806254565381025e-06, + "loss": 0.81930189, + "memory(GiB)": 135.77, + "step": 36190, + "train_speed(iter/s)": 0.20121 + }, + { + "acc": 0.81716328, + "epoch": 0.844573411685793, + "grad_norm": 5.375, + "learning_rate": 6.478820967495343e-06, + "loss": 0.6350563, + "memory(GiB)": 135.77, + "step": 36200, + "train_speed(iter/s)": 0.201239 + }, + { + "acc": 0.77658648, + "epoch": 0.8448067192580819, + "grad_norm": 4.34375, + "learning_rate": 6.47701626735132e-06, + "loss": 0.78943858, + "memory(GiB)": 135.77, + "step": 36210, + "train_speed(iter/s)": 0.201265 + }, + { + "acc": 0.78699703, + "epoch": 0.8450400268303708, + "grad_norm": 5.96875, + "learning_rate": 6.475211356363655e-06, + "loss": 0.75699453, + "memory(GiB)": 135.77, + "step": 36220, + "train_speed(iter/s)": 0.201292 + }, + { + "acc": 0.80862513, + "epoch": 0.8452733344026597, + "grad_norm": 4.625, + "learning_rate": 6.473406234789998e-06, + "loss": 0.67446146, + "memory(GiB)": 135.77, + "step": 36230, + "train_speed(iter/s)": 0.20132 + }, + { + "acc": 0.77075014, + "epoch": 0.8455066419749486, + "grad_norm": 6.25, + "learning_rate": 6.471600902888029e-06, + "loss": 0.81247129, + "memory(GiB)": 135.77, + "step": 36240, + "train_speed(iter/s)": 0.20135 + }, + { + "acc": 0.77357235, + "epoch": 0.8457399495472375, + "grad_norm": 6.21875, + "learning_rate": 6.4697953609154575e-06, + "loss": 0.81807146, + "memory(GiB)": 135.77, + "step": 36250, + "train_speed(iter/s)": 0.201378 + }, + { + "acc": 0.78533292, + "epoch": 0.8459732571195264, + "grad_norm": 5.34375, + "learning_rate": 6.467989609130024e-06, + "loss": 0.78466349, + "memory(GiB)": 135.77, + "step": 36260, + "train_speed(iter/s)": 0.201406 + }, + { + "acc": 0.77039223, + "epoch": 0.8462065646918153, + "grad_norm": 5.34375, + "learning_rate": 6.466183647789502e-06, + "loss": 0.83654995, + "memory(GiB)": 135.77, + "step": 36270, + "train_speed(iter/s)": 0.201434 + }, + { + "acc": 0.7773541, + "epoch": 0.8464398722641042, + "grad_norm": 4.40625, + "learning_rate": 6.46437747715169e-06, + "loss": 0.79755011, + "memory(GiB)": 135.77, + "step": 36280, + "train_speed(iter/s)": 0.201462 + }, + { + "acc": 0.77812119, + "epoch": 0.8466731798363931, + "grad_norm": 5.46875, + "learning_rate": 6.462571097474419e-06, + "loss": 0.78683367, + "memory(GiB)": 135.77, + "step": 36290, + "train_speed(iter/s)": 0.201491 + }, + { + "acc": 0.78762054, + "epoch": 0.8469064874086819, + "grad_norm": 4.28125, + "learning_rate": 6.460764509015547e-06, + "loss": 0.76013975, + "memory(GiB)": 135.77, + "step": 36300, + "train_speed(iter/s)": 0.201516 + }, + { + "acc": 0.78863735, + "epoch": 0.8471397949809708, + "grad_norm": 6.125, + "learning_rate": 6.4589577120329685e-06, + "loss": 0.7554966, + "memory(GiB)": 135.77, + "step": 36310, + "train_speed(iter/s)": 0.201544 + }, + { + "acc": 0.78832941, + "epoch": 0.8473731025532597, + "grad_norm": 6.375, + "learning_rate": 6.4571507067845985e-06, + "loss": 0.78529215, + "memory(GiB)": 135.77, + "step": 36320, + "train_speed(iter/s)": 0.201569 + }, + { + "acc": 0.78547802, + "epoch": 0.8476064101255486, + "grad_norm": 4.8125, + "learning_rate": 6.455343493528388e-06, + "loss": 0.77054935, + "memory(GiB)": 135.77, + "step": 36330, + "train_speed(iter/s)": 0.201598 + }, + { + "acc": 0.7642107, + "epoch": 0.8478397176978375, + "grad_norm": 5.96875, + "learning_rate": 6.4535360725223175e-06, + "loss": 0.85638161, + "memory(GiB)": 135.77, + "step": 36340, + "train_speed(iter/s)": 0.201628 + }, + { + "acc": 0.77951527, + "epoch": 0.8480730252701264, + "grad_norm": 4.15625, + "learning_rate": 6.451728444024394e-06, + "loss": 0.77187543, + "memory(GiB)": 135.77, + "step": 36350, + "train_speed(iter/s)": 0.201656 + }, + { + "acc": 0.78816013, + "epoch": 0.8483063328424153, + "grad_norm": 4.84375, + "learning_rate": 6.449920608292658e-06, + "loss": 0.75378218, + "memory(GiB)": 135.77, + "step": 36360, + "train_speed(iter/s)": 0.201683 + }, + { + "acc": 0.7518692, + "epoch": 0.8485396404147042, + "grad_norm": 5.65625, + "learning_rate": 6.448112565585176e-06, + "loss": 0.89817324, + "memory(GiB)": 135.77, + "step": 36370, + "train_speed(iter/s)": 0.201712 + }, + { + "acc": 0.77250676, + "epoch": 0.8487729479869931, + "grad_norm": 4.34375, + "learning_rate": 6.446304316160046e-06, + "loss": 0.80824423, + "memory(GiB)": 135.77, + "step": 36380, + "train_speed(iter/s)": 0.20174 + }, + { + "acc": 0.78365912, + "epoch": 0.849006255559282, + "grad_norm": 4.40625, + "learning_rate": 6.444495860275395e-06, + "loss": 0.76890135, + "memory(GiB)": 135.77, + "step": 36390, + "train_speed(iter/s)": 0.201768 + }, + { + "acc": 0.75655127, + "epoch": 0.8492395631315709, + "grad_norm": 5.625, + "learning_rate": 6.442687198189379e-06, + "loss": 0.86654816, + "memory(GiB)": 135.77, + "step": 36400, + "train_speed(iter/s)": 0.201797 + }, + { + "acc": 0.77825761, + "epoch": 0.8494728707038598, + "grad_norm": 5.3125, + "learning_rate": 6.440878330160185e-06, + "loss": 0.79696665, + "memory(GiB)": 135.77, + "step": 36410, + "train_speed(iter/s)": 0.201829 + }, + { + "acc": 0.78085461, + "epoch": 0.8497061782761487, + "grad_norm": 4.46875, + "learning_rate": 6.439069256446027e-06, + "loss": 0.78490944, + "memory(GiB)": 135.77, + "step": 36420, + "train_speed(iter/s)": 0.201857 + }, + { + "acc": 0.78769636, + "epoch": 0.8499394858484376, + "grad_norm": 7.34375, + "learning_rate": 6.437259977305152e-06, + "loss": 0.77664795, + "memory(GiB)": 135.77, + "step": 36430, + "train_speed(iter/s)": 0.201887 + }, + { + "acc": 0.77704792, + "epoch": 0.8501727934207265, + "grad_norm": 4.875, + "learning_rate": 6.435450492995833e-06, + "loss": 0.80361004, + "memory(GiB)": 135.77, + "step": 36440, + "train_speed(iter/s)": 0.201916 + }, + { + "acc": 0.79341078, + "epoch": 0.8504061009930154, + "grad_norm": 5.21875, + "learning_rate": 6.433640803776372e-06, + "loss": 0.7458343, + "memory(GiB)": 135.77, + "step": 36450, + "train_speed(iter/s)": 0.201944 + }, + { + "acc": 0.78444099, + "epoch": 0.8506394085653043, + "grad_norm": 6.5, + "learning_rate": 6.431830909905105e-06, + "loss": 0.77440691, + "memory(GiB)": 135.77, + "step": 36460, + "train_speed(iter/s)": 0.20197 + }, + { + "acc": 0.78535118, + "epoch": 0.8508727161375932, + "grad_norm": 4.28125, + "learning_rate": 6.43002081164039e-06, + "loss": 0.76964531, + "memory(GiB)": 135.77, + "step": 36470, + "train_speed(iter/s)": 0.201997 + }, + { + "acc": 0.7824791, + "epoch": 0.8511060237098821, + "grad_norm": 6.15625, + "learning_rate": 6.428210509240618e-06, + "loss": 0.80835476, + "memory(GiB)": 135.77, + "step": 36480, + "train_speed(iter/s)": 0.202024 + }, + { + "acc": 0.79336419, + "epoch": 0.8513393312821709, + "grad_norm": 4.90625, + "learning_rate": 6.426400002964211e-06, + "loss": 0.73920593, + "memory(GiB)": 135.77, + "step": 36490, + "train_speed(iter/s)": 0.202054 + }, + { + "acc": 0.77750196, + "epoch": 0.8515726388544598, + "grad_norm": 4.84375, + "learning_rate": 6.42458929306962e-06, + "loss": 0.81381168, + "memory(GiB)": 135.77, + "step": 36500, + "train_speed(iter/s)": 0.202082 + }, + { + "epoch": 0.8515726388544598, + "eval_acc": 0.7429995751373517, + "eval_loss": 0.8101937174797058, + "eval_runtime": 1268.6867, + "eval_samples_per_second": 28.369, + "eval_steps_per_second": 14.185, + "step": 36500 + }, + { + "acc": 0.78311806, + "epoch": 0.8518059464267487, + "grad_norm": 5.34375, + "learning_rate": 6.42277837981532e-06, + "loss": 0.75560861, + "memory(GiB)": 135.77, + "step": 36510, + "train_speed(iter/s)": 0.200682 + }, + { + "acc": 0.76936331, + "epoch": 0.8520392539990376, + "grad_norm": 7.1875, + "learning_rate": 6.420967263459821e-06, + "loss": 0.82802315, + "memory(GiB)": 135.77, + "step": 36520, + "train_speed(iter/s)": 0.200712 + }, + { + "acc": 0.76965437, + "epoch": 0.8522725615713265, + "grad_norm": 5.3125, + "learning_rate": 6.419155944261657e-06, + "loss": 0.82444363, + "memory(GiB)": 135.77, + "step": 36530, + "train_speed(iter/s)": 0.200741 + }, + { + "acc": 0.77062273, + "epoch": 0.8525058691436154, + "grad_norm": 13.5, + "learning_rate": 6.4173444224793935e-06, + "loss": 0.82105751, + "memory(GiB)": 135.77, + "step": 36540, + "train_speed(iter/s)": 0.200772 + }, + { + "acc": 0.77684073, + "epoch": 0.8527391767159043, + "grad_norm": 5.96875, + "learning_rate": 6.415532698371625e-06, + "loss": 0.80583134, + "memory(GiB)": 135.77, + "step": 36550, + "train_speed(iter/s)": 0.200801 + }, + { + "acc": 0.78810062, + "epoch": 0.8529724842881932, + "grad_norm": 4.25, + "learning_rate": 6.413720772196976e-06, + "loss": 0.76445894, + "memory(GiB)": 135.77, + "step": 36560, + "train_speed(iter/s)": 0.200832 + }, + { + "acc": 0.7537466, + "epoch": 0.8532057918604821, + "grad_norm": 6.34375, + "learning_rate": 6.411908644214098e-06, + "loss": 0.89372339, + "memory(GiB)": 135.77, + "step": 36570, + "train_speed(iter/s)": 0.200861 + }, + { + "acc": 0.77111721, + "epoch": 0.853439099432771, + "grad_norm": 5.53125, + "learning_rate": 6.410096314681671e-06, + "loss": 0.82660351, + "memory(GiB)": 135.77, + "step": 36580, + "train_speed(iter/s)": 0.200892 + }, + { + "acc": 0.76293507, + "epoch": 0.8536724070050599, + "grad_norm": 5.34375, + "learning_rate": 6.408283783858405e-06, + "loss": 0.87582846, + "memory(GiB)": 135.77, + "step": 36590, + "train_speed(iter/s)": 0.20092 + }, + { + "acc": 0.77199388, + "epoch": 0.8539057145773488, + "grad_norm": 5.84375, + "learning_rate": 6.406471052003036e-06, + "loss": 0.82917595, + "memory(GiB)": 135.77, + "step": 36600, + "train_speed(iter/s)": 0.200949 + }, + { + "acc": 0.78203735, + "epoch": 0.8541390221496377, + "grad_norm": 5.28125, + "learning_rate": 6.4046581193743344e-06, + "loss": 0.79196444, + "memory(GiB)": 135.77, + "step": 36610, + "train_speed(iter/s)": 0.200978 + }, + { + "acc": 0.7757906, + "epoch": 0.8543723297219266, + "grad_norm": 5.6875, + "learning_rate": 6.402844986231094e-06, + "loss": 0.81577177, + "memory(GiB)": 135.77, + "step": 36620, + "train_speed(iter/s)": 0.201007 + }, + { + "acc": 0.77216077, + "epoch": 0.8546056372942155, + "grad_norm": 5.1875, + "learning_rate": 6.401031652832141e-06, + "loss": 0.79967194, + "memory(GiB)": 135.77, + "step": 36630, + "train_speed(iter/s)": 0.201038 + }, + { + "acc": 0.78772125, + "epoch": 0.8548389448665044, + "grad_norm": 4.875, + "learning_rate": 6.3992181194363234e-06, + "loss": 0.75887017, + "memory(GiB)": 135.77, + "step": 36640, + "train_speed(iter/s)": 0.201067 + }, + { + "acc": 0.76235476, + "epoch": 0.8550722524387933, + "grad_norm": 6.28125, + "learning_rate": 6.397404386302528e-06, + "loss": 0.84511347, + "memory(GiB)": 135.77, + "step": 36650, + "train_speed(iter/s)": 0.201097 + }, + { + "acc": 0.7826313, + "epoch": 0.8553055600110822, + "grad_norm": 4.21875, + "learning_rate": 6.395590453689662e-06, + "loss": 0.8007947, + "memory(GiB)": 135.77, + "step": 36660, + "train_speed(iter/s)": 0.201124 + }, + { + "acc": 0.77450428, + "epoch": 0.855538867583371, + "grad_norm": 6.53125, + "learning_rate": 6.393776321856664e-06, + "loss": 0.80132704, + "memory(GiB)": 135.77, + "step": 36670, + "train_speed(iter/s)": 0.201152 + }, + { + "acc": 0.75675406, + "epoch": 0.8557721751556598, + "grad_norm": 6.75, + "learning_rate": 6.391961991062501e-06, + "loss": 0.86429386, + "memory(GiB)": 135.77, + "step": 36680, + "train_speed(iter/s)": 0.201179 + }, + { + "acc": 0.75716515, + "epoch": 0.8560054827279487, + "grad_norm": 11.5, + "learning_rate": 6.390147461566167e-06, + "loss": 0.86847095, + "memory(GiB)": 135.77, + "step": 36690, + "train_speed(iter/s)": 0.201205 + }, + { + "acc": 0.78775959, + "epoch": 0.8562387903002376, + "grad_norm": 4.875, + "learning_rate": 6.388332733626689e-06, + "loss": 0.75113401, + "memory(GiB)": 147.13, + "step": 36700, + "train_speed(iter/s)": 0.201232 + }, + { + "acc": 0.76984215, + "epoch": 0.8564720978725265, + "grad_norm": 6.0625, + "learning_rate": 6.386517807503114e-06, + "loss": 0.83430986, + "memory(GiB)": 147.13, + "step": 36710, + "train_speed(iter/s)": 0.201262 + }, + { + "acc": 0.77020178, + "epoch": 0.8567054054448154, + "grad_norm": 6.28125, + "learning_rate": 6.384702683454527e-06, + "loss": 0.82754822, + "memory(GiB)": 147.13, + "step": 36720, + "train_speed(iter/s)": 0.20129 + }, + { + "acc": 0.7816741, + "epoch": 0.8569387130171043, + "grad_norm": 6.125, + "learning_rate": 6.382887361740033e-06, + "loss": 0.78474622, + "memory(GiB)": 147.13, + "step": 36730, + "train_speed(iter/s)": 0.201319 + }, + { + "acc": 0.79666619, + "epoch": 0.8571720205893932, + "grad_norm": 5.0, + "learning_rate": 6.38107184261877e-06, + "loss": 0.72450123, + "memory(GiB)": 147.13, + "step": 36740, + "train_speed(iter/s)": 0.201344 + }, + { + "acc": 0.78614969, + "epoch": 0.8574053281616821, + "grad_norm": 9.125, + "learning_rate": 6.379256126349903e-06, + "loss": 0.76010709, + "memory(GiB)": 147.13, + "step": 36750, + "train_speed(iter/s)": 0.201372 + }, + { + "acc": 0.77783051, + "epoch": 0.857638635733971, + "grad_norm": 5.53125, + "learning_rate": 6.377440213192625e-06, + "loss": 0.80419178, + "memory(GiB)": 147.13, + "step": 36760, + "train_speed(iter/s)": 0.201399 + }, + { + "acc": 0.78635902, + "epoch": 0.8578719433062599, + "grad_norm": 5.03125, + "learning_rate": 6.375624103406155e-06, + "loss": 0.77373714, + "memory(GiB)": 147.13, + "step": 36770, + "train_speed(iter/s)": 0.201429 + }, + { + "acc": 0.79163675, + "epoch": 0.8581052508785488, + "grad_norm": 5.71875, + "learning_rate": 6.373807797249744e-06, + "loss": 0.74694929, + "memory(GiB)": 147.13, + "step": 36780, + "train_speed(iter/s)": 0.201456 + }, + { + "acc": 0.77907381, + "epoch": 0.8583385584508377, + "grad_norm": 5.875, + "learning_rate": 6.371991294982671e-06, + "loss": 0.79713793, + "memory(GiB)": 147.13, + "step": 36790, + "train_speed(iter/s)": 0.201484 + }, + { + "acc": 0.75770316, + "epoch": 0.8585718660231266, + "grad_norm": 6.75, + "learning_rate": 6.370174596864238e-06, + "loss": 0.88794937, + "memory(GiB)": 147.13, + "step": 36800, + "train_speed(iter/s)": 0.201513 + }, + { + "acc": 0.77909842, + "epoch": 0.8588051735954155, + "grad_norm": 4.46875, + "learning_rate": 6.368357703153782e-06, + "loss": 0.80933161, + "memory(GiB)": 147.13, + "step": 36810, + "train_speed(iter/s)": 0.201542 + }, + { + "acc": 0.78378386, + "epoch": 0.8590384811677044, + "grad_norm": 4.3125, + "learning_rate": 6.366540614110658e-06, + "loss": 0.78677444, + "memory(GiB)": 147.13, + "step": 36820, + "train_speed(iter/s)": 0.20157 + }, + { + "acc": 0.77209501, + "epoch": 0.8592717887399933, + "grad_norm": 5.9375, + "learning_rate": 6.364723329994259e-06, + "loss": 0.81071463, + "memory(GiB)": 147.13, + "step": 36830, + "train_speed(iter/s)": 0.201599 + }, + { + "acc": 0.77325363, + "epoch": 0.8595050963122822, + "grad_norm": 6.4375, + "learning_rate": 6.362905851064001e-06, + "loss": 0.81591139, + "memory(GiB)": 147.13, + "step": 36840, + "train_speed(iter/s)": 0.201627 + }, + { + "acc": 0.76463056, + "epoch": 0.8597384038845711, + "grad_norm": 4.6875, + "learning_rate": 6.361088177579329e-06, + "loss": 0.83884716, + "memory(GiB)": 147.13, + "step": 36850, + "train_speed(iter/s)": 0.201656 + }, + { + "acc": 0.77708492, + "epoch": 0.85997171145686, + "grad_norm": 6.1875, + "learning_rate": 6.359270309799715e-06, + "loss": 0.80096121, + "memory(GiB)": 147.13, + "step": 36860, + "train_speed(iter/s)": 0.201683 + }, + { + "acc": 0.78316288, + "epoch": 0.8602050190291488, + "grad_norm": 3.859375, + "learning_rate": 6.357452247984659e-06, + "loss": 0.79706192, + "memory(GiB)": 147.13, + "step": 36870, + "train_speed(iter/s)": 0.201711 + }, + { + "acc": 0.77961206, + "epoch": 0.8604383266014377, + "grad_norm": 5.40625, + "learning_rate": 6.35563399239369e-06, + "loss": 0.80441303, + "memory(GiB)": 147.13, + "step": 36880, + "train_speed(iter/s)": 0.201737 + }, + { + "acc": 0.78116093, + "epoch": 0.8606716341737266, + "grad_norm": 4.59375, + "learning_rate": 6.353815543286361e-06, + "loss": 0.78395686, + "memory(GiB)": 147.13, + "step": 36890, + "train_speed(iter/s)": 0.201765 + }, + { + "acc": 0.78719234, + "epoch": 0.8609049417460155, + "grad_norm": 4.6875, + "learning_rate": 6.351996900922257e-06, + "loss": 0.75968537, + "memory(GiB)": 147.13, + "step": 36900, + "train_speed(iter/s)": 0.201793 + }, + { + "acc": 0.78203468, + "epoch": 0.8611382493183044, + "grad_norm": 5.1875, + "learning_rate": 6.3501780655609875e-06, + "loss": 0.78212891, + "memory(GiB)": 147.13, + "step": 36910, + "train_speed(iter/s)": 0.201819 + }, + { + "acc": 0.77617979, + "epoch": 0.8613715568905933, + "grad_norm": 5.75, + "learning_rate": 6.348359037462194e-06, + "loss": 0.79318113, + "memory(GiB)": 147.13, + "step": 36920, + "train_speed(iter/s)": 0.201847 + }, + { + "acc": 0.79143863, + "epoch": 0.8616048644628822, + "grad_norm": 5.5, + "learning_rate": 6.346539816885537e-06, + "loss": 0.73529739, + "memory(GiB)": 147.13, + "step": 36930, + "train_speed(iter/s)": 0.201875 + }, + { + "acc": 0.76003532, + "epoch": 0.8618381720351711, + "grad_norm": 5.1875, + "learning_rate": 6.3447204040907125e-06, + "loss": 0.87924767, + "memory(GiB)": 147.13, + "step": 36940, + "train_speed(iter/s)": 0.201904 + }, + { + "acc": 0.78279285, + "epoch": 0.86207147960746, + "grad_norm": 4.75, + "learning_rate": 6.342900799337443e-06, + "loss": 0.77494459, + "memory(GiB)": 147.13, + "step": 36950, + "train_speed(iter/s)": 0.201933 + }, + { + "acc": 0.78726845, + "epoch": 0.8623047871797489, + "grad_norm": 6.28125, + "learning_rate": 6.341081002885472e-06, + "loss": 0.7636848, + "memory(GiB)": 147.13, + "step": 36960, + "train_speed(iter/s)": 0.201962 + }, + { + "acc": 0.78955112, + "epoch": 0.8625380947520378, + "grad_norm": 22.25, + "learning_rate": 6.33926101499458e-06, + "loss": 0.75000238, + "memory(GiB)": 147.13, + "step": 36970, + "train_speed(iter/s)": 0.201987 + }, + { + "acc": 0.7933239, + "epoch": 0.8627714023243267, + "grad_norm": 5.8125, + "learning_rate": 6.337440835924564e-06, + "loss": 0.75319366, + "memory(GiB)": 147.13, + "step": 36980, + "train_speed(iter/s)": 0.202016 + }, + { + "acc": 0.7852212, + "epoch": 0.8630047098966156, + "grad_norm": 5.53125, + "learning_rate": 6.335620465935259e-06, + "loss": 0.76989651, + "memory(GiB)": 147.13, + "step": 36990, + "train_speed(iter/s)": 0.202045 + }, + { + "acc": 0.78815894, + "epoch": 0.8632380174689045, + "grad_norm": 5.4375, + "learning_rate": 6.333799905286519e-06, + "loss": 0.76841455, + "memory(GiB)": 147.13, + "step": 37000, + "train_speed(iter/s)": 0.202073 + }, + { + "epoch": 0.8632380174689045, + "eval_acc": 0.7431547548984517, + "eval_loss": 0.8100025653839111, + "eval_runtime": 1269.5429, + "eval_samples_per_second": 28.35, + "eval_steps_per_second": 14.175, + "step": 37000 + }, + { + "acc": 0.76950207, + "epoch": 0.8634713250411934, + "grad_norm": 5.25, + "learning_rate": 6.331979154238232e-06, + "loss": 0.82681618, + "memory(GiB)": 147.13, + "step": 37010, + "train_speed(iter/s)": 0.200693 + }, + { + "acc": 0.77774849, + "epoch": 0.8637046326134823, + "grad_norm": 4.71875, + "learning_rate": 6.330158213050308e-06, + "loss": 0.81203289, + "memory(GiB)": 147.13, + "step": 37020, + "train_speed(iter/s)": 0.20072 + }, + { + "acc": 0.77672329, + "epoch": 0.8639379401857712, + "grad_norm": 9.375, + "learning_rate": 6.328337081982685e-06, + "loss": 0.78580074, + "memory(GiB)": 147.13, + "step": 37030, + "train_speed(iter/s)": 0.200751 + }, + { + "acc": 0.79651041, + "epoch": 0.8641712477580601, + "grad_norm": 5.125, + "learning_rate": 6.326515761295328e-06, + "loss": 0.72057328, + "memory(GiB)": 147.13, + "step": 37040, + "train_speed(iter/s)": 0.200778 + }, + { + "acc": 0.767904, + "epoch": 0.864404555330349, + "grad_norm": 6.1875, + "learning_rate": 6.3246942512482325e-06, + "loss": 0.84308643, + "memory(GiB)": 147.13, + "step": 37050, + "train_speed(iter/s)": 0.200806 + }, + { + "acc": 0.77300673, + "epoch": 0.8646378629026379, + "grad_norm": 4.9375, + "learning_rate": 6.3228725521014165e-06, + "loss": 0.8220705, + "memory(GiB)": 147.13, + "step": 37060, + "train_speed(iter/s)": 0.200834 + }, + { + "acc": 0.81248608, + "epoch": 0.8648711704749267, + "grad_norm": 8.0625, + "learning_rate": 6.32105066411493e-06, + "loss": 0.68309054, + "memory(GiB)": 147.13, + "step": 37070, + "train_speed(iter/s)": 0.200863 + }, + { + "acc": 0.76430616, + "epoch": 0.8651044780472156, + "grad_norm": 5.09375, + "learning_rate": 6.319228587548843e-06, + "loss": 0.85580292, + "memory(GiB)": 147.13, + "step": 37080, + "train_speed(iter/s)": 0.200888 + }, + { + "acc": 0.76082668, + "epoch": 0.8653377856195045, + "grad_norm": 5.0, + "learning_rate": 6.317406322663259e-06, + "loss": 0.86531963, + "memory(GiB)": 147.13, + "step": 37090, + "train_speed(iter/s)": 0.200915 + }, + { + "acc": 0.77870302, + "epoch": 0.8655710931917934, + "grad_norm": 4.40625, + "learning_rate": 6.315583869718306e-06, + "loss": 0.79484749, + "memory(GiB)": 147.13, + "step": 37100, + "train_speed(iter/s)": 0.20094 + }, + { + "acc": 0.7822587, + "epoch": 0.8658044007640823, + "grad_norm": 4.71875, + "learning_rate": 6.313761228974137e-06, + "loss": 0.80919828, + "memory(GiB)": 147.13, + "step": 37110, + "train_speed(iter/s)": 0.200966 + }, + { + "acc": 0.76600981, + "epoch": 0.8660377083363712, + "grad_norm": 6.15625, + "learning_rate": 6.311938400690933e-06, + "loss": 0.83051043, + "memory(GiB)": 147.13, + "step": 37120, + "train_speed(iter/s)": 0.200994 + }, + { + "acc": 0.78756142, + "epoch": 0.8662710159086601, + "grad_norm": 4.21875, + "learning_rate": 6.310115385128905e-06, + "loss": 0.73773327, + "memory(GiB)": 147.13, + "step": 37130, + "train_speed(iter/s)": 0.201021 + }, + { + "acc": 0.77740078, + "epoch": 0.866504323480949, + "grad_norm": 5.6875, + "learning_rate": 6.308292182548287e-06, + "loss": 0.79663916, + "memory(GiB)": 147.13, + "step": 37140, + "train_speed(iter/s)": 0.20105 + }, + { + "acc": 0.7775095, + "epoch": 0.8667376310532379, + "grad_norm": 4.3125, + "learning_rate": 6.3064687932093386e-06, + "loss": 0.79716334, + "memory(GiB)": 147.13, + "step": 37150, + "train_speed(iter/s)": 0.201077 + }, + { + "acc": 0.78595037, + "epoch": 0.8669709386255268, + "grad_norm": 4.0, + "learning_rate": 6.3046452173723495e-06, + "loss": 0.76871729, + "memory(GiB)": 147.13, + "step": 37160, + "train_speed(iter/s)": 0.201103 + }, + { + "acc": 0.76833076, + "epoch": 0.8672042461978157, + "grad_norm": 4.90625, + "learning_rate": 6.302821455297635e-06, + "loss": 0.83416252, + "memory(GiB)": 147.13, + "step": 37170, + "train_speed(iter/s)": 0.201131 + }, + { + "acc": 0.77176652, + "epoch": 0.8674375537701046, + "grad_norm": 4.84375, + "learning_rate": 6.300997507245537e-06, + "loss": 0.82635098, + "memory(GiB)": 147.13, + "step": 37180, + "train_speed(iter/s)": 0.201159 + }, + { + "acc": 0.78389416, + "epoch": 0.8676708613423935, + "grad_norm": 5.4375, + "learning_rate": 6.299173373476422e-06, + "loss": 0.77835617, + "memory(GiB)": 147.13, + "step": 37190, + "train_speed(iter/s)": 0.201185 + }, + { + "acc": 0.7786643, + "epoch": 0.8679041689146824, + "grad_norm": 4.65625, + "learning_rate": 6.2973490542506854e-06, + "loss": 0.81813984, + "memory(GiB)": 147.13, + "step": 37200, + "train_speed(iter/s)": 0.201214 + }, + { + "acc": 0.78079252, + "epoch": 0.8681374764869713, + "grad_norm": 5.625, + "learning_rate": 6.295524549828747e-06, + "loss": 0.76534152, + "memory(GiB)": 147.13, + "step": 37210, + "train_speed(iter/s)": 0.201242 + }, + { + "acc": 0.77490101, + "epoch": 0.8683707840592602, + "grad_norm": 5.5, + "learning_rate": 6.293699860471057e-06, + "loss": 0.77603149, + "memory(GiB)": 147.13, + "step": 37220, + "train_speed(iter/s)": 0.201269 + }, + { + "acc": 0.75890179, + "epoch": 0.8686040916315491, + "grad_norm": 6.375, + "learning_rate": 6.2918749864380875e-06, + "loss": 0.8679534, + "memory(GiB)": 147.13, + "step": 37230, + "train_speed(iter/s)": 0.201298 + }, + { + "acc": 0.7783083, + "epoch": 0.868837399203838, + "grad_norm": 4.3125, + "learning_rate": 6.290049927990339e-06, + "loss": 0.79737039, + "memory(GiB)": 147.13, + "step": 37240, + "train_speed(iter/s)": 0.201325 + }, + { + "acc": 0.77292824, + "epoch": 0.8690707067761269, + "grad_norm": 5.25, + "learning_rate": 6.288224685388337e-06, + "loss": 0.82085094, + "memory(GiB)": 147.13, + "step": 37250, + "train_speed(iter/s)": 0.201351 + }, + { + "acc": 0.78002787, + "epoch": 0.8693040143484156, + "grad_norm": 4.5625, + "learning_rate": 6.286399258892638e-06, + "loss": 0.80144997, + "memory(GiB)": 147.13, + "step": 37260, + "train_speed(iter/s)": 0.201378 + }, + { + "acc": 0.78442354, + "epoch": 0.8695373219207045, + "grad_norm": 3.90625, + "learning_rate": 6.284573648763816e-06, + "loss": 0.77326307, + "memory(GiB)": 147.13, + "step": 37270, + "train_speed(iter/s)": 0.201406 + }, + { + "acc": 0.78011799, + "epoch": 0.8697706294929934, + "grad_norm": 4.96875, + "learning_rate": 6.28274785526248e-06, + "loss": 0.82026291, + "memory(GiB)": 147.13, + "step": 37280, + "train_speed(iter/s)": 0.201434 + }, + { + "acc": 0.76693525, + "epoch": 0.8700039370652823, + "grad_norm": 12.875, + "learning_rate": 6.2809218786492595e-06, + "loss": 0.84187355, + "memory(GiB)": 147.13, + "step": 37290, + "train_speed(iter/s)": 0.201462 + }, + { + "acc": 0.79030437, + "epoch": 0.8702372446375712, + "grad_norm": 6.53125, + "learning_rate": 6.279095719184813e-06, + "loss": 0.7504972, + "memory(GiB)": 147.13, + "step": 37300, + "train_speed(iter/s)": 0.201489 + }, + { + "acc": 0.77714148, + "epoch": 0.8704705522098601, + "grad_norm": 4.71875, + "learning_rate": 6.277269377129826e-06, + "loss": 0.79407024, + "memory(GiB)": 147.13, + "step": 37310, + "train_speed(iter/s)": 0.201517 + }, + { + "acc": 0.77218485, + "epoch": 0.870703859782149, + "grad_norm": 6.09375, + "learning_rate": 6.275442852745005e-06, + "loss": 0.82651482, + "memory(GiB)": 147.13, + "step": 37320, + "train_speed(iter/s)": 0.201546 + }, + { + "acc": 0.77186918, + "epoch": 0.8709371673544379, + "grad_norm": 9.3125, + "learning_rate": 6.273616146291086e-06, + "loss": 0.81594334, + "memory(GiB)": 147.13, + "step": 37330, + "train_speed(iter/s)": 0.201575 + }, + { + "acc": 0.7657239, + "epoch": 0.8711704749267268, + "grad_norm": 5.59375, + "learning_rate": 6.2717892580288335e-06, + "loss": 0.83614559, + "memory(GiB)": 147.13, + "step": 37340, + "train_speed(iter/s)": 0.201603 + }, + { + "acc": 0.77476711, + "epoch": 0.8714037824990157, + "grad_norm": 4.75, + "learning_rate": 6.269962188219034e-06, + "loss": 0.80064745, + "memory(GiB)": 147.13, + "step": 37350, + "train_speed(iter/s)": 0.20163 + }, + { + "acc": 0.78318167, + "epoch": 0.8716370900713046, + "grad_norm": 5.9375, + "learning_rate": 6.2681349371225e-06, + "loss": 0.78883448, + "memory(GiB)": 147.13, + "step": 37360, + "train_speed(iter/s)": 0.201654 + }, + { + "acc": 0.79107747, + "epoch": 0.8718703976435935, + "grad_norm": 3.90625, + "learning_rate": 6.266307505000073e-06, + "loss": 0.72986178, + "memory(GiB)": 147.13, + "step": 37370, + "train_speed(iter/s)": 0.201681 + }, + { + "acc": 0.77610064, + "epoch": 0.8721037052158824, + "grad_norm": 6.8125, + "learning_rate": 6.264479892112619e-06, + "loss": 0.79103355, + "memory(GiB)": 147.13, + "step": 37380, + "train_speed(iter/s)": 0.201711 + }, + { + "acc": 0.77150335, + "epoch": 0.8723370127881713, + "grad_norm": 6.0625, + "learning_rate": 6.262652098721026e-06, + "loss": 0.82169552, + "memory(GiB)": 147.13, + "step": 37390, + "train_speed(iter/s)": 0.201738 + }, + { + "acc": 0.77216101, + "epoch": 0.8725703203604602, + "grad_norm": 5.65625, + "learning_rate": 6.260824125086212e-06, + "loss": 0.8190197, + "memory(GiB)": 147.13, + "step": 37400, + "train_speed(iter/s)": 0.201767 + }, + { + "acc": 0.77426405, + "epoch": 0.8728036279327491, + "grad_norm": 5.375, + "learning_rate": 6.258995971469122e-06, + "loss": 0.83238316, + "memory(GiB)": 147.13, + "step": 37410, + "train_speed(iter/s)": 0.201795 + }, + { + "acc": 0.78717422, + "epoch": 0.873036935505038, + "grad_norm": 6.0625, + "learning_rate": 6.2571676381307215e-06, + "loss": 0.79043889, + "memory(GiB)": 147.13, + "step": 37420, + "train_speed(iter/s)": 0.201823 + }, + { + "acc": 0.78104959, + "epoch": 0.8732702430773269, + "grad_norm": 5.0625, + "learning_rate": 6.255339125332007e-06, + "loss": 0.79519758, + "memory(GiB)": 147.13, + "step": 37430, + "train_speed(iter/s)": 0.201849 + }, + { + "acc": 0.7618619, + "epoch": 0.8735035506496158, + "grad_norm": 4.90625, + "learning_rate": 6.253510433333996e-06, + "loss": 0.86247349, + "memory(GiB)": 147.13, + "step": 37440, + "train_speed(iter/s)": 0.201877 + }, + { + "acc": 0.77435389, + "epoch": 0.8737368582219046, + "grad_norm": 5.84375, + "learning_rate": 6.251681562397736e-06, + "loss": 0.79898653, + "memory(GiB)": 147.13, + "step": 37450, + "train_speed(iter/s)": 0.201905 + }, + { + "acc": 0.78359423, + "epoch": 0.8739701657941935, + "grad_norm": 5.4375, + "learning_rate": 6.2498525127842955e-06, + "loss": 0.76345568, + "memory(GiB)": 147.13, + "step": 37460, + "train_speed(iter/s)": 0.201931 + }, + { + "acc": 0.7648387, + "epoch": 0.8742034733664824, + "grad_norm": 5.25, + "learning_rate": 6.248023284754772e-06, + "loss": 0.83345156, + "memory(GiB)": 147.13, + "step": 37470, + "train_speed(iter/s)": 0.20196 + }, + { + "acc": 0.78857408, + "epoch": 0.8744367809387713, + "grad_norm": 5.0625, + "learning_rate": 6.2461938785702866e-06, + "loss": 0.75806623, + "memory(GiB)": 147.13, + "step": 37480, + "train_speed(iter/s)": 0.201987 + }, + { + "acc": 0.76943989, + "epoch": 0.8746700885110602, + "grad_norm": 3.984375, + "learning_rate": 6.244364294491989e-06, + "loss": 0.82730265, + "memory(GiB)": 147.13, + "step": 37490, + "train_speed(iter/s)": 0.202015 + }, + { + "acc": 0.77271547, + "epoch": 0.8749033960833491, + "grad_norm": 7.25, + "learning_rate": 6.2425345327810485e-06, + "loss": 0.81668425, + "memory(GiB)": 147.13, + "step": 37500, + "train_speed(iter/s)": 0.202043 + }, + { + "epoch": 0.8749033960833491, + "eval_acc": 0.7432971231209676, + "eval_loss": 0.8092904686927795, + "eval_runtime": 1270.0898, + "eval_samples_per_second": 28.337, + "eval_steps_per_second": 14.169, + "step": 37500 + }, + { + "acc": 0.77455845, + "epoch": 0.875136703655638, + "grad_norm": 5.8125, + "learning_rate": 6.240704593698664e-06, + "loss": 0.79787617, + "memory(GiB)": 147.13, + "step": 37510, + "train_speed(iter/s)": 0.20068 + }, + { + "acc": 0.77178693, + "epoch": 0.8753700112279269, + "grad_norm": 6.84375, + "learning_rate": 6.238874477506061e-06, + "loss": 0.81941929, + "memory(GiB)": 147.13, + "step": 37520, + "train_speed(iter/s)": 0.200701 + }, + { + "acc": 0.75730181, + "epoch": 0.8756033188002158, + "grad_norm": 6.46875, + "learning_rate": 6.237044184464485e-06, + "loss": 0.89033489, + "memory(GiB)": 147.13, + "step": 37530, + "train_speed(iter/s)": 0.20073 + }, + { + "acc": 0.77331123, + "epoch": 0.8758366263725047, + "grad_norm": 4.3125, + "learning_rate": 6.235213714835211e-06, + "loss": 0.82823706, + "memory(GiB)": 147.13, + "step": 37540, + "train_speed(iter/s)": 0.200755 + }, + { + "acc": 0.78762941, + "epoch": 0.8760699339447936, + "grad_norm": 4.125, + "learning_rate": 6.233383068879538e-06, + "loss": 0.75171795, + "memory(GiB)": 147.13, + "step": 37550, + "train_speed(iter/s)": 0.200782 + }, + { + "acc": 0.7689115, + "epoch": 0.8763032415170825, + "grad_norm": 5.28125, + "learning_rate": 6.231552246858791e-06, + "loss": 0.83698616, + "memory(GiB)": 147.13, + "step": 37560, + "train_speed(iter/s)": 0.20081 + }, + { + "acc": 0.78390293, + "epoch": 0.8765365490893714, + "grad_norm": 5.78125, + "learning_rate": 6.229721249034318e-06, + "loss": 0.79848022, + "memory(GiB)": 147.13, + "step": 37570, + "train_speed(iter/s)": 0.200839 + }, + { + "acc": 0.78724899, + "epoch": 0.8767698566616603, + "grad_norm": 5.59375, + "learning_rate": 6.227890075667492e-06, + "loss": 0.77459116, + "memory(GiB)": 147.13, + "step": 37580, + "train_speed(iter/s)": 0.200868 + }, + { + "acc": 0.79053335, + "epoch": 0.8770031642339492, + "grad_norm": 5.65625, + "learning_rate": 6.226058727019717e-06, + "loss": 0.75305357, + "memory(GiB)": 147.13, + "step": 37590, + "train_speed(iter/s)": 0.200893 + }, + { + "acc": 0.75518756, + "epoch": 0.8772364718062381, + "grad_norm": 5.8125, + "learning_rate": 6.224227203352415e-06, + "loss": 0.87923584, + "memory(GiB)": 147.13, + "step": 37600, + "train_speed(iter/s)": 0.20092 + }, + { + "acc": 0.78576441, + "epoch": 0.877469779378527, + "grad_norm": 4.875, + "learning_rate": 6.222395504927035e-06, + "loss": 0.77100844, + "memory(GiB)": 147.13, + "step": 37610, + "train_speed(iter/s)": 0.200949 + }, + { + "acc": 0.7848784, + "epoch": 0.8777030869508159, + "grad_norm": 6.3125, + "learning_rate": 6.22056363200505e-06, + "loss": 0.7881711, + "memory(GiB)": 147.13, + "step": 37620, + "train_speed(iter/s)": 0.200976 + }, + { + "acc": 0.77304902, + "epoch": 0.8779363945231048, + "grad_norm": 10.875, + "learning_rate": 6.218731584847963e-06, + "loss": 0.81832428, + "memory(GiB)": 147.13, + "step": 37630, + "train_speed(iter/s)": 0.201004 + }, + { + "acc": 0.78921366, + "epoch": 0.8781697020953936, + "grad_norm": 6.65625, + "learning_rate": 6.216899363717295e-06, + "loss": 0.76330223, + "memory(GiB)": 147.13, + "step": 37640, + "train_speed(iter/s)": 0.201032 + }, + { + "acc": 0.77204285, + "epoch": 0.8784030096676825, + "grad_norm": 7.15625, + "learning_rate": 6.215066968874596e-06, + "loss": 0.83313923, + "memory(GiB)": 147.13, + "step": 37650, + "train_speed(iter/s)": 0.201059 + }, + { + "acc": 0.78158464, + "epoch": 0.8786363172399714, + "grad_norm": 5.75, + "learning_rate": 6.213234400581442e-06, + "loss": 0.7798768, + "memory(GiB)": 147.13, + "step": 37660, + "train_speed(iter/s)": 0.201087 + }, + { + "acc": 0.77431865, + "epoch": 0.8788696248122603, + "grad_norm": 13.3125, + "learning_rate": 6.2114016590994295e-06, + "loss": 0.82120934, + "memory(GiB)": 147.13, + "step": 37670, + "train_speed(iter/s)": 0.201113 + }, + { + "acc": 0.76597242, + "epoch": 0.8791029323845492, + "grad_norm": 6.1875, + "learning_rate": 6.209568744690181e-06, + "loss": 0.82842293, + "memory(GiB)": 147.13, + "step": 37680, + "train_speed(iter/s)": 0.20114 + }, + { + "acc": 0.7780242, + "epoch": 0.8793362399568381, + "grad_norm": 7.125, + "learning_rate": 6.207735657615346e-06, + "loss": 0.7849369, + "memory(GiB)": 147.13, + "step": 37690, + "train_speed(iter/s)": 0.201167 + }, + { + "acc": 0.79243302, + "epoch": 0.879569547529127, + "grad_norm": 4.15625, + "learning_rate": 6.2059023981365965e-06, + "loss": 0.75370507, + "memory(GiB)": 147.13, + "step": 37700, + "train_speed(iter/s)": 0.201196 + }, + { + "acc": 0.78596573, + "epoch": 0.8798028551014159, + "grad_norm": 5.46875, + "learning_rate": 6.20406896651563e-06, + "loss": 0.76070099, + "memory(GiB)": 147.13, + "step": 37710, + "train_speed(iter/s)": 0.201224 + }, + { + "acc": 0.77707858, + "epoch": 0.8800361626737048, + "grad_norm": 6.59375, + "learning_rate": 6.202235363014169e-06, + "loss": 0.82894115, + "memory(GiB)": 147.13, + "step": 37720, + "train_speed(iter/s)": 0.201248 + }, + { + "acc": 0.77520704, + "epoch": 0.8802694702459937, + "grad_norm": 6.8125, + "learning_rate": 6.2004015878939585e-06, + "loss": 0.81481781, + "memory(GiB)": 147.13, + "step": 37730, + "train_speed(iter/s)": 0.201275 + }, + { + "acc": 0.78262343, + "epoch": 0.8805027778182826, + "grad_norm": 6.28125, + "learning_rate": 6.198567641416772e-06, + "loss": 0.78242087, + "memory(GiB)": 147.13, + "step": 37740, + "train_speed(iter/s)": 0.201304 + }, + { + "acc": 0.77590933, + "epoch": 0.8807360853905715, + "grad_norm": 5.09375, + "learning_rate": 6.1967335238444004e-06, + "loss": 0.81266613, + "memory(GiB)": 147.13, + "step": 37750, + "train_speed(iter/s)": 0.201331 + }, + { + "acc": 0.7834343, + "epoch": 0.8809693929628604, + "grad_norm": 6.9375, + "learning_rate": 6.194899235438666e-06, + "loss": 0.75194969, + "memory(GiB)": 147.13, + "step": 37760, + "train_speed(iter/s)": 0.201358 + }, + { + "acc": 0.75540552, + "epoch": 0.8812027005351493, + "grad_norm": 4.34375, + "learning_rate": 6.193064776461415e-06, + "loss": 0.88943443, + "memory(GiB)": 147.13, + "step": 37770, + "train_speed(iter/s)": 0.201387 + }, + { + "acc": 0.78063102, + "epoch": 0.8814360081074382, + "grad_norm": 4.25, + "learning_rate": 6.191230147174512e-06, + "loss": 0.80147295, + "memory(GiB)": 147.13, + "step": 37780, + "train_speed(iter/s)": 0.201411 + }, + { + "acc": 0.77797747, + "epoch": 0.8816693156797271, + "grad_norm": 5.5625, + "learning_rate": 6.1893953478398515e-06, + "loss": 0.81042271, + "memory(GiB)": 147.13, + "step": 37790, + "train_speed(iter/s)": 0.201438 + }, + { + "acc": 0.78152409, + "epoch": 0.881902623252016, + "grad_norm": 5.96875, + "learning_rate": 6.18756037871935e-06, + "loss": 0.77627401, + "memory(GiB)": 147.13, + "step": 37800, + "train_speed(iter/s)": 0.201464 + }, + { + "acc": 0.76364655, + "epoch": 0.8821359308243049, + "grad_norm": 4.125, + "learning_rate": 6.185725240074951e-06, + "loss": 0.85897408, + "memory(GiB)": 147.13, + "step": 37810, + "train_speed(iter/s)": 0.201494 + }, + { + "acc": 0.78259721, + "epoch": 0.8823692383965938, + "grad_norm": 5.28125, + "learning_rate": 6.1838899321686185e-06, + "loss": 0.78828173, + "memory(GiB)": 147.13, + "step": 37820, + "train_speed(iter/s)": 0.201523 + }, + { + "acc": 0.77779245, + "epoch": 0.8826025459688827, + "grad_norm": 4.625, + "learning_rate": 6.1820544552623415e-06, + "loss": 0.80315275, + "memory(GiB)": 147.13, + "step": 37830, + "train_speed(iter/s)": 0.201551 + }, + { + "acc": 0.77334061, + "epoch": 0.8828358535411714, + "grad_norm": 5.78125, + "learning_rate": 6.180218809618135e-06, + "loss": 0.81666965, + "memory(GiB)": 147.13, + "step": 37840, + "train_speed(iter/s)": 0.201578 + }, + { + "acc": 0.76984267, + "epoch": 0.8830691611134603, + "grad_norm": 6.96875, + "learning_rate": 6.1783829954980345e-06, + "loss": 0.83126059, + "memory(GiB)": 147.13, + "step": 37850, + "train_speed(iter/s)": 0.201607 + }, + { + "acc": 0.77715025, + "epoch": 0.8833024686857492, + "grad_norm": 6.84375, + "learning_rate": 6.176547013164104e-06, + "loss": 0.79118605, + "memory(GiB)": 147.13, + "step": 37860, + "train_speed(iter/s)": 0.201634 + }, + { + "acc": 0.76471095, + "epoch": 0.8835357762580381, + "grad_norm": 5.34375, + "learning_rate": 6.17471086287843e-06, + "loss": 0.83969116, + "memory(GiB)": 147.13, + "step": 37870, + "train_speed(iter/s)": 0.201663 + }, + { + "acc": 0.78056355, + "epoch": 0.883769083830327, + "grad_norm": 7.3125, + "learning_rate": 6.172874544903122e-06, + "loss": 0.76982565, + "memory(GiB)": 147.13, + "step": 37880, + "train_speed(iter/s)": 0.201689 + }, + { + "acc": 0.78541079, + "epoch": 0.8840023914026159, + "grad_norm": 6.625, + "learning_rate": 6.171038059500315e-06, + "loss": 0.77315903, + "memory(GiB)": 147.13, + "step": 37890, + "train_speed(iter/s)": 0.201715 + }, + { + "acc": 0.78337684, + "epoch": 0.8842356989749048, + "grad_norm": 4.15625, + "learning_rate": 6.169201406932163e-06, + "loss": 0.78238239, + "memory(GiB)": 147.13, + "step": 37900, + "train_speed(iter/s)": 0.201744 + }, + { + "acc": 0.7873363, + "epoch": 0.8844690065471937, + "grad_norm": 6.03125, + "learning_rate": 6.167364587460849e-06, + "loss": 0.75477743, + "memory(GiB)": 147.13, + "step": 37910, + "train_speed(iter/s)": 0.201772 + }, + { + "acc": 0.7772418, + "epoch": 0.8847023141194826, + "grad_norm": 5.71875, + "learning_rate": 6.16552760134858e-06, + "loss": 0.84283714, + "memory(GiB)": 147.13, + "step": 37920, + "train_speed(iter/s)": 0.201801 + }, + { + "acc": 0.78096313, + "epoch": 0.8849356216917715, + "grad_norm": 5.34375, + "learning_rate": 6.1636904488575845e-06, + "loss": 0.79128194, + "memory(GiB)": 147.13, + "step": 37930, + "train_speed(iter/s)": 0.201826 + }, + { + "acc": 0.79532461, + "epoch": 0.8851689292640604, + "grad_norm": 5.5625, + "learning_rate": 6.161853130250117e-06, + "loss": 0.74347682, + "memory(GiB)": 147.13, + "step": 37940, + "train_speed(iter/s)": 0.201852 + }, + { + "acc": 0.77315941, + "epoch": 0.8854022368363493, + "grad_norm": 4.75, + "learning_rate": 6.160015645788451e-06, + "loss": 0.8172802, + "memory(GiB)": 147.13, + "step": 37950, + "train_speed(iter/s)": 0.201879 + }, + { + "acc": 0.77103496, + "epoch": 0.8856355444086382, + "grad_norm": 5.1875, + "learning_rate": 6.15817799573489e-06, + "loss": 0.83767815, + "memory(GiB)": 147.13, + "step": 37960, + "train_speed(iter/s)": 0.201905 + }, + { + "acc": 0.80051785, + "epoch": 0.8858688519809271, + "grad_norm": 5.53125, + "learning_rate": 6.1563401803517545e-06, + "loss": 0.70509105, + "memory(GiB)": 147.13, + "step": 37970, + "train_speed(iter/s)": 0.201931 + }, + { + "acc": 0.77665696, + "epoch": 0.886102159553216, + "grad_norm": 5.375, + "learning_rate": 6.154502199901396e-06, + "loss": 0.81156635, + "memory(GiB)": 147.13, + "step": 37980, + "train_speed(iter/s)": 0.201956 + }, + { + "acc": 0.76781158, + "epoch": 0.8863354671255049, + "grad_norm": 4.46875, + "learning_rate": 6.152664054646183e-06, + "loss": 0.83875208, + "memory(GiB)": 147.13, + "step": 37990, + "train_speed(iter/s)": 0.201984 + }, + { + "acc": 0.75077744, + "epoch": 0.8865687746977938, + "grad_norm": 5.875, + "learning_rate": 6.150825744848511e-06, + "loss": 0.90200367, + "memory(GiB)": 147.13, + "step": 38000, + "train_speed(iter/s)": 0.202011 + }, + { + "epoch": 0.8865687746977938, + "eval_acc": 0.74343228485303, + "eval_loss": 0.8092561960220337, + "eval_runtime": 1268.5776, + "eval_samples_per_second": 28.371, + "eval_steps_per_second": 14.186, + "step": 38000 + }, + { + "acc": 0.7838891, + "epoch": 0.8868020822700827, + "grad_norm": 5.5625, + "learning_rate": 6.148987270770798e-06, + "loss": 0.78417902, + "memory(GiB)": 147.13, + "step": 38010, + "train_speed(iter/s)": 0.200667 + }, + { + "acc": 0.79363413, + "epoch": 0.8870353898423716, + "grad_norm": 4.78125, + "learning_rate": 6.147148632675486e-06, + "loss": 0.73398776, + "memory(GiB)": 147.13, + "step": 38020, + "train_speed(iter/s)": 0.200693 + }, + { + "acc": 0.77010832, + "epoch": 0.8872686974146604, + "grad_norm": 12.5, + "learning_rate": 6.145309830825041e-06, + "loss": 0.87349873, + "memory(GiB)": 147.13, + "step": 38030, + "train_speed(iter/s)": 0.200721 + }, + { + "acc": 0.76912684, + "epoch": 0.8875020049869493, + "grad_norm": 5.15625, + "learning_rate": 6.143470865481948e-06, + "loss": 0.81946507, + "memory(GiB)": 147.13, + "step": 38040, + "train_speed(iter/s)": 0.200748 + }, + { + "acc": 0.76783762, + "epoch": 0.8877353125592382, + "grad_norm": 5.125, + "learning_rate": 6.141631736908723e-06, + "loss": 0.82789316, + "memory(GiB)": 147.13, + "step": 38050, + "train_speed(iter/s)": 0.200774 + }, + { + "acc": 0.78408384, + "epoch": 0.8879686201315271, + "grad_norm": 7.0625, + "learning_rate": 6.1397924453679e-06, + "loss": 0.7791399, + "memory(GiB)": 147.13, + "step": 38060, + "train_speed(iter/s)": 0.2008 + }, + { + "acc": 0.77008028, + "epoch": 0.888201927703816, + "grad_norm": 7.625, + "learning_rate": 6.137952991122035e-06, + "loss": 0.83867502, + "memory(GiB)": 147.13, + "step": 38070, + "train_speed(iter/s)": 0.200828 + }, + { + "acc": 0.79044018, + "epoch": 0.8884352352761049, + "grad_norm": 3.90625, + "learning_rate": 6.136113374433712e-06, + "loss": 0.74994631, + "memory(GiB)": 147.13, + "step": 38080, + "train_speed(iter/s)": 0.200854 + }, + { + "acc": 0.76272373, + "epoch": 0.8886685428483938, + "grad_norm": 4.84375, + "learning_rate": 6.134273595565534e-06, + "loss": 0.84841614, + "memory(GiB)": 147.13, + "step": 38090, + "train_speed(iter/s)": 0.200882 + }, + { + "acc": 0.76892252, + "epoch": 0.8889018504206827, + "grad_norm": 6.8125, + "learning_rate": 6.13243365478013e-06, + "loss": 0.81232853, + "memory(GiB)": 147.13, + "step": 38100, + "train_speed(iter/s)": 0.20091 + }, + { + "acc": 0.78824024, + "epoch": 0.8891351579929716, + "grad_norm": 5.59375, + "learning_rate": 6.13059355234015e-06, + "loss": 0.75986581, + "memory(GiB)": 147.13, + "step": 38110, + "train_speed(iter/s)": 0.200939 + }, + { + "acc": 0.77105479, + "epoch": 0.8893684655652605, + "grad_norm": 6.46875, + "learning_rate": 6.128753288508271e-06, + "loss": 0.83737135, + "memory(GiB)": 147.13, + "step": 38120, + "train_speed(iter/s)": 0.200967 + }, + { + "acc": 0.79165573, + "epoch": 0.8896017731375494, + "grad_norm": 7.875, + "learning_rate": 6.126912863547186e-06, + "loss": 0.74300852, + "memory(GiB)": 147.13, + "step": 38130, + "train_speed(iter/s)": 0.200993 + }, + { + "acc": 0.79510765, + "epoch": 0.8898350807098383, + "grad_norm": 5.0, + "learning_rate": 6.125072277719618e-06, + "loss": 0.7389545, + "memory(GiB)": 147.13, + "step": 38140, + "train_speed(iter/s)": 0.201021 + }, + { + "acc": 0.79052238, + "epoch": 0.8900683882821272, + "grad_norm": 6.25, + "learning_rate": 6.123231531288308e-06, + "loss": 0.75054169, + "memory(GiB)": 147.13, + "step": 38150, + "train_speed(iter/s)": 0.201048 + }, + { + "acc": 0.76992264, + "epoch": 0.8903016958544161, + "grad_norm": 7.71875, + "learning_rate": 6.121390624516026e-06, + "loss": 0.83893986, + "memory(GiB)": 147.13, + "step": 38160, + "train_speed(iter/s)": 0.201076 + }, + { + "acc": 0.78948836, + "epoch": 0.890535003426705, + "grad_norm": 8.0625, + "learning_rate": 6.119549557665556e-06, + "loss": 0.74797955, + "memory(GiB)": 147.13, + "step": 38170, + "train_speed(iter/s)": 0.201103 + }, + { + "acc": 0.77837534, + "epoch": 0.8907683109989939, + "grad_norm": 6.4375, + "learning_rate": 6.117708330999712e-06, + "loss": 0.78834462, + "memory(GiB)": 147.13, + "step": 38180, + "train_speed(iter/s)": 0.201129 + }, + { + "acc": 0.75297174, + "epoch": 0.8910016185712828, + "grad_norm": 4.59375, + "learning_rate": 6.115866944781329e-06, + "loss": 0.86872931, + "memory(GiB)": 147.13, + "step": 38190, + "train_speed(iter/s)": 0.201158 + }, + { + "acc": 0.79166384, + "epoch": 0.8912349261435717, + "grad_norm": 6.21875, + "learning_rate": 6.114025399273264e-06, + "loss": 0.72422724, + "memory(GiB)": 147.13, + "step": 38200, + "train_speed(iter/s)": 0.201184 + }, + { + "acc": 0.788766, + "epoch": 0.8914682337158606, + "grad_norm": 5.21875, + "learning_rate": 6.112183694738395e-06, + "loss": 0.74276934, + "memory(GiB)": 147.13, + "step": 38210, + "train_speed(iter/s)": 0.20121 + }, + { + "acc": 0.78757305, + "epoch": 0.8917015412881494, + "grad_norm": 5.75, + "learning_rate": 6.110341831439628e-06, + "loss": 0.76513844, + "memory(GiB)": 147.13, + "step": 38220, + "train_speed(iter/s)": 0.201236 + }, + { + "acc": 0.77797546, + "epoch": 0.8919348488604383, + "grad_norm": 5.1875, + "learning_rate": 6.108499809639887e-06, + "loss": 0.80226593, + "memory(GiB)": 147.13, + "step": 38230, + "train_speed(iter/s)": 0.201263 + }, + { + "acc": 0.78466263, + "epoch": 0.8921681564327272, + "grad_norm": 6.09375, + "learning_rate": 6.106657629602122e-06, + "loss": 0.76339788, + "memory(GiB)": 147.13, + "step": 38240, + "train_speed(iter/s)": 0.201292 + }, + { + "acc": 0.78802223, + "epoch": 0.8924014640050161, + "grad_norm": 4.5, + "learning_rate": 6.104815291589299e-06, + "loss": 0.75786328, + "memory(GiB)": 147.13, + "step": 38250, + "train_speed(iter/s)": 0.20132 + }, + { + "acc": 0.77927275, + "epoch": 0.892634771577305, + "grad_norm": 7.09375, + "learning_rate": 6.1029727958644144e-06, + "loss": 0.80576792, + "memory(GiB)": 147.13, + "step": 38260, + "train_speed(iter/s)": 0.201349 + }, + { + "acc": 0.78089199, + "epoch": 0.8928680791495939, + "grad_norm": 5.84375, + "learning_rate": 6.1011301426904845e-06, + "loss": 0.80531197, + "memory(GiB)": 147.13, + "step": 38270, + "train_speed(iter/s)": 0.201377 + }, + { + "acc": 0.79331675, + "epoch": 0.8931013867218828, + "grad_norm": 5.59375, + "learning_rate": 6.0992873323305465e-06, + "loss": 0.74135227, + "memory(GiB)": 147.13, + "step": 38280, + "train_speed(iter/s)": 0.201404 + }, + { + "acc": 0.78814082, + "epoch": 0.8933346942941717, + "grad_norm": 4.9375, + "learning_rate": 6.097444365047662e-06, + "loss": 0.76247959, + "memory(GiB)": 147.13, + "step": 38290, + "train_speed(iter/s)": 0.201432 + }, + { + "acc": 0.76565762, + "epoch": 0.8935680018664606, + "grad_norm": 6.375, + "learning_rate": 6.095601241104911e-06, + "loss": 0.85332813, + "memory(GiB)": 147.13, + "step": 38300, + "train_speed(iter/s)": 0.201459 + }, + { + "acc": 0.79775143, + "epoch": 0.8938013094387495, + "grad_norm": 4.78125, + "learning_rate": 6.093757960765404e-06, + "loss": 0.72894554, + "memory(GiB)": 147.13, + "step": 38310, + "train_speed(iter/s)": 0.201486 + }, + { + "acc": 0.76751165, + "epoch": 0.8940346170110384, + "grad_norm": 4.90625, + "learning_rate": 6.091914524292264e-06, + "loss": 0.86291084, + "memory(GiB)": 147.13, + "step": 38320, + "train_speed(iter/s)": 0.201515 + }, + { + "acc": 0.76315336, + "epoch": 0.8942679245833273, + "grad_norm": 6.5, + "learning_rate": 6.090070931948643e-06, + "loss": 0.85644455, + "memory(GiB)": 147.13, + "step": 38330, + "train_speed(iter/s)": 0.201542 + }, + { + "acc": 0.79633956, + "epoch": 0.8945012321556162, + "grad_norm": 6.65625, + "learning_rate": 6.088227183997715e-06, + "loss": 0.72798471, + "memory(GiB)": 147.13, + "step": 38340, + "train_speed(iter/s)": 0.20157 + }, + { + "acc": 0.78646612, + "epoch": 0.8947345397279051, + "grad_norm": 5.65625, + "learning_rate": 6.08638328070267e-06, + "loss": 0.75003166, + "memory(GiB)": 147.13, + "step": 38350, + "train_speed(iter/s)": 0.201596 + }, + { + "acc": 0.77957439, + "epoch": 0.894967847300194, + "grad_norm": 5.875, + "learning_rate": 6.084539222326728e-06, + "loss": 0.78398819, + "memory(GiB)": 147.13, + "step": 38360, + "train_speed(iter/s)": 0.201624 + }, + { + "acc": 0.78460279, + "epoch": 0.8952011548724829, + "grad_norm": 6.96875, + "learning_rate": 6.082695009133126e-06, + "loss": 0.77256527, + "memory(GiB)": 147.13, + "step": 38370, + "train_speed(iter/s)": 0.201652 + }, + { + "acc": 0.78010826, + "epoch": 0.8954344624447718, + "grad_norm": 6.8125, + "learning_rate": 6.080850641385129e-06, + "loss": 0.79071956, + "memory(GiB)": 147.13, + "step": 38380, + "train_speed(iter/s)": 0.201679 + }, + { + "acc": 0.76988091, + "epoch": 0.8956677700170607, + "grad_norm": 4.6875, + "learning_rate": 6.079006119346015e-06, + "loss": 0.83236942, + "memory(GiB)": 147.13, + "step": 38390, + "train_speed(iter/s)": 0.201707 + }, + { + "acc": 0.78275881, + "epoch": 0.8959010775893496, + "grad_norm": 4.90625, + "learning_rate": 6.0771614432790915e-06, + "loss": 0.76272345, + "memory(GiB)": 147.13, + "step": 38400, + "train_speed(iter/s)": 0.201736 + }, + { + "acc": 0.79821758, + "epoch": 0.8961343851616383, + "grad_norm": 3.765625, + "learning_rate": 6.075316613447684e-06, + "loss": 0.7420167, + "memory(GiB)": 147.13, + "step": 38410, + "train_speed(iter/s)": 0.20176 + }, + { + "acc": 0.78013487, + "epoch": 0.8963676927339272, + "grad_norm": 5.03125, + "learning_rate": 6.073471630115142e-06, + "loss": 0.77953615, + "memory(GiB)": 147.13, + "step": 38420, + "train_speed(iter/s)": 0.201788 + }, + { + "acc": 0.76554446, + "epoch": 0.8966010003062161, + "grad_norm": 5.4375, + "learning_rate": 6.071626493544838e-06, + "loss": 0.85112772, + "memory(GiB)": 147.13, + "step": 38430, + "train_speed(iter/s)": 0.201815 + }, + { + "acc": 0.78542604, + "epoch": 0.896834307878505, + "grad_norm": 4.34375, + "learning_rate": 6.0697812040001625e-06, + "loss": 0.7776813, + "memory(GiB)": 147.13, + "step": 38440, + "train_speed(iter/s)": 0.201842 + }, + { + "acc": 0.79344821, + "epoch": 0.8970676154507939, + "grad_norm": 8.5, + "learning_rate": 6.067935761744531e-06, + "loss": 0.74864655, + "memory(GiB)": 147.13, + "step": 38450, + "train_speed(iter/s)": 0.201867 + }, + { + "acc": 0.77485456, + "epoch": 0.8973009230230828, + "grad_norm": 5.4375, + "learning_rate": 6.066090167041381e-06, + "loss": 0.78871832, + "memory(GiB)": 147.13, + "step": 38460, + "train_speed(iter/s)": 0.201894 + }, + { + "acc": 0.7692565, + "epoch": 0.8975342305953717, + "grad_norm": 6.0625, + "learning_rate": 6.0642444201541686e-06, + "loss": 0.83406401, + "memory(GiB)": 147.13, + "step": 38470, + "train_speed(iter/s)": 0.201921 + }, + { + "acc": 0.78231134, + "epoch": 0.8977675381676606, + "grad_norm": 4.28125, + "learning_rate": 6.062398521346374e-06, + "loss": 0.77111473, + "memory(GiB)": 147.13, + "step": 38480, + "train_speed(iter/s)": 0.201948 + }, + { + "acc": 0.78063517, + "epoch": 0.8980008457399495, + "grad_norm": 5.96875, + "learning_rate": 6.060552470881498e-06, + "loss": 0.77957792, + "memory(GiB)": 147.13, + "step": 38490, + "train_speed(iter/s)": 0.201974 + }, + { + "acc": 0.77691154, + "epoch": 0.8982341533122384, + "grad_norm": 4.65625, + "learning_rate": 6.0587062690230654e-06, + "loss": 0.79885607, + "memory(GiB)": 147.13, + "step": 38500, + "train_speed(iter/s)": 0.202001 + }, + { + "epoch": 0.8982341533122384, + "eval_acc": 0.7434691180264593, + "eval_loss": 0.8091056942939758, + "eval_runtime": 1269.2357, + "eval_samples_per_second": 28.356, + "eval_steps_per_second": 14.179, + "step": 38500 + }, + { + "acc": 0.76533337, + "epoch": 0.8984674608845273, + "grad_norm": 5.875, + "learning_rate": 6.056859916034621e-06, + "loss": 0.85079603, + "memory(GiB)": 147.13, + "step": 38510, + "train_speed(iter/s)": 0.200674 + }, + { + "acc": 0.80131989, + "epoch": 0.8987007684568162, + "grad_norm": 5.15625, + "learning_rate": 6.055013412179732e-06, + "loss": 0.73174257, + "memory(GiB)": 147.13, + "step": 38520, + "train_speed(iter/s)": 0.200701 + }, + { + "acc": 0.77752538, + "epoch": 0.8989340760291051, + "grad_norm": 5.59375, + "learning_rate": 6.053166757721984e-06, + "loss": 0.79698305, + "memory(GiB)": 147.13, + "step": 38530, + "train_speed(iter/s)": 0.200726 + }, + { + "acc": 0.78054886, + "epoch": 0.899167383601394, + "grad_norm": 5.03125, + "learning_rate": 6.051319952924987e-06, + "loss": 0.77370358, + "memory(GiB)": 147.13, + "step": 38540, + "train_speed(iter/s)": 0.200752 + }, + { + "acc": 0.79228392, + "epoch": 0.8994006911736829, + "grad_norm": 6.125, + "learning_rate": 6.049472998052371e-06, + "loss": 0.73808813, + "memory(GiB)": 147.13, + "step": 38550, + "train_speed(iter/s)": 0.200779 + }, + { + "acc": 0.78893967, + "epoch": 0.8996339987459718, + "grad_norm": 11.9375, + "learning_rate": 6.047625893367791e-06, + "loss": 0.74676943, + "memory(GiB)": 147.13, + "step": 38560, + "train_speed(iter/s)": 0.200807 + }, + { + "acc": 0.77075801, + "epoch": 0.8998673063182607, + "grad_norm": 6.25, + "learning_rate": 6.0457786391349195e-06, + "loss": 0.83967133, + "memory(GiB)": 147.13, + "step": 38570, + "train_speed(iter/s)": 0.200834 + }, + { + "acc": 0.77375088, + "epoch": 0.9001006138905496, + "grad_norm": 35.5, + "learning_rate": 6.0439312356174495e-06, + "loss": 0.83246117, + "memory(GiB)": 147.13, + "step": 38580, + "train_speed(iter/s)": 0.200859 + }, + { + "acc": 0.77322402, + "epoch": 0.9003339214628385, + "grad_norm": 4.71875, + "learning_rate": 6.042083683079099e-06, + "loss": 0.80312271, + "memory(GiB)": 147.13, + "step": 38590, + "train_speed(iter/s)": 0.200886 + }, + { + "acc": 0.78943686, + "epoch": 0.9005672290351274, + "grad_norm": 7.0, + "learning_rate": 6.0402359817836065e-06, + "loss": 0.76622558, + "memory(GiB)": 147.13, + "step": 38600, + "train_speed(iter/s)": 0.200912 + }, + { + "acc": 0.78096523, + "epoch": 0.9008005366074162, + "grad_norm": 6.9375, + "learning_rate": 6.038388131994729e-06, + "loss": 0.78436923, + "memory(GiB)": 147.13, + "step": 38610, + "train_speed(iter/s)": 0.200938 + }, + { + "acc": 0.7831727, + "epoch": 0.9010338441797051, + "grad_norm": 6.21875, + "learning_rate": 6.036540133976247e-06, + "loss": 0.77339869, + "memory(GiB)": 147.13, + "step": 38620, + "train_speed(iter/s)": 0.200965 + }, + { + "acc": 0.77260571, + "epoch": 0.901267151751994, + "grad_norm": 5.3125, + "learning_rate": 6.034691987991963e-06, + "loss": 0.81804924, + "memory(GiB)": 147.13, + "step": 38630, + "train_speed(iter/s)": 0.200992 + }, + { + "acc": 0.7917419, + "epoch": 0.9015004593242829, + "grad_norm": 3.796875, + "learning_rate": 6.032843694305698e-06, + "loss": 0.74714317, + "memory(GiB)": 147.13, + "step": 38640, + "train_speed(iter/s)": 0.201018 + }, + { + "acc": 0.78762455, + "epoch": 0.9017337668965718, + "grad_norm": 7.6875, + "learning_rate": 6.0309952531812955e-06, + "loss": 0.74814758, + "memory(GiB)": 147.13, + "step": 38650, + "train_speed(iter/s)": 0.201046 + }, + { + "acc": 0.78616996, + "epoch": 0.9019670744688607, + "grad_norm": 6.15625, + "learning_rate": 6.029146664882619e-06, + "loss": 0.77387581, + "memory(GiB)": 147.13, + "step": 38660, + "train_speed(iter/s)": 0.201075 + }, + { + "acc": 0.7877409, + "epoch": 0.9022003820411496, + "grad_norm": 4.65625, + "learning_rate": 6.027297929673557e-06, + "loss": 0.75647035, + "memory(GiB)": 147.13, + "step": 38670, + "train_speed(iter/s)": 0.201104 + }, + { + "acc": 0.77807121, + "epoch": 0.9024336896134385, + "grad_norm": 5.65625, + "learning_rate": 6.025449047818012e-06, + "loss": 0.80177183, + "memory(GiB)": 147.13, + "step": 38680, + "train_speed(iter/s)": 0.201132 + }, + { + "acc": 0.79884205, + "epoch": 0.9026669971857274, + "grad_norm": 3.53125, + "learning_rate": 6.0236000195799164e-06, + "loss": 0.73010259, + "memory(GiB)": 147.13, + "step": 38690, + "train_speed(iter/s)": 0.201158 + }, + { + "acc": 0.78179312, + "epoch": 0.9029003047580163, + "grad_norm": 6.6875, + "learning_rate": 6.0217508452232135e-06, + "loss": 0.76933064, + "memory(GiB)": 147.13, + "step": 38700, + "train_speed(iter/s)": 0.201183 + }, + { + "acc": 0.77845345, + "epoch": 0.9031336123303052, + "grad_norm": 8.9375, + "learning_rate": 6.019901525011873e-06, + "loss": 0.78353348, + "memory(GiB)": 147.13, + "step": 38710, + "train_speed(iter/s)": 0.201211 + }, + { + "acc": 0.78779402, + "epoch": 0.9033669199025941, + "grad_norm": 3.765625, + "learning_rate": 6.018052059209887e-06, + "loss": 0.76179886, + "memory(GiB)": 147.13, + "step": 38720, + "train_speed(iter/s)": 0.201239 + }, + { + "acc": 0.75972013, + "epoch": 0.903600227474883, + "grad_norm": 5.5625, + "learning_rate": 6.016202448081266e-06, + "loss": 0.8852212, + "memory(GiB)": 147.13, + "step": 38730, + "train_speed(iter/s)": 0.201265 + }, + { + "acc": 0.77685909, + "epoch": 0.9038335350471719, + "grad_norm": 7.03125, + "learning_rate": 6.014352691890041e-06, + "loss": 0.79779468, + "memory(GiB)": 147.13, + "step": 38740, + "train_speed(iter/s)": 0.201293 + }, + { + "acc": 0.79123516, + "epoch": 0.9040668426194608, + "grad_norm": 4.5625, + "learning_rate": 6.012502790900263e-06, + "loss": 0.76187844, + "memory(GiB)": 147.13, + "step": 38750, + "train_speed(iter/s)": 0.201321 + }, + { + "acc": 0.78490562, + "epoch": 0.9043001501917497, + "grad_norm": 5.65625, + "learning_rate": 6.010652745376006e-06, + "loss": 0.77625852, + "memory(GiB)": 147.13, + "step": 38760, + "train_speed(iter/s)": 0.201347 + }, + { + "acc": 0.77033277, + "epoch": 0.9045334577640386, + "grad_norm": 5.15625, + "learning_rate": 6.008802555581364e-06, + "loss": 0.81603584, + "memory(GiB)": 147.13, + "step": 38770, + "train_speed(iter/s)": 0.201374 + }, + { + "acc": 0.76615601, + "epoch": 0.9047667653363275, + "grad_norm": 6.5, + "learning_rate": 6.006952221780447e-06, + "loss": 0.86489754, + "memory(GiB)": 147.13, + "step": 38780, + "train_speed(iter/s)": 0.201402 + }, + { + "acc": 0.77902718, + "epoch": 0.9050000729086164, + "grad_norm": 6.21875, + "learning_rate": 6.005101744237396e-06, + "loss": 0.79199033, + "memory(GiB)": 147.13, + "step": 38790, + "train_speed(iter/s)": 0.20143 + }, + { + "acc": 0.78239946, + "epoch": 0.9052333804809052, + "grad_norm": 6.84375, + "learning_rate": 6.003251123216362e-06, + "loss": 0.76015949, + "memory(GiB)": 147.13, + "step": 38800, + "train_speed(iter/s)": 0.201457 + }, + { + "acc": 0.79029064, + "epoch": 0.9054666880531941, + "grad_norm": 5.625, + "learning_rate": 6.001400358981522e-06, + "loss": 0.73135719, + "memory(GiB)": 147.13, + "step": 38810, + "train_speed(iter/s)": 0.201484 + }, + { + "acc": 0.75774117, + "epoch": 0.905699995625483, + "grad_norm": 8.25, + "learning_rate": 5.999549451797073e-06, + "loss": 0.88968811, + "memory(GiB)": 147.13, + "step": 38820, + "train_speed(iter/s)": 0.201513 + }, + { + "acc": 0.78999863, + "epoch": 0.9059333031977719, + "grad_norm": 5.9375, + "learning_rate": 5.997698401927228e-06, + "loss": 0.77355022, + "memory(GiB)": 147.13, + "step": 38830, + "train_speed(iter/s)": 0.201538 + }, + { + "acc": 0.76618147, + "epoch": 0.9061666107700608, + "grad_norm": 5.78125, + "learning_rate": 5.995847209636227e-06, + "loss": 0.85498562, + "memory(GiB)": 147.13, + "step": 38840, + "train_speed(iter/s)": 0.201565 + }, + { + "acc": 0.7809248, + "epoch": 0.9063999183423497, + "grad_norm": 6.0625, + "learning_rate": 5.993995875188324e-06, + "loss": 0.76933775, + "memory(GiB)": 147.13, + "step": 38850, + "train_speed(iter/s)": 0.201591 + }, + { + "acc": 0.77357092, + "epoch": 0.9066332259146386, + "grad_norm": 4.6875, + "learning_rate": 5.992144398847801e-06, + "loss": 0.81348381, + "memory(GiB)": 147.13, + "step": 38860, + "train_speed(iter/s)": 0.201618 + }, + { + "acc": 0.77968731, + "epoch": 0.9068665334869275, + "grad_norm": 5.40625, + "learning_rate": 5.990292780878952e-06, + "loss": 0.79122777, + "memory(GiB)": 147.13, + "step": 38870, + "train_speed(iter/s)": 0.201646 + }, + { + "acc": 0.77165852, + "epoch": 0.9070998410592164, + "grad_norm": 7.46875, + "learning_rate": 5.988441021546097e-06, + "loss": 0.82253704, + "memory(GiB)": 147.13, + "step": 38880, + "train_speed(iter/s)": 0.201674 + }, + { + "acc": 0.798492, + "epoch": 0.9073331486315053, + "grad_norm": 5.15625, + "learning_rate": 5.986589121113574e-06, + "loss": 0.71198359, + "memory(GiB)": 147.13, + "step": 38890, + "train_speed(iter/s)": 0.2017 + }, + { + "acc": 0.78536911, + "epoch": 0.9075664562037942, + "grad_norm": 4.78125, + "learning_rate": 5.9847370798457395e-06, + "loss": 0.77795515, + "memory(GiB)": 147.13, + "step": 38900, + "train_speed(iter/s)": 0.201727 + }, + { + "acc": 0.80710964, + "epoch": 0.9077997637760831, + "grad_norm": 5.375, + "learning_rate": 5.982884898006973e-06, + "loss": 0.67784872, + "memory(GiB)": 147.13, + "step": 38910, + "train_speed(iter/s)": 0.201755 + }, + { + "acc": 0.76790614, + "epoch": 0.908033071348372, + "grad_norm": 4.5, + "learning_rate": 5.981032575861674e-06, + "loss": 0.83228683, + "memory(GiB)": 147.13, + "step": 38920, + "train_speed(iter/s)": 0.201782 + }, + { + "acc": 0.78043723, + "epoch": 0.9082663789206609, + "grad_norm": 8.8125, + "learning_rate": 5.979180113674258e-06, + "loss": 0.79833183, + "memory(GiB)": 147.13, + "step": 38930, + "train_speed(iter/s)": 0.201808 + }, + { + "acc": 0.78106966, + "epoch": 0.9084996864929498, + "grad_norm": 4.65625, + "learning_rate": 5.9773275117091655e-06, + "loss": 0.76774416, + "memory(GiB)": 147.13, + "step": 38940, + "train_speed(iter/s)": 0.201834 + }, + { + "acc": 0.77814817, + "epoch": 0.9087329940652387, + "grad_norm": 6.0, + "learning_rate": 5.975474770230856e-06, + "loss": 0.80169296, + "memory(GiB)": 147.13, + "step": 38950, + "train_speed(iter/s)": 0.201858 + }, + { + "acc": 0.79056687, + "epoch": 0.9089663016375276, + "grad_norm": 4.59375, + "learning_rate": 5.973621889503804e-06, + "loss": 0.74093547, + "memory(GiB)": 147.13, + "step": 38960, + "train_speed(iter/s)": 0.201886 + }, + { + "acc": 0.75867233, + "epoch": 0.9091996092098165, + "grad_norm": 10.8125, + "learning_rate": 5.9717688697925134e-06, + "loss": 0.87084265, + "memory(GiB)": 147.13, + "step": 38970, + "train_speed(iter/s)": 0.201913 + }, + { + "acc": 0.77255449, + "epoch": 0.9094329167821054, + "grad_norm": 4.78125, + "learning_rate": 5.969915711361497e-06, + "loss": 0.82182369, + "memory(GiB)": 147.13, + "step": 38980, + "train_speed(iter/s)": 0.201939 + }, + { + "acc": 0.77857056, + "epoch": 0.9096662243543941, + "grad_norm": 5.15625, + "learning_rate": 5.968062414475294e-06, + "loss": 0.80886345, + "memory(GiB)": 147.13, + "step": 38990, + "train_speed(iter/s)": 0.201965 + }, + { + "acc": 0.77122827, + "epoch": 0.909899531926683, + "grad_norm": 8.625, + "learning_rate": 5.966208979398462e-06, + "loss": 0.82270679, + "memory(GiB)": 147.13, + "step": 39000, + "train_speed(iter/s)": 0.201991 + }, + { + "epoch": 0.909899531926683, + "eval_acc": 0.7434351674492115, + "eval_loss": 0.8086429834365845, + "eval_runtime": 1270.6235, + "eval_samples_per_second": 28.325, + "eval_steps_per_second": 14.163, + "step": 39000 + }, + { + "acc": 0.78865824, + "epoch": 0.910132839498972, + "grad_norm": 5.09375, + "learning_rate": 5.964355406395581e-06, + "loss": 0.73996811, + "memory(GiB)": 147.13, + "step": 39010, + "train_speed(iter/s)": 0.200679 + }, + { + "acc": 0.77401848, + "epoch": 0.9103661470712608, + "grad_norm": 7.21875, + "learning_rate": 5.962501695731245e-06, + "loss": 0.81361609, + "memory(GiB)": 147.13, + "step": 39020, + "train_speed(iter/s)": 0.200707 + }, + { + "acc": 0.76471219, + "epoch": 0.9105994546435497, + "grad_norm": 6.40625, + "learning_rate": 5.9606478476700714e-06, + "loss": 0.85037766, + "memory(GiB)": 147.13, + "step": 39030, + "train_speed(iter/s)": 0.200733 + }, + { + "acc": 0.79004974, + "epoch": 0.9108327622158386, + "grad_norm": 7.375, + "learning_rate": 5.958793862476699e-06, + "loss": 0.75956354, + "memory(GiB)": 147.13, + "step": 39040, + "train_speed(iter/s)": 0.20076 + }, + { + "acc": 0.78604002, + "epoch": 0.9110660697881275, + "grad_norm": 11.625, + "learning_rate": 5.956939740415778e-06, + "loss": 0.77915764, + "memory(GiB)": 147.13, + "step": 39050, + "train_speed(iter/s)": 0.200785 + }, + { + "acc": 0.78488879, + "epoch": 0.9112993773604164, + "grad_norm": 4.65625, + "learning_rate": 5.9550854817519875e-06, + "loss": 0.75412149, + "memory(GiB)": 147.13, + "step": 39060, + "train_speed(iter/s)": 0.200811 + }, + { + "acc": 0.79100518, + "epoch": 0.9115326849327053, + "grad_norm": 3.953125, + "learning_rate": 5.953231086750022e-06, + "loss": 0.7392344, + "memory(GiB)": 147.13, + "step": 39070, + "train_speed(iter/s)": 0.200837 + }, + { + "acc": 0.76816645, + "epoch": 0.9117659925049942, + "grad_norm": 7.78125, + "learning_rate": 5.951376555674596e-06, + "loss": 0.84252634, + "memory(GiB)": 147.13, + "step": 39080, + "train_speed(iter/s)": 0.200863 + }, + { + "acc": 0.7637064, + "epoch": 0.9119993000772831, + "grad_norm": 6.3125, + "learning_rate": 5.949521888790444e-06, + "loss": 0.85107002, + "memory(GiB)": 147.13, + "step": 39090, + "train_speed(iter/s)": 0.200889 + }, + { + "acc": 0.77783241, + "epoch": 0.912232607649572, + "grad_norm": 5.8125, + "learning_rate": 5.947667086362318e-06, + "loss": 0.78836126, + "memory(GiB)": 147.13, + "step": 39100, + "train_speed(iter/s)": 0.200914 + }, + { + "acc": 0.79197435, + "epoch": 0.9124659152218609, + "grad_norm": 5.75, + "learning_rate": 5.945812148654991e-06, + "loss": 0.74622936, + "memory(GiB)": 147.13, + "step": 39110, + "train_speed(iter/s)": 0.200941 + }, + { + "acc": 0.78090453, + "epoch": 0.9126992227941498, + "grad_norm": 6.84375, + "learning_rate": 5.943957075933253e-06, + "loss": 0.77442908, + "memory(GiB)": 147.13, + "step": 39120, + "train_speed(iter/s)": 0.200968 + }, + { + "acc": 0.80023155, + "epoch": 0.9129325303664387, + "grad_norm": 4.4375, + "learning_rate": 5.9421018684619165e-06, + "loss": 0.71343641, + "memory(GiB)": 147.13, + "step": 39130, + "train_speed(iter/s)": 0.200995 + }, + { + "acc": 0.7824955, + "epoch": 0.9131658379387276, + "grad_norm": 4.6875, + "learning_rate": 5.940246526505814e-06, + "loss": 0.78219862, + "memory(GiB)": 147.13, + "step": 39140, + "train_speed(iter/s)": 0.201021 + }, + { + "acc": 0.78558354, + "epoch": 0.9133991455110165, + "grad_norm": 7.34375, + "learning_rate": 5.9383910503297915e-06, + "loss": 0.77994785, + "memory(GiB)": 147.13, + "step": 39150, + "train_speed(iter/s)": 0.201046 + }, + { + "acc": 0.77539968, + "epoch": 0.9136324530833054, + "grad_norm": 5.75, + "learning_rate": 5.9365354401987195e-06, + "loss": 0.80972853, + "memory(GiB)": 147.13, + "step": 39160, + "train_speed(iter/s)": 0.201072 + }, + { + "acc": 0.78717179, + "epoch": 0.9138657606555943, + "grad_norm": 5.78125, + "learning_rate": 5.934679696377486e-06, + "loss": 0.77642155, + "memory(GiB)": 147.13, + "step": 39170, + "train_speed(iter/s)": 0.2011 + }, + { + "acc": 0.76665878, + "epoch": 0.9140990682278831, + "grad_norm": 4.21875, + "learning_rate": 5.932823819130997e-06, + "loss": 0.84620333, + "memory(GiB)": 147.13, + "step": 39180, + "train_speed(iter/s)": 0.201124 + }, + { + "acc": 0.77948418, + "epoch": 0.914332375800172, + "grad_norm": 4.78125, + "learning_rate": 5.930967808724178e-06, + "loss": 0.79968572, + "memory(GiB)": 147.13, + "step": 39190, + "train_speed(iter/s)": 0.201152 + }, + { + "acc": 0.79536867, + "epoch": 0.9145656833724609, + "grad_norm": 4.875, + "learning_rate": 5.929111665421976e-06, + "loss": 0.7362793, + "memory(GiB)": 147.13, + "step": 39200, + "train_speed(iter/s)": 0.201179 + }, + { + "acc": 0.76817694, + "epoch": 0.9147989909447498, + "grad_norm": 5.4375, + "learning_rate": 5.927255389489354e-06, + "loss": 0.81463871, + "memory(GiB)": 147.13, + "step": 39210, + "train_speed(iter/s)": 0.201206 + }, + { + "acc": 0.78327427, + "epoch": 0.9150322985170387, + "grad_norm": 7.375, + "learning_rate": 5.925398981191293e-06, + "loss": 0.75333605, + "memory(GiB)": 147.13, + "step": 39220, + "train_speed(iter/s)": 0.201233 + }, + { + "acc": 0.78863673, + "epoch": 0.9152656060893276, + "grad_norm": 4.53125, + "learning_rate": 5.9235424407927965e-06, + "loss": 0.749193, + "memory(GiB)": 147.13, + "step": 39230, + "train_speed(iter/s)": 0.201259 + }, + { + "acc": 0.76760859, + "epoch": 0.9154989136616165, + "grad_norm": 6.9375, + "learning_rate": 5.9216857685588855e-06, + "loss": 0.82770119, + "memory(GiB)": 147.13, + "step": 39240, + "train_speed(iter/s)": 0.201284 + }, + { + "acc": 0.78653831, + "epoch": 0.9157322212339054, + "grad_norm": 5.09375, + "learning_rate": 5.919828964754599e-06, + "loss": 0.76294737, + "memory(GiB)": 147.13, + "step": 39250, + "train_speed(iter/s)": 0.201312 + }, + { + "acc": 0.76094551, + "epoch": 0.9159655288061943, + "grad_norm": 6.21875, + "learning_rate": 5.917972029644995e-06, + "loss": 0.88056946, + "memory(GiB)": 147.13, + "step": 39260, + "train_speed(iter/s)": 0.201338 + }, + { + "acc": 0.7901597, + "epoch": 0.9161988363784832, + "grad_norm": 6.28125, + "learning_rate": 5.91611496349515e-06, + "loss": 0.76451311, + "memory(GiB)": 147.13, + "step": 39270, + "train_speed(iter/s)": 0.201364 + }, + { + "acc": 0.78220377, + "epoch": 0.9164321439507721, + "grad_norm": 4.46875, + "learning_rate": 5.91425776657016e-06, + "loss": 0.78790097, + "memory(GiB)": 147.13, + "step": 39280, + "train_speed(iter/s)": 0.201389 + }, + { + "acc": 0.76608167, + "epoch": 0.916665451523061, + "grad_norm": 5.03125, + "learning_rate": 5.912400439135139e-06, + "loss": 0.8713728, + "memory(GiB)": 147.13, + "step": 39290, + "train_speed(iter/s)": 0.201416 + }, + { + "acc": 0.77654085, + "epoch": 0.9168987590953499, + "grad_norm": 5.5625, + "learning_rate": 5.9105429814552204e-06, + "loss": 0.81738625, + "memory(GiB)": 147.13, + "step": 39300, + "train_speed(iter/s)": 0.201442 + }, + { + "acc": 0.79101648, + "epoch": 0.9171320666676388, + "grad_norm": 7.65625, + "learning_rate": 5.908685393795557e-06, + "loss": 0.74483733, + "memory(GiB)": 147.13, + "step": 39310, + "train_speed(iter/s)": 0.201467 + }, + { + "acc": 0.79285164, + "epoch": 0.9173653742399277, + "grad_norm": 11.4375, + "learning_rate": 5.9068276764213175e-06, + "loss": 0.73478308, + "memory(GiB)": 147.13, + "step": 39320, + "train_speed(iter/s)": 0.201495 + }, + { + "acc": 0.78043051, + "epoch": 0.9175986818122166, + "grad_norm": 6.78125, + "learning_rate": 5.90496982959769e-06, + "loss": 0.80263834, + "memory(GiB)": 147.13, + "step": 39330, + "train_speed(iter/s)": 0.201522 + }, + { + "acc": 0.77075157, + "epoch": 0.9178319893845055, + "grad_norm": 6.25, + "learning_rate": 5.903111853589881e-06, + "loss": 0.81982746, + "memory(GiB)": 147.13, + "step": 39340, + "train_speed(iter/s)": 0.201548 + }, + { + "acc": 0.78020802, + "epoch": 0.9180652969567944, + "grad_norm": 4.84375, + "learning_rate": 5.9012537486631185e-06, + "loss": 0.78267994, + "memory(GiB)": 147.13, + "step": 39350, + "train_speed(iter/s)": 0.201574 + }, + { + "acc": 0.77198472, + "epoch": 0.9182986045290833, + "grad_norm": 5.0625, + "learning_rate": 5.899395515082644e-06, + "loss": 0.82205286, + "memory(GiB)": 147.13, + "step": 39360, + "train_speed(iter/s)": 0.2016 + }, + { + "acc": 0.77900686, + "epoch": 0.9185319121013722, + "grad_norm": 4.40625, + "learning_rate": 5.897537153113724e-06, + "loss": 0.77138987, + "memory(GiB)": 147.13, + "step": 39370, + "train_speed(iter/s)": 0.201626 + }, + { + "acc": 0.7678503, + "epoch": 0.918765219673661, + "grad_norm": 5.96875, + "learning_rate": 5.895678663021634e-06, + "loss": 0.83069754, + "memory(GiB)": 147.13, + "step": 39380, + "train_speed(iter/s)": 0.201654 + }, + { + "acc": 0.775527, + "epoch": 0.9189985272459499, + "grad_norm": 5.5, + "learning_rate": 5.893820045071675e-06, + "loss": 0.80461073, + "memory(GiB)": 147.13, + "step": 39390, + "train_speed(iter/s)": 0.201678 + }, + { + "acc": 0.77663894, + "epoch": 0.9192318348182388, + "grad_norm": 6.15625, + "learning_rate": 5.891961299529165e-06, + "loss": 0.79502859, + "memory(GiB)": 147.13, + "step": 39400, + "train_speed(iter/s)": 0.201704 + }, + { + "acc": 0.76121426, + "epoch": 0.9194651423905277, + "grad_norm": 5.75, + "learning_rate": 5.890102426659438e-06, + "loss": 0.8578661, + "memory(GiB)": 147.13, + "step": 39410, + "train_speed(iter/s)": 0.20173 + }, + { + "acc": 0.78363619, + "epoch": 0.9196984499628166, + "grad_norm": 9.0625, + "learning_rate": 5.888243426727847e-06, + "loss": 0.78227863, + "memory(GiB)": 147.13, + "step": 39420, + "train_speed(iter/s)": 0.201755 + }, + { + "acc": 0.7768178, + "epoch": 0.9199317575351055, + "grad_norm": 6.5625, + "learning_rate": 5.886384299999767e-06, + "loss": 0.81008139, + "memory(GiB)": 147.13, + "step": 39430, + "train_speed(iter/s)": 0.20178 + }, + { + "acc": 0.77163954, + "epoch": 0.9201650651073944, + "grad_norm": 6.0, + "learning_rate": 5.884525046740586e-06, + "loss": 0.83757038, + "memory(GiB)": 147.13, + "step": 39440, + "train_speed(iter/s)": 0.201805 + }, + { + "acc": 0.76802855, + "epoch": 0.9203983726796833, + "grad_norm": 4.625, + "learning_rate": 5.882665667215709e-06, + "loss": 0.84138384, + "memory(GiB)": 147.13, + "step": 39450, + "train_speed(iter/s)": 0.201832 + }, + { + "acc": 0.77422495, + "epoch": 0.9206316802519722, + "grad_norm": 6.0, + "learning_rate": 5.880806161690567e-06, + "loss": 0.82321262, + "memory(GiB)": 147.13, + "step": 39460, + "train_speed(iter/s)": 0.201859 + }, + { + "acc": 0.7739378, + "epoch": 0.9208649878242611, + "grad_norm": 6.25, + "learning_rate": 5.878946530430599e-06, + "loss": 0.81013727, + "memory(GiB)": 147.13, + "step": 39470, + "train_speed(iter/s)": 0.201885 + }, + { + "acc": 0.77839308, + "epoch": 0.92109829539655, + "grad_norm": 10.25, + "learning_rate": 5.877086773701271e-06, + "loss": 0.80809059, + "memory(GiB)": 147.13, + "step": 39480, + "train_speed(iter/s)": 0.201909 + }, + { + "acc": 0.78637233, + "epoch": 0.9213316029688389, + "grad_norm": 8.4375, + "learning_rate": 5.87522689176806e-06, + "loss": 0.78325071, + "memory(GiB)": 147.13, + "step": 39490, + "train_speed(iter/s)": 0.201935 + }, + { + "acc": 0.78502064, + "epoch": 0.9215649105411278, + "grad_norm": 19.625, + "learning_rate": 5.873366884896464e-06, + "loss": 0.78469729, + "memory(GiB)": 147.13, + "step": 39500, + "train_speed(iter/s)": 0.20196 + }, + { + "epoch": 0.9215649105411278, + "eval_acc": 0.7435581582196189, + "eval_loss": 0.8083301782608032, + "eval_runtime": 1269.8332, + "eval_samples_per_second": 28.343, + "eval_steps_per_second": 14.172, + "step": 39500 + }, + { + "acc": 0.758465, + "epoch": 0.9217982181134167, + "grad_norm": 4.4375, + "learning_rate": 5.871506753352e-06, + "loss": 0.88601971, + "memory(GiB)": 147.13, + "step": 39510, + "train_speed(iter/s)": 0.200665 + }, + { + "acc": 0.77248645, + "epoch": 0.9220315256857056, + "grad_norm": 5.03125, + "learning_rate": 5.869646497400199e-06, + "loss": 0.81327343, + "memory(GiB)": 147.13, + "step": 39520, + "train_speed(iter/s)": 0.200692 + }, + { + "acc": 0.78898239, + "epoch": 0.9222648332579945, + "grad_norm": 4.875, + "learning_rate": 5.867786117306614e-06, + "loss": 0.73193016, + "memory(GiB)": 147.13, + "step": 39530, + "train_speed(iter/s)": 0.200719 + }, + { + "acc": 0.77655249, + "epoch": 0.9224981408302834, + "grad_norm": 8.25, + "learning_rate": 5.865925613336814e-06, + "loss": 0.80876665, + "memory(GiB)": 147.13, + "step": 39540, + "train_speed(iter/s)": 0.200747 + }, + { + "acc": 0.79803247, + "epoch": 0.9227314484025723, + "grad_norm": 4.625, + "learning_rate": 5.864064985756382e-06, + "loss": 0.73142552, + "memory(GiB)": 147.13, + "step": 39550, + "train_speed(iter/s)": 0.200773 + }, + { + "acc": 0.78445044, + "epoch": 0.9229647559748612, + "grad_norm": 5.0, + "learning_rate": 5.862204234830925e-06, + "loss": 0.78575296, + "memory(GiB)": 147.13, + "step": 39560, + "train_speed(iter/s)": 0.200799 + }, + { + "acc": 0.77423639, + "epoch": 0.92319806354715, + "grad_norm": 7.25, + "learning_rate": 5.860343360826063e-06, + "loss": 0.79959049, + "memory(GiB)": 147.13, + "step": 39570, + "train_speed(iter/s)": 0.200826 + }, + { + "acc": 0.75270905, + "epoch": 0.9234313711194388, + "grad_norm": 5.40625, + "learning_rate": 5.858482364007438e-06, + "loss": 0.88813477, + "memory(GiB)": 147.13, + "step": 39580, + "train_speed(iter/s)": 0.200853 + }, + { + "acc": 0.77825441, + "epoch": 0.9236646786917277, + "grad_norm": 11.4375, + "learning_rate": 5.856621244640704e-06, + "loss": 0.79800529, + "memory(GiB)": 147.13, + "step": 39590, + "train_speed(iter/s)": 0.200877 + }, + { + "acc": 0.78405609, + "epoch": 0.9238979862640166, + "grad_norm": 5.34375, + "learning_rate": 5.8547600029915366e-06, + "loss": 0.77924662, + "memory(GiB)": 147.13, + "step": 39600, + "train_speed(iter/s)": 0.200903 + }, + { + "acc": 0.76475625, + "epoch": 0.9241312938363055, + "grad_norm": 5.25, + "learning_rate": 5.852898639325627e-06, + "loss": 0.84154186, + "memory(GiB)": 147.13, + "step": 39610, + "train_speed(iter/s)": 0.20093 + }, + { + "acc": 0.79988174, + "epoch": 0.9243646014085944, + "grad_norm": 5.5625, + "learning_rate": 5.851037153908684e-06, + "loss": 0.71289425, + "memory(GiB)": 147.13, + "step": 39620, + "train_speed(iter/s)": 0.200956 + }, + { + "acc": 0.77636728, + "epoch": 0.9245979089808833, + "grad_norm": 5.5625, + "learning_rate": 5.849175547006433e-06, + "loss": 0.8403513, + "memory(GiB)": 147.13, + "step": 39630, + "train_speed(iter/s)": 0.200981 + }, + { + "acc": 0.76683617, + "epoch": 0.9248312165531722, + "grad_norm": 5.75, + "learning_rate": 5.8473138188846216e-06, + "loss": 0.8414938, + "memory(GiB)": 147.13, + "step": 39640, + "train_speed(iter/s)": 0.201007 + }, + { + "acc": 0.78084307, + "epoch": 0.9250645241254611, + "grad_norm": 4.65625, + "learning_rate": 5.845451969809009e-06, + "loss": 0.79839873, + "memory(GiB)": 147.13, + "step": 39650, + "train_speed(iter/s)": 0.201035 + }, + { + "acc": 0.79516888, + "epoch": 0.92529783169775, + "grad_norm": 3.96875, + "learning_rate": 5.843590000045372e-06, + "loss": 0.71090298, + "memory(GiB)": 147.13, + "step": 39660, + "train_speed(iter/s)": 0.201063 + }, + { + "acc": 0.7792943, + "epoch": 0.9255311392700389, + "grad_norm": 5.03125, + "learning_rate": 5.841727909859508e-06, + "loss": 0.78876328, + "memory(GiB)": 147.13, + "step": 39670, + "train_speed(iter/s)": 0.201089 + }, + { + "acc": 0.76696959, + "epoch": 0.9257644468423278, + "grad_norm": 8.625, + "learning_rate": 5.83986569951723e-06, + "loss": 0.85549431, + "memory(GiB)": 147.13, + "step": 39680, + "train_speed(iter/s)": 0.201115 + }, + { + "acc": 0.78547182, + "epoch": 0.9259977544146167, + "grad_norm": 4.59375, + "learning_rate": 5.838003369284366e-06, + "loss": 0.7880579, + "memory(GiB)": 147.13, + "step": 39690, + "train_speed(iter/s)": 0.20114 + }, + { + "acc": 0.7796061, + "epoch": 0.9262310619869056, + "grad_norm": 4.0625, + "learning_rate": 5.836140919426765e-06, + "loss": 0.78260422, + "memory(GiB)": 147.13, + "step": 39700, + "train_speed(iter/s)": 0.201165 + }, + { + "acc": 0.77718563, + "epoch": 0.9264643695591945, + "grad_norm": 4.15625, + "learning_rate": 5.834278350210292e-06, + "loss": 0.81985626, + "memory(GiB)": 147.13, + "step": 39710, + "train_speed(iter/s)": 0.201192 + }, + { + "acc": 0.78238964, + "epoch": 0.9266976771314834, + "grad_norm": 6.65625, + "learning_rate": 5.832415661900826e-06, + "loss": 0.7647913, + "memory(GiB)": 147.13, + "step": 39720, + "train_speed(iter/s)": 0.201217 + }, + { + "acc": 0.78833084, + "epoch": 0.9269309847037723, + "grad_norm": 4.90625, + "learning_rate": 5.830552854764265e-06, + "loss": 0.73814125, + "memory(GiB)": 147.13, + "step": 39730, + "train_speed(iter/s)": 0.201244 + }, + { + "acc": 0.77150917, + "epoch": 0.9271642922760612, + "grad_norm": 5.3125, + "learning_rate": 5.828689929066526e-06, + "loss": 0.81474161, + "memory(GiB)": 147.13, + "step": 39740, + "train_speed(iter/s)": 0.20127 + }, + { + "acc": 0.78371716, + "epoch": 0.9273975998483501, + "grad_norm": 5.34375, + "learning_rate": 5.826826885073541e-06, + "loss": 0.77574019, + "memory(GiB)": 147.13, + "step": 39750, + "train_speed(iter/s)": 0.201296 + }, + { + "acc": 0.79163008, + "epoch": 0.9276309074206389, + "grad_norm": 6.09375, + "learning_rate": 5.824963723051258e-06, + "loss": 0.75398407, + "memory(GiB)": 147.13, + "step": 39760, + "train_speed(iter/s)": 0.201322 + }, + { + "acc": 0.77016096, + "epoch": 0.9278642149929278, + "grad_norm": 5.71875, + "learning_rate": 5.823100443265643e-06, + "loss": 0.84416866, + "memory(GiB)": 147.13, + "step": 39770, + "train_speed(iter/s)": 0.201349 + }, + { + "acc": 0.7718441, + "epoch": 0.9280975225652167, + "grad_norm": 6.71875, + "learning_rate": 5.821237045982679e-06, + "loss": 0.82021675, + "memory(GiB)": 147.13, + "step": 39780, + "train_speed(iter/s)": 0.201376 + }, + { + "acc": 0.76817198, + "epoch": 0.9283308301375056, + "grad_norm": 7.84375, + "learning_rate": 5.819373531468364e-06, + "loss": 0.82225189, + "memory(GiB)": 147.13, + "step": 39790, + "train_speed(iter/s)": 0.201401 + }, + { + "acc": 0.77029276, + "epoch": 0.9285641377097945, + "grad_norm": 10.125, + "learning_rate": 5.817509899988717e-06, + "loss": 0.81980934, + "memory(GiB)": 147.13, + "step": 39800, + "train_speed(iter/s)": 0.201428 + }, + { + "acc": 0.77408419, + "epoch": 0.9287974452820834, + "grad_norm": 5.0625, + "learning_rate": 5.8156461518097695e-06, + "loss": 0.81796417, + "memory(GiB)": 147.13, + "step": 39810, + "train_speed(iter/s)": 0.201454 + }, + { + "acc": 0.78498955, + "epoch": 0.9290307528543723, + "grad_norm": 20.0, + "learning_rate": 5.813782287197569e-06, + "loss": 0.78012381, + "memory(GiB)": 147.13, + "step": 39820, + "train_speed(iter/s)": 0.201481 + }, + { + "acc": 0.76299672, + "epoch": 0.9292640604266612, + "grad_norm": 8.3125, + "learning_rate": 5.8119183064181864e-06, + "loss": 0.84655218, + "memory(GiB)": 147.13, + "step": 39830, + "train_speed(iter/s)": 0.201508 + }, + { + "acc": 0.77199259, + "epoch": 0.9294973679989501, + "grad_norm": 8.125, + "learning_rate": 5.810054209737699e-06, + "loss": 0.807763, + "memory(GiB)": 147.13, + "step": 39840, + "train_speed(iter/s)": 0.201533 + }, + { + "acc": 0.77625284, + "epoch": 0.929730675571239, + "grad_norm": 4.28125, + "learning_rate": 5.8081899974222076e-06, + "loss": 0.8012434, + "memory(GiB)": 147.13, + "step": 39850, + "train_speed(iter/s)": 0.20156 + }, + { + "acc": 0.79133472, + "epoch": 0.9299639831435279, + "grad_norm": 4.84375, + "learning_rate": 5.80632566973783e-06, + "loss": 0.75545387, + "memory(GiB)": 147.13, + "step": 39860, + "train_speed(iter/s)": 0.201585 + }, + { + "acc": 0.78476238, + "epoch": 0.9301972907158168, + "grad_norm": 6.0, + "learning_rate": 5.804461226950697e-06, + "loss": 0.75581503, + "memory(GiB)": 147.13, + "step": 39870, + "train_speed(iter/s)": 0.201611 + }, + { + "acc": 0.75795188, + "epoch": 0.9304305982881057, + "grad_norm": 6.6875, + "learning_rate": 5.80259666932696e-06, + "loss": 0.87952824, + "memory(GiB)": 147.13, + "step": 39880, + "train_speed(iter/s)": 0.201638 + }, + { + "acc": 0.75466013, + "epoch": 0.9306639058603946, + "grad_norm": 5.78125, + "learning_rate": 5.800731997132779e-06, + "loss": 0.8734972, + "memory(GiB)": 147.13, + "step": 39890, + "train_speed(iter/s)": 0.201665 + }, + { + "acc": 0.78315744, + "epoch": 0.9308972134326835, + "grad_norm": 6.375, + "learning_rate": 5.7988672106343395e-06, + "loss": 0.8057559, + "memory(GiB)": 147.13, + "step": 39900, + "train_speed(iter/s)": 0.20169 + }, + { + "acc": 0.76869955, + "epoch": 0.9311305210049724, + "grad_norm": 6.21875, + "learning_rate": 5.797002310097836e-06, + "loss": 0.82583561, + "memory(GiB)": 147.13, + "step": 39910, + "train_speed(iter/s)": 0.201718 + }, + { + "acc": 0.78581076, + "epoch": 0.9313638285772613, + "grad_norm": 5.0625, + "learning_rate": 5.795137295789486e-06, + "loss": 0.78243923, + "memory(GiB)": 147.13, + "step": 39920, + "train_speed(iter/s)": 0.201744 + }, + { + "acc": 0.77420025, + "epoch": 0.9315971361495502, + "grad_norm": 5.4375, + "learning_rate": 5.7932721679755164e-06, + "loss": 0.81065769, + "memory(GiB)": 147.13, + "step": 39930, + "train_speed(iter/s)": 0.20177 + }, + { + "acc": 0.76268339, + "epoch": 0.9318304437218391, + "grad_norm": 4.09375, + "learning_rate": 5.791406926922176e-06, + "loss": 0.88640518, + "memory(GiB)": 147.13, + "step": 39940, + "train_speed(iter/s)": 0.201797 + }, + { + "acc": 0.78374519, + "epoch": 0.9320637512941279, + "grad_norm": 5.59375, + "learning_rate": 5.789541572895727e-06, + "loss": 0.78053789, + "memory(GiB)": 147.13, + "step": 39950, + "train_speed(iter/s)": 0.201824 + }, + { + "acc": 0.79211001, + "epoch": 0.9322970588664168, + "grad_norm": 4.96875, + "learning_rate": 5.787676106162449e-06, + "loss": 0.74461236, + "memory(GiB)": 147.13, + "step": 39960, + "train_speed(iter/s)": 0.20185 + }, + { + "acc": 0.80399532, + "epoch": 0.9325303664387057, + "grad_norm": 5.0625, + "learning_rate": 5.785810526988633e-06, + "loss": 0.70544991, + "memory(GiB)": 147.13, + "step": 39970, + "train_speed(iter/s)": 0.201875 + }, + { + "acc": 0.76267509, + "epoch": 0.9327636740109946, + "grad_norm": 4.125, + "learning_rate": 5.783944835640594e-06, + "loss": 0.82315941, + "memory(GiB)": 147.13, + "step": 39980, + "train_speed(iter/s)": 0.201902 + }, + { + "acc": 0.76198874, + "epoch": 0.9329969815832835, + "grad_norm": 4.4375, + "learning_rate": 5.7820790323846566e-06, + "loss": 0.88518867, + "memory(GiB)": 147.13, + "step": 39990, + "train_speed(iter/s)": 0.201927 + }, + { + "acc": 0.76849947, + "epoch": 0.9332302891555724, + "grad_norm": 7.75, + "learning_rate": 5.780213117487167e-06, + "loss": 0.81474934, + "memory(GiB)": 147.13, + "step": 40000, + "train_speed(iter/s)": 0.201955 + }, + { + "epoch": 0.9332302891555724, + "eval_acc": 0.7435186025942405, + "eval_loss": 0.8081688284873962, + "eval_runtime": 1269.5667, + "eval_samples_per_second": 28.349, + "eval_steps_per_second": 14.175, + "step": 40000 + }, + { + "acc": 0.77274613, + "epoch": 0.9334635967278613, + "grad_norm": 4.8125, + "learning_rate": 5.778347091214479e-06, + "loss": 0.81792908, + "memory(GiB)": 147.13, + "step": 40010, + "train_speed(iter/s)": 0.200676 + }, + { + "acc": 0.77955446, + "epoch": 0.9336969043001502, + "grad_norm": 4.9375, + "learning_rate": 5.77648095383297e-06, + "loss": 0.7724124, + "memory(GiB)": 147.13, + "step": 40020, + "train_speed(iter/s)": 0.200702 + }, + { + "acc": 0.76370978, + "epoch": 0.9339302118724391, + "grad_norm": 11.625, + "learning_rate": 5.774614705609032e-06, + "loss": 0.85831079, + "memory(GiB)": 147.13, + "step": 40030, + "train_speed(iter/s)": 0.200726 + }, + { + "acc": 0.77389021, + "epoch": 0.934163519444728, + "grad_norm": 4.5, + "learning_rate": 5.7727483468090686e-06, + "loss": 0.80834856, + "memory(GiB)": 147.13, + "step": 40040, + "train_speed(iter/s)": 0.200752 + }, + { + "acc": 0.77940512, + "epoch": 0.9343968270170169, + "grad_norm": 4.375, + "learning_rate": 5.770881877699502e-06, + "loss": 0.78713818, + "memory(GiB)": 147.13, + "step": 40050, + "train_speed(iter/s)": 0.200777 + }, + { + "acc": 0.77909374, + "epoch": 0.9346301345893058, + "grad_norm": 7.21875, + "learning_rate": 5.769015298546774e-06, + "loss": 0.79142208, + "memory(GiB)": 147.13, + "step": 40060, + "train_speed(iter/s)": 0.200801 + }, + { + "acc": 0.77855005, + "epoch": 0.9348634421615947, + "grad_norm": 4.84375, + "learning_rate": 5.7671486096173336e-06, + "loss": 0.78886166, + "memory(GiB)": 147.13, + "step": 40070, + "train_speed(iter/s)": 0.200827 + }, + { + "acc": 0.77905307, + "epoch": 0.9350967497338836, + "grad_norm": 5.90625, + "learning_rate": 5.765281811177652e-06, + "loss": 0.81017637, + "memory(GiB)": 147.13, + "step": 40080, + "train_speed(iter/s)": 0.200853 + }, + { + "acc": 0.79088497, + "epoch": 0.9353300573061725, + "grad_norm": 5.5625, + "learning_rate": 5.763414903494216e-06, + "loss": 0.75551167, + "memory(GiB)": 147.13, + "step": 40090, + "train_speed(iter/s)": 0.200879 + }, + { + "acc": 0.75535116, + "epoch": 0.9355633648784614, + "grad_norm": 5.75, + "learning_rate": 5.761547886833523e-06, + "loss": 0.89576149, + "memory(GiB)": 147.13, + "step": 40100, + "train_speed(iter/s)": 0.200905 + }, + { + "acc": 0.80530338, + "epoch": 0.9357966724507503, + "grad_norm": 6.34375, + "learning_rate": 5.759680761462091e-06, + "loss": 0.69478006, + "memory(GiB)": 147.13, + "step": 40110, + "train_speed(iter/s)": 0.20093 + }, + { + "acc": 0.77864857, + "epoch": 0.9360299800230392, + "grad_norm": 4.875, + "learning_rate": 5.757813527646449e-06, + "loss": 0.8089098, + "memory(GiB)": 147.13, + "step": 40120, + "train_speed(iter/s)": 0.200956 + }, + { + "acc": 0.79146218, + "epoch": 0.9362632875953281, + "grad_norm": 4.59375, + "learning_rate": 5.755946185653148e-06, + "loss": 0.74217372, + "memory(GiB)": 147.13, + "step": 40130, + "train_speed(iter/s)": 0.200982 + }, + { + "acc": 0.78336248, + "epoch": 0.936496595167617, + "grad_norm": 15.5625, + "learning_rate": 5.7540787357487485e-06, + "loss": 0.78665681, + "memory(GiB)": 147.13, + "step": 40140, + "train_speed(iter/s)": 0.201007 + }, + { + "acc": 0.78126898, + "epoch": 0.9367299027399058, + "grad_norm": 5.59375, + "learning_rate": 5.752211178199828e-06, + "loss": 0.77836123, + "memory(GiB)": 147.13, + "step": 40150, + "train_speed(iter/s)": 0.201033 + }, + { + "acc": 0.78209891, + "epoch": 0.9369632103121946, + "grad_norm": 4.75, + "learning_rate": 5.7503435132729805e-06, + "loss": 0.76724977, + "memory(GiB)": 147.13, + "step": 40160, + "train_speed(iter/s)": 0.201059 + }, + { + "acc": 0.77962379, + "epoch": 0.9371965178844835, + "grad_norm": 6.0625, + "learning_rate": 5.7484757412348146e-06, + "loss": 0.78311558, + "memory(GiB)": 147.13, + "step": 40170, + "train_speed(iter/s)": 0.201084 + }, + { + "acc": 0.78048811, + "epoch": 0.9374298254567724, + "grad_norm": 7.125, + "learning_rate": 5.746607862351955e-06, + "loss": 0.78884706, + "memory(GiB)": 147.13, + "step": 40180, + "train_speed(iter/s)": 0.201109 + }, + { + "acc": 0.76181178, + "epoch": 0.9376631330290613, + "grad_norm": 5.09375, + "learning_rate": 5.744739876891038e-06, + "loss": 0.85814571, + "memory(GiB)": 147.13, + "step": 40190, + "train_speed(iter/s)": 0.201136 + }, + { + "acc": 0.77918482, + "epoch": 0.9378964406013502, + "grad_norm": 4.6875, + "learning_rate": 5.742871785118721e-06, + "loss": 0.76165657, + "memory(GiB)": 147.13, + "step": 40200, + "train_speed(iter/s)": 0.201162 + }, + { + "acc": 0.8000536, + "epoch": 0.9381297481736391, + "grad_norm": 13.25, + "learning_rate": 5.741003587301673e-06, + "loss": 0.71440544, + "memory(GiB)": 147.13, + "step": 40210, + "train_speed(iter/s)": 0.201187 + }, + { + "acc": 0.77322521, + "epoch": 0.938363055745928, + "grad_norm": 4.90625, + "learning_rate": 5.739135283706576e-06, + "loss": 0.82055111, + "memory(GiB)": 147.13, + "step": 40220, + "train_speed(iter/s)": 0.201214 + }, + { + "acc": 0.77559209, + "epoch": 0.9385963633182169, + "grad_norm": 5.875, + "learning_rate": 5.737266874600134e-06, + "loss": 0.79832792, + "memory(GiB)": 147.13, + "step": 40230, + "train_speed(iter/s)": 0.20124 + }, + { + "acc": 0.76249466, + "epoch": 0.9388296708905058, + "grad_norm": 4.59375, + "learning_rate": 5.735398360249059e-06, + "loss": 0.84215164, + "memory(GiB)": 147.13, + "step": 40240, + "train_speed(iter/s)": 0.201266 + }, + { + "acc": 0.79150157, + "epoch": 0.9390629784627947, + "grad_norm": 6.25, + "learning_rate": 5.733529740920083e-06, + "loss": 0.73628078, + "memory(GiB)": 147.13, + "step": 40250, + "train_speed(iter/s)": 0.201291 + }, + { + "acc": 0.77681289, + "epoch": 0.9392962860350836, + "grad_norm": 6.0625, + "learning_rate": 5.731661016879948e-06, + "loss": 0.79612532, + "memory(GiB)": 147.13, + "step": 40260, + "train_speed(iter/s)": 0.201318 + }, + { + "acc": 0.7825942, + "epoch": 0.9395295936073725, + "grad_norm": 4.9375, + "learning_rate": 5.729792188395415e-06, + "loss": 0.7699985, + "memory(GiB)": 147.13, + "step": 40270, + "train_speed(iter/s)": 0.201342 + }, + { + "acc": 0.76840849, + "epoch": 0.9397629011796614, + "grad_norm": 5.34375, + "learning_rate": 5.7279232557332595e-06, + "loss": 0.82999878, + "memory(GiB)": 147.13, + "step": 40280, + "train_speed(iter/s)": 0.201367 + }, + { + "acc": 0.77449684, + "epoch": 0.9399962087519503, + "grad_norm": 5.34375, + "learning_rate": 5.726054219160273e-06, + "loss": 0.81352596, + "memory(GiB)": 147.13, + "step": 40290, + "train_speed(iter/s)": 0.201392 + }, + { + "acc": 0.77998219, + "epoch": 0.9402295163242392, + "grad_norm": 4.65625, + "learning_rate": 5.7241850789432555e-06, + "loss": 0.79409661, + "memory(GiB)": 147.13, + "step": 40300, + "train_speed(iter/s)": 0.201417 + }, + { + "acc": 0.78303413, + "epoch": 0.9404628238965281, + "grad_norm": 5.8125, + "learning_rate": 5.722315835349029e-06, + "loss": 0.76099334, + "memory(GiB)": 147.13, + "step": 40310, + "train_speed(iter/s)": 0.201444 + }, + { + "acc": 0.78695984, + "epoch": 0.940696131468817, + "grad_norm": 5.59375, + "learning_rate": 5.7204464886444265e-06, + "loss": 0.76036892, + "memory(GiB)": 147.13, + "step": 40320, + "train_speed(iter/s)": 0.201469 + }, + { + "acc": 0.78353043, + "epoch": 0.9409294390411059, + "grad_norm": 5.25, + "learning_rate": 5.718577039096297e-06, + "loss": 0.78870249, + "memory(GiB)": 147.13, + "step": 40330, + "train_speed(iter/s)": 0.201495 + }, + { + "acc": 0.78347793, + "epoch": 0.9411627466133947, + "grad_norm": 5.25, + "learning_rate": 5.7167074869715045e-06, + "loss": 0.76787477, + "memory(GiB)": 147.13, + "step": 40340, + "train_speed(iter/s)": 0.20152 + }, + { + "acc": 0.79285727, + "epoch": 0.9413960541856836, + "grad_norm": 7.1875, + "learning_rate": 5.714837832536926e-06, + "loss": 0.73996181, + "memory(GiB)": 147.13, + "step": 40350, + "train_speed(iter/s)": 0.201547 + }, + { + "acc": 0.77512045, + "epoch": 0.9416293617579725, + "grad_norm": 5.78125, + "learning_rate": 5.712968076059454e-06, + "loss": 0.81619711, + "memory(GiB)": 147.13, + "step": 40360, + "train_speed(iter/s)": 0.201572 + }, + { + "acc": 0.78800678, + "epoch": 0.9418626693302614, + "grad_norm": 5.375, + "learning_rate": 5.711098217805997e-06, + "loss": 0.74943008, + "memory(GiB)": 147.13, + "step": 40370, + "train_speed(iter/s)": 0.201597 + }, + { + "acc": 0.77038002, + "epoch": 0.9420959769025503, + "grad_norm": 5.125, + "learning_rate": 5.709228258043476e-06, + "loss": 0.82832499, + "memory(GiB)": 147.13, + "step": 40380, + "train_speed(iter/s)": 0.201622 + }, + { + "acc": 0.7822566, + "epoch": 0.9423292844748392, + "grad_norm": 4.625, + "learning_rate": 5.707358197038827e-06, + "loss": 0.78541126, + "memory(GiB)": 147.13, + "step": 40390, + "train_speed(iter/s)": 0.201646 + }, + { + "acc": 0.77454891, + "epoch": 0.9425625920471281, + "grad_norm": 5.71875, + "learning_rate": 5.7054880350590015e-06, + "loss": 0.83365231, + "memory(GiB)": 147.13, + "step": 40400, + "train_speed(iter/s)": 0.201673 + }, + { + "acc": 0.77726865, + "epoch": 0.942795899619417, + "grad_norm": 4.96875, + "learning_rate": 5.703617772370963e-06, + "loss": 0.79805484, + "memory(GiB)": 147.13, + "step": 40410, + "train_speed(iter/s)": 0.201698 + }, + { + "acc": 0.7605464, + "epoch": 0.9430292071917059, + "grad_norm": 5.3125, + "learning_rate": 5.701747409241691e-06, + "loss": 0.85971928, + "memory(GiB)": 147.13, + "step": 40420, + "train_speed(iter/s)": 0.201723 + }, + { + "acc": 0.79386868, + "epoch": 0.9432625147639948, + "grad_norm": 6.03125, + "learning_rate": 5.699876945938182e-06, + "loss": 0.74754467, + "memory(GiB)": 147.13, + "step": 40430, + "train_speed(iter/s)": 0.201748 + }, + { + "acc": 0.7801425, + "epoch": 0.9434958223362837, + "grad_norm": 4.90625, + "learning_rate": 5.698006382727441e-06, + "loss": 0.7978281, + "memory(GiB)": 147.13, + "step": 40440, + "train_speed(iter/s)": 0.201775 + }, + { + "acc": 0.74835367, + "epoch": 0.9437291299085726, + "grad_norm": 8.5, + "learning_rate": 5.696135719876492e-06, + "loss": 0.92766171, + "memory(GiB)": 147.13, + "step": 40450, + "train_speed(iter/s)": 0.201803 + }, + { + "acc": 0.78347101, + "epoch": 0.9439624374808615, + "grad_norm": 5.0, + "learning_rate": 5.694264957652373e-06, + "loss": 0.75946207, + "memory(GiB)": 147.13, + "step": 40460, + "train_speed(iter/s)": 0.201827 + }, + { + "acc": 0.78046608, + "epoch": 0.9441957450531504, + "grad_norm": 5.59375, + "learning_rate": 5.692394096322131e-06, + "loss": 0.81162415, + "memory(GiB)": 147.13, + "step": 40470, + "train_speed(iter/s)": 0.201854 + }, + { + "acc": 0.77506027, + "epoch": 0.9444290526254393, + "grad_norm": 5.09375, + "learning_rate": 5.690523136152834e-06, + "loss": 0.83319483, + "memory(GiB)": 147.13, + "step": 40480, + "train_speed(iter/s)": 0.201879 + }, + { + "acc": 0.77044592, + "epoch": 0.9446623601977282, + "grad_norm": 5.75, + "learning_rate": 5.688652077411558e-06, + "loss": 0.82162943, + "memory(GiB)": 147.13, + "step": 40490, + "train_speed(iter/s)": 0.201904 + }, + { + "acc": 0.78238449, + "epoch": 0.9448956677700171, + "grad_norm": 6.28125, + "learning_rate": 5.6867809203654004e-06, + "loss": 0.78513522, + "memory(GiB)": 147.13, + "step": 40500, + "train_speed(iter/s)": 0.201929 + }, + { + "epoch": 0.9448956677700171, + "eval_acc": 0.7436441556723647, + "eval_loss": 0.8078497052192688, + "eval_runtime": 1270.4605, + "eval_samples_per_second": 28.329, + "eval_steps_per_second": 14.165, + "step": 40500 + }, + { + "acc": 0.77993817, + "epoch": 0.945128975342306, + "grad_norm": 6.125, + "learning_rate": 5.684909665281465e-06, + "loss": 0.7786458, + "memory(GiB)": 147.13, + "step": 40510, + "train_speed(iter/s)": 0.200667 + }, + { + "acc": 0.7921401, + "epoch": 0.9453622829145949, + "grad_norm": 5.0, + "learning_rate": 5.683038312426873e-06, + "loss": 0.73767538, + "memory(GiB)": 147.13, + "step": 40520, + "train_speed(iter/s)": 0.200693 + }, + { + "acc": 0.77997236, + "epoch": 0.9455955904868837, + "grad_norm": 4.9375, + "learning_rate": 5.681166862068761e-06, + "loss": 0.80369387, + "memory(GiB)": 147.13, + "step": 40530, + "train_speed(iter/s)": 0.200719 + }, + { + "acc": 0.75641575, + "epoch": 0.9458288980591726, + "grad_norm": 4.34375, + "learning_rate": 5.679295314474278e-06, + "loss": 0.89748631, + "memory(GiB)": 147.13, + "step": 40540, + "train_speed(iter/s)": 0.200743 + }, + { + "acc": 0.77325706, + "epoch": 0.9460622056314615, + "grad_norm": 4.125, + "learning_rate": 5.677423669910584e-06, + "loss": 0.82863417, + "memory(GiB)": 147.13, + "step": 40550, + "train_speed(iter/s)": 0.20077 + }, + { + "acc": 0.7895503, + "epoch": 0.9462955132037504, + "grad_norm": 5.53125, + "learning_rate": 5.67555192864486e-06, + "loss": 0.74407339, + "memory(GiB)": 147.13, + "step": 40560, + "train_speed(iter/s)": 0.200795 + }, + { + "acc": 0.77496853, + "epoch": 0.9465288207760393, + "grad_norm": 11.0, + "learning_rate": 5.673680090944294e-06, + "loss": 0.81249332, + "memory(GiB)": 147.13, + "step": 40570, + "train_speed(iter/s)": 0.200821 + }, + { + "acc": 0.79530725, + "epoch": 0.9467621283483282, + "grad_norm": 9.1875, + "learning_rate": 5.671808157076091e-06, + "loss": 0.72718515, + "memory(GiB)": 147.13, + "step": 40580, + "train_speed(iter/s)": 0.200843 + }, + { + "acc": 0.76706448, + "epoch": 0.9469954359206171, + "grad_norm": 4.0, + "learning_rate": 5.669936127307468e-06, + "loss": 0.85577278, + "memory(GiB)": 147.13, + "step": 40590, + "train_speed(iter/s)": 0.200868 + }, + { + "acc": 0.77201781, + "epoch": 0.947228743492906, + "grad_norm": 6.84375, + "learning_rate": 5.668064001905658e-06, + "loss": 0.82298203, + "memory(GiB)": 147.13, + "step": 40600, + "train_speed(iter/s)": 0.200893 + }, + { + "acc": 0.76372414, + "epoch": 0.9474620510651949, + "grad_norm": 5.40625, + "learning_rate": 5.666191781137905e-06, + "loss": 0.85342846, + "memory(GiB)": 147.13, + "step": 40610, + "train_speed(iter/s)": 0.200919 + }, + { + "acc": 0.77155447, + "epoch": 0.9476953586374838, + "grad_norm": 6.09375, + "learning_rate": 5.66431946527147e-06, + "loss": 0.80941982, + "memory(GiB)": 147.13, + "step": 40620, + "train_speed(iter/s)": 0.200945 + }, + { + "acc": 0.78885984, + "epoch": 0.9479286662097727, + "grad_norm": 4.8125, + "learning_rate": 5.662447054573624e-06, + "loss": 0.76518712, + "memory(GiB)": 147.13, + "step": 40630, + "train_speed(iter/s)": 0.20097 + }, + { + "acc": 0.7786191, + "epoch": 0.9481619737820616, + "grad_norm": 4.3125, + "learning_rate": 5.660574549311653e-06, + "loss": 0.80177717, + "memory(GiB)": 147.13, + "step": 40640, + "train_speed(iter/s)": 0.200994 + }, + { + "acc": 0.77824173, + "epoch": 0.9483952813543505, + "grad_norm": 5.71875, + "learning_rate": 5.658701949752856e-06, + "loss": 0.79966569, + "memory(GiB)": 147.13, + "step": 40650, + "train_speed(iter/s)": 0.201019 + }, + { + "acc": 0.77022877, + "epoch": 0.9486285889266394, + "grad_norm": 5.09375, + "learning_rate": 5.656829256164549e-06, + "loss": 0.85357208, + "memory(GiB)": 147.13, + "step": 40660, + "train_speed(iter/s)": 0.201045 + }, + { + "acc": 0.78125162, + "epoch": 0.9488618964989283, + "grad_norm": 5.28125, + "learning_rate": 5.6549564688140555e-06, + "loss": 0.76331964, + "memory(GiB)": 147.13, + "step": 40670, + "train_speed(iter/s)": 0.201072 + }, + { + "acc": 0.79808364, + "epoch": 0.9490952040712172, + "grad_norm": 4.8125, + "learning_rate": 5.653083587968716e-06, + "loss": 0.71971464, + "memory(GiB)": 147.13, + "step": 40680, + "train_speed(iter/s)": 0.201099 + }, + { + "acc": 0.77893338, + "epoch": 0.9493285116435061, + "grad_norm": 6.21875, + "learning_rate": 5.651210613895885e-06, + "loss": 0.78710918, + "memory(GiB)": 147.13, + "step": 40690, + "train_speed(iter/s)": 0.201125 + }, + { + "acc": 0.77154942, + "epoch": 0.949561819215795, + "grad_norm": 7.03125, + "learning_rate": 5.649337546862927e-06, + "loss": 0.81971178, + "memory(GiB)": 147.13, + "step": 40700, + "train_speed(iter/s)": 0.201151 + }, + { + "acc": 0.7884603, + "epoch": 0.9497951267880839, + "grad_norm": 4.5, + "learning_rate": 5.647464387137224e-06, + "loss": 0.75167475, + "memory(GiB)": 147.13, + "step": 40710, + "train_speed(iter/s)": 0.201176 + }, + { + "acc": 0.77500906, + "epoch": 0.9500284343603727, + "grad_norm": 6.5, + "learning_rate": 5.645591134986166e-06, + "loss": 0.81845722, + "memory(GiB)": 147.13, + "step": 40720, + "train_speed(iter/s)": 0.201201 + }, + { + "acc": 0.78605471, + "epoch": 0.9502617419326616, + "grad_norm": 5.09375, + "learning_rate": 5.643717790677162e-06, + "loss": 0.7811121, + "memory(GiB)": 147.13, + "step": 40730, + "train_speed(iter/s)": 0.201226 + }, + { + "acc": 0.77256351, + "epoch": 0.9504950495049505, + "grad_norm": 5.46875, + "learning_rate": 5.641844354477631e-06, + "loss": 0.84232235, + "memory(GiB)": 147.13, + "step": 40740, + "train_speed(iter/s)": 0.201251 + }, + { + "acc": 0.76221151, + "epoch": 0.9507283570772393, + "grad_norm": 6.34375, + "learning_rate": 5.639970826655005e-06, + "loss": 0.85408916, + "memory(GiB)": 147.13, + "step": 40750, + "train_speed(iter/s)": 0.201277 + }, + { + "acc": 0.78343468, + "epoch": 0.9509616646495282, + "grad_norm": 6.96875, + "learning_rate": 5.63809720747673e-06, + "loss": 0.76852407, + "memory(GiB)": 147.13, + "step": 40760, + "train_speed(iter/s)": 0.201302 + }, + { + "acc": 0.78977461, + "epoch": 0.9511949722218171, + "grad_norm": 7.5, + "learning_rate": 5.636223497210261e-06, + "loss": 0.74346333, + "memory(GiB)": 147.13, + "step": 40770, + "train_speed(iter/s)": 0.201328 + }, + { + "acc": 0.7746685, + "epoch": 0.951428279794106, + "grad_norm": 7.0625, + "learning_rate": 5.634349696123075e-06, + "loss": 0.81890717, + "memory(GiB)": 147.13, + "step": 40780, + "train_speed(iter/s)": 0.201354 + }, + { + "acc": 0.77995763, + "epoch": 0.951661587366395, + "grad_norm": 15.1875, + "learning_rate": 5.6324758044826535e-06, + "loss": 0.79575968, + "memory(GiB)": 147.13, + "step": 40790, + "train_speed(iter/s)": 0.201378 + }, + { + "acc": 0.78342981, + "epoch": 0.9518948949386838, + "grad_norm": 4.0, + "learning_rate": 5.6306018225564955e-06, + "loss": 0.79688606, + "memory(GiB)": 147.13, + "step": 40800, + "train_speed(iter/s)": 0.201404 + }, + { + "acc": 0.77644749, + "epoch": 0.9521282025109727, + "grad_norm": 7.59375, + "learning_rate": 5.6287277506121084e-06, + "loss": 0.80725603, + "memory(GiB)": 147.13, + "step": 40810, + "train_speed(iter/s)": 0.20143 + }, + { + "acc": 0.78804812, + "epoch": 0.9523615100832616, + "grad_norm": 5.0, + "learning_rate": 5.626853588917021e-06, + "loss": 0.7547946, + "memory(GiB)": 147.13, + "step": 40820, + "train_speed(iter/s)": 0.201454 + }, + { + "acc": 0.76683331, + "epoch": 0.9525948176555505, + "grad_norm": 4.78125, + "learning_rate": 5.624979337738763e-06, + "loss": 0.8316308, + "memory(GiB)": 147.13, + "step": 40830, + "train_speed(iter/s)": 0.20148 + }, + { + "acc": 0.77537379, + "epoch": 0.9528281252278394, + "grad_norm": 4.9375, + "learning_rate": 5.623104997344886e-06, + "loss": 0.80050964, + "memory(GiB)": 147.13, + "step": 40840, + "train_speed(iter/s)": 0.201507 + }, + { + "acc": 0.77048178, + "epoch": 0.9530614328001283, + "grad_norm": 7.84375, + "learning_rate": 5.621230568002952e-06, + "loss": 0.84840851, + "memory(GiB)": 147.13, + "step": 40850, + "train_speed(iter/s)": 0.201531 + }, + { + "acc": 0.77276115, + "epoch": 0.9532947403724172, + "grad_norm": 6.125, + "learning_rate": 5.619356049980536e-06, + "loss": 0.84306707, + "memory(GiB)": 147.13, + "step": 40860, + "train_speed(iter/s)": 0.201556 + }, + { + "acc": 0.75870538, + "epoch": 0.9535280479447061, + "grad_norm": 5.21875, + "learning_rate": 5.617481443545223e-06, + "loss": 0.88285389, + "memory(GiB)": 147.13, + "step": 40870, + "train_speed(iter/s)": 0.201582 + }, + { + "acc": 0.78115153, + "epoch": 0.953761355516995, + "grad_norm": 5.3125, + "learning_rate": 5.615606748964613e-06, + "loss": 0.77917652, + "memory(GiB)": 147.13, + "step": 40880, + "train_speed(iter/s)": 0.201608 + }, + { + "acc": 0.77769594, + "epoch": 0.9539946630892839, + "grad_norm": 7.40625, + "learning_rate": 5.613731966506321e-06, + "loss": 0.78834753, + "memory(GiB)": 147.13, + "step": 40890, + "train_speed(iter/s)": 0.201632 + }, + { + "acc": 0.78029628, + "epoch": 0.9542279706615728, + "grad_norm": 7.28125, + "learning_rate": 5.611857096437966e-06, + "loss": 0.80404196, + "memory(GiB)": 147.13, + "step": 40900, + "train_speed(iter/s)": 0.201658 + }, + { + "acc": 0.78504705, + "epoch": 0.9544612782338617, + "grad_norm": 5.34375, + "learning_rate": 5.60998213902719e-06, + "loss": 0.78944445, + "memory(GiB)": 147.13, + "step": 40910, + "train_speed(iter/s)": 0.201684 + }, + { + "acc": 0.78178701, + "epoch": 0.9546945858061505, + "grad_norm": 6.03125, + "learning_rate": 5.60810709454164e-06, + "loss": 0.791785, + "memory(GiB)": 147.13, + "step": 40920, + "train_speed(iter/s)": 0.20171 + }, + { + "acc": 0.77981234, + "epoch": 0.9549278933784394, + "grad_norm": 6.3125, + "learning_rate": 5.606231963248978e-06, + "loss": 0.80088806, + "memory(GiB)": 147.13, + "step": 40930, + "train_speed(iter/s)": 0.201736 + }, + { + "acc": 0.79295478, + "epoch": 0.9551612009507283, + "grad_norm": 3.71875, + "learning_rate": 5.60435674541688e-06, + "loss": 0.7574069, + "memory(GiB)": 147.13, + "step": 40940, + "train_speed(iter/s)": 0.20176 + }, + { + "acc": 0.77834711, + "epoch": 0.9553945085230172, + "grad_norm": 6.34375, + "learning_rate": 5.602481441313032e-06, + "loss": 0.78354673, + "memory(GiB)": 147.13, + "step": 40950, + "train_speed(iter/s)": 0.201786 + }, + { + "acc": 0.79002304, + "epoch": 0.9556278160953061, + "grad_norm": 4.09375, + "learning_rate": 5.6006060512051355e-06, + "loss": 0.75721602, + "memory(GiB)": 147.13, + "step": 40960, + "train_speed(iter/s)": 0.20181 + }, + { + "acc": 0.78958197, + "epoch": 0.955861123667595, + "grad_norm": 6.34375, + "learning_rate": 5.598730575360898e-06, + "loss": 0.77207351, + "memory(GiB)": 147.13, + "step": 40970, + "train_speed(iter/s)": 0.201836 + }, + { + "acc": 0.78223519, + "epoch": 0.9560944312398839, + "grad_norm": 8.1875, + "learning_rate": 5.596855014048045e-06, + "loss": 0.77432485, + "memory(GiB)": 147.13, + "step": 40980, + "train_speed(iter/s)": 0.201861 + }, + { + "acc": 0.76258755, + "epoch": 0.9563277388121728, + "grad_norm": 5.375, + "learning_rate": 5.594979367534311e-06, + "loss": 0.84754496, + "memory(GiB)": 147.13, + "step": 40990, + "train_speed(iter/s)": 0.201888 + }, + { + "acc": 0.79521761, + "epoch": 0.9565610463844617, + "grad_norm": 4.3125, + "learning_rate": 5.593103636087446e-06, + "loss": 0.74911346, + "memory(GiB)": 147.13, + "step": 41000, + "train_speed(iter/s)": 0.201913 + }, + { + "epoch": 0.9565610463844617, + "eval_acc": 0.7435477488445194, + "eval_loss": 0.8075899481773376, + "eval_runtime": 1270.0956, + "eval_samples_per_second": 28.337, + "eval_steps_per_second": 14.169, + "step": 41000 + }, + { + "acc": 0.78175507, + "epoch": 0.9567943539567506, + "grad_norm": 5.21875, + "learning_rate": 5.591227819975209e-06, + "loss": 0.77779932, + "memory(GiB)": 147.13, + "step": 41010, + "train_speed(iter/s)": 0.200667 + }, + { + "acc": 0.77938576, + "epoch": 0.9570276615290395, + "grad_norm": 5.59375, + "learning_rate": 5.589351919465373e-06, + "loss": 0.80347643, + "memory(GiB)": 147.13, + "step": 41020, + "train_speed(iter/s)": 0.200691 + }, + { + "acc": 0.77089553, + "epoch": 0.9572609691013284, + "grad_norm": 5.03125, + "learning_rate": 5.587475934825721e-06, + "loss": 0.84662561, + "memory(GiB)": 147.13, + "step": 41030, + "train_speed(iter/s)": 0.200716 + }, + { + "acc": 0.77880993, + "epoch": 0.9574942766736173, + "grad_norm": 5.3125, + "learning_rate": 5.585599866324052e-06, + "loss": 0.77378798, + "memory(GiB)": 147.13, + "step": 41040, + "train_speed(iter/s)": 0.200739 + }, + { + "acc": 0.77963066, + "epoch": 0.9577275842459062, + "grad_norm": 5.46875, + "learning_rate": 5.583723714228169e-06, + "loss": 0.78790627, + "memory(GiB)": 147.13, + "step": 41050, + "train_speed(iter/s)": 0.200765 + }, + { + "acc": 0.78368587, + "epoch": 0.9579608918181951, + "grad_norm": 5.71875, + "learning_rate": 5.581847478805898e-06, + "loss": 0.78440642, + "memory(GiB)": 147.13, + "step": 41060, + "train_speed(iter/s)": 0.20079 + }, + { + "acc": 0.79512386, + "epoch": 0.958194199390484, + "grad_norm": 5.15625, + "learning_rate": 5.579971160325066e-06, + "loss": 0.73103809, + "memory(GiB)": 147.13, + "step": 41070, + "train_speed(iter/s)": 0.200817 + }, + { + "acc": 0.79529629, + "epoch": 0.9584275069627729, + "grad_norm": 5.90625, + "learning_rate": 5.578094759053521e-06, + "loss": 0.73167953, + "memory(GiB)": 147.13, + "step": 41080, + "train_speed(iter/s)": 0.200842 + }, + { + "acc": 0.77233238, + "epoch": 0.9586608145350618, + "grad_norm": 5.65625, + "learning_rate": 5.576218275259116e-06, + "loss": 0.83091927, + "memory(GiB)": 147.13, + "step": 41090, + "train_speed(iter/s)": 0.200868 + }, + { + "acc": 0.76945944, + "epoch": 0.9588941221073507, + "grad_norm": 7.71875, + "learning_rate": 5.574341709209721e-06, + "loss": 0.83968544, + "memory(GiB)": 147.13, + "step": 41100, + "train_speed(iter/s)": 0.200893 + }, + { + "acc": 0.78422494, + "epoch": 0.9591274296796395, + "grad_norm": 5.71875, + "learning_rate": 5.572465061173215e-06, + "loss": 0.76671977, + "memory(GiB)": 147.13, + "step": 41110, + "train_speed(iter/s)": 0.200919 + }, + { + "acc": 0.76734838, + "epoch": 0.9593607372519284, + "grad_norm": 4.75, + "learning_rate": 5.5705883314174845e-06, + "loss": 0.83322086, + "memory(GiB)": 147.13, + "step": 41120, + "train_speed(iter/s)": 0.200946 + }, + { + "acc": 0.77143869, + "epoch": 0.9595940448242173, + "grad_norm": 4.59375, + "learning_rate": 5.568711520210437e-06, + "loss": 0.83908682, + "memory(GiB)": 147.13, + "step": 41130, + "train_speed(iter/s)": 0.200972 + }, + { + "acc": 0.77202053, + "epoch": 0.9598273523965062, + "grad_norm": 5.34375, + "learning_rate": 5.566834627819986e-06, + "loss": 0.8184268, + "memory(GiB)": 147.13, + "step": 41140, + "train_speed(iter/s)": 0.200998 + }, + { + "acc": 0.77577133, + "epoch": 0.9600606599687951, + "grad_norm": 4.875, + "learning_rate": 5.564957654514055e-06, + "loss": 0.80979881, + "memory(GiB)": 147.13, + "step": 41150, + "train_speed(iter/s)": 0.201022 + }, + { + "acc": 0.77774215, + "epoch": 0.960293967541084, + "grad_norm": 3.46875, + "learning_rate": 5.563080600560584e-06, + "loss": 0.80543184, + "memory(GiB)": 147.13, + "step": 41160, + "train_speed(iter/s)": 0.201048 + }, + { + "acc": 0.77905116, + "epoch": 0.9605272751133729, + "grad_norm": 5.46875, + "learning_rate": 5.5612034662275205e-06, + "loss": 0.78646588, + "memory(GiB)": 147.13, + "step": 41170, + "train_speed(iter/s)": 0.201072 + }, + { + "acc": 0.77790079, + "epoch": 0.9607605826856618, + "grad_norm": 4.375, + "learning_rate": 5.559326251782825e-06, + "loss": 0.78109341, + "memory(GiB)": 147.13, + "step": 41180, + "train_speed(iter/s)": 0.201096 + }, + { + "acc": 0.7724268, + "epoch": 0.9609938902579507, + "grad_norm": 5.03125, + "learning_rate": 5.55744895749447e-06, + "loss": 0.82191772, + "memory(GiB)": 147.13, + "step": 41190, + "train_speed(iter/s)": 0.201123 + }, + { + "acc": 0.78639202, + "epoch": 0.9612271978302396, + "grad_norm": 4.53125, + "learning_rate": 5.555571583630439e-06, + "loss": 0.75079656, + "memory(GiB)": 147.13, + "step": 41200, + "train_speed(iter/s)": 0.201149 + }, + { + "acc": 0.78967714, + "epoch": 0.9614605054025285, + "grad_norm": 4.78125, + "learning_rate": 5.553694130458725e-06, + "loss": 0.77342958, + "memory(GiB)": 147.13, + "step": 41210, + "train_speed(iter/s)": 0.201173 + }, + { + "acc": 0.77171679, + "epoch": 0.9616938129748174, + "grad_norm": 6.59375, + "learning_rate": 5.551816598247334e-06, + "loss": 0.81496878, + "memory(GiB)": 147.13, + "step": 41220, + "train_speed(iter/s)": 0.201198 + }, + { + "acc": 0.79381685, + "epoch": 0.9619271205471063, + "grad_norm": 4.84375, + "learning_rate": 5.549938987264284e-06, + "loss": 0.73036442, + "memory(GiB)": 147.13, + "step": 41230, + "train_speed(iter/s)": 0.201223 + }, + { + "acc": 0.79161181, + "epoch": 0.9621604281193952, + "grad_norm": 4.71875, + "learning_rate": 5.548061297777604e-06, + "loss": 0.73119068, + "memory(GiB)": 147.13, + "step": 41240, + "train_speed(iter/s)": 0.201249 + }, + { + "acc": 0.8105217, + "epoch": 0.9623937356916841, + "grad_norm": 5.0, + "learning_rate": 5.546183530055334e-06, + "loss": 0.68218474, + "memory(GiB)": 147.13, + "step": 41250, + "train_speed(iter/s)": 0.201274 + }, + { + "acc": 0.75500202, + "epoch": 0.962627043263973, + "grad_norm": 7.15625, + "learning_rate": 5.544305684365522e-06, + "loss": 0.89543095, + "memory(GiB)": 147.13, + "step": 41260, + "train_speed(iter/s)": 0.2013 + }, + { + "acc": 0.77084942, + "epoch": 0.9628603508362619, + "grad_norm": 6.6875, + "learning_rate": 5.542427760976232e-06, + "loss": 0.82826271, + "memory(GiB)": 147.13, + "step": 41270, + "train_speed(iter/s)": 0.201325 + }, + { + "acc": 0.76677494, + "epoch": 0.9630936584085508, + "grad_norm": 8.4375, + "learning_rate": 5.540549760155537e-06, + "loss": 0.86246529, + "memory(GiB)": 147.13, + "step": 41280, + "train_speed(iter/s)": 0.201348 + }, + { + "acc": 0.79330645, + "epoch": 0.9633269659808397, + "grad_norm": 4.40625, + "learning_rate": 5.53867168217152e-06, + "loss": 0.74018192, + "memory(GiB)": 147.13, + "step": 41290, + "train_speed(iter/s)": 0.201371 + }, + { + "acc": 0.77103343, + "epoch": 0.9635602735531285, + "grad_norm": 5.84375, + "learning_rate": 5.536793527292278e-06, + "loss": 0.83305159, + "memory(GiB)": 147.13, + "step": 41300, + "train_speed(iter/s)": 0.201395 + }, + { + "acc": 0.76550531, + "epoch": 0.9637935811254174, + "grad_norm": 5.90625, + "learning_rate": 5.5349152957859155e-06, + "loss": 0.83599548, + "memory(GiB)": 147.13, + "step": 41310, + "train_speed(iter/s)": 0.201419 + }, + { + "acc": 0.77552824, + "epoch": 0.9640268886977063, + "grad_norm": 4.03125, + "learning_rate": 5.53303698792055e-06, + "loss": 0.82170572, + "memory(GiB)": 147.13, + "step": 41320, + "train_speed(iter/s)": 0.201445 + }, + { + "acc": 0.7870533, + "epoch": 0.9642601962699952, + "grad_norm": 8.375, + "learning_rate": 5.531158603964309e-06, + "loss": 0.75256863, + "memory(GiB)": 147.13, + "step": 41330, + "train_speed(iter/s)": 0.20147 + }, + { + "acc": 0.77868299, + "epoch": 0.964493503842284, + "grad_norm": 5.25, + "learning_rate": 5.529280144185331e-06, + "loss": 0.79275427, + "memory(GiB)": 147.13, + "step": 41340, + "train_speed(iter/s)": 0.201495 + }, + { + "acc": 0.77070007, + "epoch": 0.964726811414573, + "grad_norm": 4.53125, + "learning_rate": 5.5274016088517676e-06, + "loss": 0.82578239, + "memory(GiB)": 147.13, + "step": 41350, + "train_speed(iter/s)": 0.201519 + }, + { + "acc": 0.78871584, + "epoch": 0.9649601189868618, + "grad_norm": 5.125, + "learning_rate": 5.525522998231777e-06, + "loss": 0.76853447, + "memory(GiB)": 147.13, + "step": 41360, + "train_speed(iter/s)": 0.201544 + }, + { + "acc": 0.78402672, + "epoch": 0.9651934265591507, + "grad_norm": 4.65625, + "learning_rate": 5.523644312593533e-06, + "loss": 0.76367111, + "memory(GiB)": 147.13, + "step": 41370, + "train_speed(iter/s)": 0.20157 + }, + { + "acc": 0.79334326, + "epoch": 0.9654267341314396, + "grad_norm": 5.09375, + "learning_rate": 5.521765552205213e-06, + "loss": 0.734587, + "memory(GiB)": 147.13, + "step": 41380, + "train_speed(iter/s)": 0.201596 + }, + { + "acc": 0.7742136, + "epoch": 0.9656600417037285, + "grad_norm": 4.78125, + "learning_rate": 5.519886717335012e-06, + "loss": 0.82144842, + "memory(GiB)": 147.13, + "step": 41390, + "train_speed(iter/s)": 0.201618 + }, + { + "acc": 0.75570812, + "epoch": 0.9658933492760174, + "grad_norm": 5.75, + "learning_rate": 5.518007808251135e-06, + "loss": 0.89175406, + "memory(GiB)": 147.13, + "step": 41400, + "train_speed(iter/s)": 0.201643 + }, + { + "acc": 0.7830318, + "epoch": 0.9661266568483063, + "grad_norm": 5.34375, + "learning_rate": 5.516128825221792e-06, + "loss": 0.77546549, + "memory(GiB)": 147.13, + "step": 41410, + "train_speed(iter/s)": 0.201668 + }, + { + "acc": 0.77321138, + "epoch": 0.9663599644205952, + "grad_norm": 4.625, + "learning_rate": 5.514249768515209e-06, + "loss": 0.80514297, + "memory(GiB)": 147.13, + "step": 41420, + "train_speed(iter/s)": 0.201691 + }, + { + "acc": 0.78639116, + "epoch": 0.9665932719928841, + "grad_norm": 6.0625, + "learning_rate": 5.512370638399622e-06, + "loss": 0.77311621, + "memory(GiB)": 147.13, + "step": 41430, + "train_speed(iter/s)": 0.201717 + }, + { + "acc": 0.78260765, + "epoch": 0.966826579565173, + "grad_norm": 5.40625, + "learning_rate": 5.510491435143275e-06, + "loss": 0.76550779, + "memory(GiB)": 147.13, + "step": 41440, + "train_speed(iter/s)": 0.201741 + }, + { + "acc": 0.79104238, + "epoch": 0.9670598871374619, + "grad_norm": 7.21875, + "learning_rate": 5.508612159014424e-06, + "loss": 0.7574954, + "memory(GiB)": 147.13, + "step": 41450, + "train_speed(iter/s)": 0.201766 + }, + { + "acc": 0.77882171, + "epoch": 0.9672931947097508, + "grad_norm": 5.34375, + "learning_rate": 5.506732810281335e-06, + "loss": 0.80786209, + "memory(GiB)": 147.13, + "step": 41460, + "train_speed(iter/s)": 0.201792 + }, + { + "acc": 0.80258312, + "epoch": 0.9675265022820397, + "grad_norm": 3.875, + "learning_rate": 5.504853389212285e-06, + "loss": 0.72713561, + "memory(GiB)": 147.13, + "step": 41470, + "train_speed(iter/s)": 0.201817 + }, + { + "acc": 0.79807262, + "epoch": 0.9677598098543286, + "grad_norm": 5.125, + "learning_rate": 5.502973896075559e-06, + "loss": 0.72810946, + "memory(GiB)": 147.13, + "step": 41480, + "train_speed(iter/s)": 0.201842 + }, + { + "acc": 0.77658319, + "epoch": 0.9679931174266175, + "grad_norm": 6.90625, + "learning_rate": 5.501094331139457e-06, + "loss": 0.79982195, + "memory(GiB)": 147.13, + "step": 41490, + "train_speed(iter/s)": 0.201866 + }, + { + "acc": 0.75606503, + "epoch": 0.9682264249989063, + "grad_norm": 5.1875, + "learning_rate": 5.499214694672283e-06, + "loss": 0.88801994, + "memory(GiB)": 147.13, + "step": 41500, + "train_speed(iter/s)": 0.201891 + }, + { + "epoch": 0.9682264249989063, + "eval_acc": 0.7436978039901857, + "eval_loss": 0.8076462745666504, + "eval_runtime": 1271.1744, + "eval_samples_per_second": 28.313, + "eval_steps_per_second": 14.157, + "step": 41500 + }, + { + "acc": 0.78003526, + "epoch": 0.9684597325711952, + "grad_norm": 5.21875, + "learning_rate": 5.497334986942358e-06, + "loss": 0.77953706, + "memory(GiB)": 147.13, + "step": 41510, + "train_speed(iter/s)": 0.20066 + }, + { + "acc": 0.78291988, + "epoch": 0.9686930401434841, + "grad_norm": 5.1875, + "learning_rate": 5.495455208218008e-06, + "loss": 0.7734293, + "memory(GiB)": 147.13, + "step": 41520, + "train_speed(iter/s)": 0.200685 + }, + { + "acc": 0.76620455, + "epoch": 0.968926347715773, + "grad_norm": 5.96875, + "learning_rate": 5.493575358767571e-06, + "loss": 0.82494164, + "memory(GiB)": 147.13, + "step": 41530, + "train_speed(iter/s)": 0.200708 + }, + { + "acc": 0.77570481, + "epoch": 0.9691596552880619, + "grad_norm": 6.125, + "learning_rate": 5.491695438859394e-06, + "loss": 0.79977193, + "memory(GiB)": 147.13, + "step": 41540, + "train_speed(iter/s)": 0.200734 + }, + { + "acc": 0.79774528, + "epoch": 0.9693929628603508, + "grad_norm": 4.875, + "learning_rate": 5.489815448761837e-06, + "loss": 0.72081814, + "memory(GiB)": 147.13, + "step": 41550, + "train_speed(iter/s)": 0.200759 + }, + { + "acc": 0.78871336, + "epoch": 0.9696262704326397, + "grad_norm": 6.8125, + "learning_rate": 5.487935388743266e-06, + "loss": 0.74919105, + "memory(GiB)": 147.13, + "step": 41560, + "train_speed(iter/s)": 0.200783 + }, + { + "acc": 0.76116972, + "epoch": 0.9698595780049286, + "grad_norm": 7.3125, + "learning_rate": 5.486055259072059e-06, + "loss": 0.86874237, + "memory(GiB)": 147.13, + "step": 41570, + "train_speed(iter/s)": 0.200808 + }, + { + "acc": 0.77257109, + "epoch": 0.9700928855772175, + "grad_norm": 5.21875, + "learning_rate": 5.484175060016607e-06, + "loss": 0.820648, + "memory(GiB)": 147.13, + "step": 41580, + "train_speed(iter/s)": 0.200834 + }, + { + "acc": 0.78412466, + "epoch": 0.9703261931495064, + "grad_norm": 5.78125, + "learning_rate": 5.482294791845305e-06, + "loss": 0.78075528, + "memory(GiB)": 147.13, + "step": 41590, + "train_speed(iter/s)": 0.20086 + }, + { + "acc": 0.79700346, + "epoch": 0.9705595007217953, + "grad_norm": 5.15625, + "learning_rate": 5.480414454826563e-06, + "loss": 0.7259798, + "memory(GiB)": 147.13, + "step": 41600, + "train_speed(iter/s)": 0.200882 + }, + { + "acc": 0.7838769, + "epoch": 0.9707928082940842, + "grad_norm": 4.65625, + "learning_rate": 5.478534049228794e-06, + "loss": 0.78670282, + "memory(GiB)": 147.13, + "step": 41610, + "train_speed(iter/s)": 0.200909 + }, + { + "acc": 0.77976408, + "epoch": 0.9710261158663731, + "grad_norm": 6.71875, + "learning_rate": 5.476653575320432e-06, + "loss": 0.79632206, + "memory(GiB)": 147.13, + "step": 41620, + "train_speed(iter/s)": 0.200935 + }, + { + "acc": 0.78164692, + "epoch": 0.971259423438662, + "grad_norm": 6.96875, + "learning_rate": 5.474773033369908e-06, + "loss": 0.77794366, + "memory(GiB)": 147.13, + "step": 41630, + "train_speed(iter/s)": 0.200959 + }, + { + "acc": 0.81677799, + "epoch": 0.9714927310109509, + "grad_norm": 5.84375, + "learning_rate": 5.472892423645673e-06, + "loss": 0.63997331, + "memory(GiB)": 147.13, + "step": 41640, + "train_speed(iter/s)": 0.200985 + }, + { + "acc": 0.77521987, + "epoch": 0.9717260385832398, + "grad_norm": 8.3125, + "learning_rate": 5.47101174641618e-06, + "loss": 0.81140451, + "memory(GiB)": 147.13, + "step": 41650, + "train_speed(iter/s)": 0.20101 + }, + { + "acc": 0.77916822, + "epoch": 0.9719593461555287, + "grad_norm": 7.21875, + "learning_rate": 5.469131001949899e-06, + "loss": 0.80082006, + "memory(GiB)": 147.13, + "step": 41660, + "train_speed(iter/s)": 0.201035 + }, + { + "acc": 0.76695161, + "epoch": 0.9721926537278176, + "grad_norm": 6.375, + "learning_rate": 5.467250190515303e-06, + "loss": 0.8524477, + "memory(GiB)": 147.13, + "step": 41670, + "train_speed(iter/s)": 0.201061 + }, + { + "acc": 0.79857035, + "epoch": 0.9724259613001065, + "grad_norm": 7.375, + "learning_rate": 5.465369312380879e-06, + "loss": 0.72482586, + "memory(GiB)": 147.13, + "step": 41680, + "train_speed(iter/s)": 0.201086 + }, + { + "acc": 0.77130175, + "epoch": 0.9726592688723953, + "grad_norm": 8.1875, + "learning_rate": 5.463488367815119e-06, + "loss": 0.8281208, + "memory(GiB)": 147.13, + "step": 41690, + "train_speed(iter/s)": 0.201113 + }, + { + "acc": 0.78064547, + "epoch": 0.9728925764446842, + "grad_norm": 5.65625, + "learning_rate": 5.46160735708653e-06, + "loss": 0.78882265, + "memory(GiB)": 147.13, + "step": 41700, + "train_speed(iter/s)": 0.201139 + }, + { + "acc": 0.79099216, + "epoch": 0.9731258840169731, + "grad_norm": 5.8125, + "learning_rate": 5.459726280463625e-06, + "loss": 0.75952692, + "memory(GiB)": 147.13, + "step": 41710, + "train_speed(iter/s)": 0.201163 + }, + { + "acc": 0.77816892, + "epoch": 0.973359191589262, + "grad_norm": 3.90625, + "learning_rate": 5.4578451382149275e-06, + "loss": 0.7955409, + "memory(GiB)": 147.13, + "step": 41720, + "train_speed(iter/s)": 0.201188 + }, + { + "acc": 0.79420767, + "epoch": 0.9735924991615509, + "grad_norm": 8.5625, + "learning_rate": 5.455963930608969e-06, + "loss": 0.75089536, + "memory(GiB)": 147.13, + "step": 41730, + "train_speed(iter/s)": 0.201211 + }, + { + "acc": 0.7890696, + "epoch": 0.9738258067338398, + "grad_norm": 7.65625, + "learning_rate": 5.454082657914292e-06, + "loss": 0.75170484, + "memory(GiB)": 147.13, + "step": 41740, + "train_speed(iter/s)": 0.201236 + }, + { + "acc": 0.79078021, + "epoch": 0.9740591143061287, + "grad_norm": 16.625, + "learning_rate": 5.452201320399447e-06, + "loss": 0.7524622, + "memory(GiB)": 147.13, + "step": 41750, + "train_speed(iter/s)": 0.201261 + }, + { + "acc": 0.80642824, + "epoch": 0.9742924218784176, + "grad_norm": 6.5, + "learning_rate": 5.450319918332995e-06, + "loss": 0.69903097, + "memory(GiB)": 147.13, + "step": 41760, + "train_speed(iter/s)": 0.201285 + }, + { + "acc": 0.79178162, + "epoch": 0.9745257294507065, + "grad_norm": 4.34375, + "learning_rate": 5.448438451983507e-06, + "loss": 0.74413743, + "memory(GiB)": 147.13, + "step": 41770, + "train_speed(iter/s)": 0.201311 + }, + { + "acc": 0.77486391, + "epoch": 0.9747590370229954, + "grad_norm": 4.0625, + "learning_rate": 5.4465569216195576e-06, + "loss": 0.82327709, + "memory(GiB)": 147.13, + "step": 41780, + "train_speed(iter/s)": 0.201336 + }, + { + "acc": 0.78499551, + "epoch": 0.9749923445952843, + "grad_norm": 6.21875, + "learning_rate": 5.444675327509738e-06, + "loss": 0.7698328, + "memory(GiB)": 147.13, + "step": 41790, + "train_speed(iter/s)": 0.201362 + }, + { + "acc": 0.78951769, + "epoch": 0.9752256521675732, + "grad_norm": 6.375, + "learning_rate": 5.4427936699226455e-06, + "loss": 0.75837417, + "memory(GiB)": 147.13, + "step": 41800, + "train_speed(iter/s)": 0.201387 + }, + { + "acc": 0.7888494, + "epoch": 0.9754589597398621, + "grad_norm": 9.125, + "learning_rate": 5.440911949126885e-06, + "loss": 0.75651331, + "memory(GiB)": 147.13, + "step": 41810, + "train_speed(iter/s)": 0.201411 + }, + { + "acc": 0.79928493, + "epoch": 0.975692267312151, + "grad_norm": 7.96875, + "learning_rate": 5.4390301653910726e-06, + "loss": 0.71884995, + "memory(GiB)": 147.13, + "step": 41820, + "train_speed(iter/s)": 0.201435 + }, + { + "acc": 0.76790247, + "epoch": 0.9759255748844399, + "grad_norm": 5.84375, + "learning_rate": 5.4371483189838315e-06, + "loss": 0.83689575, + "memory(GiB)": 147.13, + "step": 41830, + "train_speed(iter/s)": 0.201459 + }, + { + "acc": 0.78529429, + "epoch": 0.9761588824567288, + "grad_norm": 10.25, + "learning_rate": 5.435266410173794e-06, + "loss": 0.77676764, + "memory(GiB)": 147.13, + "step": 41840, + "train_speed(iter/s)": 0.201483 + }, + { + "acc": 0.75995874, + "epoch": 0.9763921900290177, + "grad_norm": 4.875, + "learning_rate": 5.433384439229603e-06, + "loss": 0.85082378, + "memory(GiB)": 147.13, + "step": 41850, + "train_speed(iter/s)": 0.201509 + }, + { + "acc": 0.7614418, + "epoch": 0.9766254976013066, + "grad_norm": 5.625, + "learning_rate": 5.431502406419908e-06, + "loss": 0.87694912, + "memory(GiB)": 147.13, + "step": 41860, + "train_speed(iter/s)": 0.201535 + }, + { + "acc": 0.80430679, + "epoch": 0.9768588051735955, + "grad_norm": 9.625, + "learning_rate": 5.429620312013372e-06, + "loss": 0.68969584, + "memory(GiB)": 147.13, + "step": 41870, + "train_speed(iter/s)": 0.201561 + }, + { + "acc": 0.76424179, + "epoch": 0.9770921127458843, + "grad_norm": 6.75, + "learning_rate": 5.427738156278662e-06, + "loss": 0.86263018, + "memory(GiB)": 147.13, + "step": 41880, + "train_speed(iter/s)": 0.201587 + }, + { + "acc": 0.77523551, + "epoch": 0.9773254203181732, + "grad_norm": 4.25, + "learning_rate": 5.4258559394844515e-06, + "loss": 0.80997353, + "memory(GiB)": 147.13, + "step": 41890, + "train_speed(iter/s)": 0.201613 + }, + { + "acc": 0.76072025, + "epoch": 0.977558727890462, + "grad_norm": 4.78125, + "learning_rate": 5.423973661899431e-06, + "loss": 0.87792006, + "memory(GiB)": 147.13, + "step": 41900, + "train_speed(iter/s)": 0.201639 + }, + { + "acc": 0.78689508, + "epoch": 0.977792035462751, + "grad_norm": 6.0, + "learning_rate": 5.4220913237922936e-06, + "loss": 0.76750278, + "memory(GiB)": 147.13, + "step": 41910, + "train_speed(iter/s)": 0.201663 + }, + { + "acc": 0.77254424, + "epoch": 0.9780253430350399, + "grad_norm": 11.1875, + "learning_rate": 5.4202089254317415e-06, + "loss": 0.83702393, + "memory(GiB)": 147.13, + "step": 41920, + "train_speed(iter/s)": 0.20169 + }, + { + "acc": 0.77660599, + "epoch": 0.9782586506073288, + "grad_norm": 5.21875, + "learning_rate": 5.418326467086488e-06, + "loss": 0.79623809, + "memory(GiB)": 147.13, + "step": 41930, + "train_speed(iter/s)": 0.201717 + }, + { + "acc": 0.76378241, + "epoch": 0.9784919581796176, + "grad_norm": 5.40625, + "learning_rate": 5.416443949025253e-06, + "loss": 0.84370394, + "memory(GiB)": 147.13, + "step": 41940, + "train_speed(iter/s)": 0.201744 + }, + { + "acc": 0.76264544, + "epoch": 0.9787252657519065, + "grad_norm": 4.8125, + "learning_rate": 5.414561371516764e-06, + "loss": 0.86721058, + "memory(GiB)": 147.13, + "step": 41950, + "train_speed(iter/s)": 0.20177 + }, + { + "acc": 0.81001492, + "epoch": 0.9789585733241954, + "grad_norm": 6.375, + "learning_rate": 5.41267873482976e-06, + "loss": 0.70287361, + "memory(GiB)": 147.13, + "step": 41960, + "train_speed(iter/s)": 0.201794 + }, + { + "acc": 0.78907146, + "epoch": 0.9791918808964843, + "grad_norm": 7.46875, + "learning_rate": 5.410796039232989e-06, + "loss": 0.73023872, + "memory(GiB)": 147.13, + "step": 41970, + "train_speed(iter/s)": 0.201819 + }, + { + "acc": 0.77236247, + "epoch": 0.9794251884687732, + "grad_norm": 6.0, + "learning_rate": 5.4089132849952e-06, + "loss": 0.83747387, + "memory(GiB)": 147.13, + "step": 41980, + "train_speed(iter/s)": 0.201845 + }, + { + "acc": 0.78121223, + "epoch": 0.9796584960410621, + "grad_norm": 4.875, + "learning_rate": 5.407030472385158e-06, + "loss": 0.78266506, + "memory(GiB)": 147.13, + "step": 41990, + "train_speed(iter/s)": 0.201871 + }, + { + "acc": 0.76154213, + "epoch": 0.979891803613351, + "grad_norm": 4.875, + "learning_rate": 5.4051476016716365e-06, + "loss": 0.85525732, + "memory(GiB)": 147.13, + "step": 42000, + "train_speed(iter/s)": 0.201896 + }, + { + "epoch": 0.979891803613351, + "eval_acc": 0.743733035721292, + "eval_loss": 0.8074895739555359, + "eval_runtime": 1270.6849, + "eval_samples_per_second": 28.324, + "eval_steps_per_second": 14.162, + "step": 42000 + }, + { + "acc": 0.79927797, + "epoch": 0.9801251111856399, + "grad_norm": 9.125, + "learning_rate": 5.4032646731234115e-06, + "loss": 0.72784662, + "memory(GiB)": 147.13, + "step": 42010, + "train_speed(iter/s)": 0.200679 + }, + { + "acc": 0.78354425, + "epoch": 0.9803584187579288, + "grad_norm": 6.53125, + "learning_rate": 5.401381687009271e-06, + "loss": 0.78162694, + "memory(GiB)": 147.13, + "step": 42020, + "train_speed(iter/s)": 0.200704 + }, + { + "acc": 0.77189026, + "epoch": 0.9805917263302177, + "grad_norm": 5.625, + "learning_rate": 5.399498643598011e-06, + "loss": 0.81935215, + "memory(GiB)": 147.13, + "step": 42030, + "train_speed(iter/s)": 0.200729 + }, + { + "acc": 0.77528439, + "epoch": 0.9808250339025066, + "grad_norm": 6.21875, + "learning_rate": 5.3976155431584375e-06, + "loss": 0.82163353, + "memory(GiB)": 147.13, + "step": 42040, + "train_speed(iter/s)": 0.200752 + }, + { + "acc": 0.78729029, + "epoch": 0.9810583414747955, + "grad_norm": 4.9375, + "learning_rate": 5.3957323859593604e-06, + "loss": 0.77468286, + "memory(GiB)": 147.13, + "step": 42050, + "train_speed(iter/s)": 0.200777 + }, + { + "acc": 0.77076778, + "epoch": 0.9812916490470844, + "grad_norm": 4.4375, + "learning_rate": 5.3938491722695996e-06, + "loss": 0.82037773, + "memory(GiB)": 147.13, + "step": 42060, + "train_speed(iter/s)": 0.2008 + }, + { + "acc": 0.75968246, + "epoch": 0.9815249566193732, + "grad_norm": 6.15625, + "learning_rate": 5.391965902357983e-06, + "loss": 0.88906879, + "memory(GiB)": 147.13, + "step": 42070, + "train_speed(iter/s)": 0.200825 + }, + { + "acc": 0.78322248, + "epoch": 0.9817582641916621, + "grad_norm": 5.40625, + "learning_rate": 5.390082576493348e-06, + "loss": 0.76822419, + "memory(GiB)": 147.13, + "step": 42080, + "train_speed(iter/s)": 0.20085 + }, + { + "acc": 0.77662072, + "epoch": 0.981991571763951, + "grad_norm": 6.71875, + "learning_rate": 5.388199194944539e-06, + "loss": 0.80375099, + "memory(GiB)": 147.13, + "step": 42090, + "train_speed(iter/s)": 0.200875 + }, + { + "acc": 0.79028854, + "epoch": 0.9822248793362399, + "grad_norm": 4.5625, + "learning_rate": 5.3863157579804075e-06, + "loss": 0.72856355, + "memory(GiB)": 147.13, + "step": 42100, + "train_speed(iter/s)": 0.200899 + }, + { + "acc": 0.76783481, + "epoch": 0.9824581869085288, + "grad_norm": 5.09375, + "learning_rate": 5.384432265869815e-06, + "loss": 0.84242039, + "memory(GiB)": 147.13, + "step": 42110, + "train_speed(iter/s)": 0.200924 + }, + { + "acc": 0.7857872, + "epoch": 0.9826914944808177, + "grad_norm": 6.96875, + "learning_rate": 5.382548718881627e-06, + "loss": 0.77328167, + "memory(GiB)": 147.13, + "step": 42120, + "train_speed(iter/s)": 0.20095 + }, + { + "acc": 0.78016534, + "epoch": 0.9829248020531066, + "grad_norm": 4.59375, + "learning_rate": 5.380665117284721e-06, + "loss": 0.78974218, + "memory(GiB)": 147.13, + "step": 42130, + "train_speed(iter/s)": 0.200974 + }, + { + "acc": 0.78708715, + "epoch": 0.9831581096253955, + "grad_norm": 5.0625, + "learning_rate": 5.378781461347979e-06, + "loss": 0.76465178, + "memory(GiB)": 147.13, + "step": 42140, + "train_speed(iter/s)": 0.200997 + }, + { + "acc": 0.77445755, + "epoch": 0.9833914171976844, + "grad_norm": 4.6875, + "learning_rate": 5.376897751340294e-06, + "loss": 0.82658815, + "memory(GiB)": 147.13, + "step": 42150, + "train_speed(iter/s)": 0.201023 + }, + { + "acc": 0.78046808, + "epoch": 0.9836247247699733, + "grad_norm": 8.625, + "learning_rate": 5.375013987530565e-06, + "loss": 0.78468456, + "memory(GiB)": 147.13, + "step": 42160, + "train_speed(iter/s)": 0.201047 + }, + { + "acc": 0.77549314, + "epoch": 0.9838580323422622, + "grad_norm": 7.78125, + "learning_rate": 5.3731301701876985e-06, + "loss": 0.81372795, + "memory(GiB)": 147.13, + "step": 42170, + "train_speed(iter/s)": 0.201073 + }, + { + "acc": 0.77141991, + "epoch": 0.9840913399145511, + "grad_norm": 4.96875, + "learning_rate": 5.371246299580608e-06, + "loss": 0.82050037, + "memory(GiB)": 147.13, + "step": 42180, + "train_speed(iter/s)": 0.201097 + }, + { + "acc": 0.79150095, + "epoch": 0.98432464748684, + "grad_norm": 6.59375, + "learning_rate": 5.3693623759782165e-06, + "loss": 0.73675623, + "memory(GiB)": 147.13, + "step": 42190, + "train_speed(iter/s)": 0.201123 + }, + { + "acc": 0.75877752, + "epoch": 0.9845579550591289, + "grad_norm": 4.71875, + "learning_rate": 5.367478399649453e-06, + "loss": 0.85596247, + "memory(GiB)": 147.13, + "step": 42200, + "train_speed(iter/s)": 0.201147 + }, + { + "acc": 0.77118468, + "epoch": 0.9847912626314178, + "grad_norm": 6.4375, + "learning_rate": 5.365594370863254e-06, + "loss": 0.81483583, + "memory(GiB)": 147.13, + "step": 42210, + "train_speed(iter/s)": 0.201172 + }, + { + "acc": 0.77900705, + "epoch": 0.9850245702037067, + "grad_norm": 6.0, + "learning_rate": 5.363710289888564e-06, + "loss": 0.79868593, + "memory(GiB)": 147.13, + "step": 42220, + "train_speed(iter/s)": 0.201196 + }, + { + "acc": 0.77433815, + "epoch": 0.9852578777759956, + "grad_norm": 5.5625, + "learning_rate": 5.361826156994338e-06, + "loss": 0.81312532, + "memory(GiB)": 147.13, + "step": 42230, + "train_speed(iter/s)": 0.201221 + }, + { + "acc": 0.79729567, + "epoch": 0.9854911853482845, + "grad_norm": 5.875, + "learning_rate": 5.359941972449532e-06, + "loss": 0.73013325, + "memory(GiB)": 147.13, + "step": 42240, + "train_speed(iter/s)": 0.201245 + }, + { + "acc": 0.78885775, + "epoch": 0.9857244929205734, + "grad_norm": 15.5625, + "learning_rate": 5.358057736523114e-06, + "loss": 0.75900469, + "memory(GiB)": 147.13, + "step": 42250, + "train_speed(iter/s)": 0.20127 + }, + { + "acc": 0.76385336, + "epoch": 0.9859578004928623, + "grad_norm": 6.15625, + "learning_rate": 5.356173449484059e-06, + "loss": 0.87608242, + "memory(GiB)": 147.13, + "step": 42260, + "train_speed(iter/s)": 0.201292 + }, + { + "acc": 0.78156281, + "epoch": 0.9861911080651511, + "grad_norm": 6.4375, + "learning_rate": 5.3542891116013465e-06, + "loss": 0.75918655, + "memory(GiB)": 147.13, + "step": 42270, + "train_speed(iter/s)": 0.201318 + }, + { + "acc": 0.78791151, + "epoch": 0.98642441563744, + "grad_norm": 5.1875, + "learning_rate": 5.352404723143968e-06, + "loss": 0.74543934, + "memory(GiB)": 147.13, + "step": 42280, + "train_speed(iter/s)": 0.201342 + }, + { + "acc": 0.7900342, + "epoch": 0.9866577232097289, + "grad_norm": 4.71875, + "learning_rate": 5.350520284380916e-06, + "loss": 0.74275174, + "memory(GiB)": 147.13, + "step": 42290, + "train_speed(iter/s)": 0.201366 + }, + { + "acc": 0.78988333, + "epoch": 0.9868910307820178, + "grad_norm": 5.375, + "learning_rate": 5.3486357955811945e-06, + "loss": 0.75508475, + "memory(GiB)": 147.13, + "step": 42300, + "train_speed(iter/s)": 0.20139 + }, + { + "acc": 0.76127229, + "epoch": 0.9871243383543067, + "grad_norm": 10.625, + "learning_rate": 5.346751257013815e-06, + "loss": 0.85649261, + "memory(GiB)": 147.13, + "step": 42310, + "train_speed(iter/s)": 0.201414 + }, + { + "acc": 0.76956244, + "epoch": 0.9873576459265956, + "grad_norm": 7.53125, + "learning_rate": 5.344866668947794e-06, + "loss": 0.8190443, + "memory(GiB)": 147.13, + "step": 42320, + "train_speed(iter/s)": 0.201439 + }, + { + "acc": 0.76760712, + "epoch": 0.9875909534988845, + "grad_norm": 5.03125, + "learning_rate": 5.342982031652159e-06, + "loss": 0.84836092, + "memory(GiB)": 147.13, + "step": 42330, + "train_speed(iter/s)": 0.201462 + }, + { + "acc": 0.77389174, + "epoch": 0.9878242610711734, + "grad_norm": 8.375, + "learning_rate": 5.341097345395937e-06, + "loss": 0.7971365, + "memory(GiB)": 147.13, + "step": 42340, + "train_speed(iter/s)": 0.201487 + }, + { + "acc": 0.78406467, + "epoch": 0.9880575686434623, + "grad_norm": 7.625, + "learning_rate": 5.339212610448167e-06, + "loss": 0.76403122, + "memory(GiB)": 147.13, + "step": 42350, + "train_speed(iter/s)": 0.201509 + }, + { + "acc": 0.7485312, + "epoch": 0.9882908762157512, + "grad_norm": 4.40625, + "learning_rate": 5.3373278270778965e-06, + "loss": 0.91936789, + "memory(GiB)": 147.13, + "step": 42360, + "train_speed(iter/s)": 0.201534 + }, + { + "acc": 0.78323016, + "epoch": 0.9885241837880401, + "grad_norm": 4.46875, + "learning_rate": 5.3354429955541755e-06, + "loss": 0.76532645, + "memory(GiB)": 147.13, + "step": 42370, + "train_speed(iter/s)": 0.201559 + }, + { + "acc": 0.774506, + "epoch": 0.988757491360329, + "grad_norm": 6.3125, + "learning_rate": 5.333558116146063e-06, + "loss": 0.78331833, + "memory(GiB)": 147.13, + "step": 42380, + "train_speed(iter/s)": 0.201585 + }, + { + "acc": 0.77661877, + "epoch": 0.9889907989326179, + "grad_norm": 4.90625, + "learning_rate": 5.33167318912263e-06, + "loss": 0.79144692, + "memory(GiB)": 147.13, + "step": 42390, + "train_speed(iter/s)": 0.201608 + }, + { + "acc": 0.79478455, + "epoch": 0.9892241065049068, + "grad_norm": 5.21875, + "learning_rate": 5.329788214752944e-06, + "loss": 0.72164011, + "memory(GiB)": 147.13, + "step": 42400, + "train_speed(iter/s)": 0.201632 + }, + { + "acc": 0.77908754, + "epoch": 0.9894574140771957, + "grad_norm": 10.3125, + "learning_rate": 5.327903193306087e-06, + "loss": 0.77795801, + "memory(GiB)": 147.13, + "step": 42410, + "train_speed(iter/s)": 0.201655 + }, + { + "acc": 0.76483593, + "epoch": 0.9896907216494846, + "grad_norm": 8.0, + "learning_rate": 5.326018125051142e-06, + "loss": 0.84128876, + "memory(GiB)": 147.13, + "step": 42420, + "train_speed(iter/s)": 0.20168 + }, + { + "acc": 0.79741845, + "epoch": 0.9899240292217735, + "grad_norm": 4.0625, + "learning_rate": 5.324133010257206e-06, + "loss": 0.73309522, + "memory(GiB)": 147.13, + "step": 42430, + "train_speed(iter/s)": 0.201704 + }, + { + "acc": 0.78365011, + "epoch": 0.9901573367940624, + "grad_norm": 5.78125, + "learning_rate": 5.3222478491933775e-06, + "loss": 0.78707895, + "memory(GiB)": 147.13, + "step": 42440, + "train_speed(iter/s)": 0.201729 + }, + { + "acc": 0.77475691, + "epoch": 0.9903906443663513, + "grad_norm": 8.5, + "learning_rate": 5.320362642128761e-06, + "loss": 0.82604122, + "memory(GiB)": 147.13, + "step": 42450, + "train_speed(iter/s)": 0.201754 + }, + { + "acc": 0.80049133, + "epoch": 0.9906239519386401, + "grad_norm": 5.875, + "learning_rate": 5.318477389332471e-06, + "loss": 0.72088904, + "memory(GiB)": 147.13, + "step": 42460, + "train_speed(iter/s)": 0.201781 + }, + { + "acc": 0.75600042, + "epoch": 0.990857259510929, + "grad_norm": 5.875, + "learning_rate": 5.316592091073626e-06, + "loss": 0.88159542, + "memory(GiB)": 147.13, + "step": 42470, + "train_speed(iter/s)": 0.201805 + }, + { + "acc": 0.7931004, + "epoch": 0.9910905670832179, + "grad_norm": 4.0625, + "learning_rate": 5.314706747621352e-06, + "loss": 0.73830881, + "memory(GiB)": 147.13, + "step": 42480, + "train_speed(iter/s)": 0.201829 + }, + { + "acc": 0.77918282, + "epoch": 0.9913238746555068, + "grad_norm": 4.65625, + "learning_rate": 5.312821359244781e-06, + "loss": 0.81584482, + "memory(GiB)": 147.13, + "step": 42490, + "train_speed(iter/s)": 0.201854 + }, + { + "acc": 0.76876907, + "epoch": 0.9915571822277957, + "grad_norm": 16.625, + "learning_rate": 5.310935926213052e-06, + "loss": 0.82897224, + "memory(GiB)": 147.13, + "step": 42500, + "train_speed(iter/s)": 0.201878 + }, + { + "epoch": 0.9915571822277957, + "eval_acc": 0.7438225563471484, + "eval_loss": 0.8072462677955627, + "eval_runtime": 1270.4976, + "eval_samples_per_second": 28.328, + "eval_steps_per_second": 14.165, + "step": 42500 + }, + { + "acc": 0.76333456, + "epoch": 0.9917904898000846, + "grad_norm": 4.8125, + "learning_rate": 5.309050448795311e-06, + "loss": 0.84974442, + "memory(GiB)": 147.13, + "step": 42510, + "train_speed(iter/s)": 0.200677 + }, + { + "acc": 0.77313266, + "epoch": 0.9920237973723735, + "grad_norm": 5.6875, + "learning_rate": 5.307164927260706e-06, + "loss": 0.82176523, + "memory(GiB)": 147.13, + "step": 42520, + "train_speed(iter/s)": 0.200701 + }, + { + "acc": 0.78784313, + "epoch": 0.9922571049446623, + "grad_norm": 4.4375, + "learning_rate": 5.305279361878398e-06, + "loss": 0.75282192, + "memory(GiB)": 147.13, + "step": 42530, + "train_speed(iter/s)": 0.200728 + }, + { + "acc": 0.79874549, + "epoch": 0.9924904125169512, + "grad_norm": 4.5, + "learning_rate": 5.30339375291755e-06, + "loss": 0.71279907, + "memory(GiB)": 147.13, + "step": 42540, + "train_speed(iter/s)": 0.200753 + }, + { + "acc": 0.77315574, + "epoch": 0.9927237200892401, + "grad_norm": 5.25, + "learning_rate": 5.3015081006473315e-06, + "loss": 0.81193829, + "memory(GiB)": 147.13, + "step": 42550, + "train_speed(iter/s)": 0.200777 + }, + { + "acc": 0.77819738, + "epoch": 0.992957027661529, + "grad_norm": 5.375, + "learning_rate": 5.299622405336919e-06, + "loss": 0.78049955, + "memory(GiB)": 147.13, + "step": 42560, + "train_speed(iter/s)": 0.200802 + }, + { + "acc": 0.78241773, + "epoch": 0.993190335233818, + "grad_norm": 6.71875, + "learning_rate": 5.297736667255497e-06, + "loss": 0.79108753, + "memory(GiB)": 147.13, + "step": 42570, + "train_speed(iter/s)": 0.200825 + }, + { + "acc": 0.78949213, + "epoch": 0.9934236428061068, + "grad_norm": 5.5625, + "learning_rate": 5.2958508866722506e-06, + "loss": 0.75022783, + "memory(GiB)": 147.13, + "step": 42580, + "train_speed(iter/s)": 0.200849 + }, + { + "acc": 0.75920568, + "epoch": 0.9936569503783957, + "grad_norm": 6.25, + "learning_rate": 5.293965063856375e-06, + "loss": 0.8708231, + "memory(GiB)": 147.13, + "step": 42590, + "train_speed(iter/s)": 0.200872 + }, + { + "acc": 0.76359487, + "epoch": 0.9938902579506846, + "grad_norm": 7.6875, + "learning_rate": 5.292079199077073e-06, + "loss": 0.85245781, + "memory(GiB)": 147.13, + "step": 42600, + "train_speed(iter/s)": 0.200898 + }, + { + "acc": 0.77966127, + "epoch": 0.9941235655229735, + "grad_norm": 5.9375, + "learning_rate": 5.290193292603551e-06, + "loss": 0.7903245, + "memory(GiB)": 147.13, + "step": 42610, + "train_speed(iter/s)": 0.200923 + }, + { + "acc": 0.78285179, + "epoch": 0.9943568730952624, + "grad_norm": 7.59375, + "learning_rate": 5.2883073447050205e-06, + "loss": 0.78474703, + "memory(GiB)": 147.13, + "step": 42620, + "train_speed(iter/s)": 0.200946 + }, + { + "acc": 0.78321028, + "epoch": 0.9945901806675513, + "grad_norm": 5.09375, + "learning_rate": 5.2864213556507e-06, + "loss": 0.79646807, + "memory(GiB)": 147.13, + "step": 42630, + "train_speed(iter/s)": 0.200971 + }, + { + "acc": 0.76653428, + "epoch": 0.9948234882398402, + "grad_norm": 7.8125, + "learning_rate": 5.2845353257098146e-06, + "loss": 0.84334564, + "memory(GiB)": 147.13, + "step": 42640, + "train_speed(iter/s)": 0.200996 + }, + { + "acc": 0.77440205, + "epoch": 0.995056795812129, + "grad_norm": 5.8125, + "learning_rate": 5.282649255151593e-06, + "loss": 0.80458755, + "memory(GiB)": 147.13, + "step": 42650, + "train_speed(iter/s)": 0.201021 + }, + { + "acc": 0.77449627, + "epoch": 0.9952901033844179, + "grad_norm": 4.875, + "learning_rate": 5.280763144245272e-06, + "loss": 0.81251926, + "memory(GiB)": 147.13, + "step": 42660, + "train_speed(iter/s)": 0.201043 + }, + { + "acc": 0.77925887, + "epoch": 0.9955234109567068, + "grad_norm": 6.125, + "learning_rate": 5.2788769932600944e-06, + "loss": 0.79605002, + "memory(GiB)": 147.13, + "step": 42670, + "train_speed(iter/s)": 0.201068 + }, + { + "acc": 0.77389841, + "epoch": 0.9957567185289957, + "grad_norm": 6.71875, + "learning_rate": 5.276990802465309e-06, + "loss": 0.80099325, + "memory(GiB)": 147.13, + "step": 42680, + "train_speed(iter/s)": 0.201092 + }, + { + "acc": 0.77797318, + "epoch": 0.9959900261012846, + "grad_norm": 7.3125, + "learning_rate": 5.275104572130167e-06, + "loss": 0.78075852, + "memory(GiB)": 147.13, + "step": 42690, + "train_speed(iter/s)": 0.201116 + }, + { + "acc": 0.78155584, + "epoch": 0.9962233336735735, + "grad_norm": 5.125, + "learning_rate": 5.273218302523925e-06, + "loss": 0.78047514, + "memory(GiB)": 147.13, + "step": 42700, + "train_speed(iter/s)": 0.201141 + }, + { + "acc": 0.78246479, + "epoch": 0.9964566412458624, + "grad_norm": 5.4375, + "learning_rate": 5.2713319939158494e-06, + "loss": 0.79133334, + "memory(GiB)": 147.13, + "step": 42710, + "train_speed(iter/s)": 0.201163 + }, + { + "acc": 0.78450847, + "epoch": 0.9966899488181513, + "grad_norm": 4.625, + "learning_rate": 5.2694456465752104e-06, + "loss": 0.76527557, + "memory(GiB)": 147.13, + "step": 42720, + "train_speed(iter/s)": 0.201188 + }, + { + "acc": 0.77672338, + "epoch": 0.9969232563904402, + "grad_norm": 5.40625, + "learning_rate": 5.267559260771285e-06, + "loss": 0.78869748, + "memory(GiB)": 147.13, + "step": 42730, + "train_speed(iter/s)": 0.201211 + }, + { + "acc": 0.79831858, + "epoch": 0.9971565639627291, + "grad_norm": 5.4375, + "learning_rate": 5.265672836773353e-06, + "loss": 0.73061762, + "memory(GiB)": 147.13, + "step": 42740, + "train_speed(iter/s)": 0.201234 + }, + { + "acc": 0.76231604, + "epoch": 0.997389871535018, + "grad_norm": 6.9375, + "learning_rate": 5.2637863748507e-06, + "loss": 0.87828217, + "memory(GiB)": 147.13, + "step": 42750, + "train_speed(iter/s)": 0.201257 + }, + { + "acc": 0.77685285, + "epoch": 0.9976231791073069, + "grad_norm": 4.75, + "learning_rate": 5.261899875272619e-06, + "loss": 0.79581809, + "memory(GiB)": 147.13, + "step": 42760, + "train_speed(iter/s)": 0.201281 + }, + { + "acc": 0.77012229, + "epoch": 0.9978564866795958, + "grad_norm": 5.59375, + "learning_rate": 5.260013338308408e-06, + "loss": 0.84313641, + "memory(GiB)": 147.13, + "step": 42770, + "train_speed(iter/s)": 0.201305 + }, + { + "acc": 0.7849412, + "epoch": 0.9980897942518847, + "grad_norm": 3.953125, + "learning_rate": 5.258126764227366e-06, + "loss": 0.76588078, + "memory(GiB)": 147.13, + "step": 42780, + "train_speed(iter/s)": 0.201328 + }, + { + "acc": 0.77974372, + "epoch": 0.9983231018241736, + "grad_norm": 6.03125, + "learning_rate": 5.256240153298804e-06, + "loss": 0.78483295, + "memory(GiB)": 147.13, + "step": 42790, + "train_speed(iter/s)": 0.201355 + }, + { + "acc": 0.78103266, + "epoch": 0.9985564093964625, + "grad_norm": 5.21875, + "learning_rate": 5.254353505792036e-06, + "loss": 0.78471365, + "memory(GiB)": 147.13, + "step": 42800, + "train_speed(iter/s)": 0.201379 + }, + { + "acc": 0.76989121, + "epoch": 0.9987897169687514, + "grad_norm": 6.0625, + "learning_rate": 5.252466821976377e-06, + "loss": 0.82569427, + "memory(GiB)": 147.13, + "step": 42810, + "train_speed(iter/s)": 0.201404 + }, + { + "acc": 0.77489061, + "epoch": 0.9990230245410403, + "grad_norm": 4.59375, + "learning_rate": 5.250580102121153e-06, + "loss": 0.81538868, + "memory(GiB)": 147.13, + "step": 42820, + "train_speed(iter/s)": 0.201429 + }, + { + "acc": 0.78204842, + "epoch": 0.9992563321133292, + "grad_norm": 4.625, + "learning_rate": 5.248693346495694e-06, + "loss": 0.78283563, + "memory(GiB)": 147.13, + "step": 42830, + "train_speed(iter/s)": 0.201452 + }, + { + "acc": 0.79220905, + "epoch": 0.999489639685618, + "grad_norm": 4.78125, + "learning_rate": 5.2468065553693306e-06, + "loss": 0.73160219, + "memory(GiB)": 147.13, + "step": 42840, + "train_speed(iter/s)": 0.201476 + }, + { + "acc": 0.77306366, + "epoch": 0.9997229472579069, + "grad_norm": 5.90625, + "learning_rate": 5.244919729011403e-06, + "loss": 0.8217906, + "memory(GiB)": 147.13, + "step": 42850, + "train_speed(iter/s)": 0.201501 + }, + { + "acc": 0.78262877, + "epoch": 0.9999562548301958, + "grad_norm": 6.875, + "learning_rate": 5.243032867691257e-06, + "loss": 0.78975267, + "memory(GiB)": 147.13, + "step": 42860, + "train_speed(iter/s)": 0.201522 + }, + { + "acc": 0.78576965, + "epoch": 1.0001895624024848, + "grad_norm": 3.8125, + "learning_rate": 5.241145971678238e-06, + "loss": 0.76854234, + "memory(GiB)": 147.13, + "step": 42870, + "train_speed(iter/s)": 0.201543 + }, + { + "acc": 0.7733151, + "epoch": 1.0004228699747737, + "grad_norm": 4.6875, + "learning_rate": 5.239259041241701e-06, + "loss": 0.82483797, + "memory(GiB)": 147.13, + "step": 42880, + "train_speed(iter/s)": 0.201568 + }, + { + "acc": 0.80650272, + "epoch": 1.0006561775470626, + "grad_norm": 4.28125, + "learning_rate": 5.237372076651006e-06, + "loss": 0.67205696, + "memory(GiB)": 147.13, + "step": 42890, + "train_speed(iter/s)": 0.20159 + }, + { + "acc": 0.78537331, + "epoch": 1.0008894851193515, + "grad_norm": 6.03125, + "learning_rate": 5.2354850781755175e-06, + "loss": 0.76955976, + "memory(GiB)": 147.13, + "step": 42900, + "train_speed(iter/s)": 0.201612 + }, + { + "acc": 0.76812172, + "epoch": 1.0011227926916404, + "grad_norm": 3.875, + "learning_rate": 5.233598046084602e-06, + "loss": 0.85648232, + "memory(GiB)": 147.13, + "step": 42910, + "train_speed(iter/s)": 0.201637 + }, + { + "acc": 0.77023239, + "epoch": 1.0013561002639293, + "grad_norm": 5.125, + "learning_rate": 5.231710980647632e-06, + "loss": 0.8251276, + "memory(GiB)": 147.13, + "step": 42920, + "train_speed(iter/s)": 0.201662 + }, + { + "acc": 0.78217449, + "epoch": 1.0015894078362182, + "grad_norm": 5.625, + "learning_rate": 5.229823882133987e-06, + "loss": 0.78995705, + "memory(GiB)": 147.13, + "step": 42930, + "train_speed(iter/s)": 0.201687 + }, + { + "acc": 0.77538042, + "epoch": 1.0018227154085069, + "grad_norm": 6.96875, + "learning_rate": 5.22793675081305e-06, + "loss": 0.78225121, + "memory(GiB)": 147.13, + "step": 42940, + "train_speed(iter/s)": 0.201711 + }, + { + "acc": 0.77648048, + "epoch": 1.0020560229807958, + "grad_norm": 6.875, + "learning_rate": 5.226049586954207e-06, + "loss": 0.81052818, + "memory(GiB)": 147.13, + "step": 42950, + "train_speed(iter/s)": 0.201735 + }, + { + "acc": 0.78938169, + "epoch": 1.0022893305530847, + "grad_norm": 4.78125, + "learning_rate": 5.2241623908268524e-06, + "loss": 0.77443514, + "memory(GiB)": 147.13, + "step": 42960, + "train_speed(iter/s)": 0.201759 + }, + { + "acc": 0.76893907, + "epoch": 1.0025226381253736, + "grad_norm": 4.75, + "learning_rate": 5.222275162700382e-06, + "loss": 0.84441872, + "memory(GiB)": 147.13, + "step": 42970, + "train_speed(iter/s)": 0.201781 + }, + { + "acc": 0.77256842, + "epoch": 1.0027559456976625, + "grad_norm": 8.75, + "learning_rate": 5.2203879028441975e-06, + "loss": 0.83643208, + "memory(GiB)": 147.13, + "step": 42980, + "train_speed(iter/s)": 0.201807 + }, + { + "acc": 0.76644096, + "epoch": 1.0029892532699514, + "grad_norm": 5.375, + "learning_rate": 5.218500611527701e-06, + "loss": 0.82999525, + "memory(GiB)": 147.13, + "step": 42990, + "train_speed(iter/s)": 0.201831 + }, + { + "acc": 0.77953863, + "epoch": 1.0032225608422403, + "grad_norm": 5.1875, + "learning_rate": 5.216613289020307e-06, + "loss": 0.77615538, + "memory(GiB)": 147.13, + "step": 43000, + "train_speed(iter/s)": 0.201856 + }, + { + "epoch": 1.0032225608422403, + "eval_acc": 0.7438829307227262, + "eval_loss": 0.8073763847351074, + "eval_runtime": 1269.4805, + "eval_samples_per_second": 28.351, + "eval_steps_per_second": 14.176, + "step": 43000 + }, + { + "acc": 0.77727814, + "epoch": 1.0034558684145292, + "grad_norm": 4.84375, + "learning_rate": 5.214725935591429e-06, + "loss": 0.80646038, + "memory(GiB)": 147.13, + "step": 43010, + "train_speed(iter/s)": 0.200669 + }, + { + "acc": 0.7740561, + "epoch": 1.003689175986818, + "grad_norm": 6.75, + "learning_rate": 5.2128385515104865e-06, + "loss": 0.81611605, + "memory(GiB)": 147.13, + "step": 43020, + "train_speed(iter/s)": 0.200692 + }, + { + "acc": 0.77674236, + "epoch": 1.003922483559107, + "grad_norm": 6.1875, + "learning_rate": 5.210951137046903e-06, + "loss": 0.80263958, + "memory(GiB)": 147.13, + "step": 43030, + "train_speed(iter/s)": 0.200718 + }, + { + "acc": 0.77732944, + "epoch": 1.0041557911313959, + "grad_norm": 6.78125, + "learning_rate": 5.209063692470104e-06, + "loss": 0.80889053, + "memory(GiB)": 147.13, + "step": 43040, + "train_speed(iter/s)": 0.200743 + }, + { + "acc": 0.75513368, + "epoch": 1.0043890987036848, + "grad_norm": 5.40625, + "learning_rate": 5.207176218049526e-06, + "loss": 0.88391256, + "memory(GiB)": 147.13, + "step": 43050, + "train_speed(iter/s)": 0.200767 + }, + { + "acc": 0.77179451, + "epoch": 1.0046224062759737, + "grad_norm": 4.71875, + "learning_rate": 5.205288714054602e-06, + "loss": 0.80851946, + "memory(GiB)": 147.13, + "step": 43060, + "train_speed(iter/s)": 0.200791 + }, + { + "acc": 0.77560835, + "epoch": 1.0048557138482626, + "grad_norm": 5.1875, + "learning_rate": 5.203401180754772e-06, + "loss": 0.79950652, + "memory(GiB)": 147.13, + "step": 43070, + "train_speed(iter/s)": 0.200815 + }, + { + "acc": 0.76174564, + "epoch": 1.0050890214205515, + "grad_norm": 4.53125, + "learning_rate": 5.201513618419486e-06, + "loss": 0.84228649, + "memory(GiB)": 147.13, + "step": 43080, + "train_speed(iter/s)": 0.200839 + }, + { + "acc": 0.78287921, + "epoch": 1.0053223289928404, + "grad_norm": 5.9375, + "learning_rate": 5.199626027318188e-06, + "loss": 0.78160782, + "memory(GiB)": 147.13, + "step": 43090, + "train_speed(iter/s)": 0.200863 + }, + { + "acc": 0.78439932, + "epoch": 1.0055556365651293, + "grad_norm": 5.09375, + "learning_rate": 5.197738407720331e-06, + "loss": 0.76160207, + "memory(GiB)": 147.13, + "step": 43100, + "train_speed(iter/s)": 0.200886 + }, + { + "acc": 0.77529421, + "epoch": 1.0057889441374182, + "grad_norm": 5.96875, + "learning_rate": 5.195850759895374e-06, + "loss": 0.8274806, + "memory(GiB)": 147.13, + "step": 43110, + "train_speed(iter/s)": 0.200911 + }, + { + "acc": 0.79326754, + "epoch": 1.006022251709707, + "grad_norm": 5.96875, + "learning_rate": 5.193963084112781e-06, + "loss": 0.73483434, + "memory(GiB)": 147.13, + "step": 43120, + "train_speed(iter/s)": 0.200935 + }, + { + "acc": 0.78251967, + "epoch": 1.006255559281996, + "grad_norm": 4.96875, + "learning_rate": 5.192075380642011e-06, + "loss": 0.76476879, + "memory(GiB)": 147.13, + "step": 43130, + "train_speed(iter/s)": 0.200958 + }, + { + "acc": 0.78560572, + "epoch": 1.0064888668542848, + "grad_norm": 6.3125, + "learning_rate": 5.190187649752538e-06, + "loss": 0.76422467, + "memory(GiB)": 147.13, + "step": 43140, + "train_speed(iter/s)": 0.200984 + }, + { + "acc": 0.77720866, + "epoch": 1.0067221744265737, + "grad_norm": 6.53125, + "learning_rate": 5.1882998917138324e-06, + "loss": 0.79875774, + "memory(GiB)": 147.13, + "step": 43150, + "train_speed(iter/s)": 0.201008 + }, + { + "acc": 0.78121185, + "epoch": 1.0069554819988626, + "grad_norm": 5.28125, + "learning_rate": 5.186412106795371e-06, + "loss": 0.78524699, + "memory(GiB)": 147.13, + "step": 43160, + "train_speed(iter/s)": 0.201033 + }, + { + "acc": 0.7672533, + "epoch": 1.0071887895711515, + "grad_norm": 5.0, + "learning_rate": 5.1845242952666365e-06, + "loss": 0.84109287, + "memory(GiB)": 147.13, + "step": 43170, + "train_speed(iter/s)": 0.201057 + }, + { + "acc": 0.79040389, + "epoch": 1.0074220971434404, + "grad_norm": 4.96875, + "learning_rate": 5.1826364573971125e-06, + "loss": 0.75233536, + "memory(GiB)": 147.13, + "step": 43180, + "train_speed(iter/s)": 0.201079 + }, + { + "acc": 0.77585616, + "epoch": 1.0076554047157293, + "grad_norm": 4.53125, + "learning_rate": 5.180748593456289e-06, + "loss": 0.8176096, + "memory(GiB)": 147.13, + "step": 43190, + "train_speed(iter/s)": 0.201103 + }, + { + "acc": 0.76432118, + "epoch": 1.0078887122880182, + "grad_norm": 4.84375, + "learning_rate": 5.178860703713654e-06, + "loss": 0.84310722, + "memory(GiB)": 147.13, + "step": 43200, + "train_speed(iter/s)": 0.201128 + }, + { + "acc": 0.78325276, + "epoch": 1.0081220198603071, + "grad_norm": 5.34375, + "learning_rate": 5.176972788438705e-06, + "loss": 0.7665885, + "memory(GiB)": 147.13, + "step": 43210, + "train_speed(iter/s)": 0.201154 + }, + { + "acc": 0.77819371, + "epoch": 1.008355327432596, + "grad_norm": 6.0, + "learning_rate": 5.175084847900943e-06, + "loss": 0.79968734, + "memory(GiB)": 147.13, + "step": 43220, + "train_speed(iter/s)": 0.201178 + }, + { + "acc": 0.78666801, + "epoch": 1.008588635004885, + "grad_norm": 4.5625, + "learning_rate": 5.17319688236987e-06, + "loss": 0.77615905, + "memory(GiB)": 147.13, + "step": 43230, + "train_speed(iter/s)": 0.201203 + }, + { + "acc": 0.78133011, + "epoch": 1.0088219425771738, + "grad_norm": 5.28125, + "learning_rate": 5.171308892114991e-06, + "loss": 0.80085545, + "memory(GiB)": 147.13, + "step": 43240, + "train_speed(iter/s)": 0.201226 + }, + { + "acc": 0.79328427, + "epoch": 1.0090552501494627, + "grad_norm": 4.25, + "learning_rate": 5.16942087740582e-06, + "loss": 0.73711023, + "memory(GiB)": 147.13, + "step": 43250, + "train_speed(iter/s)": 0.201249 + }, + { + "acc": 0.78356128, + "epoch": 1.0092885577217516, + "grad_norm": 4.71875, + "learning_rate": 5.167532838511866e-06, + "loss": 0.76406059, + "memory(GiB)": 147.13, + "step": 43260, + "train_speed(iter/s)": 0.201274 + }, + { + "acc": 0.76661291, + "epoch": 1.0095218652940405, + "grad_norm": 4.9375, + "learning_rate": 5.16564477570265e-06, + "loss": 0.81206608, + "memory(GiB)": 147.13, + "step": 43270, + "train_speed(iter/s)": 0.201297 + }, + { + "acc": 0.78339691, + "epoch": 1.0097551728663294, + "grad_norm": 5.65625, + "learning_rate": 5.163756689247687e-06, + "loss": 0.76879001, + "memory(GiB)": 147.13, + "step": 43280, + "train_speed(iter/s)": 0.201321 + }, + { + "acc": 0.78117151, + "epoch": 1.0099884804386183, + "grad_norm": 4.625, + "learning_rate": 5.1618685794165066e-06, + "loss": 0.78589344, + "memory(GiB)": 147.13, + "step": 43290, + "train_speed(iter/s)": 0.201347 + }, + { + "acc": 0.75611968, + "epoch": 1.0102217880109072, + "grad_norm": 5.75, + "learning_rate": 5.159980446478633e-06, + "loss": 0.88358593, + "memory(GiB)": 147.13, + "step": 43300, + "train_speed(iter/s)": 0.201372 + }, + { + "acc": 0.77154584, + "epoch": 1.0104550955831961, + "grad_norm": 5.375, + "learning_rate": 5.158092290703597e-06, + "loss": 0.81572247, + "memory(GiB)": 147.13, + "step": 43310, + "train_speed(iter/s)": 0.201395 + }, + { + "acc": 0.76835303, + "epoch": 1.010688403155485, + "grad_norm": 5.9375, + "learning_rate": 5.156204112360933e-06, + "loss": 0.80885944, + "memory(GiB)": 147.13, + "step": 43320, + "train_speed(iter/s)": 0.201419 + }, + { + "acc": 0.77416339, + "epoch": 1.0109217107277737, + "grad_norm": 5.03125, + "learning_rate": 5.154315911720176e-06, + "loss": 0.81178932, + "memory(GiB)": 147.13, + "step": 43330, + "train_speed(iter/s)": 0.201444 + }, + { + "acc": 0.78630424, + "epoch": 1.0111550183000626, + "grad_norm": 6.625, + "learning_rate": 5.152427689050869e-06, + "loss": 0.77561946, + "memory(GiB)": 147.13, + "step": 43340, + "train_speed(iter/s)": 0.201467 + }, + { + "acc": 0.7755363, + "epoch": 1.0113883258723515, + "grad_norm": 3.859375, + "learning_rate": 5.150539444622552e-06, + "loss": 0.80117035, + "memory(GiB)": 147.13, + "step": 43350, + "train_speed(iter/s)": 0.201491 + }, + { + "acc": 0.78446379, + "epoch": 1.0116216334446404, + "grad_norm": 8.1875, + "learning_rate": 5.148651178704775e-06, + "loss": 0.82833748, + "memory(GiB)": 147.13, + "step": 43360, + "train_speed(iter/s)": 0.201516 + }, + { + "acc": 0.76374173, + "epoch": 1.0118549410169293, + "grad_norm": 7.96875, + "learning_rate": 5.146762891567084e-06, + "loss": 0.88345985, + "memory(GiB)": 147.13, + "step": 43370, + "train_speed(iter/s)": 0.20154 + }, + { + "acc": 0.79155569, + "epoch": 1.0120882485892182, + "grad_norm": 5.8125, + "learning_rate": 5.144874583479034e-06, + "loss": 0.74518204, + "memory(GiB)": 147.13, + "step": 43380, + "train_speed(iter/s)": 0.201564 + }, + { + "acc": 0.77843351, + "epoch": 1.012321556161507, + "grad_norm": 5.59375, + "learning_rate": 5.142986254710177e-06, + "loss": 0.78075495, + "memory(GiB)": 147.13, + "step": 43390, + "train_speed(iter/s)": 0.201587 + }, + { + "acc": 0.77403011, + "epoch": 1.012554863733796, + "grad_norm": 5.9375, + "learning_rate": 5.141097905530077e-06, + "loss": 0.80180321, + "memory(GiB)": 147.13, + "step": 43400, + "train_speed(iter/s)": 0.201608 + }, + { + "acc": 0.79151011, + "epoch": 1.012788171306085, + "grad_norm": 5.53125, + "learning_rate": 5.139209536208289e-06, + "loss": 0.74089231, + "memory(GiB)": 147.13, + "step": 43410, + "train_speed(iter/s)": 0.20163 + }, + { + "acc": 0.76592178, + "epoch": 1.0130214788783738, + "grad_norm": 8.125, + "learning_rate": 5.1373211470143814e-06, + "loss": 0.83816414, + "memory(GiB)": 147.13, + "step": 43420, + "train_speed(iter/s)": 0.201653 + }, + { + "acc": 0.76648312, + "epoch": 1.0132547864506627, + "grad_norm": 5.625, + "learning_rate": 5.13543273821792e-06, + "loss": 0.85734253, + "memory(GiB)": 147.13, + "step": 43430, + "train_speed(iter/s)": 0.201676 + }, + { + "acc": 0.76693745, + "epoch": 1.0134880940229516, + "grad_norm": 4.875, + "learning_rate": 5.133544310088474e-06, + "loss": 0.84319305, + "memory(GiB)": 147.13, + "step": 43440, + "train_speed(iter/s)": 0.2017 + }, + { + "acc": 0.78325624, + "epoch": 1.0137214015952405, + "grad_norm": 5.4375, + "learning_rate": 5.131655862895617e-06, + "loss": 0.77048082, + "memory(GiB)": 147.13, + "step": 43450, + "train_speed(iter/s)": 0.201723 + }, + { + "acc": 0.79426832, + "epoch": 1.0139547091675294, + "grad_norm": 4.09375, + "learning_rate": 5.129767396908923e-06, + "loss": 0.73330755, + "memory(GiB)": 147.13, + "step": 43460, + "train_speed(iter/s)": 0.201747 + }, + { + "acc": 0.78500352, + "epoch": 1.0141880167398183, + "grad_norm": 4.5625, + "learning_rate": 5.1278789123979736e-06, + "loss": 0.76361828, + "memory(GiB)": 147.13, + "step": 43470, + "train_speed(iter/s)": 0.201772 + }, + { + "acc": 0.77640676, + "epoch": 1.0144213243121072, + "grad_norm": 5.6875, + "learning_rate": 5.125990409632344e-06, + "loss": 0.82779522, + "memory(GiB)": 147.13, + "step": 43480, + "train_speed(iter/s)": 0.201796 + }, + { + "acc": 0.76425214, + "epoch": 1.014654631884396, + "grad_norm": 4.3125, + "learning_rate": 5.1241018888816205e-06, + "loss": 0.862677, + "memory(GiB)": 147.13, + "step": 43490, + "train_speed(iter/s)": 0.20182 + }, + { + "acc": 0.78395214, + "epoch": 1.014887939456685, + "grad_norm": 6.28125, + "learning_rate": 5.122213350415389e-06, + "loss": 0.76862893, + "memory(GiB)": 147.13, + "step": 43500, + "train_speed(iter/s)": 0.201844 + }, + { + "epoch": 1.014887939456685, + "eval_acc": 0.743926970386609, + "eval_loss": 0.8070799708366394, + "eval_runtime": 1270.2738, + "eval_samples_per_second": 28.333, + "eval_steps_per_second": 14.167, + "step": 43500 + }, + { + "acc": 0.77859073, + "epoch": 1.0151212470289739, + "grad_norm": 5.625, + "learning_rate": 5.1203247945032365e-06, + "loss": 0.78678293, + "memory(GiB)": 147.13, + "step": 43510, + "train_speed(iter/s)": 0.200669 + }, + { + "acc": 0.77705793, + "epoch": 1.0153545546012628, + "grad_norm": 6.6875, + "learning_rate": 5.118436221414754e-06, + "loss": 0.80260468, + "memory(GiB)": 147.13, + "step": 43520, + "train_speed(iter/s)": 0.200692 + }, + { + "acc": 0.76431408, + "epoch": 1.0155878621735517, + "grad_norm": 7.75, + "learning_rate": 5.116547631419536e-06, + "loss": 0.85512829, + "memory(GiB)": 147.13, + "step": 43530, + "train_speed(iter/s)": 0.200717 + }, + { + "acc": 0.7825469, + "epoch": 1.0158211697458406, + "grad_norm": 5.0625, + "learning_rate": 5.114659024787179e-06, + "loss": 0.76410046, + "memory(GiB)": 147.13, + "step": 43540, + "train_speed(iter/s)": 0.200742 + }, + { + "acc": 0.76927857, + "epoch": 1.0160544773181295, + "grad_norm": 5.875, + "learning_rate": 5.112770401787278e-06, + "loss": 0.81999454, + "memory(GiB)": 147.13, + "step": 43550, + "train_speed(iter/s)": 0.200765 + }, + { + "acc": 0.76841311, + "epoch": 1.0162877848904184, + "grad_norm": 5.0625, + "learning_rate": 5.110881762689435e-06, + "loss": 0.84414501, + "memory(GiB)": 147.13, + "step": 43560, + "train_speed(iter/s)": 0.20079 + }, + { + "acc": 0.77150702, + "epoch": 1.0165210924627073, + "grad_norm": 4.28125, + "learning_rate": 5.1089931077632514e-06, + "loss": 0.81745987, + "memory(GiB)": 147.13, + "step": 43570, + "train_speed(iter/s)": 0.200814 + }, + { + "acc": 0.7604578, + "epoch": 1.0167544000349962, + "grad_norm": 7.5, + "learning_rate": 5.1071044372783355e-06, + "loss": 0.86848364, + "memory(GiB)": 147.13, + "step": 43580, + "train_speed(iter/s)": 0.200837 + }, + { + "acc": 0.77862034, + "epoch": 1.016987707607285, + "grad_norm": 6.6875, + "learning_rate": 5.10521575150429e-06, + "loss": 0.80064793, + "memory(GiB)": 147.13, + "step": 43590, + "train_speed(iter/s)": 0.20086 + }, + { + "acc": 0.77443581, + "epoch": 1.017221015179574, + "grad_norm": 5.75, + "learning_rate": 5.103327050710729e-06, + "loss": 0.83215542, + "memory(GiB)": 147.13, + "step": 43600, + "train_speed(iter/s)": 0.200884 + }, + { + "acc": 0.7929235, + "epoch": 1.0174543227518629, + "grad_norm": 6.40625, + "learning_rate": 5.10143833516726e-06, + "loss": 0.74127417, + "memory(GiB)": 147.13, + "step": 43610, + "train_speed(iter/s)": 0.200907 + }, + { + "acc": 0.79245968, + "epoch": 1.0176876303241518, + "grad_norm": 6.25, + "learning_rate": 5.099549605143499e-06, + "loss": 0.75981627, + "memory(GiB)": 147.13, + "step": 43620, + "train_speed(iter/s)": 0.200931 + }, + { + "acc": 0.79105101, + "epoch": 1.0179209378964407, + "grad_norm": 5.4375, + "learning_rate": 5.0976608609090606e-06, + "loss": 0.74999475, + "memory(GiB)": 147.13, + "step": 43630, + "train_speed(iter/s)": 0.200955 + }, + { + "acc": 0.77970495, + "epoch": 1.0181542454687296, + "grad_norm": 6.1875, + "learning_rate": 5.095772102733561e-06, + "loss": 0.77877584, + "memory(GiB)": 147.13, + "step": 43640, + "train_speed(iter/s)": 0.200979 + }, + { + "acc": 0.78451672, + "epoch": 1.0183875530410185, + "grad_norm": 5.4375, + "learning_rate": 5.093883330886623e-06, + "loss": 0.75853043, + "memory(GiB)": 147.13, + "step": 43650, + "train_speed(iter/s)": 0.201002 + }, + { + "acc": 0.79117913, + "epoch": 1.0186208606133074, + "grad_norm": 7.15625, + "learning_rate": 5.091994545637867e-06, + "loss": 0.74103169, + "memory(GiB)": 147.13, + "step": 43660, + "train_speed(iter/s)": 0.201026 + }, + { + "acc": 0.79023709, + "epoch": 1.0188541681855963, + "grad_norm": 5.75, + "learning_rate": 5.090105747256916e-06, + "loss": 0.7264926, + "memory(GiB)": 147.13, + "step": 43670, + "train_speed(iter/s)": 0.201049 + }, + { + "acc": 0.78450155, + "epoch": 1.0190874757578852, + "grad_norm": 4.6875, + "learning_rate": 5.088216936013396e-06, + "loss": 0.77232718, + "memory(GiB)": 147.13, + "step": 43680, + "train_speed(iter/s)": 0.201071 + }, + { + "acc": 0.77247133, + "epoch": 1.019320783330174, + "grad_norm": 5.84375, + "learning_rate": 5.086328112176934e-06, + "loss": 0.82542839, + "memory(GiB)": 147.13, + "step": 43690, + "train_speed(iter/s)": 0.201094 + }, + { + "acc": 0.77573242, + "epoch": 1.019554090902463, + "grad_norm": 5.3125, + "learning_rate": 5.084439276017159e-06, + "loss": 0.81169853, + "memory(GiB)": 147.13, + "step": 43700, + "train_speed(iter/s)": 0.201118 + }, + { + "acc": 0.79919705, + "epoch": 1.0197873984747516, + "grad_norm": 4.34375, + "learning_rate": 5.082550427803702e-06, + "loss": 0.72000685, + "memory(GiB)": 147.13, + "step": 43710, + "train_speed(iter/s)": 0.201141 + }, + { + "acc": 0.79065461, + "epoch": 1.0200207060470405, + "grad_norm": 10.6875, + "learning_rate": 5.080661567806195e-06, + "loss": 0.746523, + "memory(GiB)": 147.13, + "step": 43720, + "train_speed(iter/s)": 0.201163 + }, + { + "acc": 0.77126436, + "epoch": 1.0202540136193294, + "grad_norm": 5.0625, + "learning_rate": 5.078772696294273e-06, + "loss": 0.81962471, + "memory(GiB)": 147.13, + "step": 43730, + "train_speed(iter/s)": 0.201186 + }, + { + "acc": 0.78477058, + "epoch": 1.0204873211916183, + "grad_norm": 6.4375, + "learning_rate": 5.076883813537571e-06, + "loss": 0.77737856, + "memory(GiB)": 147.13, + "step": 43740, + "train_speed(iter/s)": 0.201211 + }, + { + "acc": 0.78322659, + "epoch": 1.0207206287639072, + "grad_norm": 6.875, + "learning_rate": 5.074994919805727e-06, + "loss": 0.75895319, + "memory(GiB)": 147.13, + "step": 43750, + "train_speed(iter/s)": 0.201235 + }, + { + "acc": 0.77741756, + "epoch": 1.0209539363361961, + "grad_norm": 5.28125, + "learning_rate": 5.073106015368381e-06, + "loss": 0.81184292, + "memory(GiB)": 147.13, + "step": 43760, + "train_speed(iter/s)": 0.201259 + }, + { + "acc": 0.78123074, + "epoch": 1.021187243908485, + "grad_norm": 5.65625, + "learning_rate": 5.071217100495172e-06, + "loss": 0.78832426, + "memory(GiB)": 147.13, + "step": 43770, + "train_speed(iter/s)": 0.201283 + }, + { + "acc": 0.76711731, + "epoch": 1.021420551480774, + "grad_norm": 7.5, + "learning_rate": 5.069328175455742e-06, + "loss": 0.82933445, + "memory(GiB)": 147.13, + "step": 43780, + "train_speed(iter/s)": 0.201307 + }, + { + "acc": 0.7838244, + "epoch": 1.0216538590530628, + "grad_norm": 5.3125, + "learning_rate": 5.067439240519735e-06, + "loss": 0.78824844, + "memory(GiB)": 147.13, + "step": 43790, + "train_speed(iter/s)": 0.201331 + }, + { + "acc": 0.77966681, + "epoch": 1.0218871666253517, + "grad_norm": 4.8125, + "learning_rate": 5.065550295956796e-06, + "loss": 0.78713975, + "memory(GiB)": 147.13, + "step": 43800, + "train_speed(iter/s)": 0.201355 + }, + { + "acc": 0.78061857, + "epoch": 1.0221204741976406, + "grad_norm": 5.25, + "learning_rate": 5.063661342036571e-06, + "loss": 0.76254168, + "memory(GiB)": 147.13, + "step": 43810, + "train_speed(iter/s)": 0.201379 + }, + { + "acc": 0.76380448, + "epoch": 1.0223537817699295, + "grad_norm": 5.84375, + "learning_rate": 5.061772379028709e-06, + "loss": 0.87199421, + "memory(GiB)": 147.13, + "step": 43820, + "train_speed(iter/s)": 0.201402 + }, + { + "acc": 0.79298944, + "epoch": 1.0225870893422184, + "grad_norm": 8.4375, + "learning_rate": 5.059883407202858e-06, + "loss": 0.7629261, + "memory(GiB)": 147.13, + "step": 43830, + "train_speed(iter/s)": 0.201426 + }, + { + "acc": 0.78520899, + "epoch": 1.0228203969145073, + "grad_norm": 7.0625, + "learning_rate": 5.057994426828669e-06, + "loss": 0.76913924, + "memory(GiB)": 147.13, + "step": 43840, + "train_speed(iter/s)": 0.20145 + }, + { + "acc": 0.76806917, + "epoch": 1.0230537044867962, + "grad_norm": 6.25, + "learning_rate": 5.05610543817579e-06, + "loss": 0.8473484, + "memory(GiB)": 147.13, + "step": 43850, + "train_speed(iter/s)": 0.201474 + }, + { + "acc": 0.79321308, + "epoch": 1.0232870120590851, + "grad_norm": 3.625, + "learning_rate": 5.054216441513876e-06, + "loss": 0.7350997, + "memory(GiB)": 147.13, + "step": 43860, + "train_speed(iter/s)": 0.201496 + }, + { + "acc": 0.77993116, + "epoch": 1.023520319631374, + "grad_norm": 4.59375, + "learning_rate": 5.052327437112582e-06, + "loss": 0.77887244, + "memory(GiB)": 147.13, + "step": 43870, + "train_speed(iter/s)": 0.20152 + }, + { + "acc": 0.7738163, + "epoch": 1.023753627203663, + "grad_norm": 6.71875, + "learning_rate": 5.050438425241562e-06, + "loss": 0.82020864, + "memory(GiB)": 147.13, + "step": 43880, + "train_speed(iter/s)": 0.201543 + }, + { + "acc": 0.78634443, + "epoch": 1.0239869347759518, + "grad_norm": 5.8125, + "learning_rate": 5.0485494061704695e-06, + "loss": 0.76208029, + "memory(GiB)": 147.13, + "step": 43890, + "train_speed(iter/s)": 0.201566 + }, + { + "acc": 0.76844702, + "epoch": 1.0242202423482407, + "grad_norm": 6.96875, + "learning_rate": 5.0466603801689655e-06, + "loss": 0.82169828, + "memory(GiB)": 147.13, + "step": 43900, + "train_speed(iter/s)": 0.201589 + }, + { + "acc": 0.78849974, + "epoch": 1.0244535499205296, + "grad_norm": 4.34375, + "learning_rate": 5.044771347506705e-06, + "loss": 0.78166656, + "memory(GiB)": 147.13, + "step": 43910, + "train_speed(iter/s)": 0.201611 + }, + { + "acc": 0.79453068, + "epoch": 1.0246868574928185, + "grad_norm": 4.71875, + "learning_rate": 5.0428823084533475e-06, + "loss": 0.72507105, + "memory(GiB)": 147.13, + "step": 43920, + "train_speed(iter/s)": 0.201635 + }, + { + "acc": 0.77169046, + "epoch": 1.0249201650651074, + "grad_norm": 5.9375, + "learning_rate": 5.040993263278552e-06, + "loss": 0.83655701, + "memory(GiB)": 147.13, + "step": 43930, + "train_speed(iter/s)": 0.201657 + }, + { + "acc": 0.77949696, + "epoch": 1.0251534726373963, + "grad_norm": 6.3125, + "learning_rate": 5.0391042122519815e-06, + "loss": 0.81380482, + "memory(GiB)": 147.13, + "step": 43940, + "train_speed(iter/s)": 0.201681 + }, + { + "acc": 0.79480991, + "epoch": 1.0253867802096852, + "grad_norm": 5.375, + "learning_rate": 5.037215155643296e-06, + "loss": 0.74450397, + "memory(GiB)": 147.13, + "step": 43950, + "train_speed(iter/s)": 0.201704 + }, + { + "acc": 0.79364281, + "epoch": 1.025620087781974, + "grad_norm": 5.71875, + "learning_rate": 5.035326093722157e-06, + "loss": 0.73111277, + "memory(GiB)": 147.13, + "step": 43960, + "train_speed(iter/s)": 0.201728 + }, + { + "acc": 0.77582326, + "epoch": 1.025853395354263, + "grad_norm": 6.65625, + "learning_rate": 5.033437026758228e-06, + "loss": 0.83863983, + "memory(GiB)": 147.13, + "step": 43970, + "train_speed(iter/s)": 0.201752 + }, + { + "acc": 0.78823829, + "epoch": 1.026086702926552, + "grad_norm": 6.75, + "learning_rate": 5.0315479550211746e-06, + "loss": 0.75079851, + "memory(GiB)": 147.13, + "step": 43980, + "train_speed(iter/s)": 0.201776 + }, + { + "acc": 0.7844943, + "epoch": 1.0263200104988408, + "grad_norm": 6.3125, + "learning_rate": 5.029658878780659e-06, + "loss": 0.77733545, + "memory(GiB)": 147.13, + "step": 43990, + "train_speed(iter/s)": 0.201798 + }, + { + "acc": 0.78340597, + "epoch": 1.0265533180711297, + "grad_norm": 4.6875, + "learning_rate": 5.0277697983063476e-06, + "loss": 0.76800084, + "memory(GiB)": 147.13, + "step": 44000, + "train_speed(iter/s)": 0.201823 + }, + { + "epoch": 1.0265533180711297, + "eval_acc": 0.7438331258664804, + "eval_loss": 0.8070977330207825, + "eval_runtime": 1271.2669, + "eval_samples_per_second": 28.311, + "eval_steps_per_second": 14.156, + "step": 44000 + }, + { + "acc": 0.77107697, + "epoch": 1.0267866256434186, + "grad_norm": 7.34375, + "learning_rate": 5.025880713867904e-06, + "loss": 0.85529652, + "memory(GiB)": 147.13, + "step": 44010, + "train_speed(iter/s)": 0.20066 + }, + { + "acc": 0.78703861, + "epoch": 1.0270199332157075, + "grad_norm": 5.375, + "learning_rate": 5.023991625734998e-06, + "loss": 0.76970444, + "memory(GiB)": 147.13, + "step": 44020, + "train_speed(iter/s)": 0.200683 + }, + { + "acc": 0.76558523, + "epoch": 1.0272532407879964, + "grad_norm": 4.15625, + "learning_rate": 5.022102534177293e-06, + "loss": 0.83571815, + "memory(GiB)": 147.13, + "step": 44030, + "train_speed(iter/s)": 0.200707 + }, + { + "acc": 0.77779908, + "epoch": 1.0274865483602853, + "grad_norm": 5.03125, + "learning_rate": 5.020213439464458e-06, + "loss": 0.78558512, + "memory(GiB)": 147.13, + "step": 44040, + "train_speed(iter/s)": 0.200728 + }, + { + "acc": 0.79692812, + "epoch": 1.0277198559325742, + "grad_norm": 6.3125, + "learning_rate": 5.018324341866161e-06, + "loss": 0.71697149, + "memory(GiB)": 147.13, + "step": 44050, + "train_speed(iter/s)": 0.200751 + }, + { + "acc": 0.80139132, + "epoch": 1.027953163504863, + "grad_norm": 5.9375, + "learning_rate": 5.01643524165207e-06, + "loss": 0.68105035, + "memory(GiB)": 147.13, + "step": 44060, + "train_speed(iter/s)": 0.200773 + }, + { + "acc": 0.77024827, + "epoch": 1.028186471077152, + "grad_norm": 6.34375, + "learning_rate": 5.014546139091851e-06, + "loss": 0.83891077, + "memory(GiB)": 147.13, + "step": 44070, + "train_speed(iter/s)": 0.200796 + }, + { + "acc": 0.78477793, + "epoch": 1.028419778649441, + "grad_norm": 4.0, + "learning_rate": 5.012657034455176e-06, + "loss": 0.80206022, + "memory(GiB)": 147.13, + "step": 44080, + "train_speed(iter/s)": 0.200818 + }, + { + "acc": 0.79112525, + "epoch": 1.0286530862217296, + "grad_norm": 6.1875, + "learning_rate": 5.010767928011713e-06, + "loss": 0.7436903, + "memory(GiB)": 147.13, + "step": 44090, + "train_speed(iter/s)": 0.200842 + }, + { + "acc": 0.78853774, + "epoch": 1.0288863937940185, + "grad_norm": 5.28125, + "learning_rate": 5.008878820031131e-06, + "loss": 0.76734447, + "memory(GiB)": 147.13, + "step": 44100, + "train_speed(iter/s)": 0.200865 + }, + { + "acc": 0.78617477, + "epoch": 1.0291197013663074, + "grad_norm": 6.09375, + "learning_rate": 5.006989710783101e-06, + "loss": 0.75627079, + "memory(GiB)": 147.13, + "step": 44110, + "train_speed(iter/s)": 0.20089 + }, + { + "acc": 0.78956699, + "epoch": 1.0293530089385963, + "grad_norm": 6.1875, + "learning_rate": 5.005100600537292e-06, + "loss": 0.74087925, + "memory(GiB)": 147.13, + "step": 44120, + "train_speed(iter/s)": 0.200913 + }, + { + "acc": 0.77857628, + "epoch": 1.0295863165108852, + "grad_norm": 5.25, + "learning_rate": 5.003211489563373e-06, + "loss": 0.80192738, + "memory(GiB)": 147.13, + "step": 44130, + "train_speed(iter/s)": 0.200937 + }, + { + "acc": 0.79740629, + "epoch": 1.029819624083174, + "grad_norm": 7.25, + "learning_rate": 5.001322378131015e-06, + "loss": 0.72210054, + "memory(GiB)": 147.13, + "step": 44140, + "train_speed(iter/s)": 0.200958 + }, + { + "acc": 0.79078064, + "epoch": 1.030052931655463, + "grad_norm": 5.0625, + "learning_rate": 4.9994332665098885e-06, + "loss": 0.74621177, + "memory(GiB)": 147.13, + "step": 44150, + "train_speed(iter/s)": 0.200981 + }, + { + "acc": 0.7691617, + "epoch": 1.0302862392277519, + "grad_norm": 4.96875, + "learning_rate": 4.997544154969661e-06, + "loss": 0.84756422, + "memory(GiB)": 147.13, + "step": 44160, + "train_speed(iter/s)": 0.201006 + }, + { + "acc": 0.78140469, + "epoch": 1.0305195468000408, + "grad_norm": 4.9375, + "learning_rate": 4.995655043780006e-06, + "loss": 0.80409288, + "memory(GiB)": 147.13, + "step": 44170, + "train_speed(iter/s)": 0.201029 + }, + { + "acc": 0.78131447, + "epoch": 1.0307528543723297, + "grad_norm": 7.96875, + "learning_rate": 4.993765933210592e-06, + "loss": 0.79957089, + "memory(GiB)": 147.13, + "step": 44180, + "train_speed(iter/s)": 0.201052 + }, + { + "acc": 0.78027859, + "epoch": 1.0309861619446186, + "grad_norm": 11.5625, + "learning_rate": 4.991876823531089e-06, + "loss": 0.79265738, + "memory(GiB)": 147.13, + "step": 44190, + "train_speed(iter/s)": 0.201075 + }, + { + "acc": 0.77979045, + "epoch": 1.0312194695169075, + "grad_norm": 10.0625, + "learning_rate": 4.989987715011168e-06, + "loss": 0.80041943, + "memory(GiB)": 147.13, + "step": 44200, + "train_speed(iter/s)": 0.201097 + }, + { + "acc": 0.77964592, + "epoch": 1.0314527770891964, + "grad_norm": 4.09375, + "learning_rate": 4.988098607920497e-06, + "loss": 0.79669185, + "memory(GiB)": 147.13, + "step": 44210, + "train_speed(iter/s)": 0.201122 + }, + { + "acc": 0.76723943, + "epoch": 1.0316860846614853, + "grad_norm": 6.15625, + "learning_rate": 4.986209502528746e-06, + "loss": 0.83204918, + "memory(GiB)": 147.13, + "step": 44220, + "train_speed(iter/s)": 0.201146 + }, + { + "acc": 0.782441, + "epoch": 1.0319193922337742, + "grad_norm": 4.09375, + "learning_rate": 4.984320399105585e-06, + "loss": 0.77679338, + "memory(GiB)": 147.13, + "step": 44230, + "train_speed(iter/s)": 0.201168 + }, + { + "acc": 0.78049326, + "epoch": 1.032152699806063, + "grad_norm": 3.6875, + "learning_rate": 4.982431297920682e-06, + "loss": 0.77765503, + "memory(GiB)": 147.13, + "step": 44240, + "train_speed(iter/s)": 0.201191 + }, + { + "acc": 0.79742098, + "epoch": 1.032386007378352, + "grad_norm": 16.75, + "learning_rate": 4.980542199243709e-06, + "loss": 0.72890673, + "memory(GiB)": 147.13, + "step": 44250, + "train_speed(iter/s)": 0.201214 + }, + { + "acc": 0.7838129, + "epoch": 1.0326193149506409, + "grad_norm": 5.90625, + "learning_rate": 4.978653103344328e-06, + "loss": 0.79265904, + "memory(GiB)": 147.13, + "step": 44260, + "train_speed(iter/s)": 0.201238 + }, + { + "acc": 0.78945446, + "epoch": 1.0328526225229298, + "grad_norm": 6.09375, + "learning_rate": 4.976764010492211e-06, + "loss": 0.7581913, + "memory(GiB)": 147.13, + "step": 44270, + "train_speed(iter/s)": 0.201261 + }, + { + "acc": 0.77482014, + "epoch": 1.0330859300952187, + "grad_norm": 7.21875, + "learning_rate": 4.974874920957025e-06, + "loss": 0.81373167, + "memory(GiB)": 147.13, + "step": 44280, + "train_speed(iter/s)": 0.201284 + }, + { + "acc": 0.79341946, + "epoch": 1.0333192376675076, + "grad_norm": 7.125, + "learning_rate": 4.972985835008437e-06, + "loss": 0.75202971, + "memory(GiB)": 147.13, + "step": 44290, + "train_speed(iter/s)": 0.201306 + }, + { + "acc": 0.79722977, + "epoch": 1.0335525452397964, + "grad_norm": 5.0625, + "learning_rate": 4.971096752916113e-06, + "loss": 0.73516812, + "memory(GiB)": 147.13, + "step": 44300, + "train_speed(iter/s)": 0.201329 + }, + { + "acc": 0.79803047, + "epoch": 1.0337858528120853, + "grad_norm": 8.5, + "learning_rate": 4.969207674949719e-06, + "loss": 0.71913786, + "memory(GiB)": 147.13, + "step": 44310, + "train_speed(iter/s)": 0.201351 + }, + { + "acc": 0.77934179, + "epoch": 1.0340191603843742, + "grad_norm": 6.28125, + "learning_rate": 4.96731860137892e-06, + "loss": 0.8005085, + "memory(GiB)": 147.13, + "step": 44320, + "train_speed(iter/s)": 0.201374 + }, + { + "acc": 0.76904945, + "epoch": 1.0342524679566631, + "grad_norm": 5.71875, + "learning_rate": 4.965429532473383e-06, + "loss": 0.84740162, + "memory(GiB)": 147.13, + "step": 44330, + "train_speed(iter/s)": 0.201398 + }, + { + "acc": 0.78627586, + "epoch": 1.034485775528952, + "grad_norm": 7.21875, + "learning_rate": 4.963540468502768e-06, + "loss": 0.75449095, + "memory(GiB)": 147.13, + "step": 44340, + "train_speed(iter/s)": 0.201421 + }, + { + "acc": 0.7668519, + "epoch": 1.034719083101241, + "grad_norm": 6.375, + "learning_rate": 4.961651409736741e-06, + "loss": 0.82782583, + "memory(GiB)": 147.13, + "step": 44350, + "train_speed(iter/s)": 0.201445 + }, + { + "acc": 0.77557964, + "epoch": 1.0349523906735298, + "grad_norm": 6.5, + "learning_rate": 4.959762356444964e-06, + "loss": 0.81699305, + "memory(GiB)": 147.13, + "step": 44360, + "train_speed(iter/s)": 0.201469 + }, + { + "acc": 0.77146072, + "epoch": 1.0351856982458187, + "grad_norm": 6.84375, + "learning_rate": 4.957873308897102e-06, + "loss": 0.83753395, + "memory(GiB)": 147.13, + "step": 44370, + "train_speed(iter/s)": 0.201493 + }, + { + "acc": 0.77347927, + "epoch": 1.0354190058181076, + "grad_norm": 4.1875, + "learning_rate": 4.95598426736281e-06, + "loss": 0.82432508, + "memory(GiB)": 147.13, + "step": 44380, + "train_speed(iter/s)": 0.201518 + }, + { + "acc": 0.78271379, + "epoch": 1.0356523133903965, + "grad_norm": 4.5, + "learning_rate": 4.954095232111751e-06, + "loss": 0.80724306, + "memory(GiB)": 147.13, + "step": 44390, + "train_speed(iter/s)": 0.201542 + }, + { + "acc": 0.78393545, + "epoch": 1.0358856209626854, + "grad_norm": 7.78125, + "learning_rate": 4.9522062034135845e-06, + "loss": 0.76437645, + "memory(GiB)": 147.13, + "step": 44400, + "train_speed(iter/s)": 0.201561 + }, + { + "acc": 0.77311645, + "epoch": 1.0361189285349743, + "grad_norm": 5.6875, + "learning_rate": 4.9503171815379695e-06, + "loss": 0.82633915, + "memory(GiB)": 147.13, + "step": 44410, + "train_speed(iter/s)": 0.201584 + }, + { + "acc": 0.78500485, + "epoch": 1.0363522361072632, + "grad_norm": 5.65625, + "learning_rate": 4.948428166754561e-06, + "loss": 0.76627178, + "memory(GiB)": 147.13, + "step": 44420, + "train_speed(iter/s)": 0.201605 + }, + { + "acc": 0.77328243, + "epoch": 1.0365855436795521, + "grad_norm": 4.15625, + "learning_rate": 4.946539159333017e-06, + "loss": 0.81605453, + "memory(GiB)": 147.13, + "step": 44430, + "train_speed(iter/s)": 0.201628 + }, + { + "acc": 0.78427544, + "epoch": 1.036818851251841, + "grad_norm": 4.25, + "learning_rate": 4.944650159542993e-06, + "loss": 0.76791339, + "memory(GiB)": 147.13, + "step": 44440, + "train_speed(iter/s)": 0.201651 + }, + { + "acc": 0.77760768, + "epoch": 1.03705215882413, + "grad_norm": 5.4375, + "learning_rate": 4.942761167654142e-06, + "loss": 0.7886517, + "memory(GiB)": 147.13, + "step": 44450, + "train_speed(iter/s)": 0.201675 + }, + { + "acc": 0.7762989, + "epoch": 1.0372854663964188, + "grad_norm": 6.46875, + "learning_rate": 4.940872183936118e-06, + "loss": 0.82030087, + "memory(GiB)": 147.13, + "step": 44460, + "train_speed(iter/s)": 0.201699 + }, + { + "acc": 0.77942324, + "epoch": 1.0375187739687077, + "grad_norm": 5.625, + "learning_rate": 4.938983208658574e-06, + "loss": 0.7806747, + "memory(GiB)": 147.13, + "step": 44470, + "train_speed(iter/s)": 0.201721 + }, + { + "acc": 0.76620932, + "epoch": 1.0377520815409964, + "grad_norm": 4.5, + "learning_rate": 4.937094242091158e-06, + "loss": 0.83421717, + "memory(GiB)": 147.13, + "step": 44480, + "train_speed(iter/s)": 0.201745 + }, + { + "acc": 0.79280834, + "epoch": 1.0379853891132853, + "grad_norm": 5.53125, + "learning_rate": 4.935205284503522e-06, + "loss": 0.74198709, + "memory(GiB)": 147.13, + "step": 44490, + "train_speed(iter/s)": 0.201768 + }, + { + "acc": 0.79166679, + "epoch": 1.0382186966855742, + "grad_norm": 7.03125, + "learning_rate": 4.933316336165311e-06, + "loss": 0.7469903, + "memory(GiB)": 147.13, + "step": 44500, + "train_speed(iter/s)": 0.201791 + }, + { + "epoch": 1.0382186966855742, + "eval_acc": 0.743963643415806, + "eval_loss": 0.8068258166313171, + "eval_runtime": 1270.5076, + "eval_samples_per_second": 28.328, + "eval_steps_per_second": 14.164, + "step": 44500 + }, + { + "acc": 0.78040266, + "epoch": 1.038452004257863, + "grad_norm": 4.4375, + "learning_rate": 4.931427397346174e-06, + "loss": 0.77377152, + "memory(GiB)": 147.13, + "step": 44510, + "train_speed(iter/s)": 0.200643 + }, + { + "acc": 0.76233931, + "epoch": 1.038685311830152, + "grad_norm": 5.5625, + "learning_rate": 4.929538468315756e-06, + "loss": 0.8748394, + "memory(GiB)": 147.13, + "step": 44520, + "train_speed(iter/s)": 0.200668 + }, + { + "acc": 0.77354889, + "epoch": 1.038918619402441, + "grad_norm": 5.25, + "learning_rate": 4.927649549343701e-06, + "loss": 0.81373053, + "memory(GiB)": 147.13, + "step": 44530, + "train_speed(iter/s)": 0.20069 + }, + { + "acc": 0.77879763, + "epoch": 1.0391519269747298, + "grad_norm": 4.9375, + "learning_rate": 4.9257606406996525e-06, + "loss": 0.78499241, + "memory(GiB)": 147.13, + "step": 44540, + "train_speed(iter/s)": 0.200714 + }, + { + "acc": 0.77786951, + "epoch": 1.0393852345470187, + "grad_norm": 4.625, + "learning_rate": 4.923871742653251e-06, + "loss": 0.80249662, + "memory(GiB)": 147.13, + "step": 44550, + "train_speed(iter/s)": 0.200736 + }, + { + "acc": 0.76491838, + "epoch": 1.0396185421193076, + "grad_norm": 5.5625, + "learning_rate": 4.921982855474136e-06, + "loss": 0.84366484, + "memory(GiB)": 147.13, + "step": 44560, + "train_speed(iter/s)": 0.20076 + }, + { + "acc": 0.80446472, + "epoch": 1.0398518496915965, + "grad_norm": 5.78125, + "learning_rate": 4.9200939794319444e-06, + "loss": 0.70642214, + "memory(GiB)": 147.13, + "step": 44570, + "train_speed(iter/s)": 0.200782 + }, + { + "acc": 0.78261833, + "epoch": 1.0400851572638854, + "grad_norm": 6.09375, + "learning_rate": 4.918205114796315e-06, + "loss": 0.7593545, + "memory(GiB)": 147.13, + "step": 44580, + "train_speed(iter/s)": 0.200805 + }, + { + "acc": 0.77353225, + "epoch": 1.0403184648361743, + "grad_norm": 6.625, + "learning_rate": 4.916316261836882e-06, + "loss": 0.84205828, + "memory(GiB)": 147.13, + "step": 44590, + "train_speed(iter/s)": 0.200829 + }, + { + "acc": 0.77843504, + "epoch": 1.0405517724084632, + "grad_norm": 8.75, + "learning_rate": 4.91442742082328e-06, + "loss": 0.79012375, + "memory(GiB)": 147.13, + "step": 44600, + "train_speed(iter/s)": 0.200852 + }, + { + "acc": 0.79202127, + "epoch": 1.040785079980752, + "grad_norm": 5.5, + "learning_rate": 4.912538592025137e-06, + "loss": 0.75029149, + "memory(GiB)": 147.13, + "step": 44610, + "train_speed(iter/s)": 0.200874 + }, + { + "acc": 0.79142704, + "epoch": 1.041018387553041, + "grad_norm": 4.90625, + "learning_rate": 4.910649775712084e-06, + "loss": 0.77472105, + "memory(GiB)": 147.13, + "step": 44620, + "train_speed(iter/s)": 0.200896 + }, + { + "acc": 0.79433427, + "epoch": 1.04125169512533, + "grad_norm": 5.4375, + "learning_rate": 4.908760972153751e-06, + "loss": 0.7298358, + "memory(GiB)": 147.13, + "step": 44630, + "train_speed(iter/s)": 0.200919 + }, + { + "acc": 0.77725754, + "epoch": 1.0414850026976188, + "grad_norm": 6.46875, + "learning_rate": 4.9068721816197615e-06, + "loss": 0.83918152, + "memory(GiB)": 147.13, + "step": 44640, + "train_speed(iter/s)": 0.200941 + }, + { + "acc": 0.79220066, + "epoch": 1.0417183102699077, + "grad_norm": 5.34375, + "learning_rate": 4.904983404379741e-06, + "loss": 0.7297925, + "memory(GiB)": 147.13, + "step": 44650, + "train_speed(iter/s)": 0.200964 + }, + { + "acc": 0.77652454, + "epoch": 1.0419516178421966, + "grad_norm": 4.75, + "learning_rate": 4.903094640703312e-06, + "loss": 0.78495092, + "memory(GiB)": 147.13, + "step": 44660, + "train_speed(iter/s)": 0.200988 + }, + { + "acc": 0.79377813, + "epoch": 1.0421849254144855, + "grad_norm": 4.9375, + "learning_rate": 4.901205890860095e-06, + "loss": 0.73209939, + "memory(GiB)": 147.13, + "step": 44670, + "train_speed(iter/s)": 0.20101 + }, + { + "acc": 0.79054551, + "epoch": 1.0424182329867744, + "grad_norm": 5.34375, + "learning_rate": 4.899317155119708e-06, + "loss": 0.7544507, + "memory(GiB)": 147.13, + "step": 44680, + "train_speed(iter/s)": 0.201033 + }, + { + "acc": 0.79657516, + "epoch": 1.0426515405590633, + "grad_norm": 4.5625, + "learning_rate": 4.89742843375177e-06, + "loss": 0.75656972, + "memory(GiB)": 147.13, + "step": 44690, + "train_speed(iter/s)": 0.201056 + }, + { + "acc": 0.78189001, + "epoch": 1.0428848481313522, + "grad_norm": 6.46875, + "learning_rate": 4.895539727025891e-06, + "loss": 0.77084103, + "memory(GiB)": 147.13, + "step": 44700, + "train_speed(iter/s)": 0.201079 + }, + { + "acc": 0.78460836, + "epoch": 1.043118155703641, + "grad_norm": 4.6875, + "learning_rate": 4.8936510352116886e-06, + "loss": 0.77186456, + "memory(GiB)": 147.13, + "step": 44710, + "train_speed(iter/s)": 0.201102 + }, + { + "acc": 0.75842123, + "epoch": 1.04335146327593, + "grad_norm": 7.78125, + "learning_rate": 4.891762358578767e-06, + "loss": 0.86604462, + "memory(GiB)": 147.13, + "step": 44720, + "train_speed(iter/s)": 0.201126 + }, + { + "acc": 0.78901138, + "epoch": 1.0435847708482189, + "grad_norm": 4.5, + "learning_rate": 4.889873697396738e-06, + "loss": 0.76355696, + "memory(GiB)": 147.13, + "step": 44730, + "train_speed(iter/s)": 0.201147 + }, + { + "acc": 0.78258648, + "epoch": 1.0438180784205078, + "grad_norm": 5.46875, + "learning_rate": 4.887985051935206e-06, + "loss": 0.77364783, + "memory(GiB)": 147.13, + "step": 44740, + "train_speed(iter/s)": 0.201171 + }, + { + "acc": 0.7676549, + "epoch": 1.0440513859927967, + "grad_norm": 5.21875, + "learning_rate": 4.8860964224637756e-06, + "loss": 0.83399296, + "memory(GiB)": 147.13, + "step": 44750, + "train_speed(iter/s)": 0.201196 + }, + { + "acc": 0.76923347, + "epoch": 1.0442846935650856, + "grad_norm": 3.890625, + "learning_rate": 4.884207809252049e-06, + "loss": 0.81878109, + "memory(GiB)": 147.13, + "step": 44760, + "train_speed(iter/s)": 0.201219 + }, + { + "acc": 0.76931305, + "epoch": 1.0445180011373745, + "grad_norm": 3.890625, + "learning_rate": 4.882319212569623e-06, + "loss": 0.84117594, + "memory(GiB)": 147.13, + "step": 44770, + "train_speed(iter/s)": 0.201242 + }, + { + "acc": 0.78014016, + "epoch": 1.0447513087096634, + "grad_norm": 5.15625, + "learning_rate": 4.880430632686096e-06, + "loss": 0.78205185, + "memory(GiB)": 147.13, + "step": 44780, + "train_speed(iter/s)": 0.201265 + }, + { + "acc": 0.7773912, + "epoch": 1.0449846162819523, + "grad_norm": 4.84375, + "learning_rate": 4.87854206987106e-06, + "loss": 0.80879612, + "memory(GiB)": 147.13, + "step": 44790, + "train_speed(iter/s)": 0.201287 + }, + { + "acc": 0.78483715, + "epoch": 1.0452179238542412, + "grad_norm": 4.5625, + "learning_rate": 4.876653524394109e-06, + "loss": 0.77698545, + "memory(GiB)": 147.13, + "step": 44800, + "train_speed(iter/s)": 0.20131 + }, + { + "acc": 0.76030092, + "epoch": 1.04545123142653, + "grad_norm": 6.0625, + "learning_rate": 4.874764996524831e-06, + "loss": 0.86837893, + "memory(GiB)": 147.13, + "step": 44810, + "train_speed(iter/s)": 0.201335 + }, + { + "acc": 0.78192863, + "epoch": 1.045684538998819, + "grad_norm": 9.875, + "learning_rate": 4.872876486532814e-06, + "loss": 0.77877698, + "memory(GiB)": 147.13, + "step": 44820, + "train_speed(iter/s)": 0.201359 + }, + { + "acc": 0.77943573, + "epoch": 1.0459178465711079, + "grad_norm": 5.1875, + "learning_rate": 4.870987994687644e-06, + "loss": 0.78068628, + "memory(GiB)": 147.13, + "step": 44830, + "train_speed(iter/s)": 0.201381 + }, + { + "acc": 0.76940956, + "epoch": 1.0461511541433968, + "grad_norm": 4.8125, + "learning_rate": 4.869099521258897e-06, + "loss": 0.83160362, + "memory(GiB)": 147.13, + "step": 44840, + "train_speed(iter/s)": 0.201403 + }, + { + "acc": 0.77414241, + "epoch": 1.0463844617156857, + "grad_norm": 6.4375, + "learning_rate": 4.867211066516157e-06, + "loss": 0.82083912, + "memory(GiB)": 147.13, + "step": 44850, + "train_speed(iter/s)": 0.201428 + }, + { + "acc": 0.80341206, + "epoch": 1.0466177692879746, + "grad_norm": 5.40625, + "learning_rate": 4.865322630728998e-06, + "loss": 0.70446091, + "memory(GiB)": 147.13, + "step": 44860, + "train_speed(iter/s)": 0.201452 + }, + { + "acc": 0.79915457, + "epoch": 1.0468510768602632, + "grad_norm": 5.59375, + "learning_rate": 4.863434214166994e-06, + "loss": 0.69466219, + "memory(GiB)": 147.13, + "step": 44870, + "train_speed(iter/s)": 0.201475 + }, + { + "acc": 0.77168198, + "epoch": 1.0470843844325521, + "grad_norm": 5.75, + "learning_rate": 4.8615458170997166e-06, + "loss": 0.82629623, + "memory(GiB)": 147.13, + "step": 44880, + "train_speed(iter/s)": 0.201499 + }, + { + "acc": 0.78714318, + "epoch": 1.047317692004841, + "grad_norm": 6.3125, + "learning_rate": 4.8596574397967324e-06, + "loss": 0.74908113, + "memory(GiB)": 147.13, + "step": 44890, + "train_speed(iter/s)": 0.201522 + }, + { + "acc": 0.79111013, + "epoch": 1.04755099957713, + "grad_norm": 6.3125, + "learning_rate": 4.857769082527609e-06, + "loss": 0.77098122, + "memory(GiB)": 147.13, + "step": 44900, + "train_speed(iter/s)": 0.201545 + }, + { + "acc": 0.75890102, + "epoch": 1.0477843071494188, + "grad_norm": 5.875, + "learning_rate": 4.855880745561909e-06, + "loss": 0.86920433, + "memory(GiB)": 147.13, + "step": 44910, + "train_speed(iter/s)": 0.201569 + }, + { + "acc": 0.78598218, + "epoch": 1.0480176147217077, + "grad_norm": 6.53125, + "learning_rate": 4.853992429169189e-06, + "loss": 0.76193895, + "memory(GiB)": 147.13, + "step": 44920, + "train_speed(iter/s)": 0.201591 + }, + { + "acc": 0.75896392, + "epoch": 1.0482509222939966, + "grad_norm": 6.5, + "learning_rate": 4.852104133619008e-06, + "loss": 0.88328629, + "memory(GiB)": 147.13, + "step": 44930, + "train_speed(iter/s)": 0.201613 + }, + { + "acc": 0.78158102, + "epoch": 1.0484842298662855, + "grad_norm": 8.125, + "learning_rate": 4.85021585918092e-06, + "loss": 0.77389903, + "memory(GiB)": 147.13, + "step": 44940, + "train_speed(iter/s)": 0.201638 + }, + { + "acc": 0.78255506, + "epoch": 1.0487175374385744, + "grad_norm": 6.21875, + "learning_rate": 4.848327606124473e-06, + "loss": 0.75741982, + "memory(GiB)": 147.13, + "step": 44950, + "train_speed(iter/s)": 0.20166 + }, + { + "acc": 0.78304787, + "epoch": 1.0489508450108633, + "grad_norm": 5.03125, + "learning_rate": 4.846439374719217e-06, + "loss": 0.79422493, + "memory(GiB)": 147.13, + "step": 44960, + "train_speed(iter/s)": 0.201682 + }, + { + "acc": 0.77571564, + "epoch": 1.0491841525831522, + "grad_norm": 4.875, + "learning_rate": 4.844551165234694e-06, + "loss": 0.80409365, + "memory(GiB)": 147.13, + "step": 44970, + "train_speed(iter/s)": 0.201706 + }, + { + "acc": 0.78164635, + "epoch": 1.0494174601554411, + "grad_norm": 4.65625, + "learning_rate": 4.842662977940448e-06, + "loss": 0.80020761, + "memory(GiB)": 147.13, + "step": 44980, + "train_speed(iter/s)": 0.201728 + }, + { + "acc": 0.78135223, + "epoch": 1.04965076772773, + "grad_norm": 5.46875, + "learning_rate": 4.8407748131060175e-06, + "loss": 0.76936512, + "memory(GiB)": 147.13, + "step": 44990, + "train_speed(iter/s)": 0.201751 + }, + { + "acc": 0.7863555, + "epoch": 1.049884075300019, + "grad_norm": 6.15625, + "learning_rate": 4.838886671000934e-06, + "loss": 0.7614399, + "memory(GiB)": 147.13, + "step": 45000, + "train_speed(iter/s)": 0.201774 + }, + { + "epoch": 1.049884075300019, + "eval_acc": 0.7440030388969522, + "eval_loss": 0.8069285750389099, + "eval_runtime": 1270.5322, + "eval_samples_per_second": 28.327, + "eval_steps_per_second": 14.164, + "step": 45000 + }, + { + "acc": 0.77712164, + "epoch": 1.0501173828723078, + "grad_norm": 6.6875, + "learning_rate": 4.8369985518947336e-06, + "loss": 0.79156408, + "memory(GiB)": 147.13, + "step": 45010, + "train_speed(iter/s)": 0.200639 + }, + { + "acc": 0.78002462, + "epoch": 1.0503506904445967, + "grad_norm": 6.0, + "learning_rate": 4.83511045605694e-06, + "loss": 0.79216776, + "memory(GiB)": 147.13, + "step": 45020, + "train_speed(iter/s)": 0.200662 + }, + { + "acc": 0.78218923, + "epoch": 1.0505839980168856, + "grad_norm": 6.40625, + "learning_rate": 4.8332223837570824e-06, + "loss": 0.7720716, + "memory(GiB)": 147.13, + "step": 45030, + "train_speed(iter/s)": 0.200681 + }, + { + "acc": 0.78232145, + "epoch": 1.0508173055891745, + "grad_norm": 6.15625, + "learning_rate": 4.831334335264682e-06, + "loss": 0.77959723, + "memory(GiB)": 147.13, + "step": 45040, + "train_speed(iter/s)": 0.200704 + }, + { + "acc": 0.78184223, + "epoch": 1.0510506131614634, + "grad_norm": 5.75, + "learning_rate": 4.829446310849256e-06, + "loss": 0.76550894, + "memory(GiB)": 147.13, + "step": 45050, + "train_speed(iter/s)": 0.200728 + }, + { + "acc": 0.78313303, + "epoch": 1.0512839207337523, + "grad_norm": 5.9375, + "learning_rate": 4.827558310780319e-06, + "loss": 0.77661057, + "memory(GiB)": 147.13, + "step": 45060, + "train_speed(iter/s)": 0.20075 + }, + { + "acc": 0.79087114, + "epoch": 1.0515172283060412, + "grad_norm": 4.40625, + "learning_rate": 4.825670335327383e-06, + "loss": 0.74372959, + "memory(GiB)": 147.13, + "step": 45070, + "train_speed(iter/s)": 0.200773 + }, + { + "acc": 0.80854607, + "epoch": 1.0517505358783301, + "grad_norm": 4.5625, + "learning_rate": 4.823782384759955e-06, + "loss": 0.68139353, + "memory(GiB)": 147.13, + "step": 45080, + "train_speed(iter/s)": 0.200796 + }, + { + "acc": 0.78581572, + "epoch": 1.051983843450619, + "grad_norm": 4.9375, + "learning_rate": 4.821894459347542e-06, + "loss": 0.7871974, + "memory(GiB)": 147.13, + "step": 45090, + "train_speed(iter/s)": 0.200818 + }, + { + "acc": 0.78187127, + "epoch": 1.052217151022908, + "grad_norm": 9.375, + "learning_rate": 4.820006559359642e-06, + "loss": 0.78979106, + "memory(GiB)": 147.13, + "step": 45100, + "train_speed(iter/s)": 0.20084 + }, + { + "acc": 0.76723819, + "epoch": 1.0524504585951968, + "grad_norm": 6.875, + "learning_rate": 4.818118685065754e-06, + "loss": 0.84743042, + "memory(GiB)": 147.13, + "step": 45110, + "train_speed(iter/s)": 0.200863 + }, + { + "acc": 0.77373695, + "epoch": 1.0526837661674857, + "grad_norm": 9.5, + "learning_rate": 4.8162308367353705e-06, + "loss": 0.81677189, + "memory(GiB)": 147.13, + "step": 45120, + "train_speed(iter/s)": 0.200887 + }, + { + "acc": 0.78724155, + "epoch": 1.0529170737397746, + "grad_norm": 5.21875, + "learning_rate": 4.814343014637982e-06, + "loss": 0.75338268, + "memory(GiB)": 147.13, + "step": 45130, + "train_speed(iter/s)": 0.200909 + }, + { + "acc": 0.77668324, + "epoch": 1.0531503813120635, + "grad_norm": 5.625, + "learning_rate": 4.812455219043074e-06, + "loss": 0.80819511, + "memory(GiB)": 147.13, + "step": 45140, + "train_speed(iter/s)": 0.200933 + }, + { + "acc": 0.77539773, + "epoch": 1.0533836888843524, + "grad_norm": 4.875, + "learning_rate": 4.810567450220128e-06, + "loss": 0.78674135, + "memory(GiB)": 147.13, + "step": 45150, + "train_speed(iter/s)": 0.200956 + }, + { + "acc": 0.7868578, + "epoch": 1.0536169964566413, + "grad_norm": 6.34375, + "learning_rate": 4.808679708438624e-06, + "loss": 0.76853762, + "memory(GiB)": 147.13, + "step": 45160, + "train_speed(iter/s)": 0.200979 + }, + { + "acc": 0.76111031, + "epoch": 1.0538503040289302, + "grad_norm": 5.375, + "learning_rate": 4.806791993968039e-06, + "loss": 0.86612825, + "memory(GiB)": 147.13, + "step": 45170, + "train_speed(iter/s)": 0.201001 + }, + { + "acc": 0.78183184, + "epoch": 1.054083611601219, + "grad_norm": 6.9375, + "learning_rate": 4.804904307077838e-06, + "loss": 0.76009259, + "memory(GiB)": 147.13, + "step": 45180, + "train_speed(iter/s)": 0.201022 + }, + { + "acc": 0.78163671, + "epoch": 1.054316919173508, + "grad_norm": 7.125, + "learning_rate": 4.80301664803749e-06, + "loss": 0.78068633, + "memory(GiB)": 147.13, + "step": 45190, + "train_speed(iter/s)": 0.201045 + }, + { + "acc": 0.78807421, + "epoch": 1.054550226745797, + "grad_norm": 5.625, + "learning_rate": 4.80112901711646e-06, + "loss": 0.75582342, + "memory(GiB)": 147.13, + "step": 45200, + "train_speed(iter/s)": 0.201068 + }, + { + "acc": 0.77289944, + "epoch": 1.0547835343180858, + "grad_norm": 6.28125, + "learning_rate": 4.799241414584204e-06, + "loss": 0.82699385, + "memory(GiB)": 147.13, + "step": 45210, + "train_speed(iter/s)": 0.201091 + }, + { + "acc": 0.78645868, + "epoch": 1.0550168418903747, + "grad_norm": 3.78125, + "learning_rate": 4.797353840710178e-06, + "loss": 0.7695097, + "memory(GiB)": 147.13, + "step": 45220, + "train_speed(iter/s)": 0.201114 + }, + { + "acc": 0.77393785, + "epoch": 1.0552501494626636, + "grad_norm": 6.125, + "learning_rate": 4.795466295763832e-06, + "loss": 0.81475391, + "memory(GiB)": 147.13, + "step": 45230, + "train_speed(iter/s)": 0.201135 + }, + { + "acc": 0.78816233, + "epoch": 1.0554834570349523, + "grad_norm": 6.1875, + "learning_rate": 4.793578780014612e-06, + "loss": 0.76425867, + "memory(GiB)": 147.13, + "step": 45240, + "train_speed(iter/s)": 0.201158 + }, + { + "acc": 0.76952739, + "epoch": 1.0557167646072414, + "grad_norm": 7.21875, + "learning_rate": 4.791691293731962e-06, + "loss": 0.81708698, + "memory(GiB)": 147.13, + "step": 45250, + "train_speed(iter/s)": 0.201181 + }, + { + "acc": 0.80263548, + "epoch": 1.05595007217953, + "grad_norm": 4.1875, + "learning_rate": 4.78980383718532e-06, + "loss": 0.68140831, + "memory(GiB)": 147.13, + "step": 45260, + "train_speed(iter/s)": 0.201206 + }, + { + "acc": 0.78784084, + "epoch": 1.056183379751819, + "grad_norm": 4.78125, + "learning_rate": 4.787916410644119e-06, + "loss": 0.75726004, + "memory(GiB)": 147.13, + "step": 45270, + "train_speed(iter/s)": 0.201229 + }, + { + "acc": 0.76332178, + "epoch": 1.0564166873241079, + "grad_norm": 5.5625, + "learning_rate": 4.786029014377789e-06, + "loss": 0.82791462, + "memory(GiB)": 147.13, + "step": 45280, + "train_speed(iter/s)": 0.201253 + }, + { + "acc": 0.77394991, + "epoch": 1.0566499948963968, + "grad_norm": 7.125, + "learning_rate": 4.784141648655756e-06, + "loss": 0.81197491, + "memory(GiB)": 147.13, + "step": 45290, + "train_speed(iter/s)": 0.201276 + }, + { + "acc": 0.76897669, + "epoch": 1.0568833024686857, + "grad_norm": 6.375, + "learning_rate": 4.782254313747438e-06, + "loss": 0.82489567, + "memory(GiB)": 147.13, + "step": 45300, + "train_speed(iter/s)": 0.201299 + }, + { + "acc": 0.78103123, + "epoch": 1.0571166100409746, + "grad_norm": 4.9375, + "learning_rate": 4.780367009922253e-06, + "loss": 0.785322, + "memory(GiB)": 147.13, + "step": 45310, + "train_speed(iter/s)": 0.201322 + }, + { + "acc": 0.76720481, + "epoch": 1.0573499176132635, + "grad_norm": 5.4375, + "learning_rate": 4.778479737449614e-06, + "loss": 0.85537357, + "memory(GiB)": 147.13, + "step": 45320, + "train_speed(iter/s)": 0.201346 + }, + { + "acc": 0.79495482, + "epoch": 1.0575832251855524, + "grad_norm": 6.28125, + "learning_rate": 4.7765924965989286e-06, + "loss": 0.72112303, + "memory(GiB)": 147.13, + "step": 45330, + "train_speed(iter/s)": 0.201369 + }, + { + "acc": 0.7869606, + "epoch": 1.0578165327578413, + "grad_norm": 6.15625, + "learning_rate": 4.7747052876396e-06, + "loss": 0.75025473, + "memory(GiB)": 147.13, + "step": 45340, + "train_speed(iter/s)": 0.201393 + }, + { + "acc": 0.77279506, + "epoch": 1.0580498403301302, + "grad_norm": 4.59375, + "learning_rate": 4.772818110841025e-06, + "loss": 0.82170506, + "memory(GiB)": 147.13, + "step": 45350, + "train_speed(iter/s)": 0.201418 + }, + { + "acc": 0.7949152, + "epoch": 1.058283147902419, + "grad_norm": 5.375, + "learning_rate": 4.7709309664726e-06, + "loss": 0.73024497, + "memory(GiB)": 147.13, + "step": 45360, + "train_speed(iter/s)": 0.201443 + }, + { + "acc": 0.79297123, + "epoch": 1.058516455474708, + "grad_norm": 6.84375, + "learning_rate": 4.769043854803712e-06, + "loss": 0.72323437, + "memory(GiB)": 147.13, + "step": 45370, + "train_speed(iter/s)": 0.201465 + }, + { + "acc": 0.77630758, + "epoch": 1.0587497630469969, + "grad_norm": 6.03125, + "learning_rate": 4.767156776103746e-06, + "loss": 0.80375509, + "memory(GiB)": 147.13, + "step": 45380, + "train_speed(iter/s)": 0.201489 + }, + { + "acc": 0.79003696, + "epoch": 1.0589830706192858, + "grad_norm": 5.09375, + "learning_rate": 4.765269730642083e-06, + "loss": 0.73524923, + "memory(GiB)": 147.13, + "step": 45390, + "train_speed(iter/s)": 0.201511 + }, + { + "acc": 0.79320269, + "epoch": 1.0592163781915747, + "grad_norm": 4.5625, + "learning_rate": 4.7633827186881e-06, + "loss": 0.7441308, + "memory(GiB)": 147.13, + "step": 45400, + "train_speed(iter/s)": 0.201535 + }, + { + "acc": 0.77364416, + "epoch": 1.0594496857638636, + "grad_norm": 5.75, + "learning_rate": 4.7614957405111635e-06, + "loss": 0.80192499, + "memory(GiB)": 147.13, + "step": 45410, + "train_speed(iter/s)": 0.201559 + }, + { + "acc": 0.78601189, + "epoch": 1.0596829933361525, + "grad_norm": 6.78125, + "learning_rate": 4.759608796380642e-06, + "loss": 0.77241449, + "memory(GiB)": 147.13, + "step": 45420, + "train_speed(iter/s)": 0.20158 + }, + { + "acc": 0.75279102, + "epoch": 1.0599163009084414, + "grad_norm": 5.5, + "learning_rate": 4.757721886565893e-06, + "loss": 0.89964409, + "memory(GiB)": 147.13, + "step": 45430, + "train_speed(iter/s)": 0.201603 + }, + { + "acc": 0.76471562, + "epoch": 1.0601496084807303, + "grad_norm": 9.5625, + "learning_rate": 4.755835011336274e-06, + "loss": 0.85386238, + "memory(GiB)": 147.13, + "step": 45440, + "train_speed(iter/s)": 0.201626 + }, + { + "acc": 0.78351641, + "epoch": 1.0603829160530192, + "grad_norm": 6.0625, + "learning_rate": 4.753948170961137e-06, + "loss": 0.76716695, + "memory(GiB)": 147.13, + "step": 45450, + "train_speed(iter/s)": 0.201649 + }, + { + "acc": 0.78080029, + "epoch": 1.060616223625308, + "grad_norm": 4.96875, + "learning_rate": 4.752061365709827e-06, + "loss": 0.76200881, + "memory(GiB)": 147.13, + "step": 45460, + "train_speed(iter/s)": 0.201672 + }, + { + "acc": 0.7751379, + "epoch": 1.060849531197597, + "grad_norm": 5.84375, + "learning_rate": 4.750174595851685e-06, + "loss": 0.79694057, + "memory(GiB)": 147.13, + "step": 45470, + "train_speed(iter/s)": 0.201697 + }, + { + "acc": 0.7780129, + "epoch": 1.0610828387698858, + "grad_norm": 4.71875, + "learning_rate": 4.748287861656047e-06, + "loss": 0.80889072, + "memory(GiB)": 147.13, + "step": 45480, + "train_speed(iter/s)": 0.20172 + }, + { + "acc": 0.78614645, + "epoch": 1.0613161463421747, + "grad_norm": 5.4375, + "learning_rate": 4.746401163392244e-06, + "loss": 0.79196577, + "memory(GiB)": 147.13, + "step": 45490, + "train_speed(iter/s)": 0.201743 + }, + { + "acc": 0.80344467, + "epoch": 1.0615494539144636, + "grad_norm": 4.78125, + "learning_rate": 4.744514501329601e-06, + "loss": 0.71334877, + "memory(GiB)": 147.13, + "step": 45500, + "train_speed(iter/s)": 0.201765 + }, + { + "epoch": 1.0615494539144636, + "eval_acc": 0.7440579683686315, + "eval_loss": 0.8067649006843567, + "eval_runtime": 1270.3852, + "eval_samples_per_second": 28.331, + "eval_steps_per_second": 14.166, + "step": 45500 + }, + { + "acc": 0.78397212, + "epoch": 1.0617827614867525, + "grad_norm": 5.28125, + "learning_rate": 4.74262787573744e-06, + "loss": 0.78342257, + "memory(GiB)": 147.13, + "step": 45510, + "train_speed(iter/s)": 0.200643 + }, + { + "acc": 0.78444176, + "epoch": 1.0620160690590414, + "grad_norm": 11.125, + "learning_rate": 4.7407412868850734e-06, + "loss": 0.77105117, + "memory(GiB)": 147.13, + "step": 45520, + "train_speed(iter/s)": 0.200666 + }, + { + "acc": 0.79020176, + "epoch": 1.0622493766313303, + "grad_norm": 5.9375, + "learning_rate": 4.738854735041813e-06, + "loss": 0.75314264, + "memory(GiB)": 147.13, + "step": 45530, + "train_speed(iter/s)": 0.200687 + }, + { + "acc": 0.7691103, + "epoch": 1.0624826842036192, + "grad_norm": 6.375, + "learning_rate": 4.736968220476963e-06, + "loss": 0.8225666, + "memory(GiB)": 147.13, + "step": 45540, + "train_speed(iter/s)": 0.200709 + }, + { + "acc": 0.77286139, + "epoch": 1.0627159917759081, + "grad_norm": 5.78125, + "learning_rate": 4.735081743459823e-06, + "loss": 0.82172585, + "memory(GiB)": 147.13, + "step": 45550, + "train_speed(iter/s)": 0.200732 + }, + { + "acc": 0.78380318, + "epoch": 1.062949299348197, + "grad_norm": 5.03125, + "learning_rate": 4.733195304259689e-06, + "loss": 0.75959673, + "memory(GiB)": 147.13, + "step": 45560, + "train_speed(iter/s)": 0.200755 + }, + { + "acc": 0.78299665, + "epoch": 1.063182606920486, + "grad_norm": 6.28125, + "learning_rate": 4.731308903145846e-06, + "loss": 0.77152033, + "memory(GiB)": 147.13, + "step": 45570, + "train_speed(iter/s)": 0.200776 + }, + { + "acc": 0.78575029, + "epoch": 1.0634159144927748, + "grad_norm": 6.75, + "learning_rate": 4.729422540387579e-06, + "loss": 0.75135193, + "memory(GiB)": 147.13, + "step": 45580, + "train_speed(iter/s)": 0.200799 + }, + { + "acc": 0.77882252, + "epoch": 1.0636492220650637, + "grad_norm": 4.59375, + "learning_rate": 4.727536216254166e-06, + "loss": 0.77502632, + "memory(GiB)": 147.13, + "step": 45590, + "train_speed(iter/s)": 0.200821 + }, + { + "acc": 0.78998165, + "epoch": 1.0638825296373526, + "grad_norm": 5.15625, + "learning_rate": 4.725649931014879e-06, + "loss": 0.75522022, + "memory(GiB)": 147.13, + "step": 45600, + "train_speed(iter/s)": 0.200844 + }, + { + "acc": 0.78265238, + "epoch": 1.0641158372096415, + "grad_norm": 6.28125, + "learning_rate": 4.723763684938985e-06, + "loss": 0.78737378, + "memory(GiB)": 147.13, + "step": 45610, + "train_speed(iter/s)": 0.200868 + }, + { + "acc": 0.78826542, + "epoch": 1.0643491447819304, + "grad_norm": 4.53125, + "learning_rate": 4.721877478295745e-06, + "loss": 0.77142153, + "memory(GiB)": 147.13, + "step": 45620, + "train_speed(iter/s)": 0.200891 + }, + { + "acc": 0.76623106, + "epoch": 1.064582452354219, + "grad_norm": 7.78125, + "learning_rate": 4.719991311354415e-06, + "loss": 0.84487782, + "memory(GiB)": 147.13, + "step": 45630, + "train_speed(iter/s)": 0.200914 + }, + { + "acc": 0.78780022, + "epoch": 1.0648157599265082, + "grad_norm": 5.65625, + "learning_rate": 4.718105184384243e-06, + "loss": 0.75018334, + "memory(GiB)": 147.13, + "step": 45640, + "train_speed(iter/s)": 0.200937 + }, + { + "acc": 0.77883091, + "epoch": 1.065049067498797, + "grad_norm": 6.3125, + "learning_rate": 4.7162190976544735e-06, + "loss": 0.80597057, + "memory(GiB)": 147.13, + "step": 45650, + "train_speed(iter/s)": 0.20096 + }, + { + "acc": 0.78427258, + "epoch": 1.0652823750710858, + "grad_norm": 7.40625, + "learning_rate": 4.7143330514343446e-06, + "loss": 0.78061914, + "memory(GiB)": 147.13, + "step": 45660, + "train_speed(iter/s)": 0.200983 + }, + { + "acc": 0.78091021, + "epoch": 1.0655156826433747, + "grad_norm": 5.53125, + "learning_rate": 4.712447045993091e-06, + "loss": 0.78055716, + "memory(GiB)": 147.13, + "step": 45670, + "train_speed(iter/s)": 0.201006 + }, + { + "acc": 0.75933809, + "epoch": 1.0657489902156636, + "grad_norm": 7.15625, + "learning_rate": 4.710561081599937e-06, + "loss": 0.8501317, + "memory(GiB)": 147.13, + "step": 45680, + "train_speed(iter/s)": 0.201028 + }, + { + "acc": 0.77189841, + "epoch": 1.0659822977879525, + "grad_norm": 4.21875, + "learning_rate": 4.708675158524105e-06, + "loss": 0.81637125, + "memory(GiB)": 147.13, + "step": 45690, + "train_speed(iter/s)": 0.201051 + }, + { + "acc": 0.78322716, + "epoch": 1.0662156053602414, + "grad_norm": 4.46875, + "learning_rate": 4.706789277034811e-06, + "loss": 0.78145242, + "memory(GiB)": 147.13, + "step": 45700, + "train_speed(iter/s)": 0.201074 + }, + { + "acc": 0.7969058, + "epoch": 1.0664489129325303, + "grad_norm": 4.96875, + "learning_rate": 4.704903437401261e-06, + "loss": 0.70635157, + "memory(GiB)": 147.13, + "step": 45710, + "train_speed(iter/s)": 0.201097 + }, + { + "acc": 0.77726212, + "epoch": 1.0666822205048192, + "grad_norm": 6.0625, + "learning_rate": 4.703017639892659e-06, + "loss": 0.77836952, + "memory(GiB)": 147.13, + "step": 45720, + "train_speed(iter/s)": 0.20112 + }, + { + "acc": 0.77621179, + "epoch": 1.066915528077108, + "grad_norm": 6.3125, + "learning_rate": 4.701131884778204e-06, + "loss": 0.80525446, + "memory(GiB)": 147.13, + "step": 45730, + "train_speed(iter/s)": 0.201144 + }, + { + "acc": 0.77437353, + "epoch": 1.067148835649397, + "grad_norm": 9.125, + "learning_rate": 4.699246172327087e-06, + "loss": 0.79244003, + "memory(GiB)": 147.13, + "step": 45740, + "train_speed(iter/s)": 0.201167 + }, + { + "acc": 0.77285023, + "epoch": 1.067382143221686, + "grad_norm": 19.625, + "learning_rate": 4.697360502808488e-06, + "loss": 0.83040085, + "memory(GiB)": 147.13, + "step": 45750, + "train_speed(iter/s)": 0.201191 + }, + { + "acc": 0.7729352, + "epoch": 1.0676154507939748, + "grad_norm": 6.96875, + "learning_rate": 4.695474876491592e-06, + "loss": 0.84784021, + "memory(GiB)": 147.13, + "step": 45760, + "train_speed(iter/s)": 0.201213 + }, + { + "acc": 0.77671528, + "epoch": 1.0678487583662637, + "grad_norm": 6.15625, + "learning_rate": 4.6935892936455664e-06, + "loss": 0.78750763, + "memory(GiB)": 147.13, + "step": 45770, + "train_speed(iter/s)": 0.201238 + }, + { + "acc": 0.77845559, + "epoch": 1.0680820659385526, + "grad_norm": 5.90625, + "learning_rate": 4.691703754539583e-06, + "loss": 0.81025581, + "memory(GiB)": 147.13, + "step": 45780, + "train_speed(iter/s)": 0.20126 + }, + { + "acc": 0.79248986, + "epoch": 1.0683153735108415, + "grad_norm": 6.1875, + "learning_rate": 4.689818259442797e-06, + "loss": 0.75349188, + "memory(GiB)": 147.13, + "step": 45790, + "train_speed(iter/s)": 0.201284 + }, + { + "acc": 0.77852826, + "epoch": 1.0685486810831304, + "grad_norm": 9.5625, + "learning_rate": 4.687932808624365e-06, + "loss": 0.79906149, + "memory(GiB)": 147.13, + "step": 45800, + "train_speed(iter/s)": 0.201307 + }, + { + "acc": 0.78552589, + "epoch": 1.0687819886554193, + "grad_norm": 4.46875, + "learning_rate": 4.686047402353433e-06, + "loss": 0.74826708, + "memory(GiB)": 147.13, + "step": 45810, + "train_speed(iter/s)": 0.201331 + }, + { + "acc": 0.7994451, + "epoch": 1.0690152962277082, + "grad_norm": 3.953125, + "learning_rate": 4.684162040899144e-06, + "loss": 0.70688953, + "memory(GiB)": 147.13, + "step": 45820, + "train_speed(iter/s)": 0.201354 + }, + { + "acc": 0.78053541, + "epoch": 1.069248603799997, + "grad_norm": 5.09375, + "learning_rate": 4.682276724530633e-06, + "loss": 0.79235182, + "memory(GiB)": 147.13, + "step": 45830, + "train_speed(iter/s)": 0.201377 + }, + { + "acc": 0.77587891, + "epoch": 1.069481911372286, + "grad_norm": 4.6875, + "learning_rate": 4.680391453517026e-06, + "loss": 0.81508999, + "memory(GiB)": 147.13, + "step": 45840, + "train_speed(iter/s)": 0.201399 + }, + { + "acc": 0.79230328, + "epoch": 1.0697152189445749, + "grad_norm": 7.84375, + "learning_rate": 4.678506228127447e-06, + "loss": 0.7369689, + "memory(GiB)": 147.13, + "step": 45850, + "train_speed(iter/s)": 0.201421 + }, + { + "acc": 0.78080015, + "epoch": 1.0699485265168638, + "grad_norm": 5.6875, + "learning_rate": 4.67662104863101e-06, + "loss": 0.80230227, + "memory(GiB)": 147.13, + "step": 45860, + "train_speed(iter/s)": 0.201444 + }, + { + "acc": 0.79010229, + "epoch": 1.0701818340891527, + "grad_norm": 5.09375, + "learning_rate": 4.674735915296824e-06, + "loss": 0.75055971, + "memory(GiB)": 147.13, + "step": 45870, + "train_speed(iter/s)": 0.201467 + }, + { + "acc": 0.75482688, + "epoch": 1.0704151416614416, + "grad_norm": 6.71875, + "learning_rate": 4.672850828393992e-06, + "loss": 0.89350119, + "memory(GiB)": 147.13, + "step": 45880, + "train_speed(iter/s)": 0.20149 + }, + { + "acc": 0.76217985, + "epoch": 1.0706484492337305, + "grad_norm": 5.5625, + "learning_rate": 4.670965788191609e-06, + "loss": 0.86493149, + "memory(GiB)": 147.13, + "step": 45890, + "train_speed(iter/s)": 0.201512 + }, + { + "acc": 0.7592453, + "epoch": 1.0708817568060194, + "grad_norm": 5.46875, + "learning_rate": 4.669080794958764e-06, + "loss": 0.85552521, + "memory(GiB)": 147.13, + "step": 45900, + "train_speed(iter/s)": 0.201532 + }, + { + "acc": 0.76834116, + "epoch": 1.0711150643783083, + "grad_norm": 4.40625, + "learning_rate": 4.6671958489645394e-06, + "loss": 0.85116701, + "memory(GiB)": 147.13, + "step": 45910, + "train_speed(iter/s)": 0.201556 + }, + { + "acc": 0.79082565, + "epoch": 1.0713483719505972, + "grad_norm": 6.0625, + "learning_rate": 4.665310950478011e-06, + "loss": 0.74479675, + "memory(GiB)": 147.13, + "step": 45920, + "train_speed(iter/s)": 0.201579 + }, + { + "acc": 0.7921772, + "epoch": 1.071581679522886, + "grad_norm": 5.5625, + "learning_rate": 4.663426099768247e-06, + "loss": 0.73003244, + "memory(GiB)": 147.13, + "step": 45930, + "train_speed(iter/s)": 0.201601 + }, + { + "acc": 0.76949821, + "epoch": 1.071814987095175, + "grad_norm": 5.375, + "learning_rate": 4.661541297104309e-06, + "loss": 0.83000374, + "memory(GiB)": 147.13, + "step": 45940, + "train_speed(iter/s)": 0.201623 + }, + { + "acc": 0.77961545, + "epoch": 1.0720482946674639, + "grad_norm": 6.34375, + "learning_rate": 4.659656542755253e-06, + "loss": 0.76818419, + "memory(GiB)": 147.13, + "step": 45950, + "train_speed(iter/s)": 0.201647 + }, + { + "acc": 0.78886552, + "epoch": 1.0722816022397528, + "grad_norm": 7.03125, + "learning_rate": 4.657771836990127e-06, + "loss": 0.73285408, + "memory(GiB)": 147.13, + "step": 45960, + "train_speed(iter/s)": 0.20167 + }, + { + "acc": 0.77267704, + "epoch": 1.0725149098120417, + "grad_norm": 5.125, + "learning_rate": 4.655887180077973e-06, + "loss": 0.80965977, + "memory(GiB)": 147.13, + "step": 45970, + "train_speed(iter/s)": 0.201693 + }, + { + "acc": 0.7777483, + "epoch": 1.0727482173843306, + "grad_norm": 5.625, + "learning_rate": 4.654002572287822e-06, + "loss": 0.79634504, + "memory(GiB)": 147.13, + "step": 45980, + "train_speed(iter/s)": 0.201714 + }, + { + "acc": 0.7671217, + "epoch": 1.0729815249566195, + "grad_norm": 5.125, + "learning_rate": 4.652118013888704e-06, + "loss": 0.84396877, + "memory(GiB)": 147.13, + "step": 45990, + "train_speed(iter/s)": 0.201736 + }, + { + "acc": 0.7591877, + "epoch": 1.0732148325289084, + "grad_norm": 6.5625, + "learning_rate": 4.650233505149639e-06, + "loss": 0.86070147, + "memory(GiB)": 147.13, + "step": 46000, + "train_speed(iter/s)": 0.201761 + }, + { + "epoch": 1.0732148325289084, + "eval_acc": 0.7440973638497776, + "eval_loss": 0.8066761493682861, + "eval_runtime": 1270.8927, + "eval_samples_per_second": 28.319, + "eval_steps_per_second": 14.16, + "step": 46000 + }, + { + "acc": 0.77020998, + "epoch": 1.0734481401011973, + "grad_norm": 5.5, + "learning_rate": 4.648349046339639e-06, + "loss": 0.83288393, + "memory(GiB)": 147.13, + "step": 46010, + "train_speed(iter/s)": 0.20065 + }, + { + "acc": 0.78606977, + "epoch": 1.073681447673486, + "grad_norm": 5.6875, + "learning_rate": 4.64646463772771e-06, + "loss": 0.76476479, + "memory(GiB)": 147.13, + "step": 46020, + "train_speed(iter/s)": 0.200672 + }, + { + "acc": 0.79527607, + "epoch": 1.0739147552457748, + "grad_norm": 9.0625, + "learning_rate": 4.6445802795828515e-06, + "loss": 0.72505322, + "memory(GiB)": 147.13, + "step": 46030, + "train_speed(iter/s)": 0.200694 + }, + { + "acc": 0.77043562, + "epoch": 1.0741480628180637, + "grad_norm": 5.59375, + "learning_rate": 4.642695972174055e-06, + "loss": 0.81447153, + "memory(GiB)": 147.13, + "step": 46040, + "train_speed(iter/s)": 0.200717 + }, + { + "acc": 0.78194885, + "epoch": 1.0743813703903526, + "grad_norm": 4.84375, + "learning_rate": 4.640811715770305e-06, + "loss": 0.78343391, + "memory(GiB)": 147.13, + "step": 46050, + "train_speed(iter/s)": 0.200739 + }, + { + "acc": 0.78372726, + "epoch": 1.0746146779626415, + "grad_norm": 4.78125, + "learning_rate": 4.638927510640578e-06, + "loss": 0.76459522, + "memory(GiB)": 147.13, + "step": 46060, + "train_speed(iter/s)": 0.200761 + }, + { + "acc": 0.79502363, + "epoch": 1.0748479855349304, + "grad_norm": 5.28125, + "learning_rate": 4.637043357053844e-06, + "loss": 0.72123518, + "memory(GiB)": 147.13, + "step": 46070, + "train_speed(iter/s)": 0.200781 + }, + { + "acc": 0.78148775, + "epoch": 1.0750812931072193, + "grad_norm": 6.90625, + "learning_rate": 4.635159255279066e-06, + "loss": 0.76372013, + "memory(GiB)": 147.13, + "step": 46080, + "train_speed(iter/s)": 0.200803 + }, + { + "acc": 0.7984148, + "epoch": 1.0753146006795082, + "grad_norm": 4.1875, + "learning_rate": 4.633275205585198e-06, + "loss": 0.69609632, + "memory(GiB)": 147.13, + "step": 46090, + "train_speed(iter/s)": 0.200823 + }, + { + "acc": 0.77495604, + "epoch": 1.0755479082517971, + "grad_norm": 6.375, + "learning_rate": 4.631391208241187e-06, + "loss": 0.80969296, + "memory(GiB)": 147.13, + "step": 46100, + "train_speed(iter/s)": 0.200846 + }, + { + "acc": 0.7904788, + "epoch": 1.075781215824086, + "grad_norm": 4.96875, + "learning_rate": 4.6295072635159744e-06, + "loss": 0.74659705, + "memory(GiB)": 147.13, + "step": 46110, + "train_speed(iter/s)": 0.200868 + }, + { + "acc": 0.77688103, + "epoch": 1.076014523396375, + "grad_norm": 4.75, + "learning_rate": 4.627623371678492e-06, + "loss": 0.8099329, + "memory(GiB)": 147.13, + "step": 46120, + "train_speed(iter/s)": 0.200891 + }, + { + "acc": 0.78068047, + "epoch": 1.0762478309686638, + "grad_norm": 4.375, + "learning_rate": 4.625739532997665e-06, + "loss": 0.7743185, + "memory(GiB)": 147.13, + "step": 46130, + "train_speed(iter/s)": 0.200914 + }, + { + "acc": 0.79984784, + "epoch": 1.0764811385409527, + "grad_norm": 5.28125, + "learning_rate": 4.623855747742412e-06, + "loss": 0.69627056, + "memory(GiB)": 147.13, + "step": 46140, + "train_speed(iter/s)": 0.200936 + }, + { + "acc": 0.77657719, + "epoch": 1.0767144461132416, + "grad_norm": 5.84375, + "learning_rate": 4.62197201618164e-06, + "loss": 0.78484745, + "memory(GiB)": 147.13, + "step": 46150, + "train_speed(iter/s)": 0.200958 + }, + { + "acc": 0.78479581, + "epoch": 1.0769477536855305, + "grad_norm": 4.75, + "learning_rate": 4.620088338584254e-06, + "loss": 0.76777401, + "memory(GiB)": 147.13, + "step": 46160, + "train_speed(iter/s)": 0.200981 + }, + { + "acc": 0.75793247, + "epoch": 1.0771810612578194, + "grad_norm": 5.03125, + "learning_rate": 4.618204715219147e-06, + "loss": 0.87096195, + "memory(GiB)": 147.13, + "step": 46170, + "train_speed(iter/s)": 0.201004 + }, + { + "acc": 0.78456974, + "epoch": 1.0774143688301083, + "grad_norm": 5.5, + "learning_rate": 4.616321146355206e-06, + "loss": 0.77649322, + "memory(GiB)": 147.13, + "step": 46180, + "train_speed(iter/s)": 0.201026 + }, + { + "acc": 0.76846437, + "epoch": 1.0776476764023972, + "grad_norm": 5.78125, + "learning_rate": 4.614437632261311e-06, + "loss": 0.82811508, + "memory(GiB)": 147.13, + "step": 46190, + "train_speed(iter/s)": 0.201049 + }, + { + "acc": 0.77970672, + "epoch": 1.0778809839746861, + "grad_norm": 10.0625, + "learning_rate": 4.6125541732063315e-06, + "loss": 0.80484219, + "memory(GiB)": 147.13, + "step": 46200, + "train_speed(iter/s)": 0.20107 + }, + { + "acc": 0.78650169, + "epoch": 1.078114291546975, + "grad_norm": 4.6875, + "learning_rate": 4.6106707694591324e-06, + "loss": 0.78840094, + "memory(GiB)": 147.13, + "step": 46210, + "train_speed(iter/s)": 0.201093 + }, + { + "acc": 0.77818255, + "epoch": 1.078347599119264, + "grad_norm": 5.28125, + "learning_rate": 4.608787421288566e-06, + "loss": 0.79254904, + "memory(GiB)": 147.13, + "step": 46220, + "train_speed(iter/s)": 0.201118 + }, + { + "acc": 0.77555122, + "epoch": 1.0785809066915528, + "grad_norm": 8.5, + "learning_rate": 4.606904128963482e-06, + "loss": 0.80696802, + "memory(GiB)": 147.13, + "step": 46230, + "train_speed(iter/s)": 0.201142 + }, + { + "acc": 0.77328091, + "epoch": 1.0788142142638417, + "grad_norm": 5.75, + "learning_rate": 4.605020892752718e-06, + "loss": 0.81441078, + "memory(GiB)": 147.13, + "step": 46240, + "train_speed(iter/s)": 0.201163 + }, + { + "acc": 0.7695261, + "epoch": 1.0790475218361306, + "grad_norm": 4.625, + "learning_rate": 4.603137712925108e-06, + "loss": 0.82396679, + "memory(GiB)": 147.13, + "step": 46250, + "train_speed(iter/s)": 0.201185 + }, + { + "acc": 0.77598724, + "epoch": 1.0792808294084195, + "grad_norm": 5.59375, + "learning_rate": 4.601254589749474e-06, + "loss": 0.8179862, + "memory(GiB)": 147.13, + "step": 46260, + "train_speed(iter/s)": 0.201206 + }, + { + "acc": 0.78526249, + "epoch": 1.0795141369807084, + "grad_norm": 5.4375, + "learning_rate": 4.599371523494632e-06, + "loss": 0.77546606, + "memory(GiB)": 147.13, + "step": 46270, + "train_speed(iter/s)": 0.201228 + }, + { + "acc": 0.78726249, + "epoch": 1.0797474445529973, + "grad_norm": 4.15625, + "learning_rate": 4.597488514429388e-06, + "loss": 0.75475173, + "memory(GiB)": 147.13, + "step": 46280, + "train_speed(iter/s)": 0.20125 + }, + { + "acc": 0.78735552, + "epoch": 1.0799807521252862, + "grad_norm": 5.21875, + "learning_rate": 4.595605562822542e-06, + "loss": 0.75245075, + "memory(GiB)": 147.13, + "step": 46290, + "train_speed(iter/s)": 0.201272 + }, + { + "acc": 0.7679944, + "epoch": 1.0802140596975751, + "grad_norm": 6.5, + "learning_rate": 4.593722668942884e-06, + "loss": 0.83475809, + "memory(GiB)": 147.13, + "step": 46300, + "train_speed(iter/s)": 0.201295 + }, + { + "acc": 0.78632035, + "epoch": 1.080447367269864, + "grad_norm": 5.8125, + "learning_rate": 4.5918398330592e-06, + "loss": 0.75767183, + "memory(GiB)": 147.13, + "step": 46310, + "train_speed(iter/s)": 0.201317 + }, + { + "acc": 0.7837204, + "epoch": 1.080680674842153, + "grad_norm": 8.5625, + "learning_rate": 4.589957055440259e-06, + "loss": 0.77630086, + "memory(GiB)": 147.13, + "step": 46320, + "train_speed(iter/s)": 0.20134 + }, + { + "acc": 0.77561617, + "epoch": 1.0809139824144418, + "grad_norm": 6.125, + "learning_rate": 4.588074336354828e-06, + "loss": 0.80536242, + "memory(GiB)": 147.13, + "step": 46330, + "train_speed(iter/s)": 0.201363 + }, + { + "acc": 0.77097578, + "epoch": 1.0811472899867307, + "grad_norm": 4.96875, + "learning_rate": 4.586191676071666e-06, + "loss": 0.83300571, + "memory(GiB)": 147.13, + "step": 46340, + "train_speed(iter/s)": 0.201385 + }, + { + "acc": 0.78117247, + "epoch": 1.0813805975590196, + "grad_norm": 5.21875, + "learning_rate": 4.584309074859524e-06, + "loss": 0.80158958, + "memory(GiB)": 147.13, + "step": 46350, + "train_speed(iter/s)": 0.201407 + }, + { + "acc": 0.78816423, + "epoch": 1.0816139051313085, + "grad_norm": 4.625, + "learning_rate": 4.5824265329871395e-06, + "loss": 0.7538969, + "memory(GiB)": 147.13, + "step": 46360, + "train_speed(iter/s)": 0.201429 + }, + { + "acc": 0.77398424, + "epoch": 1.0818472127035974, + "grad_norm": 5.90625, + "learning_rate": 4.580544050723246e-06, + "loss": 0.8361598, + "memory(GiB)": 147.13, + "step": 46370, + "train_speed(iter/s)": 0.201451 + }, + { + "acc": 0.79671211, + "epoch": 1.0820805202758863, + "grad_norm": 5.40625, + "learning_rate": 4.578661628336567e-06, + "loss": 0.73220673, + "memory(GiB)": 147.13, + "step": 46380, + "train_speed(iter/s)": 0.201472 + }, + { + "acc": 0.7743166, + "epoch": 1.082313827848175, + "grad_norm": 4.96875, + "learning_rate": 4.576779266095818e-06, + "loss": 0.8441576, + "memory(GiB)": 147.13, + "step": 46390, + "train_speed(iter/s)": 0.201494 + }, + { + "acc": 0.76517391, + "epoch": 1.082547135420464, + "grad_norm": 6.0, + "learning_rate": 4.574896964269707e-06, + "loss": 0.84825592, + "memory(GiB)": 147.13, + "step": 46400, + "train_speed(iter/s)": 0.201518 + }, + { + "acc": 0.80378437, + "epoch": 1.0827804429927528, + "grad_norm": 4.28125, + "learning_rate": 4.573014723126931e-06, + "loss": 0.70220609, + "memory(GiB)": 147.13, + "step": 46410, + "train_speed(iter/s)": 0.201541 + }, + { + "acc": 0.78200688, + "epoch": 1.0830137505650417, + "grad_norm": 3.625, + "learning_rate": 4.571132542936179e-06, + "loss": 0.79165678, + "memory(GiB)": 147.13, + "step": 46420, + "train_speed(iter/s)": 0.201563 + }, + { + "acc": 0.79855556, + "epoch": 1.0832470581373306, + "grad_norm": 4.78125, + "learning_rate": 4.569250423966132e-06, + "loss": 0.72647519, + "memory(GiB)": 147.13, + "step": 46430, + "train_speed(iter/s)": 0.201586 + }, + { + "acc": 0.77577281, + "epoch": 1.0834803657096195, + "grad_norm": 4.875, + "learning_rate": 4.567368366485462e-06, + "loss": 0.7856792, + "memory(GiB)": 147.13, + "step": 46440, + "train_speed(iter/s)": 0.201609 + }, + { + "acc": 0.77464085, + "epoch": 1.0837136732819084, + "grad_norm": 5.59375, + "learning_rate": 4.56548637076283e-06, + "loss": 0.80171261, + "memory(GiB)": 147.13, + "step": 46450, + "train_speed(iter/s)": 0.201632 + }, + { + "acc": 0.79902134, + "epoch": 1.0839469808541973, + "grad_norm": 5.875, + "learning_rate": 4.563604437066894e-06, + "loss": 0.714886, + "memory(GiB)": 147.13, + "step": 46460, + "train_speed(iter/s)": 0.201654 + }, + { + "acc": 0.78830729, + "epoch": 1.0841802884264862, + "grad_norm": 4.71875, + "learning_rate": 4.561722565666298e-06, + "loss": 0.74691525, + "memory(GiB)": 147.13, + "step": 46470, + "train_speed(iter/s)": 0.201676 + }, + { + "acc": 0.77470942, + "epoch": 1.084413595998775, + "grad_norm": 5.34375, + "learning_rate": 4.559840756829677e-06, + "loss": 0.80980682, + "memory(GiB)": 147.13, + "step": 46480, + "train_speed(iter/s)": 0.201698 + }, + { + "acc": 0.79907804, + "epoch": 1.084646903571064, + "grad_norm": 4.375, + "learning_rate": 4.557959010825662e-06, + "loss": 0.70692854, + "memory(GiB)": 147.13, + "step": 46490, + "train_speed(iter/s)": 0.201721 + }, + { + "acc": 0.76710625, + "epoch": 1.0848802111433529, + "grad_norm": 5.40625, + "learning_rate": 4.5560773279228686e-06, + "loss": 0.82628555, + "memory(GiB)": 147.13, + "step": 46500, + "train_speed(iter/s)": 0.201745 + }, + { + "epoch": 1.0848802111433529, + "eval_acc": 0.7440590893782576, + "eval_loss": 0.8066306710243225, + "eval_runtime": 1271.0341, + "eval_samples_per_second": 28.316, + "eval_steps_per_second": 14.159, + "step": 46500 + }, + { + "acc": 0.76087437, + "epoch": 1.0851135187156418, + "grad_norm": 7.875, + "learning_rate": 4.5541957083899075e-06, + "loss": 0.90284786, + "memory(GiB)": 147.13, + "step": 46510, + "train_speed(iter/s)": 0.200645 + }, + { + "acc": 0.78702526, + "epoch": 1.0853468262879307, + "grad_norm": 7.34375, + "learning_rate": 4.55231415249538e-06, + "loss": 0.76634626, + "memory(GiB)": 147.13, + "step": 46520, + "train_speed(iter/s)": 0.200665 + }, + { + "acc": 0.75945778, + "epoch": 1.0855801338602196, + "grad_norm": 5.5, + "learning_rate": 4.550432660507877e-06, + "loss": 0.8995718, + "memory(GiB)": 147.13, + "step": 46530, + "train_speed(iter/s)": 0.200686 + }, + { + "acc": 0.78146367, + "epoch": 1.0858134414325085, + "grad_norm": 5.5625, + "learning_rate": 4.548551232695983e-06, + "loss": 0.78113079, + "memory(GiB)": 147.13, + "step": 46540, + "train_speed(iter/s)": 0.20071 + }, + { + "acc": 0.77994041, + "epoch": 1.0860467490047974, + "grad_norm": 5.125, + "learning_rate": 4.5466698693282675e-06, + "loss": 0.79309092, + "memory(GiB)": 147.13, + "step": 46550, + "train_speed(iter/s)": 0.200731 + }, + { + "acc": 0.79029708, + "epoch": 1.0862800565770863, + "grad_norm": 6.3125, + "learning_rate": 4.544788570673296e-06, + "loss": 0.76324611, + "memory(GiB)": 147.13, + "step": 46560, + "train_speed(iter/s)": 0.200754 + }, + { + "acc": 0.77777467, + "epoch": 1.0865133641493752, + "grad_norm": 14.5625, + "learning_rate": 4.542907336999625e-06, + "loss": 0.81118917, + "memory(GiB)": 147.13, + "step": 46570, + "train_speed(iter/s)": 0.200776 + }, + { + "acc": 0.78099852, + "epoch": 1.086746671721664, + "grad_norm": 5.34375, + "learning_rate": 4.541026168575798e-06, + "loss": 0.76369138, + "memory(GiB)": 147.13, + "step": 46580, + "train_speed(iter/s)": 0.200799 + }, + { + "acc": 0.79265385, + "epoch": 1.086979979293953, + "grad_norm": 4.53125, + "learning_rate": 4.539145065670353e-06, + "loss": 0.74946361, + "memory(GiB)": 147.13, + "step": 46590, + "train_speed(iter/s)": 0.200822 + }, + { + "acc": 0.7763402, + "epoch": 1.0872132868662419, + "grad_norm": 7.15625, + "learning_rate": 4.537264028551814e-06, + "loss": 0.81608562, + "memory(GiB)": 147.13, + "step": 46600, + "train_speed(iter/s)": 0.200843 + }, + { + "acc": 0.79085789, + "epoch": 1.0874465944385308, + "grad_norm": 6.6875, + "learning_rate": 4.535383057488702e-06, + "loss": 0.75591068, + "memory(GiB)": 147.13, + "step": 46610, + "train_speed(iter/s)": 0.200865 + }, + { + "acc": 0.78365879, + "epoch": 1.0876799020108197, + "grad_norm": 5.75, + "learning_rate": 4.533502152749523e-06, + "loss": 0.78133993, + "memory(GiB)": 147.13, + "step": 46620, + "train_speed(iter/s)": 0.200887 + }, + { + "acc": 0.81641464, + "epoch": 1.0879132095831086, + "grad_norm": 4.9375, + "learning_rate": 4.531621314602777e-06, + "loss": 0.6454319, + "memory(GiB)": 147.13, + "step": 46630, + "train_speed(iter/s)": 0.20091 + }, + { + "acc": 0.7887289, + "epoch": 1.0881465171553975, + "grad_norm": 6.1875, + "learning_rate": 4.529740543316952e-06, + "loss": 0.75883055, + "memory(GiB)": 147.13, + "step": 46640, + "train_speed(iter/s)": 0.200931 + }, + { + "acc": 0.76232872, + "epoch": 1.0883798247276864, + "grad_norm": 4.0, + "learning_rate": 4.52785983916053e-06, + "loss": 0.8628355, + "memory(GiB)": 147.13, + "step": 46650, + "train_speed(iter/s)": 0.200952 + }, + { + "acc": 0.76253319, + "epoch": 1.0886131322999753, + "grad_norm": 6.25, + "learning_rate": 4.525979202401976e-06, + "loss": 0.85835104, + "memory(GiB)": 147.13, + "step": 46660, + "train_speed(iter/s)": 0.200974 + }, + { + "acc": 0.78178864, + "epoch": 1.0888464398722641, + "grad_norm": 5.8125, + "learning_rate": 4.524098633309753e-06, + "loss": 0.79391246, + "memory(GiB)": 147.13, + "step": 46670, + "train_speed(iter/s)": 0.200996 + }, + { + "acc": 0.7927947, + "epoch": 1.089079747444553, + "grad_norm": 8.4375, + "learning_rate": 4.522218132152313e-06, + "loss": 0.73775425, + "memory(GiB)": 147.13, + "step": 46680, + "train_speed(iter/s)": 0.201018 + }, + { + "acc": 0.77629023, + "epoch": 1.089313055016842, + "grad_norm": 5.25, + "learning_rate": 4.520337699198095e-06, + "loss": 0.80242271, + "memory(GiB)": 147.13, + "step": 46690, + "train_speed(iter/s)": 0.201041 + }, + { + "acc": 0.79058509, + "epoch": 1.0895463625891308, + "grad_norm": 4.4375, + "learning_rate": 4.5184573347155316e-06, + "loss": 0.73654985, + "memory(GiB)": 147.13, + "step": 46700, + "train_speed(iter/s)": 0.201065 + }, + { + "acc": 0.77961817, + "epoch": 1.0897796701614197, + "grad_norm": 6.5, + "learning_rate": 4.516577038973044e-06, + "loss": 0.7927855, + "memory(GiB)": 147.13, + "step": 46710, + "train_speed(iter/s)": 0.201088 + }, + { + "acc": 0.78442645, + "epoch": 1.0900129777337086, + "grad_norm": 5.34375, + "learning_rate": 4.514696812239043e-06, + "loss": 0.78609529, + "memory(GiB)": 147.13, + "step": 46720, + "train_speed(iter/s)": 0.20111 + }, + { + "acc": 0.79100533, + "epoch": 1.0902462853059975, + "grad_norm": 8.9375, + "learning_rate": 4.512816654781931e-06, + "loss": 0.74049482, + "memory(GiB)": 147.13, + "step": 46730, + "train_speed(iter/s)": 0.201132 + }, + { + "acc": 0.78928852, + "epoch": 1.0904795928782864, + "grad_norm": 8.875, + "learning_rate": 4.5109365668701e-06, + "loss": 0.73953748, + "memory(GiB)": 147.13, + "step": 46740, + "train_speed(iter/s)": 0.201155 + }, + { + "acc": 0.78580799, + "epoch": 1.0907129004505753, + "grad_norm": 6.53125, + "learning_rate": 4.5090565487719326e-06, + "loss": 0.75008278, + "memory(GiB)": 147.13, + "step": 46750, + "train_speed(iter/s)": 0.201177 + }, + { + "acc": 0.77508364, + "epoch": 1.0909462080228642, + "grad_norm": 5.125, + "learning_rate": 4.5071766007558e-06, + "loss": 0.81171427, + "memory(GiB)": 147.13, + "step": 46760, + "train_speed(iter/s)": 0.2012 + }, + { + "acc": 0.78406925, + "epoch": 1.0911795155951531, + "grad_norm": 5.96875, + "learning_rate": 4.505296723090066e-06, + "loss": 0.7646771, + "memory(GiB)": 147.13, + "step": 46770, + "train_speed(iter/s)": 0.201222 + }, + { + "acc": 0.78110132, + "epoch": 1.0914128231674418, + "grad_norm": 5.6875, + "learning_rate": 4.503416916043079e-06, + "loss": 0.76935401, + "memory(GiB)": 147.13, + "step": 46780, + "train_speed(iter/s)": 0.201242 + }, + { + "acc": 0.79187145, + "epoch": 1.091646130739731, + "grad_norm": 4.8125, + "learning_rate": 4.501537179883184e-06, + "loss": 0.74730501, + "memory(GiB)": 147.13, + "step": 46790, + "train_speed(iter/s)": 0.201264 + }, + { + "acc": 0.78256168, + "epoch": 1.0918794383120196, + "grad_norm": 4.96875, + "learning_rate": 4.499657514878711e-06, + "loss": 0.77862916, + "memory(GiB)": 147.13, + "step": 46800, + "train_speed(iter/s)": 0.201287 + }, + { + "acc": 0.77598977, + "epoch": 1.0921127458843085, + "grad_norm": 6.75, + "learning_rate": 4.497777921297983e-06, + "loss": 0.8359479, + "memory(GiB)": 147.13, + "step": 46810, + "train_speed(iter/s)": 0.201309 + }, + { + "acc": 0.77790146, + "epoch": 1.0923460534565974, + "grad_norm": 5.96875, + "learning_rate": 4.49589839940931e-06, + "loss": 0.81699162, + "memory(GiB)": 147.13, + "step": 46820, + "train_speed(iter/s)": 0.201332 + }, + { + "acc": 0.78884344, + "epoch": 1.0925793610288863, + "grad_norm": 4.9375, + "learning_rate": 4.494018949480994e-06, + "loss": 0.74211559, + "memory(GiB)": 147.13, + "step": 46830, + "train_speed(iter/s)": 0.201354 + }, + { + "acc": 0.77702522, + "epoch": 1.0928126686011752, + "grad_norm": 6.0625, + "learning_rate": 4.492139571781328e-06, + "loss": 0.79819546, + "memory(GiB)": 147.13, + "step": 46840, + "train_speed(iter/s)": 0.201376 + }, + { + "acc": 0.77434607, + "epoch": 1.093045976173464, + "grad_norm": 5.3125, + "learning_rate": 4.490260266578589e-06, + "loss": 0.81396513, + "memory(GiB)": 147.13, + "step": 46850, + "train_speed(iter/s)": 0.201398 + }, + { + "acc": 0.79454117, + "epoch": 1.093279283745753, + "grad_norm": 6.28125, + "learning_rate": 4.4883810341410485e-06, + "loss": 0.74300327, + "memory(GiB)": 147.13, + "step": 46860, + "train_speed(iter/s)": 0.20142 + }, + { + "acc": 0.77728949, + "epoch": 1.093512591318042, + "grad_norm": 4.96875, + "learning_rate": 4.486501874736967e-06, + "loss": 0.80336361, + "memory(GiB)": 147.13, + "step": 46870, + "train_speed(iter/s)": 0.201442 + }, + { + "acc": 0.79518509, + "epoch": 1.0937458988903308, + "grad_norm": 5.3125, + "learning_rate": 4.484622788634596e-06, + "loss": 0.7204237, + "memory(GiB)": 147.13, + "step": 46880, + "train_speed(iter/s)": 0.201465 + }, + { + "acc": 0.78115616, + "epoch": 1.0939792064626197, + "grad_norm": 5.8125, + "learning_rate": 4.48274377610217e-06, + "loss": 0.78942556, + "memory(GiB)": 147.13, + "step": 46890, + "train_speed(iter/s)": 0.201487 + }, + { + "acc": 0.78453555, + "epoch": 1.0942125140349086, + "grad_norm": 4.6875, + "learning_rate": 4.480864837407919e-06, + "loss": 0.76847844, + "memory(GiB)": 147.13, + "step": 46900, + "train_speed(iter/s)": 0.201509 + }, + { + "acc": 0.80208855, + "epoch": 1.0944458216071975, + "grad_norm": 5.125, + "learning_rate": 4.478985972820063e-06, + "loss": 0.69647503, + "memory(GiB)": 147.13, + "step": 46910, + "train_speed(iter/s)": 0.20153 + }, + { + "acc": 0.77438059, + "epoch": 1.0946791291794864, + "grad_norm": 5.25, + "learning_rate": 4.477107182606807e-06, + "loss": 0.82461748, + "memory(GiB)": 147.13, + "step": 46920, + "train_speed(iter/s)": 0.201553 + }, + { + "acc": 0.7742578, + "epoch": 1.0949124367517753, + "grad_norm": 6.0625, + "learning_rate": 4.4752284670363495e-06, + "loss": 0.80661144, + "memory(GiB)": 147.13, + "step": 46930, + "train_speed(iter/s)": 0.201575 + }, + { + "acc": 0.78175735, + "epoch": 1.0951457443240642, + "grad_norm": 5.75, + "learning_rate": 4.473349826376876e-06, + "loss": 0.80005064, + "memory(GiB)": 147.13, + "step": 46940, + "train_speed(iter/s)": 0.201597 + }, + { + "acc": 0.78072362, + "epoch": 1.095379051896353, + "grad_norm": 6.78125, + "learning_rate": 4.471471260896561e-06, + "loss": 0.80625315, + "memory(GiB)": 147.13, + "step": 46950, + "train_speed(iter/s)": 0.20162 + }, + { + "acc": 0.78413506, + "epoch": 1.095612359468642, + "grad_norm": 4.4375, + "learning_rate": 4.46959277086357e-06, + "loss": 0.76228724, + "memory(GiB)": 147.13, + "step": 46960, + "train_speed(iter/s)": 0.201642 + }, + { + "acc": 0.7963975, + "epoch": 1.095845667040931, + "grad_norm": 5.03125, + "learning_rate": 4.467714356546057e-06, + "loss": 0.71428413, + "memory(GiB)": 147.13, + "step": 46970, + "train_speed(iter/s)": 0.201664 + }, + { + "acc": 0.79295254, + "epoch": 1.0960789746132198, + "grad_norm": 5.09375, + "learning_rate": 4.465836018212166e-06, + "loss": 0.72961626, + "memory(GiB)": 147.13, + "step": 46980, + "train_speed(iter/s)": 0.201686 + }, + { + "acc": 0.76461864, + "epoch": 1.0963122821855087, + "grad_norm": 5.84375, + "learning_rate": 4.463957756130028e-06, + "loss": 0.82763643, + "memory(GiB)": 147.13, + "step": 46990, + "train_speed(iter/s)": 0.201709 + }, + { + "acc": 0.77132835, + "epoch": 1.0965455897577976, + "grad_norm": 6.9375, + "learning_rate": 4.462079570567765e-06, + "loss": 0.81077747, + "memory(GiB)": 147.13, + "step": 47000, + "train_speed(iter/s)": 0.20173 + }, + { + "epoch": 1.0965455897577976, + "eval_acc": 0.7441737526485854, + "eval_loss": 0.8062352538108826, + "eval_runtime": 1270.2715, + "eval_samples_per_second": 28.333, + "eval_steps_per_second": 14.167, + "step": 47000 + }, + { + "acc": 0.77072382, + "epoch": 1.0967788973300865, + "grad_norm": 4.21875, + "learning_rate": 4.460201461793486e-06, + "loss": 0.83631325, + "memory(GiB)": 147.13, + "step": 47010, + "train_speed(iter/s)": 0.200644 + }, + { + "acc": 0.79384909, + "epoch": 1.0970122049023754, + "grad_norm": 6.21875, + "learning_rate": 4.458323430075292e-06, + "loss": 0.74116993, + "memory(GiB)": 147.13, + "step": 47020, + "train_speed(iter/s)": 0.200664 + }, + { + "acc": 0.7897253, + "epoch": 1.0972455124746643, + "grad_norm": 4.375, + "learning_rate": 4.45644547568127e-06, + "loss": 0.75881343, + "memory(GiB)": 147.13, + "step": 47030, + "train_speed(iter/s)": 0.200686 + }, + { + "acc": 0.78165913, + "epoch": 1.0974788200469532, + "grad_norm": 4.9375, + "learning_rate": 4.4545675988795e-06, + "loss": 0.79373956, + "memory(GiB)": 147.13, + "step": 47040, + "train_speed(iter/s)": 0.200708 + }, + { + "acc": 0.78019323, + "epoch": 1.097712127619242, + "grad_norm": 5.5625, + "learning_rate": 4.452689799938045e-06, + "loss": 0.79432139, + "memory(GiB)": 147.13, + "step": 47050, + "train_speed(iter/s)": 0.200729 + }, + { + "acc": 0.76499996, + "epoch": 1.097945435191531, + "grad_norm": 6.0625, + "learning_rate": 4.450812079124964e-06, + "loss": 0.85962629, + "memory(GiB)": 147.13, + "step": 47060, + "train_speed(iter/s)": 0.200751 + }, + { + "acc": 0.78296137, + "epoch": 1.0981787427638199, + "grad_norm": 4.84375, + "learning_rate": 4.448934436708297e-06, + "loss": 0.78475742, + "memory(GiB)": 147.13, + "step": 47070, + "train_speed(iter/s)": 0.200773 + }, + { + "acc": 0.78817444, + "epoch": 1.0984120503361088, + "grad_norm": 6.28125, + "learning_rate": 4.44705687295608e-06, + "loss": 0.77784424, + "memory(GiB)": 147.13, + "step": 47080, + "train_speed(iter/s)": 0.200795 + }, + { + "acc": 0.78499746, + "epoch": 1.0986453579083977, + "grad_norm": 7.09375, + "learning_rate": 4.445179388136335e-06, + "loss": 0.760812, + "memory(GiB)": 147.13, + "step": 47090, + "train_speed(iter/s)": 0.200816 + }, + { + "acc": 0.78877263, + "epoch": 1.0988786654806866, + "grad_norm": 6.59375, + "learning_rate": 4.44330198251707e-06, + "loss": 0.73901072, + "memory(GiB)": 147.13, + "step": 47100, + "train_speed(iter/s)": 0.200837 + }, + { + "acc": 0.75595312, + "epoch": 1.0991119730529755, + "grad_norm": 6.3125, + "learning_rate": 4.441424656366287e-06, + "loss": 0.85828323, + "memory(GiB)": 147.13, + "step": 47110, + "train_speed(iter/s)": 0.20086 + }, + { + "acc": 0.78338614, + "epoch": 1.0993452806252644, + "grad_norm": 6.90625, + "learning_rate": 4.43954740995197e-06, + "loss": 0.77686892, + "memory(GiB)": 147.13, + "step": 47120, + "train_speed(iter/s)": 0.200882 + }, + { + "acc": 0.79634991, + "epoch": 1.0995785881975533, + "grad_norm": 7.125, + "learning_rate": 4.437670243542097e-06, + "loss": 0.72251873, + "memory(GiB)": 147.13, + "step": 47130, + "train_speed(iter/s)": 0.200903 + }, + { + "acc": 0.7735898, + "epoch": 1.0998118957698422, + "grad_norm": 5.3125, + "learning_rate": 4.435793157404636e-06, + "loss": 0.81141148, + "memory(GiB)": 147.13, + "step": 47140, + "train_speed(iter/s)": 0.200925 + }, + { + "acc": 0.76752191, + "epoch": 1.100045203342131, + "grad_norm": 5.78125, + "learning_rate": 4.433916151807535e-06, + "loss": 0.83682642, + "memory(GiB)": 147.13, + "step": 47150, + "train_speed(iter/s)": 0.200948 + }, + { + "acc": 0.78204641, + "epoch": 1.10027851091442, + "grad_norm": 5.5625, + "learning_rate": 4.43203922701874e-06, + "loss": 0.77274141, + "memory(GiB)": 147.13, + "step": 47160, + "train_speed(iter/s)": 0.20097 + }, + { + "acc": 0.7881916, + "epoch": 1.1005118184867086, + "grad_norm": 4.9375, + "learning_rate": 4.43016238330618e-06, + "loss": 0.74384875, + "memory(GiB)": 147.13, + "step": 47170, + "train_speed(iter/s)": 0.200992 + }, + { + "acc": 0.78752022, + "epoch": 1.1007451260589978, + "grad_norm": 3.984375, + "learning_rate": 4.428285620937774e-06, + "loss": 0.76339617, + "memory(GiB)": 147.13, + "step": 47180, + "train_speed(iter/s)": 0.201013 + }, + { + "acc": 0.76733403, + "epoch": 1.1009784336312864, + "grad_norm": 5.78125, + "learning_rate": 4.4264089401814306e-06, + "loss": 0.84128456, + "memory(GiB)": 147.13, + "step": 47190, + "train_speed(iter/s)": 0.201035 + }, + { + "acc": 0.78480959, + "epoch": 1.1012117412035753, + "grad_norm": 4.625, + "learning_rate": 4.4245323413050446e-06, + "loss": 0.76962199, + "memory(GiB)": 147.13, + "step": 47200, + "train_speed(iter/s)": 0.201057 + }, + { + "acc": 0.7589119, + "epoch": 1.1014450487758642, + "grad_norm": 4.6875, + "learning_rate": 4.422655824576499e-06, + "loss": 0.88679562, + "memory(GiB)": 147.13, + "step": 47210, + "train_speed(iter/s)": 0.201079 + }, + { + "acc": 0.80034866, + "epoch": 1.1016783563481531, + "grad_norm": 4.90625, + "learning_rate": 4.420779390263669e-06, + "loss": 0.70025153, + "memory(GiB)": 147.13, + "step": 47220, + "train_speed(iter/s)": 0.201102 + }, + { + "acc": 0.78663492, + "epoch": 1.101911663920442, + "grad_norm": 4.90625, + "learning_rate": 4.4189030386344094e-06, + "loss": 0.76303225, + "memory(GiB)": 147.13, + "step": 47230, + "train_speed(iter/s)": 0.201123 + }, + { + "acc": 0.77694702, + "epoch": 1.102144971492731, + "grad_norm": 6.90625, + "learning_rate": 4.417026769956573e-06, + "loss": 0.79356718, + "memory(GiB)": 147.13, + "step": 47240, + "train_speed(iter/s)": 0.201143 + }, + { + "acc": 0.77413359, + "epoch": 1.1023782790650198, + "grad_norm": 4.9375, + "learning_rate": 4.415150584497996e-06, + "loss": 0.81121407, + "memory(GiB)": 147.13, + "step": 47250, + "train_speed(iter/s)": 0.201165 + }, + { + "acc": 0.74965243, + "epoch": 1.1026115866373087, + "grad_norm": 6.5, + "learning_rate": 4.413274482526503e-06, + "loss": 0.91974735, + "memory(GiB)": 147.13, + "step": 47260, + "train_speed(iter/s)": 0.201187 + }, + { + "acc": 0.78864617, + "epoch": 1.1028448942095976, + "grad_norm": 5.28125, + "learning_rate": 4.4113984643099075e-06, + "loss": 0.75029535, + "memory(GiB)": 147.13, + "step": 47270, + "train_speed(iter/s)": 0.201207 + }, + { + "acc": 0.79483843, + "epoch": 1.1030782017818865, + "grad_norm": 5.21875, + "learning_rate": 4.409522530116011e-06, + "loss": 0.73387432, + "memory(GiB)": 147.13, + "step": 47280, + "train_speed(iter/s)": 0.20123 + }, + { + "acc": 0.7674356, + "epoch": 1.1033115093541754, + "grad_norm": 4.6875, + "learning_rate": 4.407646680212601e-06, + "loss": 0.85647984, + "memory(GiB)": 147.13, + "step": 47290, + "train_speed(iter/s)": 0.201252 + }, + { + "acc": 0.78836451, + "epoch": 1.1035448169264643, + "grad_norm": 5.90625, + "learning_rate": 4.405770914867455e-06, + "loss": 0.74849033, + "memory(GiB)": 147.13, + "step": 47300, + "train_speed(iter/s)": 0.201274 + }, + { + "acc": 0.78828807, + "epoch": 1.1037781244987532, + "grad_norm": 7.03125, + "learning_rate": 4.403895234348338e-06, + "loss": 0.74482512, + "memory(GiB)": 147.13, + "step": 47310, + "train_speed(iter/s)": 0.201297 + }, + { + "acc": 0.77510147, + "epoch": 1.1040114320710421, + "grad_norm": 5.4375, + "learning_rate": 4.402019638923003e-06, + "loss": 0.82446518, + "memory(GiB)": 147.13, + "step": 47320, + "train_speed(iter/s)": 0.201319 + }, + { + "acc": 0.78557339, + "epoch": 1.104244739643331, + "grad_norm": 5.3125, + "learning_rate": 4.400144128859192e-06, + "loss": 0.76738482, + "memory(GiB)": 147.13, + "step": 47330, + "train_speed(iter/s)": 0.201342 + }, + { + "acc": 0.79229355, + "epoch": 1.10447804721562, + "grad_norm": 4.28125, + "learning_rate": 4.3982687044246336e-06, + "loss": 0.76299677, + "memory(GiB)": 147.13, + "step": 47340, + "train_speed(iter/s)": 0.201363 + }, + { + "acc": 0.79036603, + "epoch": 1.1047113547879088, + "grad_norm": 6.1875, + "learning_rate": 4.396393365887041e-06, + "loss": 0.74940066, + "memory(GiB)": 147.13, + "step": 47350, + "train_speed(iter/s)": 0.201384 + }, + { + "acc": 0.78618107, + "epoch": 1.1049446623601977, + "grad_norm": 6.125, + "learning_rate": 4.394518113514121e-06, + "loss": 0.74857254, + "memory(GiB)": 147.13, + "step": 47360, + "train_speed(iter/s)": 0.201407 + }, + { + "acc": 0.78618221, + "epoch": 1.1051779699324866, + "grad_norm": 5.65625, + "learning_rate": 4.392642947573563e-06, + "loss": 0.75713739, + "memory(GiB)": 147.13, + "step": 47370, + "train_speed(iter/s)": 0.20143 + }, + { + "acc": 0.77709045, + "epoch": 1.1054112775047755, + "grad_norm": 4.9375, + "learning_rate": 4.3907678683330486e-06, + "loss": 0.7963932, + "memory(GiB)": 147.13, + "step": 47380, + "train_speed(iter/s)": 0.201452 + }, + { + "acc": 0.78100262, + "epoch": 1.1056445850770644, + "grad_norm": 5.375, + "learning_rate": 4.388892876060243e-06, + "loss": 0.79459877, + "memory(GiB)": 147.13, + "step": 47390, + "train_speed(iter/s)": 0.201473 + }, + { + "acc": 0.78773174, + "epoch": 1.1058778926493533, + "grad_norm": 8.875, + "learning_rate": 4.387017971022803e-06, + "loss": 0.74201722, + "memory(GiB)": 147.13, + "step": 47400, + "train_speed(iter/s)": 0.201496 + }, + { + "acc": 0.7911149, + "epoch": 1.1061112002216422, + "grad_norm": 47.25, + "learning_rate": 4.385143153488369e-06, + "loss": 0.74900484, + "memory(GiB)": 147.13, + "step": 47410, + "train_speed(iter/s)": 0.201518 + }, + { + "acc": 0.77765427, + "epoch": 1.1063445077939311, + "grad_norm": 12.8125, + "learning_rate": 4.383268423724572e-06, + "loss": 0.78825626, + "memory(GiB)": 147.13, + "step": 47420, + "train_speed(iter/s)": 0.201539 + }, + { + "acc": 0.76809454, + "epoch": 1.10657781536622, + "grad_norm": 5.375, + "learning_rate": 4.381393781999027e-06, + "loss": 0.84127693, + "memory(GiB)": 147.13, + "step": 47430, + "train_speed(iter/s)": 0.201561 + }, + { + "acc": 0.80026913, + "epoch": 1.106811122938509, + "grad_norm": 4.75, + "learning_rate": 4.379519228579342e-06, + "loss": 0.69325418, + "memory(GiB)": 147.13, + "step": 47440, + "train_speed(iter/s)": 0.201583 + }, + { + "acc": 0.78321247, + "epoch": 1.1070444305107978, + "grad_norm": 7.125, + "learning_rate": 4.377644763733106e-06, + "loss": 0.79312048, + "memory(GiB)": 147.13, + "step": 47450, + "train_speed(iter/s)": 0.201604 + }, + { + "acc": 0.7822238, + "epoch": 1.1072777380830867, + "grad_norm": 4.625, + "learning_rate": 4.375770387727899e-06, + "loss": 0.79141903, + "memory(GiB)": 147.13, + "step": 47460, + "train_speed(iter/s)": 0.201627 + }, + { + "acc": 0.7735548, + "epoch": 1.1075110456553756, + "grad_norm": 5.5625, + "learning_rate": 4.373896100831288e-06, + "loss": 0.81294842, + "memory(GiB)": 147.13, + "step": 47470, + "train_speed(iter/s)": 0.20165 + }, + { + "acc": 0.77273912, + "epoch": 1.1077443532276645, + "grad_norm": 5.96875, + "learning_rate": 4.372021903310826e-06, + "loss": 0.81545506, + "memory(GiB)": 147.13, + "step": 47480, + "train_speed(iter/s)": 0.201672 + }, + { + "acc": 0.77078466, + "epoch": 1.1079776607999534, + "grad_norm": 6.375, + "learning_rate": 4.370147795434054e-06, + "loss": 0.84389324, + "memory(GiB)": 147.13, + "step": 47490, + "train_speed(iter/s)": 0.201694 + }, + { + "acc": 0.81073818, + "epoch": 1.1082109683722423, + "grad_norm": 6.0625, + "learning_rate": 4.3682737774685035e-06, + "loss": 0.68347592, + "memory(GiB)": 147.13, + "step": 47500, + "train_speed(iter/s)": 0.201717 + }, + { + "epoch": 1.1082109683722423, + "eval_acc": 0.7442459776973533, + "eval_loss": 0.8062799572944641, + "eval_runtime": 1271.1815, + "eval_samples_per_second": 28.313, + "eval_steps_per_second": 14.157, + "step": 47500 + }, + { + "acc": 0.77623062, + "epoch": 1.1084442759445312, + "grad_norm": 5.125, + "learning_rate": 4.366399849681686e-06, + "loss": 0.78513613, + "memory(GiB)": 147.13, + "step": 47510, + "train_speed(iter/s)": 0.200641 + }, + { + "acc": 0.78764448, + "epoch": 1.10867758351682, + "grad_norm": 7.3125, + "learning_rate": 4.364526012341107e-06, + "loss": 0.74058805, + "memory(GiB)": 147.13, + "step": 47520, + "train_speed(iter/s)": 0.200661 + }, + { + "acc": 0.77476292, + "epoch": 1.108910891089109, + "grad_norm": 4.53125, + "learning_rate": 4.362652265714254e-06, + "loss": 0.79294233, + "memory(GiB)": 147.13, + "step": 47530, + "train_speed(iter/s)": 0.200683 + }, + { + "acc": 0.75863619, + "epoch": 1.109144198661398, + "grad_norm": 6.375, + "learning_rate": 4.360778610068605e-06, + "loss": 0.88454666, + "memory(GiB)": 147.13, + "step": 47540, + "train_speed(iter/s)": 0.200705 + }, + { + "acc": 0.78705368, + "epoch": 1.1093775062336868, + "grad_norm": 6.1875, + "learning_rate": 4.3589050456716254e-06, + "loss": 0.74554396, + "memory(GiB)": 147.13, + "step": 47550, + "train_speed(iter/s)": 0.200727 + }, + { + "acc": 0.79435959, + "epoch": 1.1096108138059755, + "grad_norm": 4.65625, + "learning_rate": 4.357031572790763e-06, + "loss": 0.74123554, + "memory(GiB)": 147.13, + "step": 47560, + "train_speed(iter/s)": 0.200747 + }, + { + "acc": 0.76363263, + "epoch": 1.1098441213782644, + "grad_norm": 5.9375, + "learning_rate": 4.355158191693458e-06, + "loss": 0.87095814, + "memory(GiB)": 147.13, + "step": 47570, + "train_speed(iter/s)": 0.200768 + }, + { + "acc": 0.77314081, + "epoch": 1.1100774289505533, + "grad_norm": 6.1875, + "learning_rate": 4.353284902647133e-06, + "loss": 0.82467804, + "memory(GiB)": 147.13, + "step": 47580, + "train_speed(iter/s)": 0.200788 + }, + { + "acc": 0.7841754, + "epoch": 1.1103107365228422, + "grad_norm": 4.6875, + "learning_rate": 4.351411705919201e-06, + "loss": 0.78773432, + "memory(GiB)": 147.13, + "step": 47590, + "train_speed(iter/s)": 0.200809 + }, + { + "acc": 0.80135212, + "epoch": 1.110544044095131, + "grad_norm": 5.75, + "learning_rate": 4.349538601777058e-06, + "loss": 0.74643879, + "memory(GiB)": 147.13, + "step": 47600, + "train_speed(iter/s)": 0.20083 + }, + { + "acc": 0.77409286, + "epoch": 1.11077735166742, + "grad_norm": 4.28125, + "learning_rate": 4.347665590488091e-06, + "loss": 0.81729527, + "memory(GiB)": 147.13, + "step": 47610, + "train_speed(iter/s)": 0.200851 + }, + { + "acc": 0.80320225, + "epoch": 1.1110106592397089, + "grad_norm": 4.34375, + "learning_rate": 4.3457926723196716e-06, + "loss": 0.69598637, + "memory(GiB)": 147.13, + "step": 47620, + "train_speed(iter/s)": 0.200873 + }, + { + "acc": 0.79413824, + "epoch": 1.1112439668119978, + "grad_norm": 5.5, + "learning_rate": 4.343919847539157e-06, + "loss": 0.7392138, + "memory(GiB)": 147.13, + "step": 47630, + "train_speed(iter/s)": 0.200895 + }, + { + "acc": 0.76051364, + "epoch": 1.1114772743842867, + "grad_norm": 6.71875, + "learning_rate": 4.342047116413897e-06, + "loss": 0.88393078, + "memory(GiB)": 147.13, + "step": 47640, + "train_speed(iter/s)": 0.200917 + }, + { + "acc": 0.77696719, + "epoch": 1.1117105819565756, + "grad_norm": 4.84375, + "learning_rate": 4.340174479211217e-06, + "loss": 0.80170889, + "memory(GiB)": 147.13, + "step": 47650, + "train_speed(iter/s)": 0.200938 + }, + { + "acc": 0.77932134, + "epoch": 1.1119438895288645, + "grad_norm": 5.65625, + "learning_rate": 4.338301936198439e-06, + "loss": 0.79549494, + "memory(GiB)": 147.13, + "step": 47660, + "train_speed(iter/s)": 0.200958 + }, + { + "acc": 0.77386909, + "epoch": 1.1121771971011534, + "grad_norm": 5.4375, + "learning_rate": 4.336429487642867e-06, + "loss": 0.82580423, + "memory(GiB)": 147.13, + "step": 47670, + "train_speed(iter/s)": 0.20098 + }, + { + "acc": 0.79550128, + "epoch": 1.1124105046734423, + "grad_norm": 5.28125, + "learning_rate": 4.334557133811796e-06, + "loss": 0.71878395, + "memory(GiB)": 147.13, + "step": 47680, + "train_speed(iter/s)": 0.201003 + }, + { + "acc": 0.77494755, + "epoch": 1.1126438122457312, + "grad_norm": 5.9375, + "learning_rate": 4.332684874972498e-06, + "loss": 0.80830698, + "memory(GiB)": 147.13, + "step": 47690, + "train_speed(iter/s)": 0.201024 + }, + { + "acc": 0.80317068, + "epoch": 1.11287711981802, + "grad_norm": 5.78125, + "learning_rate": 4.330812711392241e-06, + "loss": 0.7105298, + "memory(GiB)": 147.13, + "step": 47700, + "train_speed(iter/s)": 0.201045 + }, + { + "acc": 0.78965735, + "epoch": 1.113110427390309, + "grad_norm": 3.765625, + "learning_rate": 4.328940643338274e-06, + "loss": 0.742593, + "memory(GiB)": 147.13, + "step": 47710, + "train_speed(iter/s)": 0.201068 + }, + { + "acc": 0.78511152, + "epoch": 1.1133437349625979, + "grad_norm": 5.40625, + "learning_rate": 4.327068671077836e-06, + "loss": 0.78913937, + "memory(GiB)": 147.13, + "step": 47720, + "train_speed(iter/s)": 0.201088 + }, + { + "acc": 0.78472223, + "epoch": 1.1135770425348868, + "grad_norm": 5.71875, + "learning_rate": 4.32519679487815e-06, + "loss": 0.77086167, + "memory(GiB)": 147.13, + "step": 47730, + "train_speed(iter/s)": 0.20111 + }, + { + "acc": 0.77630825, + "epoch": 1.1138103501071757, + "grad_norm": 6.6875, + "learning_rate": 4.323325015006425e-06, + "loss": 0.82653818, + "memory(GiB)": 147.13, + "step": 47740, + "train_speed(iter/s)": 0.201132 + }, + { + "acc": 0.76846933, + "epoch": 1.1140436576794646, + "grad_norm": 5.65625, + "learning_rate": 4.321453331729857e-06, + "loss": 0.82080832, + "memory(GiB)": 147.13, + "step": 47750, + "train_speed(iter/s)": 0.201153 + }, + { + "acc": 0.78095446, + "epoch": 1.1142769652517535, + "grad_norm": 3.890625, + "learning_rate": 4.319581745315629e-06, + "loss": 0.78204536, + "memory(GiB)": 147.13, + "step": 47760, + "train_speed(iter/s)": 0.201175 + }, + { + "acc": 0.77750607, + "epoch": 1.1145102728240424, + "grad_norm": 5.4375, + "learning_rate": 4.317710256030911e-06, + "loss": 0.82766047, + "memory(GiB)": 147.13, + "step": 47770, + "train_speed(iter/s)": 0.201196 + }, + { + "acc": 0.77918196, + "epoch": 1.1147435803963313, + "grad_norm": 7.03125, + "learning_rate": 4.3158388641428536e-06, + "loss": 0.7857625, + "memory(GiB)": 147.13, + "step": 47780, + "train_speed(iter/s)": 0.201219 + }, + { + "acc": 0.77556639, + "epoch": 1.1149768879686202, + "grad_norm": 5.21875, + "learning_rate": 4.3139675699186e-06, + "loss": 0.81662083, + "memory(GiB)": 147.13, + "step": 47790, + "train_speed(iter/s)": 0.20124 + }, + { + "acc": 0.79196119, + "epoch": 1.115210195540909, + "grad_norm": 6.40625, + "learning_rate": 4.312096373625279e-06, + "loss": 0.75588217, + "memory(GiB)": 147.13, + "step": 47800, + "train_speed(iter/s)": 0.201261 + }, + { + "acc": 0.77584, + "epoch": 1.115443503113198, + "grad_norm": 6.6875, + "learning_rate": 4.310225275529998e-06, + "loss": 0.79082928, + "memory(GiB)": 147.13, + "step": 47810, + "train_speed(iter/s)": 0.201282 + }, + { + "acc": 0.76462359, + "epoch": 1.1156768106854869, + "grad_norm": 4.78125, + "learning_rate": 4.308354275899859e-06, + "loss": 0.8640892, + "memory(GiB)": 147.13, + "step": 47820, + "train_speed(iter/s)": 0.201304 + }, + { + "acc": 0.8109581, + "epoch": 1.1159101182577758, + "grad_norm": 7.6875, + "learning_rate": 4.306483375001946e-06, + "loss": 0.65745254, + "memory(GiB)": 147.13, + "step": 47830, + "train_speed(iter/s)": 0.201326 + }, + { + "acc": 0.770755, + "epoch": 1.1161434258300647, + "grad_norm": 4.6875, + "learning_rate": 4.30461257310333e-06, + "loss": 0.84625854, + "memory(GiB)": 147.13, + "step": 47840, + "train_speed(iter/s)": 0.201348 + }, + { + "acc": 0.77912922, + "epoch": 1.1163767334023535, + "grad_norm": 4.78125, + "learning_rate": 4.302741870471069e-06, + "loss": 0.76887741, + "memory(GiB)": 147.13, + "step": 47850, + "train_speed(iter/s)": 0.201369 + }, + { + "acc": 0.78028278, + "epoch": 1.1166100409746424, + "grad_norm": 8.1875, + "learning_rate": 4.3008712673722005e-06, + "loss": 0.80194454, + "memory(GiB)": 147.13, + "step": 47860, + "train_speed(iter/s)": 0.201392 + }, + { + "acc": 0.79243369, + "epoch": 1.1168433485469313, + "grad_norm": 5.6875, + "learning_rate": 4.299000764073757e-06, + "loss": 0.733636, + "memory(GiB)": 147.13, + "step": 47870, + "train_speed(iter/s)": 0.201412 + }, + { + "acc": 0.78628654, + "epoch": 1.1170766561192202, + "grad_norm": 5.1875, + "learning_rate": 4.29713036084275e-06, + "loss": 0.76849709, + "memory(GiB)": 147.13, + "step": 47880, + "train_speed(iter/s)": 0.201434 + }, + { + "acc": 0.78145351, + "epoch": 1.1173099636915091, + "grad_norm": 5.53125, + "learning_rate": 4.29526005794618e-06, + "loss": 0.79676495, + "memory(GiB)": 147.13, + "step": 47890, + "train_speed(iter/s)": 0.201454 + }, + { + "acc": 0.7888092, + "epoch": 1.117543271263798, + "grad_norm": 6.4375, + "learning_rate": 4.2933898556510325e-06, + "loss": 0.7636147, + "memory(GiB)": 147.13, + "step": 47900, + "train_speed(iter/s)": 0.201475 + }, + { + "acc": 0.77102165, + "epoch": 1.117776578836087, + "grad_norm": 10.4375, + "learning_rate": 4.29151975422428e-06, + "loss": 0.8149704, + "memory(GiB)": 147.13, + "step": 47910, + "train_speed(iter/s)": 0.201496 + }, + { + "acc": 0.7934557, + "epoch": 1.1180098864083758, + "grad_norm": 5.0, + "learning_rate": 4.289649753932874e-06, + "loss": 0.72969637, + "memory(GiB)": 147.13, + "step": 47920, + "train_speed(iter/s)": 0.201517 + }, + { + "acc": 0.79851007, + "epoch": 1.1182431939806645, + "grad_norm": 5.03125, + "learning_rate": 4.28777985504376e-06, + "loss": 0.72625418, + "memory(GiB)": 147.13, + "step": 47930, + "train_speed(iter/s)": 0.201538 + }, + { + "acc": 0.77621775, + "epoch": 1.1184765015529536, + "grad_norm": 6.03125, + "learning_rate": 4.285910057823864e-06, + "loss": 0.83182402, + "memory(GiB)": 147.13, + "step": 47940, + "train_speed(iter/s)": 0.201559 + }, + { + "acc": 0.79367595, + "epoch": 1.1187098091252423, + "grad_norm": 5.25, + "learning_rate": 4.284040362540101e-06, + "loss": 0.73981581, + "memory(GiB)": 147.13, + "step": 47950, + "train_speed(iter/s)": 0.201581 + }, + { + "acc": 0.78099489, + "epoch": 1.1189431166975312, + "grad_norm": 4.21875, + "learning_rate": 4.282170769459367e-06, + "loss": 0.79089422, + "memory(GiB)": 147.13, + "step": 47960, + "train_speed(iter/s)": 0.201603 + }, + { + "acc": 0.79144993, + "epoch": 1.1191764242698201, + "grad_norm": 4.5625, + "learning_rate": 4.2803012788485475e-06, + "loss": 0.74069662, + "memory(GiB)": 147.13, + "step": 47970, + "train_speed(iter/s)": 0.201622 + }, + { + "acc": 0.77184448, + "epoch": 1.119409731842109, + "grad_norm": 5.5625, + "learning_rate": 4.278431890974511e-06, + "loss": 0.82890167, + "memory(GiB)": 147.13, + "step": 47980, + "train_speed(iter/s)": 0.201646 + }, + { + "acc": 0.77109141, + "epoch": 1.119643039414398, + "grad_norm": 8.125, + "learning_rate": 4.276562606104114e-06, + "loss": 0.80681925, + "memory(GiB)": 147.13, + "step": 47990, + "train_speed(iter/s)": 0.201667 + }, + { + "acc": 0.78441682, + "epoch": 1.1198763469866868, + "grad_norm": 4.875, + "learning_rate": 4.274693424504194e-06, + "loss": 0.78442831, + "memory(GiB)": 147.13, + "step": 48000, + "train_speed(iter/s)": 0.201688 + }, + { + "epoch": 1.1198763469866868, + "eval_acc": 0.744195852552643, + "eval_loss": 0.806307315826416, + "eval_runtime": 1268.7398, + "eval_samples_per_second": 28.368, + "eval_steps_per_second": 14.184, + "step": 48000 + }, + { + "acc": 0.78372445, + "epoch": 1.1201096545589757, + "grad_norm": 11.0, + "learning_rate": 4.272824346441576e-06, + "loss": 0.77319117, + "memory(GiB)": 147.13, + "step": 48010, + "train_speed(iter/s)": 0.200624 + }, + { + "acc": 0.77591352, + "epoch": 1.1203429621312646, + "grad_norm": 7.625, + "learning_rate": 4.270955372183074e-06, + "loss": 0.80051012, + "memory(GiB)": 147.13, + "step": 48020, + "train_speed(iter/s)": 0.200643 + }, + { + "acc": 0.77649298, + "epoch": 1.1205762697035535, + "grad_norm": 4.59375, + "learning_rate": 4.269086501995478e-06, + "loss": 0.81262932, + "memory(GiB)": 147.13, + "step": 48030, + "train_speed(iter/s)": 0.200664 + }, + { + "acc": 0.79521112, + "epoch": 1.1208095772758424, + "grad_norm": 5.34375, + "learning_rate": 4.267217736145573e-06, + "loss": 0.7146986, + "memory(GiB)": 147.13, + "step": 48040, + "train_speed(iter/s)": 0.200684 + }, + { + "acc": 0.76602602, + "epoch": 1.1210428848481313, + "grad_norm": 5.75, + "learning_rate": 4.265349074900123e-06, + "loss": 0.83298635, + "memory(GiB)": 147.13, + "step": 48050, + "train_speed(iter/s)": 0.200707 + }, + { + "acc": 0.78246584, + "epoch": 1.1212761924204202, + "grad_norm": 5.21875, + "learning_rate": 4.263480518525878e-06, + "loss": 0.78094912, + "memory(GiB)": 147.13, + "step": 48060, + "train_speed(iter/s)": 0.200728 + }, + { + "acc": 0.77939157, + "epoch": 1.121509499992709, + "grad_norm": 4.78125, + "learning_rate": 4.261612067289577e-06, + "loss": 0.78741126, + "memory(GiB)": 147.13, + "step": 48070, + "train_speed(iter/s)": 0.200751 + }, + { + "acc": 0.78157773, + "epoch": 1.121742807564998, + "grad_norm": 5.25, + "learning_rate": 4.259743721457937e-06, + "loss": 0.76782036, + "memory(GiB)": 147.13, + "step": 48080, + "train_speed(iter/s)": 0.200772 + }, + { + "acc": 0.76486025, + "epoch": 1.121976115137287, + "grad_norm": 6.53125, + "learning_rate": 4.257875481297667e-06, + "loss": 0.86555462, + "memory(GiB)": 147.13, + "step": 48090, + "train_speed(iter/s)": 0.200794 + }, + { + "acc": 0.79323206, + "epoch": 1.1222094227095758, + "grad_norm": 5.40625, + "learning_rate": 4.256007347075455e-06, + "loss": 0.73063769, + "memory(GiB)": 147.13, + "step": 48100, + "train_speed(iter/s)": 0.200816 + }, + { + "acc": 0.76760402, + "epoch": 1.1224427302818647, + "grad_norm": 5.46875, + "learning_rate": 4.254139319057979e-06, + "loss": 0.84659023, + "memory(GiB)": 147.13, + "step": 48110, + "train_speed(iter/s)": 0.200837 + }, + { + "acc": 0.78787589, + "epoch": 1.1226760378541536, + "grad_norm": 8.625, + "learning_rate": 4.252271397511898e-06, + "loss": 0.76824799, + "memory(GiB)": 147.13, + "step": 48120, + "train_speed(iter/s)": 0.200856 + }, + { + "acc": 0.78889484, + "epoch": 1.1229093454264425, + "grad_norm": 5.4375, + "learning_rate": 4.2504035827038595e-06, + "loss": 0.75601206, + "memory(GiB)": 147.13, + "step": 48130, + "train_speed(iter/s)": 0.200878 + }, + { + "acc": 0.78154402, + "epoch": 1.1231426529987314, + "grad_norm": 6.90625, + "learning_rate": 4.248535874900491e-06, + "loss": 0.79978552, + "memory(GiB)": 147.13, + "step": 48140, + "train_speed(iter/s)": 0.200899 + }, + { + "acc": 0.79487672, + "epoch": 1.1233759605710203, + "grad_norm": 4.25, + "learning_rate": 4.246668274368409e-06, + "loss": 0.71404648, + "memory(GiB)": 147.13, + "step": 48150, + "train_speed(iter/s)": 0.20092 + }, + { + "acc": 0.7581543, + "epoch": 1.1236092681433092, + "grad_norm": 4.8125, + "learning_rate": 4.24480078137421e-06, + "loss": 0.88291979, + "memory(GiB)": 147.13, + "step": 48160, + "train_speed(iter/s)": 0.200941 + }, + { + "acc": 0.75657516, + "epoch": 1.123842575715598, + "grad_norm": 4.46875, + "learning_rate": 4.2429333961844805e-06, + "loss": 0.87030544, + "memory(GiB)": 147.13, + "step": 48170, + "train_speed(iter/s)": 0.200963 + }, + { + "acc": 0.79136295, + "epoch": 1.124075883287887, + "grad_norm": 3.890625, + "learning_rate": 4.241066119065789e-06, + "loss": 0.74944844, + "memory(GiB)": 147.13, + "step": 48180, + "train_speed(iter/s)": 0.200985 + }, + { + "acc": 0.77575951, + "epoch": 1.1243091908601759, + "grad_norm": 4.90625, + "learning_rate": 4.239198950284688e-06, + "loss": 0.80282288, + "memory(GiB)": 147.13, + "step": 48190, + "train_speed(iter/s)": 0.201006 + }, + { + "acc": 0.78090439, + "epoch": 1.1245424984324648, + "grad_norm": 5.28125, + "learning_rate": 4.237331890107717e-06, + "loss": 0.79569597, + "memory(GiB)": 147.13, + "step": 48200, + "train_speed(iter/s)": 0.201024 + }, + { + "acc": 0.78601589, + "epoch": 1.1247758060047537, + "grad_norm": 4.09375, + "learning_rate": 4.2354649388013965e-06, + "loss": 0.77665925, + "memory(GiB)": 147.13, + "step": 48210, + "train_speed(iter/s)": 0.201045 + }, + { + "acc": 0.77956362, + "epoch": 1.1250091135770426, + "grad_norm": 4.9375, + "learning_rate": 4.233598096632234e-06, + "loss": 0.77131734, + "memory(GiB)": 147.13, + "step": 48220, + "train_speed(iter/s)": 0.201068 + }, + { + "acc": 0.79384608, + "epoch": 1.1252424211493315, + "grad_norm": 6.09375, + "learning_rate": 4.23173136386672e-06, + "loss": 0.73389316, + "memory(GiB)": 147.13, + "step": 48230, + "train_speed(iter/s)": 0.201089 + }, + { + "acc": 0.78729687, + "epoch": 1.1254757287216204, + "grad_norm": 4.53125, + "learning_rate": 4.2298647407713314e-06, + "loss": 0.7819458, + "memory(GiB)": 147.13, + "step": 48240, + "train_speed(iter/s)": 0.201111 + }, + { + "acc": 0.78039856, + "epoch": 1.1257090362939093, + "grad_norm": 5.1875, + "learning_rate": 4.227998227612529e-06, + "loss": 0.78439426, + "memory(GiB)": 147.13, + "step": 48250, + "train_speed(iter/s)": 0.201133 + }, + { + "acc": 0.78524809, + "epoch": 1.1259423438661982, + "grad_norm": 4.25, + "learning_rate": 4.226131824656752e-06, + "loss": 0.7713275, + "memory(GiB)": 147.13, + "step": 48260, + "train_speed(iter/s)": 0.201155 + }, + { + "acc": 0.77486734, + "epoch": 1.126175651438487, + "grad_norm": 4.125, + "learning_rate": 4.224265532170434e-06, + "loss": 0.79942617, + "memory(GiB)": 147.13, + "step": 48270, + "train_speed(iter/s)": 0.201175 + }, + { + "acc": 0.78316307, + "epoch": 1.126408959010776, + "grad_norm": 6.03125, + "learning_rate": 4.222399350419985e-06, + "loss": 0.76972885, + "memory(GiB)": 147.13, + "step": 48280, + "train_speed(iter/s)": 0.201196 + }, + { + "acc": 0.7986887, + "epoch": 1.1266422665830649, + "grad_norm": 6.625, + "learning_rate": 4.220533279671804e-06, + "loss": 0.71824002, + "memory(GiB)": 147.13, + "step": 48290, + "train_speed(iter/s)": 0.201218 + }, + { + "acc": 0.79125404, + "epoch": 1.1268755741553538, + "grad_norm": 4.4375, + "learning_rate": 4.21866732019227e-06, + "loss": 0.74609642, + "memory(GiB)": 147.13, + "step": 48300, + "train_speed(iter/s)": 0.201238 + }, + { + "acc": 0.78295269, + "epoch": 1.1271088817276427, + "grad_norm": 6.53125, + "learning_rate": 4.216801472247749e-06, + "loss": 0.77721262, + "memory(GiB)": 147.13, + "step": 48310, + "train_speed(iter/s)": 0.20126 + }, + { + "acc": 0.76647134, + "epoch": 1.1273421892999314, + "grad_norm": 5.59375, + "learning_rate": 4.214935736104591e-06, + "loss": 0.85473747, + "memory(GiB)": 147.13, + "step": 48320, + "train_speed(iter/s)": 0.201283 + }, + { + "acc": 0.7766263, + "epoch": 1.1275754968722205, + "grad_norm": 6.625, + "learning_rate": 4.213070112029127e-06, + "loss": 0.80300884, + "memory(GiB)": 147.13, + "step": 48330, + "train_speed(iter/s)": 0.201304 + }, + { + "acc": 0.78820081, + "epoch": 1.1278088044445091, + "grad_norm": 5.8125, + "learning_rate": 4.211204600287677e-06, + "loss": 0.7805532, + "memory(GiB)": 147.13, + "step": 48340, + "train_speed(iter/s)": 0.201325 + }, + { + "acc": 0.78450546, + "epoch": 1.128042112016798, + "grad_norm": 7.84375, + "learning_rate": 4.2093392011465425e-06, + "loss": 0.77261581, + "memory(GiB)": 147.13, + "step": 48350, + "train_speed(iter/s)": 0.201347 + }, + { + "acc": 0.75979958, + "epoch": 1.128275419589087, + "grad_norm": 5.46875, + "learning_rate": 4.207473914872006e-06, + "loss": 0.86760702, + "memory(GiB)": 147.13, + "step": 48360, + "train_speed(iter/s)": 0.201368 + }, + { + "acc": 0.78485813, + "epoch": 1.1285087271613758, + "grad_norm": 6.1875, + "learning_rate": 4.20560874173034e-06, + "loss": 0.7671032, + "memory(GiB)": 147.13, + "step": 48370, + "train_speed(iter/s)": 0.201391 + }, + { + "acc": 0.77036834, + "epoch": 1.1287420347336647, + "grad_norm": 5.6875, + "learning_rate": 4.203743681987793e-06, + "loss": 0.8098217, + "memory(GiB)": 147.13, + "step": 48380, + "train_speed(iter/s)": 0.201413 + }, + { + "acc": 0.77560768, + "epoch": 1.1289753423059536, + "grad_norm": 4.3125, + "learning_rate": 4.2018787359106045e-06, + "loss": 0.81092434, + "memory(GiB)": 147.13, + "step": 48390, + "train_speed(iter/s)": 0.201435 + }, + { + "acc": 0.78146276, + "epoch": 1.1292086498782425, + "grad_norm": 5.15625, + "learning_rate": 4.200013903764994e-06, + "loss": 0.77416515, + "memory(GiB)": 147.13, + "step": 48400, + "train_speed(iter/s)": 0.201457 + }, + { + "acc": 0.77433596, + "epoch": 1.1294419574505314, + "grad_norm": 6.34375, + "learning_rate": 4.198149185817167e-06, + "loss": 0.80879793, + "memory(GiB)": 147.13, + "step": 48410, + "train_speed(iter/s)": 0.201478 + }, + { + "acc": 0.77343392, + "epoch": 1.1296752650228203, + "grad_norm": 6.15625, + "learning_rate": 4.19628458233331e-06, + "loss": 0.81420822, + "memory(GiB)": 147.13, + "step": 48420, + "train_speed(iter/s)": 0.201501 + }, + { + "acc": 0.78997188, + "epoch": 1.1299085725951092, + "grad_norm": 5.625, + "learning_rate": 4.194420093579597e-06, + "loss": 0.74577589, + "memory(GiB)": 147.13, + "step": 48430, + "train_speed(iter/s)": 0.201523 + }, + { + "acc": 0.79533091, + "epoch": 1.1301418801673981, + "grad_norm": 6.25, + "learning_rate": 4.1925557198221805e-06, + "loss": 0.72079945, + "memory(GiB)": 147.13, + "step": 48440, + "train_speed(iter/s)": 0.201542 + }, + { + "acc": 0.76862497, + "epoch": 1.130375187739687, + "grad_norm": 4.5625, + "learning_rate": 4.1906914613272e-06, + "loss": 0.86250811, + "memory(GiB)": 147.13, + "step": 48450, + "train_speed(iter/s)": 0.201564 + }, + { + "acc": 0.78376331, + "epoch": 1.130608495311976, + "grad_norm": 4.53125, + "learning_rate": 4.188827318360779e-06, + "loss": 0.76748829, + "memory(GiB)": 147.13, + "step": 48460, + "train_speed(iter/s)": 0.201585 + }, + { + "acc": 0.77294278, + "epoch": 1.1308418028842648, + "grad_norm": 6.09375, + "learning_rate": 4.186963291189022e-06, + "loss": 0.83114796, + "memory(GiB)": 147.13, + "step": 48470, + "train_speed(iter/s)": 0.201606 + }, + { + "acc": 0.78433104, + "epoch": 1.1310751104565537, + "grad_norm": 4.625, + "learning_rate": 4.185099380078022e-06, + "loss": 0.76472044, + "memory(GiB)": 147.13, + "step": 48480, + "train_speed(iter/s)": 0.201626 + }, + { + "acc": 0.78431997, + "epoch": 1.1313084180288426, + "grad_norm": 5.84375, + "learning_rate": 4.183235585293846e-06, + "loss": 0.79146395, + "memory(GiB)": 147.13, + "step": 48490, + "train_speed(iter/s)": 0.201648 + }, + { + "acc": 0.78068762, + "epoch": 1.1315417256011315, + "grad_norm": 5.96875, + "learning_rate": 4.181371907102553e-06, + "loss": 0.79927993, + "memory(GiB)": 147.13, + "step": 48500, + "train_speed(iter/s)": 0.201669 + }, + { + "epoch": 1.1315417256011315, + "eval_acc": 0.7441143391384016, + "eval_loss": 0.8060808777809143, + "eval_runtime": 1270.8615, + "eval_samples_per_second": 28.32, + "eval_steps_per_second": 14.16, + "step": 48500 + }, + { + "acc": 0.7775506, + "epoch": 1.1317750331734204, + "grad_norm": 5.4375, + "learning_rate": 4.179508345770184e-06, + "loss": 0.77313948, + "memory(GiB)": 147.13, + "step": 48510, + "train_speed(iter/s)": 0.200616 + }, + { + "acc": 0.78219233, + "epoch": 1.1320083407457093, + "grad_norm": 5.0625, + "learning_rate": 4.177644901562758e-06, + "loss": 0.79809017, + "memory(GiB)": 147.13, + "step": 48520, + "train_speed(iter/s)": 0.200637 + }, + { + "acc": 0.7918745, + "epoch": 1.1322416483179982, + "grad_norm": 4.59375, + "learning_rate": 4.1757815747462845e-06, + "loss": 0.740306, + "memory(GiB)": 147.13, + "step": 48530, + "train_speed(iter/s)": 0.200658 + }, + { + "acc": 0.7779398, + "epoch": 1.1324749558902871, + "grad_norm": 7.3125, + "learning_rate": 4.173918365586751e-06, + "loss": 0.81543484, + "memory(GiB)": 147.13, + "step": 48540, + "train_speed(iter/s)": 0.20068 + }, + { + "acc": 0.7860157, + "epoch": 1.132708263462576, + "grad_norm": 4.875, + "learning_rate": 4.172055274350132e-06, + "loss": 0.75977716, + "memory(GiB)": 147.13, + "step": 48550, + "train_speed(iter/s)": 0.2007 + }, + { + "acc": 0.775599, + "epoch": 1.132941571034865, + "grad_norm": 5.4375, + "learning_rate": 4.170192301302382e-06, + "loss": 0.80197821, + "memory(GiB)": 147.13, + "step": 48560, + "train_speed(iter/s)": 0.200722 + }, + { + "acc": 0.79071817, + "epoch": 1.1331748786071538, + "grad_norm": 5.21875, + "learning_rate": 4.168329446709439e-06, + "loss": 0.73534966, + "memory(GiB)": 147.13, + "step": 48570, + "train_speed(iter/s)": 0.200744 + }, + { + "acc": 0.79963331, + "epoch": 1.1334081861794427, + "grad_norm": 4.5, + "learning_rate": 4.166466710837226e-06, + "loss": 0.71867814, + "memory(GiB)": 147.13, + "step": 48580, + "train_speed(iter/s)": 0.200764 + }, + { + "acc": 0.76818967, + "epoch": 1.1336414937517316, + "grad_norm": 4.40625, + "learning_rate": 4.1646040939516485e-06, + "loss": 0.83885851, + "memory(GiB)": 147.13, + "step": 48590, + "train_speed(iter/s)": 0.200784 + }, + { + "acc": 0.8015955, + "epoch": 1.1338748013240205, + "grad_norm": 4.53125, + "learning_rate": 4.162741596318596e-06, + "loss": 0.70797453, + "memory(GiB)": 147.13, + "step": 48600, + "train_speed(iter/s)": 0.200804 + }, + { + "acc": 0.78102293, + "epoch": 1.1341081088963094, + "grad_norm": 5.59375, + "learning_rate": 4.160879218203935e-06, + "loss": 0.79669533, + "memory(GiB)": 147.13, + "step": 48610, + "train_speed(iter/s)": 0.200826 + }, + { + "acc": 0.77543592, + "epoch": 1.1343414164685983, + "grad_norm": 3.890625, + "learning_rate": 4.159016959873521e-06, + "loss": 0.79444084, + "memory(GiB)": 147.13, + "step": 48620, + "train_speed(iter/s)": 0.200847 + }, + { + "acc": 0.76163797, + "epoch": 1.1345747240408872, + "grad_norm": 5.65625, + "learning_rate": 4.1571548215931925e-06, + "loss": 0.86727257, + "memory(GiB)": 147.13, + "step": 48630, + "train_speed(iter/s)": 0.200869 + }, + { + "acc": 0.78029289, + "epoch": 1.1348080316131761, + "grad_norm": 4.46875, + "learning_rate": 4.155292803628768e-06, + "loss": 0.79705906, + "memory(GiB)": 147.13, + "step": 48640, + "train_speed(iter/s)": 0.200891 + }, + { + "acc": 0.77660732, + "epoch": 1.135041339185465, + "grad_norm": 5.21875, + "learning_rate": 4.153430906246052e-06, + "loss": 0.80238552, + "memory(GiB)": 147.13, + "step": 48650, + "train_speed(iter/s)": 0.200913 + }, + { + "acc": 0.78615427, + "epoch": 1.135274646757754, + "grad_norm": 4.875, + "learning_rate": 4.151569129710827e-06, + "loss": 0.78471584, + "memory(GiB)": 147.13, + "step": 48660, + "train_speed(iter/s)": 0.200933 + }, + { + "acc": 0.77250824, + "epoch": 1.1355079543300428, + "grad_norm": 4.8125, + "learning_rate": 4.149707474288862e-06, + "loss": 0.81316957, + "memory(GiB)": 147.13, + "step": 48670, + "train_speed(iter/s)": 0.200956 + }, + { + "acc": 0.79985304, + "epoch": 1.1357412619023317, + "grad_norm": 4.53125, + "learning_rate": 4.147845940245908e-06, + "loss": 0.70010648, + "memory(GiB)": 147.13, + "step": 48680, + "train_speed(iter/s)": 0.200977 + }, + { + "acc": 0.77105551, + "epoch": 1.1359745694746204, + "grad_norm": 5.28125, + "learning_rate": 4.145984527847699e-06, + "loss": 0.81544399, + "memory(GiB)": 147.13, + "step": 48690, + "train_speed(iter/s)": 0.200998 + }, + { + "acc": 0.77478409, + "epoch": 1.1362078770469095, + "grad_norm": 6.03125, + "learning_rate": 4.14412323735995e-06, + "loss": 0.81853638, + "memory(GiB)": 147.13, + "step": 48700, + "train_speed(iter/s)": 0.20102 + }, + { + "acc": 0.78074069, + "epoch": 1.1364411846191982, + "grad_norm": 6.78125, + "learning_rate": 4.142262069048362e-06, + "loss": 0.78377013, + "memory(GiB)": 147.13, + "step": 48710, + "train_speed(iter/s)": 0.201041 + }, + { + "acc": 0.78800902, + "epoch": 1.1366744921914873, + "grad_norm": 5.03125, + "learning_rate": 4.140401023178613e-06, + "loss": 0.76176023, + "memory(GiB)": 147.13, + "step": 48720, + "train_speed(iter/s)": 0.201062 + }, + { + "acc": 0.78853445, + "epoch": 1.136907799763776, + "grad_norm": 4.4375, + "learning_rate": 4.138540100016369e-06, + "loss": 0.7591939, + "memory(GiB)": 147.13, + "step": 48730, + "train_speed(iter/s)": 0.201084 + }, + { + "acc": 0.78152742, + "epoch": 1.1371411073360649, + "grad_norm": 7.03125, + "learning_rate": 4.136679299827275e-06, + "loss": 0.77542849, + "memory(GiB)": 147.13, + "step": 48740, + "train_speed(iter/s)": 0.201105 + }, + { + "acc": 0.80343904, + "epoch": 1.1373744149083538, + "grad_norm": 5.21875, + "learning_rate": 4.134818622876959e-06, + "loss": 0.68836613, + "memory(GiB)": 147.13, + "step": 48750, + "train_speed(iter/s)": 0.201128 + }, + { + "acc": 0.77372913, + "epoch": 1.1376077224806427, + "grad_norm": 6.5625, + "learning_rate": 4.132958069431034e-06, + "loss": 0.78857126, + "memory(GiB)": 147.13, + "step": 48760, + "train_speed(iter/s)": 0.20115 + }, + { + "acc": 0.78540783, + "epoch": 1.1378410300529316, + "grad_norm": 4.6875, + "learning_rate": 4.131097639755093e-06, + "loss": 0.76112299, + "memory(GiB)": 147.13, + "step": 48770, + "train_speed(iter/s)": 0.201169 + }, + { + "acc": 0.77772455, + "epoch": 1.1380743376252205, + "grad_norm": 6.21875, + "learning_rate": 4.129237334114712e-06, + "loss": 0.81547289, + "memory(GiB)": 147.13, + "step": 48780, + "train_speed(iter/s)": 0.20119 + }, + { + "acc": 0.78677707, + "epoch": 1.1383076451975094, + "grad_norm": 4.40625, + "learning_rate": 4.127377152775448e-06, + "loss": 0.77611208, + "memory(GiB)": 147.13, + "step": 48790, + "train_speed(iter/s)": 0.201211 + }, + { + "acc": 0.79749675, + "epoch": 1.1385409527697983, + "grad_norm": 4.125, + "learning_rate": 4.125517096002842e-06, + "loss": 0.71030278, + "memory(GiB)": 147.13, + "step": 48800, + "train_speed(iter/s)": 0.201231 + }, + { + "acc": 0.79024119, + "epoch": 1.1387742603420872, + "grad_norm": 4.46875, + "learning_rate": 4.123657164062415e-06, + "loss": 0.74269834, + "memory(GiB)": 147.13, + "step": 48810, + "train_speed(iter/s)": 0.201252 + }, + { + "acc": 0.76520209, + "epoch": 1.139007567914376, + "grad_norm": 4.84375, + "learning_rate": 4.121797357219678e-06, + "loss": 0.84504023, + "memory(GiB)": 147.13, + "step": 48820, + "train_speed(iter/s)": 0.201274 + }, + { + "acc": 0.7725544, + "epoch": 1.139240875486665, + "grad_norm": 5.09375, + "learning_rate": 4.119937675740109e-06, + "loss": 0.81992121, + "memory(GiB)": 147.13, + "step": 48830, + "train_speed(iter/s)": 0.201296 + }, + { + "acc": 0.78341656, + "epoch": 1.1394741830589539, + "grad_norm": 5.65625, + "learning_rate": 4.118078119889182e-06, + "loss": 0.76839838, + "memory(GiB)": 147.13, + "step": 48840, + "train_speed(iter/s)": 0.201317 + }, + { + "acc": 0.80117064, + "epoch": 1.1397074906312428, + "grad_norm": 5.21875, + "learning_rate": 4.116218689932346e-06, + "loss": 0.71321936, + "memory(GiB)": 147.13, + "step": 48850, + "train_speed(iter/s)": 0.201338 + }, + { + "acc": 0.79148402, + "epoch": 1.1399407982035317, + "grad_norm": 5.53125, + "learning_rate": 4.114359386135038e-06, + "loss": 0.74038553, + "memory(GiB)": 147.13, + "step": 48860, + "train_speed(iter/s)": 0.20136 + }, + { + "acc": 0.77297692, + "epoch": 1.1401741057758206, + "grad_norm": 5.4375, + "learning_rate": 4.112500208762668e-06, + "loss": 0.80347652, + "memory(GiB)": 147.13, + "step": 48870, + "train_speed(iter/s)": 0.201381 + }, + { + "acc": 0.78603468, + "epoch": 1.1404074133481095, + "grad_norm": 4.40625, + "learning_rate": 4.110641158080636e-06, + "loss": 0.78780212, + "memory(GiB)": 147.13, + "step": 48880, + "train_speed(iter/s)": 0.201402 + }, + { + "acc": 0.80655193, + "epoch": 1.1406407209203984, + "grad_norm": 4.78125, + "learning_rate": 4.108782234354321e-06, + "loss": 0.7006731, + "memory(GiB)": 147.13, + "step": 48890, + "train_speed(iter/s)": 0.201423 + }, + { + "acc": 0.79013271, + "epoch": 1.1408740284926873, + "grad_norm": 5.5, + "learning_rate": 4.106923437849082e-06, + "loss": 0.75852995, + "memory(GiB)": 147.13, + "step": 48900, + "train_speed(iter/s)": 0.201443 + }, + { + "acc": 0.7818615, + "epoch": 1.1411073360649762, + "grad_norm": 4.40625, + "learning_rate": 4.105064768830263e-06, + "loss": 0.77103171, + "memory(GiB)": 147.13, + "step": 48910, + "train_speed(iter/s)": 0.201463 + }, + { + "acc": 0.76176796, + "epoch": 1.141340643637265, + "grad_norm": 4.90625, + "learning_rate": 4.1032062275631894e-06, + "loss": 0.87524958, + "memory(GiB)": 147.13, + "step": 48920, + "train_speed(iter/s)": 0.201485 + }, + { + "acc": 0.76909437, + "epoch": 1.141573951209554, + "grad_norm": 4.625, + "learning_rate": 4.101347814313166e-06, + "loss": 0.81877556, + "memory(GiB)": 147.13, + "step": 48930, + "train_speed(iter/s)": 0.201506 + }, + { + "acc": 0.80096893, + "epoch": 1.1418072587818429, + "grad_norm": 4.25, + "learning_rate": 4.099489529345483e-06, + "loss": 0.71625419, + "memory(GiB)": 147.13, + "step": 48940, + "train_speed(iter/s)": 0.201527 + }, + { + "acc": 0.77617383, + "epoch": 1.1420405663541318, + "grad_norm": 5.75, + "learning_rate": 4.097631372925405e-06, + "loss": 0.79977846, + "memory(GiB)": 147.13, + "step": 48950, + "train_speed(iter/s)": 0.201549 + }, + { + "acc": 0.78573585, + "epoch": 1.1422738739264207, + "grad_norm": 6.28125, + "learning_rate": 4.095773345318186e-06, + "loss": 0.77477055, + "memory(GiB)": 147.13, + "step": 48960, + "train_speed(iter/s)": 0.201571 + }, + { + "acc": 0.78423834, + "epoch": 1.1425071814987096, + "grad_norm": 15.9375, + "learning_rate": 4.0939154467890605e-06, + "loss": 0.76231022, + "memory(GiB)": 147.13, + "step": 48970, + "train_speed(iter/s)": 0.201591 + }, + { + "acc": 0.79738851, + "epoch": 1.1427404890709985, + "grad_norm": 8.875, + "learning_rate": 4.0920576776032415e-06, + "loss": 0.71518207, + "memory(GiB)": 147.13, + "step": 48980, + "train_speed(iter/s)": 0.201613 + }, + { + "acc": 0.77292356, + "epoch": 1.1429737966432874, + "grad_norm": 4.375, + "learning_rate": 4.090200038025926e-06, + "loss": 0.79759941, + "memory(GiB)": 147.13, + "step": 48990, + "train_speed(iter/s)": 0.201634 + }, + { + "acc": 0.8075819, + "epoch": 1.1432071042155763, + "grad_norm": 7.0625, + "learning_rate": 4.08834252832229e-06, + "loss": 0.67757139, + "memory(GiB)": 147.13, + "step": 49000, + "train_speed(iter/s)": 0.201656 + }, + { + "epoch": 1.1432071042155763, + "eval_acc": 0.744222436495205, + "eval_loss": 0.8061447739601135, + "eval_runtime": 1270.2536, + "eval_samples_per_second": 28.334, + "eval_steps_per_second": 14.167, + "step": 49000 + }, + { + "acc": 0.78951435, + "epoch": 1.1434404117878652, + "grad_norm": 6.3125, + "learning_rate": 4.086485148757493e-06, + "loss": 0.7389822, + "memory(GiB)": 147.13, + "step": 49010, + "train_speed(iter/s)": 0.200613 + }, + { + "acc": 0.78884678, + "epoch": 1.143673719360154, + "grad_norm": 5.875, + "learning_rate": 4.084627899596676e-06, + "loss": 0.75943174, + "memory(GiB)": 147.13, + "step": 49020, + "train_speed(iter/s)": 0.200633 + }, + { + "acc": 0.76504393, + "epoch": 1.143907026932443, + "grad_norm": 4.6875, + "learning_rate": 4.082770781104961e-06, + "loss": 0.84475698, + "memory(GiB)": 147.13, + "step": 49030, + "train_speed(iter/s)": 0.200653 + }, + { + "acc": 0.79473829, + "epoch": 1.1441403345047318, + "grad_norm": 6.03125, + "learning_rate": 4.080913793547449e-06, + "loss": 0.74036322, + "memory(GiB)": 147.13, + "step": 49040, + "train_speed(iter/s)": 0.200673 + }, + { + "acc": 0.77508821, + "epoch": 1.1443736420770207, + "grad_norm": 5.4375, + "learning_rate": 4.079056937189229e-06, + "loss": 0.80481701, + "memory(GiB)": 147.13, + "step": 49050, + "train_speed(iter/s)": 0.200695 + }, + { + "acc": 0.7855969, + "epoch": 1.1446069496493096, + "grad_norm": 5.5625, + "learning_rate": 4.077200212295361e-06, + "loss": 0.76578369, + "memory(GiB)": 147.13, + "step": 49060, + "train_speed(iter/s)": 0.200716 + }, + { + "acc": 0.7720355, + "epoch": 1.1448402572215985, + "grad_norm": 4.875, + "learning_rate": 4.075343619130895e-06, + "loss": 0.8170085, + "memory(GiB)": 147.13, + "step": 49070, + "train_speed(iter/s)": 0.200737 + }, + { + "acc": 0.7803793, + "epoch": 1.1450735647938872, + "grad_norm": 4.71875, + "learning_rate": 4.0734871579608606e-06, + "loss": 0.79507275, + "memory(GiB)": 147.13, + "step": 49080, + "train_speed(iter/s)": 0.200756 + }, + { + "acc": 0.78040457, + "epoch": 1.1453068723661763, + "grad_norm": 5.28125, + "learning_rate": 4.071630829050263e-06, + "loss": 0.79167814, + "memory(GiB)": 147.13, + "step": 49090, + "train_speed(iter/s)": 0.200777 + }, + { + "acc": 0.80139008, + "epoch": 1.145540179938465, + "grad_norm": 4.28125, + "learning_rate": 4.069774632664095e-06, + "loss": 0.69903708, + "memory(GiB)": 147.13, + "step": 49100, + "train_speed(iter/s)": 0.200795 + }, + { + "acc": 0.7990375, + "epoch": 1.1457734875107541, + "grad_norm": 6.46875, + "learning_rate": 4.0679185690673285e-06, + "loss": 0.707726, + "memory(GiB)": 147.13, + "step": 49110, + "train_speed(iter/s)": 0.200815 + }, + { + "acc": 0.78790112, + "epoch": 1.1460067950830428, + "grad_norm": 5.625, + "learning_rate": 4.066062638524915e-06, + "loss": 0.76991944, + "memory(GiB)": 147.13, + "step": 49120, + "train_speed(iter/s)": 0.200838 + }, + { + "acc": 0.77768164, + "epoch": 1.1462401026553317, + "grad_norm": 7.125, + "learning_rate": 4.064206841301789e-06, + "loss": 0.78967957, + "memory(GiB)": 147.13, + "step": 49130, + "train_speed(iter/s)": 0.20086 + }, + { + "acc": 0.78068905, + "epoch": 1.1464734102276206, + "grad_norm": 5.53125, + "learning_rate": 4.062351177662866e-06, + "loss": 0.77087574, + "memory(GiB)": 147.13, + "step": 49140, + "train_speed(iter/s)": 0.200883 + }, + { + "acc": 0.78477345, + "epoch": 1.1467067177999095, + "grad_norm": 4.6875, + "learning_rate": 4.060495647873038e-06, + "loss": 0.77597842, + "memory(GiB)": 147.13, + "step": 49150, + "train_speed(iter/s)": 0.200903 + }, + { + "acc": 0.77061286, + "epoch": 1.1469400253721984, + "grad_norm": 6.0625, + "learning_rate": 4.058640252197184e-06, + "loss": 0.81207161, + "memory(GiB)": 147.13, + "step": 49160, + "train_speed(iter/s)": 0.200923 + }, + { + "acc": 0.79196901, + "epoch": 1.1471733329444873, + "grad_norm": 6.90625, + "learning_rate": 4.056784990900162e-06, + "loss": 0.73144436, + "memory(GiB)": 147.13, + "step": 49170, + "train_speed(iter/s)": 0.200944 + }, + { + "acc": 0.76946406, + "epoch": 1.1474066405167762, + "grad_norm": 4.75, + "learning_rate": 4.054929864246807e-06, + "loss": 0.8269702, + "memory(GiB)": 147.13, + "step": 49180, + "train_speed(iter/s)": 0.200966 + }, + { + "acc": 0.79781704, + "epoch": 1.147639948089065, + "grad_norm": 5.03125, + "learning_rate": 4.053074872501939e-06, + "loss": 0.72426014, + "memory(GiB)": 147.13, + "step": 49190, + "train_speed(iter/s)": 0.200989 + }, + { + "acc": 0.79303007, + "epoch": 1.147873255661354, + "grad_norm": 3.4375, + "learning_rate": 4.051220015930358e-06, + "loss": 0.74754534, + "memory(GiB)": 147.13, + "step": 49200, + "train_speed(iter/s)": 0.20101 + }, + { + "acc": 0.78933134, + "epoch": 1.148106563233643, + "grad_norm": 7.0, + "learning_rate": 4.049365294796844e-06, + "loss": 0.74844933, + "memory(GiB)": 147.13, + "step": 49210, + "train_speed(iter/s)": 0.20103 + }, + { + "acc": 0.76711445, + "epoch": 1.1483398708059318, + "grad_norm": 3.390625, + "learning_rate": 4.047510709366159e-06, + "loss": 0.85391102, + "memory(GiB)": 147.13, + "step": 49220, + "train_speed(iter/s)": 0.201052 + }, + { + "acc": 0.77335253, + "epoch": 1.1485731783782207, + "grad_norm": 6.59375, + "learning_rate": 4.045656259903042e-06, + "loss": 0.81023026, + "memory(GiB)": 147.13, + "step": 49230, + "train_speed(iter/s)": 0.201073 + }, + { + "acc": 0.79616175, + "epoch": 1.1488064859505096, + "grad_norm": 5.15625, + "learning_rate": 4.043801946672217e-06, + "loss": 0.72714806, + "memory(GiB)": 147.13, + "step": 49240, + "train_speed(iter/s)": 0.201095 + }, + { + "acc": 0.77853937, + "epoch": 1.1490397935227985, + "grad_norm": 6.4375, + "learning_rate": 4.041947769938387e-06, + "loss": 0.81164007, + "memory(GiB)": 147.13, + "step": 49250, + "train_speed(iter/s)": 0.201117 + }, + { + "acc": 0.78679638, + "epoch": 1.1492731010950874, + "grad_norm": 4.6875, + "learning_rate": 4.040093729966234e-06, + "loss": 0.77023234, + "memory(GiB)": 147.13, + "step": 49260, + "train_speed(iter/s)": 0.201139 + }, + { + "acc": 0.79337749, + "epoch": 1.1495064086673763, + "grad_norm": 5.0625, + "learning_rate": 4.038239827020424e-06, + "loss": 0.72135153, + "memory(GiB)": 147.13, + "step": 49270, + "train_speed(iter/s)": 0.20116 + }, + { + "acc": 0.79230156, + "epoch": 1.1497397162396652, + "grad_norm": 6.0, + "learning_rate": 4.036386061365598e-06, + "loss": 0.74501095, + "memory(GiB)": 147.13, + "step": 49280, + "train_speed(iter/s)": 0.201181 + }, + { + "acc": 0.78310337, + "epoch": 1.149973023811954, + "grad_norm": 5.21875, + "learning_rate": 4.034532433266382e-06, + "loss": 0.78334503, + "memory(GiB)": 147.13, + "step": 49290, + "train_speed(iter/s)": 0.201201 + }, + { + "acc": 0.78788719, + "epoch": 1.150206331384243, + "grad_norm": 12.9375, + "learning_rate": 4.032678942987382e-06, + "loss": 0.74433508, + "memory(GiB)": 147.13, + "step": 49300, + "train_speed(iter/s)": 0.201221 + }, + { + "acc": 0.77241621, + "epoch": 1.150439638956532, + "grad_norm": 5.0625, + "learning_rate": 4.030825590793179e-06, + "loss": 0.81397333, + "memory(GiB)": 147.13, + "step": 49310, + "train_speed(iter/s)": 0.201242 + }, + { + "acc": 0.78942137, + "epoch": 1.1506729465288208, + "grad_norm": 4.75, + "learning_rate": 4.028972376948343e-06, + "loss": 0.75723009, + "memory(GiB)": 147.13, + "step": 49320, + "train_speed(iter/s)": 0.201261 + }, + { + "acc": 0.78589869, + "epoch": 1.1509062541011097, + "grad_norm": 6.0625, + "learning_rate": 4.027119301717417e-06, + "loss": 0.74469566, + "memory(GiB)": 147.13, + "step": 49330, + "train_speed(iter/s)": 0.201283 + }, + { + "acc": 0.79240808, + "epoch": 1.1511395616733986, + "grad_norm": 4.71875, + "learning_rate": 4.025266365364928e-06, + "loss": 0.72842922, + "memory(GiB)": 147.13, + "step": 49340, + "train_speed(iter/s)": 0.201302 + }, + { + "acc": 0.79350748, + "epoch": 1.1513728692456875, + "grad_norm": 8.3125, + "learning_rate": 4.0234135681553835e-06, + "loss": 0.74578514, + "memory(GiB)": 147.13, + "step": 49350, + "train_speed(iter/s)": 0.201324 + }, + { + "acc": 0.76579685, + "epoch": 1.1516061768179764, + "grad_norm": 5.5625, + "learning_rate": 4.021560910353268e-06, + "loss": 0.84622517, + "memory(GiB)": 147.13, + "step": 49360, + "train_speed(iter/s)": 0.201344 + }, + { + "acc": 0.77959232, + "epoch": 1.1518394843902653, + "grad_norm": 5.03125, + "learning_rate": 4.019708392223048e-06, + "loss": 0.79495625, + "memory(GiB)": 147.13, + "step": 49370, + "train_speed(iter/s)": 0.201366 + }, + { + "acc": 0.79989409, + "epoch": 1.1520727919625542, + "grad_norm": 4.875, + "learning_rate": 4.017856014029171e-06, + "loss": 0.71503725, + "memory(GiB)": 147.13, + "step": 49380, + "train_speed(iter/s)": 0.201386 + }, + { + "acc": 0.77668247, + "epoch": 1.152306099534843, + "grad_norm": 5.0, + "learning_rate": 4.016003776036064e-06, + "loss": 0.78333073, + "memory(GiB)": 147.13, + "step": 49390, + "train_speed(iter/s)": 0.201405 + }, + { + "acc": 0.77576346, + "epoch": 1.152539407107132, + "grad_norm": 6.34375, + "learning_rate": 4.01415167850813e-06, + "loss": 0.80841331, + "memory(GiB)": 147.13, + "step": 49400, + "train_speed(iter/s)": 0.201426 + }, + { + "acc": 0.77319818, + "epoch": 1.1527727146794209, + "grad_norm": 5.6875, + "learning_rate": 4.012299721709757e-06, + "loss": 0.81761799, + "memory(GiB)": 147.13, + "step": 49410, + "train_speed(iter/s)": 0.201447 + }, + { + "acc": 0.76485691, + "epoch": 1.1530060222517098, + "grad_norm": 8.375, + "learning_rate": 4.010447905905312e-06, + "loss": 0.85202141, + "memory(GiB)": 147.13, + "step": 49420, + "train_speed(iter/s)": 0.201468 + }, + { + "acc": 0.77609119, + "epoch": 1.1532393298239987, + "grad_norm": 6.0, + "learning_rate": 4.0085962313591416e-06, + "loss": 0.82331104, + "memory(GiB)": 147.13, + "step": 49430, + "train_speed(iter/s)": 0.201488 + }, + { + "acc": 0.76981869, + "epoch": 1.1534726373962876, + "grad_norm": 4.125, + "learning_rate": 4.006744698335572e-06, + "loss": 0.83070965, + "memory(GiB)": 147.13, + "step": 49440, + "train_speed(iter/s)": 0.201508 + }, + { + "acc": 0.77602968, + "epoch": 1.1537059449685765, + "grad_norm": 5.5, + "learning_rate": 4.004893307098907e-06, + "loss": 0.80735464, + "memory(GiB)": 147.13, + "step": 49450, + "train_speed(iter/s)": 0.20153 + }, + { + "acc": 0.79087119, + "epoch": 1.1539392525408654, + "grad_norm": 4.46875, + "learning_rate": 4.003042057913434e-06, + "loss": 0.74953089, + "memory(GiB)": 147.13, + "step": 49460, + "train_speed(iter/s)": 0.201551 + }, + { + "acc": 0.79258838, + "epoch": 1.154172560113154, + "grad_norm": 4.65625, + "learning_rate": 4.001190951043416e-06, + "loss": 0.73958902, + "memory(GiB)": 147.13, + "step": 49470, + "train_speed(iter/s)": 0.201572 + }, + { + "acc": 0.78888607, + "epoch": 1.1544058676854432, + "grad_norm": 5.125, + "learning_rate": 3.9993399867531e-06, + "loss": 0.74525504, + "memory(GiB)": 147.13, + "step": 49480, + "train_speed(iter/s)": 0.201592 + }, + { + "acc": 0.78004522, + "epoch": 1.1546391752577319, + "grad_norm": 7.15625, + "learning_rate": 3.997489165306713e-06, + "loss": 0.77018299, + "memory(GiB)": 147.13, + "step": 49490, + "train_speed(iter/s)": 0.201612 + }, + { + "acc": 0.78893394, + "epoch": 1.154872482830021, + "grad_norm": 7.375, + "learning_rate": 3.995638486968453e-06, + "loss": 0.74949064, + "memory(GiB)": 147.13, + "step": 49500, + "train_speed(iter/s)": 0.201633 + }, + { + "epoch": 1.154872482830021, + "eval_acc": 0.7442927398131853, + "eval_loss": 0.8059793710708618, + "eval_runtime": 1270.1643, + "eval_samples_per_second": 28.336, + "eval_steps_per_second": 14.168, + "step": 49500 + }, + { + "acc": 0.77485609, + "epoch": 1.1551057904023097, + "grad_norm": 4.84375, + "learning_rate": 3.99378795200251e-06, + "loss": 0.8245182, + "memory(GiB)": 147.13, + "step": 49510, + "train_speed(iter/s)": 0.200599 + }, + { + "acc": 0.77423291, + "epoch": 1.1553390979745986, + "grad_norm": 6.34375, + "learning_rate": 3.991937560673044e-06, + "loss": 0.79736261, + "memory(GiB)": 147.13, + "step": 49520, + "train_speed(iter/s)": 0.20062 + }, + { + "acc": 0.77984371, + "epoch": 1.1555724055468874, + "grad_norm": 5.15625, + "learning_rate": 3.990087313244197e-06, + "loss": 0.77974262, + "memory(GiB)": 147.13, + "step": 49530, + "train_speed(iter/s)": 0.200641 + }, + { + "acc": 0.77480555, + "epoch": 1.1558057131191763, + "grad_norm": 6.34375, + "learning_rate": 3.988237209980093e-06, + "loss": 0.79339681, + "memory(GiB)": 147.13, + "step": 49540, + "train_speed(iter/s)": 0.200664 + }, + { + "acc": 0.79203959, + "epoch": 1.1560390206914652, + "grad_norm": 5.0, + "learning_rate": 3.986387251144833e-06, + "loss": 0.74149151, + "memory(GiB)": 147.13, + "step": 49550, + "train_speed(iter/s)": 0.200683 + }, + { + "acc": 0.78118038, + "epoch": 1.1562723282637541, + "grad_norm": 6.0, + "learning_rate": 3.9845374370024995e-06, + "loss": 0.78897324, + "memory(GiB)": 147.13, + "step": 49560, + "train_speed(iter/s)": 0.200703 + }, + { + "acc": 0.78257017, + "epoch": 1.156505635836043, + "grad_norm": 5.03125, + "learning_rate": 3.9826877678171515e-06, + "loss": 0.78667088, + "memory(GiB)": 147.13, + "step": 49570, + "train_speed(iter/s)": 0.200723 + }, + { + "acc": 0.77982354, + "epoch": 1.156738943408332, + "grad_norm": 5.1875, + "learning_rate": 3.980838243852829e-06, + "loss": 0.77447562, + "memory(GiB)": 147.13, + "step": 49580, + "train_speed(iter/s)": 0.200745 + }, + { + "acc": 0.7748292, + "epoch": 1.1569722509806208, + "grad_norm": 4.5, + "learning_rate": 3.978988865373551e-06, + "loss": 0.80467196, + "memory(GiB)": 147.13, + "step": 49590, + "train_speed(iter/s)": 0.200766 + }, + { + "acc": 0.78131981, + "epoch": 1.1572055585529097, + "grad_norm": 5.0625, + "learning_rate": 3.977139632643316e-06, + "loss": 0.7902432, + "memory(GiB)": 147.13, + "step": 49600, + "train_speed(iter/s)": 0.200786 + }, + { + "acc": 0.79091144, + "epoch": 1.1574388661251986, + "grad_norm": 5.3125, + "learning_rate": 3.975290545926101e-06, + "loss": 0.77418213, + "memory(GiB)": 147.13, + "step": 49610, + "train_speed(iter/s)": 0.200807 + }, + { + "acc": 0.77418609, + "epoch": 1.1576721736974875, + "grad_norm": 6.0625, + "learning_rate": 3.973441605485864e-06, + "loss": 0.81481934, + "memory(GiB)": 147.13, + "step": 49620, + "train_speed(iter/s)": 0.200826 + }, + { + "acc": 0.78058105, + "epoch": 1.1579054812697764, + "grad_norm": 8.375, + "learning_rate": 3.971592811586539e-06, + "loss": 0.78378577, + "memory(GiB)": 147.13, + "step": 49630, + "train_speed(iter/s)": 0.200846 + }, + { + "acc": 0.76909003, + "epoch": 1.1581387888420653, + "grad_norm": 6.09375, + "learning_rate": 3.969744164492041e-06, + "loss": 0.83589916, + "memory(GiB)": 147.13, + "step": 49640, + "train_speed(iter/s)": 0.200868 + }, + { + "acc": 0.77044625, + "epoch": 1.1583720964143542, + "grad_norm": 5.15625, + "learning_rate": 3.967895664466265e-06, + "loss": 0.81802549, + "memory(GiB)": 147.13, + "step": 49650, + "train_speed(iter/s)": 0.200888 + }, + { + "acc": 0.77337999, + "epoch": 1.1586054039866431, + "grad_norm": 5.375, + "learning_rate": 3.966047311773083e-06, + "loss": 0.82878065, + "memory(GiB)": 147.13, + "step": 49660, + "train_speed(iter/s)": 0.200909 + }, + { + "acc": 0.76443853, + "epoch": 1.158838711558932, + "grad_norm": 5.96875, + "learning_rate": 3.964199106676345e-06, + "loss": 0.85824375, + "memory(GiB)": 147.13, + "step": 49670, + "train_speed(iter/s)": 0.20093 + }, + { + "acc": 0.7868217, + "epoch": 1.159072019131221, + "grad_norm": 13.0625, + "learning_rate": 3.962351049439885e-06, + "loss": 0.77052197, + "memory(GiB)": 147.13, + "step": 49680, + "train_speed(iter/s)": 0.200953 + }, + { + "acc": 0.78962936, + "epoch": 1.1593053267035098, + "grad_norm": 5.96875, + "learning_rate": 3.960503140327511e-06, + "loss": 0.75969067, + "memory(GiB)": 147.13, + "step": 49690, + "train_speed(iter/s)": 0.200974 + }, + { + "acc": 0.76684074, + "epoch": 1.1595386342757987, + "grad_norm": 7.0625, + "learning_rate": 3.958655379603011e-06, + "loss": 0.85526161, + "memory(GiB)": 147.13, + "step": 49700, + "train_speed(iter/s)": 0.200996 + }, + { + "acc": 0.78498111, + "epoch": 1.1597719418480876, + "grad_norm": 5.0625, + "learning_rate": 3.956807767530155e-06, + "loss": 0.78701396, + "memory(GiB)": 147.13, + "step": 49710, + "train_speed(iter/s)": 0.201017 + }, + { + "acc": 0.7782752, + "epoch": 1.1600052494203765, + "grad_norm": 4.8125, + "learning_rate": 3.954960304372686e-06, + "loss": 0.82441578, + "memory(GiB)": 147.13, + "step": 49720, + "train_speed(iter/s)": 0.201039 + }, + { + "acc": 0.76695485, + "epoch": 1.1602385569926654, + "grad_norm": 4.90625, + "learning_rate": 3.95311299039433e-06, + "loss": 0.86624813, + "memory(GiB)": 147.13, + "step": 49730, + "train_speed(iter/s)": 0.20106 + }, + { + "acc": 0.77228241, + "epoch": 1.1604718645649543, + "grad_norm": 3.875, + "learning_rate": 3.951265825858792e-06, + "loss": 0.8170001, + "memory(GiB)": 147.13, + "step": 49740, + "train_speed(iter/s)": 0.201083 + }, + { + "acc": 0.77791176, + "epoch": 1.1607051721372432, + "grad_norm": 5.75, + "learning_rate": 3.949418811029752e-06, + "loss": 0.8135541, + "memory(GiB)": 147.13, + "step": 49750, + "train_speed(iter/s)": 0.201103 + }, + { + "acc": 0.7993576, + "epoch": 1.1609384797095321, + "grad_norm": 6.09375, + "learning_rate": 3.94757194617087e-06, + "loss": 0.70589614, + "memory(GiB)": 147.13, + "step": 49760, + "train_speed(iter/s)": 0.201124 + }, + { + "acc": 0.7687006, + "epoch": 1.161171787281821, + "grad_norm": 6.90625, + "learning_rate": 3.945725231545787e-06, + "loss": 0.8219142, + "memory(GiB)": 147.13, + "step": 49770, + "train_speed(iter/s)": 0.201145 + }, + { + "acc": 0.78447962, + "epoch": 1.16140509485411, + "grad_norm": 5.375, + "learning_rate": 3.943878667418122e-06, + "loss": 0.76722612, + "memory(GiB)": 147.13, + "step": 49780, + "train_speed(iter/s)": 0.201165 + }, + { + "acc": 0.78034124, + "epoch": 1.1616384024263988, + "grad_norm": 4.46875, + "learning_rate": 3.942032254051471e-06, + "loss": 0.78942509, + "memory(GiB)": 147.13, + "step": 49790, + "train_speed(iter/s)": 0.201185 + }, + { + "acc": 0.79355407, + "epoch": 1.1618717099986877, + "grad_norm": 5.78125, + "learning_rate": 3.940185991709407e-06, + "loss": 0.72849441, + "memory(GiB)": 147.13, + "step": 49800, + "train_speed(iter/s)": 0.201206 + }, + { + "acc": 0.7907186, + "epoch": 1.1621050175709766, + "grad_norm": 4.125, + "learning_rate": 3.938339880655485e-06, + "loss": 0.76254416, + "memory(GiB)": 147.13, + "step": 49810, + "train_speed(iter/s)": 0.201226 + }, + { + "acc": 0.78690691, + "epoch": 1.1623383251432655, + "grad_norm": 5.03125, + "learning_rate": 3.9364939211532365e-06, + "loss": 0.76221375, + "memory(GiB)": 147.13, + "step": 49820, + "train_speed(iter/s)": 0.201247 + }, + { + "acc": 0.77374029, + "epoch": 1.1625716327155544, + "grad_norm": 4.96875, + "learning_rate": 3.934648113466172e-06, + "loss": 0.80819073, + "memory(GiB)": 147.13, + "step": 49830, + "train_speed(iter/s)": 0.201268 + }, + { + "acc": 0.77530112, + "epoch": 1.1628049402878433, + "grad_norm": 5.0, + "learning_rate": 3.93280245785778e-06, + "loss": 0.78218956, + "memory(GiB)": 147.13, + "step": 49840, + "train_speed(iter/s)": 0.201288 + }, + { + "acc": 0.78257365, + "epoch": 1.1630382478601322, + "grad_norm": 5.3125, + "learning_rate": 3.9309569545915285e-06, + "loss": 0.77504187, + "memory(GiB)": 147.13, + "step": 49850, + "train_speed(iter/s)": 0.201309 + }, + { + "acc": 0.77935219, + "epoch": 1.163271555432421, + "grad_norm": 7.0625, + "learning_rate": 3.9291116039308605e-06, + "loss": 0.79420118, + "memory(GiB)": 147.13, + "step": 49860, + "train_speed(iter/s)": 0.20133 + }, + { + "acc": 0.78821473, + "epoch": 1.16350486300471, + "grad_norm": 5.625, + "learning_rate": 3.9272664061392e-06, + "loss": 0.74523487, + "memory(GiB)": 147.13, + "step": 49870, + "train_speed(iter/s)": 0.201351 + }, + { + "acc": 0.75341196, + "epoch": 1.1637381705769987, + "grad_norm": 6.125, + "learning_rate": 3.925421361479947e-06, + "loss": 0.88330593, + "memory(GiB)": 147.13, + "step": 49880, + "train_speed(iter/s)": 0.20137 + }, + { + "acc": 0.77374578, + "epoch": 1.1639714781492876, + "grad_norm": 5.25, + "learning_rate": 3.923576470216483e-06, + "loss": 0.83307543, + "memory(GiB)": 147.13, + "step": 49890, + "train_speed(iter/s)": 0.201391 + }, + { + "acc": 0.7787755, + "epoch": 1.1642047857215765, + "grad_norm": 8.25, + "learning_rate": 3.9217317326121655e-06, + "loss": 0.78595629, + "memory(GiB)": 147.13, + "step": 49900, + "train_speed(iter/s)": 0.201411 + }, + { + "acc": 0.79464808, + "epoch": 1.1644380932938654, + "grad_norm": 5.5625, + "learning_rate": 3.919887148930329e-06, + "loss": 0.72763758, + "memory(GiB)": 147.13, + "step": 49910, + "train_speed(iter/s)": 0.201431 + }, + { + "acc": 0.78488331, + "epoch": 1.1646714008661543, + "grad_norm": 5.21875, + "learning_rate": 3.918042719434288e-06, + "loss": 0.77773986, + "memory(GiB)": 147.13, + "step": 49920, + "train_speed(iter/s)": 0.201453 + }, + { + "acc": 0.79530802, + "epoch": 1.1649047084384432, + "grad_norm": 4.46875, + "learning_rate": 3.916198444387337e-06, + "loss": 0.71487732, + "memory(GiB)": 147.13, + "step": 49930, + "train_speed(iter/s)": 0.201473 + }, + { + "acc": 0.78052902, + "epoch": 1.165138016010732, + "grad_norm": 6.03125, + "learning_rate": 3.914354324052741e-06, + "loss": 0.77790031, + "memory(GiB)": 147.13, + "step": 49940, + "train_speed(iter/s)": 0.201494 + }, + { + "acc": 0.7807313, + "epoch": 1.165371323583021, + "grad_norm": 4.46875, + "learning_rate": 3.91251035869375e-06, + "loss": 0.77833533, + "memory(GiB)": 147.13, + "step": 49950, + "train_speed(iter/s)": 0.201514 + }, + { + "acc": 0.78631954, + "epoch": 1.1656046311553099, + "grad_norm": 5.5625, + "learning_rate": 3.91066654857359e-06, + "loss": 0.7649682, + "memory(GiB)": 147.13, + "step": 49960, + "train_speed(iter/s)": 0.201533 + }, + { + "acc": 0.78310833, + "epoch": 1.1658379387275988, + "grad_norm": 5.28125, + "learning_rate": 3.908822893955466e-06, + "loss": 0.78763723, + "memory(GiB)": 147.13, + "step": 49970, + "train_speed(iter/s)": 0.201554 + }, + { + "acc": 0.7878459, + "epoch": 1.1660712462998877, + "grad_norm": 7.6875, + "learning_rate": 3.9069793951025544e-06, + "loss": 0.74921327, + "memory(GiB)": 147.13, + "step": 49980, + "train_speed(iter/s)": 0.201575 + }, + { + "acc": 0.76945477, + "epoch": 1.1663045538721766, + "grad_norm": 4.125, + "learning_rate": 3.9051360522780166e-06, + "loss": 0.81270313, + "memory(GiB)": 147.13, + "step": 49990, + "train_speed(iter/s)": 0.201596 + }, + { + "acc": 0.76828485, + "epoch": 1.1665378614444655, + "grad_norm": 6.5625, + "learning_rate": 3.903292865744989e-06, + "loss": 0.81358232, + "memory(GiB)": 147.13, + "step": 50000, + "train_speed(iter/s)": 0.201615 + }, + { + "epoch": 1.1665378614444655, + "eval_acc": 0.7442993057267097, + "eval_loss": 0.8060175180435181, + "eval_runtime": 1270.1101, + "eval_samples_per_second": 28.337, + "eval_steps_per_second": 14.169, + "step": 50000 + }, + { + "acc": 0.78241692, + "epoch": 1.1667711690167544, + "grad_norm": 5.375, + "learning_rate": 3.901449835766588e-06, + "loss": 0.79187217, + "memory(GiB)": 147.13, + "step": 50010, + "train_speed(iter/s)": 0.200593 + }, + { + "acc": 0.7830759, + "epoch": 1.1670044765890433, + "grad_norm": 6.4375, + "learning_rate": 3.899606962605902e-06, + "loss": 0.76939902, + "memory(GiB)": 147.13, + "step": 50020, + "train_speed(iter/s)": 0.200614 + }, + { + "acc": 0.77431717, + "epoch": 1.1672377841613322, + "grad_norm": 5.84375, + "learning_rate": 3.897764246526003e-06, + "loss": 0.80634575, + "memory(GiB)": 147.13, + "step": 50030, + "train_speed(iter/s)": 0.200635 + }, + { + "acc": 0.77010593, + "epoch": 1.167471091733621, + "grad_norm": 6.03125, + "learning_rate": 3.895921687789936e-06, + "loss": 0.83955479, + "memory(GiB)": 147.13, + "step": 50040, + "train_speed(iter/s)": 0.200657 + }, + { + "acc": 0.77356577, + "epoch": 1.16770439930591, + "grad_norm": 5.0625, + "learning_rate": 3.894079286660729e-06, + "loss": 0.80566463, + "memory(GiB)": 147.13, + "step": 50050, + "train_speed(iter/s)": 0.200677 + }, + { + "acc": 0.75591946, + "epoch": 1.1679377068781989, + "grad_norm": 25.0, + "learning_rate": 3.892237043401382e-06, + "loss": 0.87540159, + "memory(GiB)": 147.13, + "step": 50060, + "train_speed(iter/s)": 0.200697 + }, + { + "acc": 0.79346662, + "epoch": 1.1681710144504878, + "grad_norm": 9.4375, + "learning_rate": 3.890394958274877e-06, + "loss": 0.74789476, + "memory(GiB)": 147.13, + "step": 50070, + "train_speed(iter/s)": 0.200719 + }, + { + "acc": 0.78836994, + "epoch": 1.1684043220227767, + "grad_norm": 5.625, + "learning_rate": 3.888553031544169e-06, + "loss": 0.75674076, + "memory(GiB)": 147.13, + "step": 50080, + "train_speed(iter/s)": 0.20074 + }, + { + "acc": 0.78470092, + "epoch": 1.1686376295950656, + "grad_norm": 4.6875, + "learning_rate": 3.886711263472192e-06, + "loss": 0.78518977, + "memory(GiB)": 147.13, + "step": 50090, + "train_speed(iter/s)": 0.200762 + }, + { + "acc": 0.79191232, + "epoch": 1.1688709371673545, + "grad_norm": 3.90625, + "learning_rate": 3.884869654321859e-06, + "loss": 0.73618574, + "memory(GiB)": 147.13, + "step": 50100, + "train_speed(iter/s)": 0.200782 + }, + { + "acc": 0.78092413, + "epoch": 1.1691042447396434, + "grad_norm": 10.1875, + "learning_rate": 3.883028204356058e-06, + "loss": 0.77842164, + "memory(GiB)": 147.13, + "step": 50110, + "train_speed(iter/s)": 0.200803 + }, + { + "acc": 0.7757453, + "epoch": 1.1693375523119323, + "grad_norm": 26.125, + "learning_rate": 3.881186913837657e-06, + "loss": 0.79169908, + "memory(GiB)": 147.13, + "step": 50120, + "train_speed(iter/s)": 0.200824 + }, + { + "acc": 0.80329933, + "epoch": 1.1695708598842212, + "grad_norm": 6.96875, + "learning_rate": 3.879345783029498e-06, + "loss": 0.70160761, + "memory(GiB)": 147.13, + "step": 50130, + "train_speed(iter/s)": 0.200845 + }, + { + "acc": 0.77083302, + "epoch": 1.16980416745651, + "grad_norm": 6.34375, + "learning_rate": 3.877504812194404e-06, + "loss": 0.82752018, + "memory(GiB)": 147.13, + "step": 50140, + "train_speed(iter/s)": 0.200867 + }, + { + "acc": 0.79996233, + "epoch": 1.170037475028799, + "grad_norm": 4.78125, + "learning_rate": 3.875664001595172e-06, + "loss": 0.68956242, + "memory(GiB)": 147.13, + "step": 50150, + "train_speed(iter/s)": 0.200887 + }, + { + "acc": 0.78814058, + "epoch": 1.1702707826010879, + "grad_norm": 5.25, + "learning_rate": 3.873823351494576e-06, + "loss": 0.74643149, + "memory(GiB)": 147.13, + "step": 50160, + "train_speed(iter/s)": 0.200908 + }, + { + "acc": 0.7647471, + "epoch": 1.1705040901733768, + "grad_norm": 5.71875, + "learning_rate": 3.8719828621553715e-06, + "loss": 0.85008469, + "memory(GiB)": 147.13, + "step": 50170, + "train_speed(iter/s)": 0.20093 + }, + { + "acc": 0.75985403, + "epoch": 1.1707373977456657, + "grad_norm": 4.875, + "learning_rate": 3.870142533840283e-06, + "loss": 0.88021393, + "memory(GiB)": 147.13, + "step": 50180, + "train_speed(iter/s)": 0.200952 + }, + { + "acc": 0.77487907, + "epoch": 1.1709707053179546, + "grad_norm": 5.34375, + "learning_rate": 3.868302366812024e-06, + "loss": 0.81592302, + "memory(GiB)": 147.13, + "step": 50190, + "train_speed(iter/s)": 0.200972 + }, + { + "acc": 0.77659655, + "epoch": 1.1712040128902435, + "grad_norm": 4.375, + "learning_rate": 3.8664623613332705e-06, + "loss": 0.81098127, + "memory(GiB)": 147.13, + "step": 50200, + "train_speed(iter/s)": 0.200992 + }, + { + "acc": 0.79070673, + "epoch": 1.1714373204625323, + "grad_norm": 4.65625, + "learning_rate": 3.864622517666685e-06, + "loss": 0.75920906, + "memory(GiB)": 147.13, + "step": 50210, + "train_speed(iter/s)": 0.201013 + }, + { + "acc": 0.79322906, + "epoch": 1.1716706280348212, + "grad_norm": 5.15625, + "learning_rate": 3.862782836074906e-06, + "loss": 0.72068224, + "memory(GiB)": 147.13, + "step": 50220, + "train_speed(iter/s)": 0.201032 + }, + { + "acc": 0.76726875, + "epoch": 1.17190393560711, + "grad_norm": 6.5, + "learning_rate": 3.860943316820548e-06, + "loss": 0.84628353, + "memory(GiB)": 147.13, + "step": 50230, + "train_speed(iter/s)": 0.201054 + }, + { + "acc": 0.79586458, + "epoch": 1.172137243179399, + "grad_norm": 7.03125, + "learning_rate": 3.859103960166198e-06, + "loss": 0.72394934, + "memory(GiB)": 147.13, + "step": 50240, + "train_speed(iter/s)": 0.201074 + }, + { + "acc": 0.77846246, + "epoch": 1.1723705507516877, + "grad_norm": 4.65625, + "learning_rate": 3.857264766374428e-06, + "loss": 0.79643097, + "memory(GiB)": 147.13, + "step": 50250, + "train_speed(iter/s)": 0.201096 + }, + { + "acc": 0.77642593, + "epoch": 1.1726038583239768, + "grad_norm": 4.5625, + "learning_rate": 3.855425735707779e-06, + "loss": 0.82001638, + "memory(GiB)": 147.13, + "step": 50260, + "train_speed(iter/s)": 0.201116 + }, + { + "acc": 0.76520214, + "epoch": 1.1728371658962655, + "grad_norm": 5.28125, + "learning_rate": 3.853586868428775e-06, + "loss": 0.84052525, + "memory(GiB)": 147.13, + "step": 50270, + "train_speed(iter/s)": 0.201137 + }, + { + "acc": 0.7804378, + "epoch": 1.1730704734685544, + "grad_norm": 5.375, + "learning_rate": 3.851748164799914e-06, + "loss": 0.77831354, + "memory(GiB)": 147.13, + "step": 50280, + "train_speed(iter/s)": 0.201157 + }, + { + "acc": 0.79932928, + "epoch": 1.1733037810408433, + "grad_norm": 13.5, + "learning_rate": 3.849909625083666e-06, + "loss": 0.70987091, + "memory(GiB)": 147.13, + "step": 50290, + "train_speed(iter/s)": 0.201178 + }, + { + "acc": 0.7968874, + "epoch": 1.1735370886131322, + "grad_norm": 4.96875, + "learning_rate": 3.848071249542486e-06, + "loss": 0.72468081, + "memory(GiB)": 147.13, + "step": 50300, + "train_speed(iter/s)": 0.2012 + }, + { + "acc": 0.76522064, + "epoch": 1.1737703961854211, + "grad_norm": 4.5, + "learning_rate": 3.846233038438803e-06, + "loss": 0.83323078, + "memory(GiB)": 147.13, + "step": 50310, + "train_speed(iter/s)": 0.201221 + }, + { + "acc": 0.76262903, + "epoch": 1.17400370375771, + "grad_norm": 5.59375, + "learning_rate": 3.844394992035017e-06, + "loss": 0.83975763, + "memory(GiB)": 147.13, + "step": 50320, + "train_speed(iter/s)": 0.201241 + }, + { + "acc": 0.79148641, + "epoch": 1.174237011329999, + "grad_norm": 4.34375, + "learning_rate": 3.842557110593509e-06, + "loss": 0.74630098, + "memory(GiB)": 147.13, + "step": 50330, + "train_speed(iter/s)": 0.201261 + }, + { + "acc": 0.78939476, + "epoch": 1.1744703189022878, + "grad_norm": 4.90625, + "learning_rate": 3.840719394376638e-06, + "loss": 0.74165897, + "memory(GiB)": 147.13, + "step": 50340, + "train_speed(iter/s)": 0.20128 + }, + { + "acc": 0.78680506, + "epoch": 1.1747036264745767, + "grad_norm": 5.5, + "learning_rate": 3.838881843646736e-06, + "loss": 0.77483697, + "memory(GiB)": 147.13, + "step": 50350, + "train_speed(iter/s)": 0.2013 + }, + { + "acc": 0.77357845, + "epoch": 1.1749369340468656, + "grad_norm": 5.125, + "learning_rate": 3.8370444586661135e-06, + "loss": 0.82649841, + "memory(GiB)": 147.13, + "step": 50360, + "train_speed(iter/s)": 0.20132 + }, + { + "acc": 0.77592096, + "epoch": 1.1751702416191545, + "grad_norm": 37.0, + "learning_rate": 3.835207239697057e-06, + "loss": 0.78373413, + "memory(GiB)": 147.13, + "step": 50370, + "train_speed(iter/s)": 0.20134 + }, + { + "acc": 0.80010624, + "epoch": 1.1754035491914434, + "grad_norm": 7.3125, + "learning_rate": 3.8333701870018296e-06, + "loss": 0.71716213, + "memory(GiB)": 147.13, + "step": 50380, + "train_speed(iter/s)": 0.20136 + }, + { + "acc": 0.78422251, + "epoch": 1.1756368567637323, + "grad_norm": 5.25, + "learning_rate": 3.831533300842667e-06, + "loss": 0.77883387, + "memory(GiB)": 147.13, + "step": 50390, + "train_speed(iter/s)": 0.201381 + }, + { + "acc": 0.77899208, + "epoch": 1.1758701643360212, + "grad_norm": 5.5, + "learning_rate": 3.829696581481787e-06, + "loss": 0.78697596, + "memory(GiB)": 147.13, + "step": 50400, + "train_speed(iter/s)": 0.201399 + }, + { + "acc": 0.77367115, + "epoch": 1.17610347190831, + "grad_norm": 4.1875, + "learning_rate": 3.827860029181382e-06, + "loss": 0.82581224, + "memory(GiB)": 147.13, + "step": 50410, + "train_speed(iter/s)": 0.20142 + }, + { + "acc": 0.76857705, + "epoch": 1.176336779480599, + "grad_norm": 6.0625, + "learning_rate": 3.826023644203617e-06, + "loss": 0.85204086, + "memory(GiB)": 147.13, + "step": 50420, + "train_speed(iter/s)": 0.20144 + }, + { + "acc": 0.79034348, + "epoch": 1.176570087052888, + "grad_norm": 6.96875, + "learning_rate": 3.824187426810635e-06, + "loss": 0.73557873, + "memory(GiB)": 147.13, + "step": 50430, + "train_speed(iter/s)": 0.201461 + }, + { + "acc": 0.7905654, + "epoch": 1.1768033946251768, + "grad_norm": 4.96875, + "learning_rate": 3.822351377264555e-06, + "loss": 0.75825205, + "memory(GiB)": 147.13, + "step": 50440, + "train_speed(iter/s)": 0.201482 + }, + { + "acc": 0.78993406, + "epoch": 1.1770367021974657, + "grad_norm": 4.5, + "learning_rate": 3.820515495827476e-06, + "loss": 0.77078609, + "memory(GiB)": 147.13, + "step": 50450, + "train_speed(iter/s)": 0.201502 + }, + { + "acc": 0.76711464, + "epoch": 1.1772700097697546, + "grad_norm": 5.4375, + "learning_rate": 3.818679782761465e-06, + "loss": 0.83489895, + "memory(GiB)": 147.13, + "step": 50460, + "train_speed(iter/s)": 0.201523 + }, + { + "acc": 0.76933699, + "epoch": 1.1775033173420435, + "grad_norm": 8.25, + "learning_rate": 3.816844238328573e-06, + "loss": 0.83145552, + "memory(GiB)": 147.13, + "step": 50470, + "train_speed(iter/s)": 0.201543 + }, + { + "acc": 0.77108583, + "epoch": 1.1777366249143324, + "grad_norm": 6.75, + "learning_rate": 3.815008862790822e-06, + "loss": 0.82043381, + "memory(GiB)": 147.13, + "step": 50480, + "train_speed(iter/s)": 0.201563 + }, + { + "acc": 0.7991888, + "epoch": 1.1779699324866213, + "grad_norm": 5.3125, + "learning_rate": 3.813173656410211e-06, + "loss": 0.74574351, + "memory(GiB)": 147.13, + "step": 50490, + "train_speed(iter/s)": 0.201583 + }, + { + "acc": 0.7735095, + "epoch": 1.1782032400589102, + "grad_norm": 6.46875, + "learning_rate": 3.8113386194487177e-06, + "loss": 0.8170289, + "memory(GiB)": 147.13, + "step": 50500, + "train_speed(iter/s)": 0.201604 + }, + { + "epoch": 1.1782032400589102, + "eval_acc": 0.7443518330349046, + "eval_loss": 0.8057728409767151, + "eval_runtime": 1269.6788, + "eval_samples_per_second": 28.347, + "eval_steps_per_second": 14.174, + "step": 50500 + }, + { + "acc": 0.77446365, + "epoch": 1.178436547631199, + "grad_norm": 6.3125, + "learning_rate": 3.80950375216829e-06, + "loss": 0.80997276, + "memory(GiB)": 147.13, + "step": 50510, + "train_speed(iter/s)": 0.200592 + }, + { + "acc": 0.78115602, + "epoch": 1.178669855203488, + "grad_norm": 5.5625, + "learning_rate": 3.807669054830855e-06, + "loss": 0.77538776, + "memory(GiB)": 147.13, + "step": 50520, + "train_speed(iter/s)": 0.200613 + }, + { + "acc": 0.7817029, + "epoch": 1.178903162775777, + "grad_norm": 4.875, + "learning_rate": 3.8058345276983165e-06, + "loss": 0.78529196, + "memory(GiB)": 147.13, + "step": 50530, + "train_speed(iter/s)": 0.200633 + }, + { + "acc": 0.76126156, + "epoch": 1.1791364703480658, + "grad_norm": 5.46875, + "learning_rate": 3.8040001710325547e-06, + "loss": 0.87188816, + "memory(GiB)": 147.13, + "step": 50540, + "train_speed(iter/s)": 0.200652 + }, + { + "acc": 0.77631016, + "epoch": 1.1793697779203547, + "grad_norm": 6.09375, + "learning_rate": 3.8021659850954186e-06, + "loss": 0.79485397, + "memory(GiB)": 147.13, + "step": 50550, + "train_speed(iter/s)": 0.200673 + }, + { + "acc": 0.78766565, + "epoch": 1.1796030854926436, + "grad_norm": 4.78125, + "learning_rate": 3.8003319701487407e-06, + "loss": 0.77381859, + "memory(GiB)": 147.13, + "step": 50560, + "train_speed(iter/s)": 0.200696 + }, + { + "acc": 0.76473374, + "epoch": 1.1798363930649325, + "grad_norm": 4.59375, + "learning_rate": 3.7984981264543247e-06, + "loss": 0.8554389, + "memory(GiB)": 147.13, + "step": 50570, + "train_speed(iter/s)": 0.200716 + }, + { + "acc": 0.78639488, + "epoch": 1.1800697006372214, + "grad_norm": 6.28125, + "learning_rate": 3.7966644542739538e-06, + "loss": 0.76430273, + "memory(GiB)": 147.13, + "step": 50580, + "train_speed(iter/s)": 0.200738 + }, + { + "acc": 0.77539215, + "epoch": 1.1803030082095103, + "grad_norm": 4.5, + "learning_rate": 3.794830953869381e-06, + "loss": 0.81135311, + "memory(GiB)": 147.13, + "step": 50590, + "train_speed(iter/s)": 0.200759 + }, + { + "acc": 0.77136354, + "epoch": 1.1805363157817992, + "grad_norm": 4.875, + "learning_rate": 3.7929976255023398e-06, + "loss": 0.81377373, + "memory(GiB)": 147.13, + "step": 50600, + "train_speed(iter/s)": 0.200779 + }, + { + "acc": 0.77100811, + "epoch": 1.180769623354088, + "grad_norm": 7.5625, + "learning_rate": 3.7911644694345368e-06, + "loss": 0.80235748, + "memory(GiB)": 147.13, + "step": 50610, + "train_speed(iter/s)": 0.2008 + }, + { + "acc": 0.77771163, + "epoch": 1.1810029309263768, + "grad_norm": 4.5, + "learning_rate": 3.789331485927654e-06, + "loss": 0.78241501, + "memory(GiB)": 147.13, + "step": 50620, + "train_speed(iter/s)": 0.200821 + }, + { + "acc": 0.76570158, + "epoch": 1.1812362384986659, + "grad_norm": 4.71875, + "learning_rate": 3.7874986752433506e-06, + "loss": 0.8620245, + "memory(GiB)": 147.13, + "step": 50630, + "train_speed(iter/s)": 0.200841 + }, + { + "acc": 0.79454651, + "epoch": 1.1814695460709546, + "grad_norm": 6.4375, + "learning_rate": 3.78566603764326e-06, + "loss": 0.74558945, + "memory(GiB)": 147.13, + "step": 50640, + "train_speed(iter/s)": 0.200863 + }, + { + "acc": 0.78433299, + "epoch": 1.1817028536432437, + "grad_norm": 5.46875, + "learning_rate": 3.7838335733889895e-06, + "loss": 0.78314257, + "memory(GiB)": 147.13, + "step": 50650, + "train_speed(iter/s)": 0.200883 + }, + { + "acc": 0.77772627, + "epoch": 1.1819361612155324, + "grad_norm": 5.59375, + "learning_rate": 3.782001282742124e-06, + "loss": 0.80842457, + "memory(GiB)": 147.13, + "step": 50660, + "train_speed(iter/s)": 0.200904 + }, + { + "acc": 0.77893171, + "epoch": 1.1821694687878213, + "grad_norm": 6.9375, + "learning_rate": 3.7801691659642196e-06, + "loss": 0.79274435, + "memory(GiB)": 147.13, + "step": 50670, + "train_speed(iter/s)": 0.200926 + }, + { + "acc": 0.77441139, + "epoch": 1.1824027763601102, + "grad_norm": 7.625, + "learning_rate": 3.7783372233168127e-06, + "loss": 0.81310921, + "memory(GiB)": 147.13, + "step": 50680, + "train_speed(iter/s)": 0.200945 + }, + { + "acc": 0.78166981, + "epoch": 1.182636083932399, + "grad_norm": 5.875, + "learning_rate": 3.776505455061412e-06, + "loss": 0.78240037, + "memory(GiB)": 147.13, + "step": 50690, + "train_speed(iter/s)": 0.200966 + }, + { + "acc": 0.78959174, + "epoch": 1.182869391504688, + "grad_norm": 4.6875, + "learning_rate": 3.7746738614595022e-06, + "loss": 0.75032959, + "memory(GiB)": 147.13, + "step": 50700, + "train_speed(iter/s)": 0.200984 + }, + { + "acc": 0.78085089, + "epoch": 1.1831026990769768, + "grad_norm": 5.21875, + "learning_rate": 3.772842442772543e-06, + "loss": 0.78894525, + "memory(GiB)": 147.13, + "step": 50710, + "train_speed(iter/s)": 0.201005 + }, + { + "acc": 0.79085646, + "epoch": 1.1833360066492657, + "grad_norm": 6.75, + "learning_rate": 3.7710111992619696e-06, + "loss": 0.7664422, + "memory(GiB)": 147.13, + "step": 50720, + "train_speed(iter/s)": 0.201026 + }, + { + "acc": 0.77849669, + "epoch": 1.1835693142215546, + "grad_norm": 5.21875, + "learning_rate": 3.7691801311891898e-06, + "loss": 0.79077878, + "memory(GiB)": 147.13, + "step": 50730, + "train_speed(iter/s)": 0.201043 + }, + { + "acc": 0.77560053, + "epoch": 1.1838026217938435, + "grad_norm": 6.28125, + "learning_rate": 3.767349238815588e-06, + "loss": 0.8155261, + "memory(GiB)": 147.13, + "step": 50740, + "train_speed(iter/s)": 0.201063 + }, + { + "acc": 0.76759486, + "epoch": 1.1840359293661324, + "grad_norm": 5.09375, + "learning_rate": 3.7655185224025247e-06, + "loss": 0.84323349, + "memory(GiB)": 147.13, + "step": 50750, + "train_speed(iter/s)": 0.201083 + }, + { + "acc": 0.77860336, + "epoch": 1.1842692369384213, + "grad_norm": 5.9375, + "learning_rate": 3.7636879822113338e-06, + "loss": 0.78937483, + "memory(GiB)": 147.13, + "step": 50760, + "train_speed(iter/s)": 0.201104 + }, + { + "acc": 0.77916279, + "epoch": 1.1845025445107102, + "grad_norm": 8.8125, + "learning_rate": 3.761857618503326e-06, + "loss": 0.80653362, + "memory(GiB)": 147.13, + "step": 50770, + "train_speed(iter/s)": 0.201123 + }, + { + "acc": 0.75581121, + "epoch": 1.1847358520829991, + "grad_norm": 6.75, + "learning_rate": 3.7600274315397816e-06, + "loss": 0.90901031, + "memory(GiB)": 147.13, + "step": 50780, + "train_speed(iter/s)": 0.201144 + }, + { + "acc": 0.75556707, + "epoch": 1.184969159655288, + "grad_norm": 4.53125, + "learning_rate": 3.758197421581961e-06, + "loss": 0.90556011, + "memory(GiB)": 147.13, + "step": 50790, + "train_speed(iter/s)": 0.201164 + }, + { + "acc": 0.77783809, + "epoch": 1.185202467227577, + "grad_norm": 4.46875, + "learning_rate": 3.756367588891099e-06, + "loss": 0.79138689, + "memory(GiB)": 147.13, + "step": 50800, + "train_speed(iter/s)": 0.201183 + }, + { + "acc": 0.77093124, + "epoch": 1.1854357747998658, + "grad_norm": 6.28125, + "learning_rate": 3.754537933728401e-06, + "loss": 0.82287769, + "memory(GiB)": 147.13, + "step": 50810, + "train_speed(iter/s)": 0.201204 + }, + { + "acc": 0.78769693, + "epoch": 1.1856690823721547, + "grad_norm": 15.625, + "learning_rate": 3.7527084563550515e-06, + "loss": 0.75005512, + "memory(GiB)": 147.13, + "step": 50820, + "train_speed(iter/s)": 0.201226 + }, + { + "acc": 0.78668466, + "epoch": 1.1859023899444436, + "grad_norm": 5.9375, + "learning_rate": 3.750879157032207e-06, + "loss": 0.75284238, + "memory(GiB)": 147.13, + "step": 50830, + "train_speed(iter/s)": 0.201247 + }, + { + "acc": 0.78141642, + "epoch": 1.1861356975167325, + "grad_norm": 7.15625, + "learning_rate": 3.7490500360210003e-06, + "loss": 0.79019589, + "memory(GiB)": 147.13, + "step": 50840, + "train_speed(iter/s)": 0.201266 + }, + { + "acc": 0.76448064, + "epoch": 1.1863690050890214, + "grad_norm": 6.15625, + "learning_rate": 3.747221093582538e-06, + "loss": 0.86280327, + "memory(GiB)": 147.13, + "step": 50850, + "train_speed(iter/s)": 0.201286 + }, + { + "acc": 0.76448793, + "epoch": 1.1866023126613103, + "grad_norm": 5.28125, + "learning_rate": 3.7453923299779014e-06, + "loss": 0.86757164, + "memory(GiB)": 147.13, + "step": 50860, + "train_speed(iter/s)": 0.201307 + }, + { + "acc": 0.78806419, + "epoch": 1.1868356202335992, + "grad_norm": 4.3125, + "learning_rate": 3.743563745468144e-06, + "loss": 0.75735083, + "memory(GiB)": 147.13, + "step": 50870, + "train_speed(iter/s)": 0.201327 + }, + { + "acc": 0.78509789, + "epoch": 1.1870689278058881, + "grad_norm": 6.125, + "learning_rate": 3.7417353403142988e-06, + "loss": 0.77933402, + "memory(GiB)": 147.13, + "step": 50880, + "train_speed(iter/s)": 0.201348 + }, + { + "acc": 0.76924162, + "epoch": 1.187302235378177, + "grad_norm": 4.84375, + "learning_rate": 3.7399071147773668e-06, + "loss": 0.83347654, + "memory(GiB)": 147.13, + "step": 50890, + "train_speed(iter/s)": 0.201369 + }, + { + "acc": 0.76855621, + "epoch": 1.187535542950466, + "grad_norm": 4.1875, + "learning_rate": 3.7380790691183276e-06, + "loss": 0.82479677, + "memory(GiB)": 147.13, + "step": 50900, + "train_speed(iter/s)": 0.201388 + }, + { + "acc": 0.77282152, + "epoch": 1.1877688505227548, + "grad_norm": 5.0625, + "learning_rate": 3.7362512035981347e-06, + "loss": 0.81683245, + "memory(GiB)": 147.13, + "step": 50910, + "train_speed(iter/s)": 0.201409 + }, + { + "acc": 0.77848644, + "epoch": 1.1880021580950437, + "grad_norm": 5.59375, + "learning_rate": 3.7344235184777157e-06, + "loss": 0.78320317, + "memory(GiB)": 147.13, + "step": 50920, + "train_speed(iter/s)": 0.20143 + }, + { + "acc": 0.77635636, + "epoch": 1.1882354656673326, + "grad_norm": 4.5625, + "learning_rate": 3.7325960140179717e-06, + "loss": 0.82402401, + "memory(GiB)": 147.13, + "step": 50930, + "train_speed(iter/s)": 0.201451 + }, + { + "acc": 0.80113115, + "epoch": 1.1884687732396215, + "grad_norm": 3.828125, + "learning_rate": 3.730768690479779e-06, + "loss": 0.70224762, + "memory(GiB)": 147.13, + "step": 50940, + "train_speed(iter/s)": 0.20147 + }, + { + "acc": 0.78452883, + "epoch": 1.1887020808119104, + "grad_norm": 6.09375, + "learning_rate": 3.7289415481239865e-06, + "loss": 0.78695316, + "memory(GiB)": 147.13, + "step": 50950, + "train_speed(iter/s)": 0.20149 + }, + { + "acc": 0.79433985, + "epoch": 1.1889353883841993, + "grad_norm": 5.53125, + "learning_rate": 3.727114587211419e-06, + "loss": 0.7226675, + "memory(GiB)": 147.13, + "step": 50960, + "train_speed(iter/s)": 0.201511 + }, + { + "acc": 0.78255625, + "epoch": 1.1891686959564882, + "grad_norm": 4.46875, + "learning_rate": 3.7252878080028744e-06, + "loss": 0.77606573, + "memory(GiB)": 147.13, + "step": 50970, + "train_speed(iter/s)": 0.201531 + }, + { + "acc": 0.77274323, + "epoch": 1.1894020035287771, + "grad_norm": 5.75, + "learning_rate": 3.7234612107591246e-06, + "loss": 0.83783932, + "memory(GiB)": 147.13, + "step": 50980, + "train_speed(iter/s)": 0.201551 + }, + { + "acc": 0.79396834, + "epoch": 1.189635311101066, + "grad_norm": 3.640625, + "learning_rate": 3.721634795740918e-06, + "loss": 0.73608699, + "memory(GiB)": 147.13, + "step": 50990, + "train_speed(iter/s)": 0.201572 + }, + { + "acc": 0.78861499, + "epoch": 1.189868618673355, + "grad_norm": 4.125, + "learning_rate": 3.719808563208971e-06, + "loss": 0.75779295, + "memory(GiB)": 147.13, + "step": 51000, + "train_speed(iter/s)": 0.201593 + }, + { + "epoch": 1.189868618673355, + "eval_acc": 0.7443236476500195, + "eval_loss": 0.8055968880653381, + "eval_runtime": 1269.0015, + "eval_samples_per_second": 28.362, + "eval_steps_per_second": 14.181, + "step": 51000 + }, + { + "acc": 0.79447026, + "epoch": 1.1901019262456436, + "grad_norm": 5.28125, + "learning_rate": 3.71798251342398e-06, + "loss": 0.73775778, + "memory(GiB)": 147.13, + "step": 51010, + "train_speed(iter/s)": 0.20059 + }, + { + "acc": 0.78940754, + "epoch": 1.1903352338179327, + "grad_norm": 6.1875, + "learning_rate": 3.7161566466466137e-06, + "loss": 0.74937639, + "memory(GiB)": 147.13, + "step": 51020, + "train_speed(iter/s)": 0.200611 + }, + { + "acc": 0.78550367, + "epoch": 1.1905685413902214, + "grad_norm": 5.09375, + "learning_rate": 3.714330963137512e-06, + "loss": 0.78713713, + "memory(GiB)": 147.13, + "step": 51030, + "train_speed(iter/s)": 0.200632 + }, + { + "acc": 0.77606764, + "epoch": 1.1908018489625105, + "grad_norm": 7.4375, + "learning_rate": 3.7125054631572915e-06, + "loss": 0.8188735, + "memory(GiB)": 147.13, + "step": 51040, + "train_speed(iter/s)": 0.200652 + }, + { + "acc": 0.79022417, + "epoch": 1.1910351565347992, + "grad_norm": 5.5, + "learning_rate": 3.710680146966542e-06, + "loss": 0.75739288, + "memory(GiB)": 147.13, + "step": 51050, + "train_speed(iter/s)": 0.200672 + }, + { + "acc": 0.75823822, + "epoch": 1.191268464107088, + "grad_norm": 9.25, + "learning_rate": 3.7088550148258277e-06, + "loss": 0.87464418, + "memory(GiB)": 147.13, + "step": 51060, + "train_speed(iter/s)": 0.20069 + }, + { + "acc": 0.7662158, + "epoch": 1.191501771679377, + "grad_norm": 6.46875, + "learning_rate": 3.707030066995685e-06, + "loss": 0.84084873, + "memory(GiB)": 147.13, + "step": 51070, + "train_speed(iter/s)": 0.20071 + }, + { + "acc": 0.77479043, + "epoch": 1.1917350792516659, + "grad_norm": 5.4375, + "learning_rate": 3.705205303736625e-06, + "loss": 0.8108655, + "memory(GiB)": 147.13, + "step": 51080, + "train_speed(iter/s)": 0.200729 + }, + { + "acc": 0.78514309, + "epoch": 1.1919683868239548, + "grad_norm": 5.40625, + "learning_rate": 3.7033807253091313e-06, + "loss": 0.77751369, + "memory(GiB)": 147.13, + "step": 51090, + "train_speed(iter/s)": 0.200749 + }, + { + "acc": 0.76177063, + "epoch": 1.1922016943962437, + "grad_norm": 5.46875, + "learning_rate": 3.7015563319736618e-06, + "loss": 0.8812314, + "memory(GiB)": 147.13, + "step": 51100, + "train_speed(iter/s)": 0.20077 + }, + { + "acc": 0.78570685, + "epoch": 1.1924350019685326, + "grad_norm": 4.375, + "learning_rate": 3.6997321239906513e-06, + "loss": 0.76931667, + "memory(GiB)": 147.13, + "step": 51110, + "train_speed(iter/s)": 0.20079 + }, + { + "acc": 0.78043647, + "epoch": 1.1926683095408215, + "grad_norm": 5.15625, + "learning_rate": 3.6979081016204998e-06, + "loss": 0.79293995, + "memory(GiB)": 147.13, + "step": 51120, + "train_speed(iter/s)": 0.200811 + }, + { + "acc": 0.7917635, + "epoch": 1.1929016171131104, + "grad_norm": 5.59375, + "learning_rate": 3.6960842651235894e-06, + "loss": 0.73350315, + "memory(GiB)": 147.13, + "step": 51130, + "train_speed(iter/s)": 0.20083 + }, + { + "acc": 0.78342714, + "epoch": 1.1931349246853993, + "grad_norm": 4.6875, + "learning_rate": 3.6942606147602705e-06, + "loss": 0.76641498, + "memory(GiB)": 147.13, + "step": 51140, + "train_speed(iter/s)": 0.200851 + }, + { + "acc": 0.77356782, + "epoch": 1.1933682322576882, + "grad_norm": 7.90625, + "learning_rate": 3.6924371507908695e-06, + "loss": 0.81099625, + "memory(GiB)": 147.13, + "step": 51150, + "train_speed(iter/s)": 0.200871 + }, + { + "acc": 0.8011385, + "epoch": 1.193601539829977, + "grad_norm": 5.09375, + "learning_rate": 3.690613873475687e-06, + "loss": 0.71837916, + "memory(GiB)": 147.13, + "step": 51160, + "train_speed(iter/s)": 0.200891 + }, + { + "acc": 0.78795547, + "epoch": 1.193834847402266, + "grad_norm": 5.6875, + "learning_rate": 3.6887907830749923e-06, + "loss": 0.75448265, + "memory(GiB)": 147.13, + "step": 51170, + "train_speed(iter/s)": 0.200911 + }, + { + "acc": 0.78816214, + "epoch": 1.1940681549745549, + "grad_norm": 4.46875, + "learning_rate": 3.686967879849033e-06, + "loss": 0.7446311, + "memory(GiB)": 147.13, + "step": 51180, + "train_speed(iter/s)": 0.200931 + }, + { + "acc": 0.78221531, + "epoch": 1.1943014625468438, + "grad_norm": 5.71875, + "learning_rate": 3.6851451640580264e-06, + "loss": 0.7772069, + "memory(GiB)": 147.13, + "step": 51190, + "train_speed(iter/s)": 0.20095 + }, + { + "acc": 0.76984034, + "epoch": 1.1945347701191327, + "grad_norm": 6.5625, + "learning_rate": 3.6833226359621668e-06, + "loss": 0.83698673, + "memory(GiB)": 147.13, + "step": 51200, + "train_speed(iter/s)": 0.200971 + }, + { + "acc": 0.77070379, + "epoch": 1.1947680776914216, + "grad_norm": 5.59375, + "learning_rate": 3.6815002958216183e-06, + "loss": 0.81270151, + "memory(GiB)": 147.13, + "step": 51210, + "train_speed(iter/s)": 0.200991 + }, + { + "acc": 0.79680748, + "epoch": 1.1950013852637105, + "grad_norm": 5.78125, + "learning_rate": 3.67967814389652e-06, + "loss": 0.71274958, + "memory(GiB)": 147.13, + "step": 51220, + "train_speed(iter/s)": 0.201011 + }, + { + "acc": 0.78539243, + "epoch": 1.1952346928359994, + "grad_norm": 4.84375, + "learning_rate": 3.6778561804469825e-06, + "loss": 0.78309517, + "memory(GiB)": 147.13, + "step": 51230, + "train_speed(iter/s)": 0.201031 + }, + { + "acc": 0.78247108, + "epoch": 1.1954680004082883, + "grad_norm": 5.625, + "learning_rate": 3.676034405733092e-06, + "loss": 0.7762372, + "memory(GiB)": 147.13, + "step": 51240, + "train_speed(iter/s)": 0.201051 + }, + { + "acc": 0.77460847, + "epoch": 1.1957013079805772, + "grad_norm": 5.46875, + "learning_rate": 3.6742128200149042e-06, + "loss": 0.82830582, + "memory(GiB)": 147.13, + "step": 51250, + "train_speed(iter/s)": 0.201072 + }, + { + "acc": 0.77715383, + "epoch": 1.195934615552866, + "grad_norm": 5.6875, + "learning_rate": 3.672391423552451e-06, + "loss": 0.81013908, + "memory(GiB)": 147.13, + "step": 51260, + "train_speed(iter/s)": 0.201091 + }, + { + "acc": 0.76946192, + "epoch": 1.196167923125155, + "grad_norm": 5.75, + "learning_rate": 3.6705702166057366e-06, + "loss": 0.83462391, + "memory(GiB)": 147.13, + "step": 51270, + "train_speed(iter/s)": 0.201111 + }, + { + "acc": 0.77984385, + "epoch": 1.1964012306974439, + "grad_norm": 5.75, + "learning_rate": 3.668749199434738e-06, + "loss": 0.80135126, + "memory(GiB)": 147.13, + "step": 51280, + "train_speed(iter/s)": 0.20113 + }, + { + "acc": 0.77329564, + "epoch": 1.1966345382697328, + "grad_norm": 5.625, + "learning_rate": 3.6669283722994054e-06, + "loss": 0.81273098, + "memory(GiB)": 147.13, + "step": 51290, + "train_speed(iter/s)": 0.20115 + }, + { + "acc": 0.78445587, + "epoch": 1.1968678458420217, + "grad_norm": 5.78125, + "learning_rate": 3.6651077354596586e-06, + "loss": 0.76524854, + "memory(GiB)": 147.13, + "step": 51300, + "train_speed(iter/s)": 0.201171 + }, + { + "acc": 0.79493475, + "epoch": 1.1971011534143106, + "grad_norm": 7.125, + "learning_rate": 3.6632872891753956e-06, + "loss": 0.73145099, + "memory(GiB)": 147.13, + "step": 51310, + "train_speed(iter/s)": 0.201192 + }, + { + "acc": 0.77402053, + "epoch": 1.1973344609865995, + "grad_norm": 3.96875, + "learning_rate": 3.661467033706483e-06, + "loss": 0.79744701, + "memory(GiB)": 147.13, + "step": 51320, + "train_speed(iter/s)": 0.201213 + }, + { + "acc": 0.78484211, + "epoch": 1.1975677685588884, + "grad_norm": 6.1875, + "learning_rate": 3.6596469693127636e-06, + "loss": 0.77361097, + "memory(GiB)": 147.13, + "step": 51330, + "train_speed(iter/s)": 0.201234 + }, + { + "acc": 0.80426512, + "epoch": 1.1978010761311773, + "grad_norm": 7.53125, + "learning_rate": 3.6578270962540506e-06, + "loss": 0.69536757, + "memory(GiB)": 147.13, + "step": 51340, + "train_speed(iter/s)": 0.201254 + }, + { + "acc": 0.75910811, + "epoch": 1.1980343837034662, + "grad_norm": 13.5, + "learning_rate": 3.6560074147901287e-06, + "loss": 0.85062847, + "memory(GiB)": 147.13, + "step": 51350, + "train_speed(iter/s)": 0.201275 + }, + { + "acc": 0.78548346, + "epoch": 1.198267691275755, + "grad_norm": 8.625, + "learning_rate": 3.654187925180758e-06, + "loss": 0.76579847, + "memory(GiB)": 147.13, + "step": 51360, + "train_speed(iter/s)": 0.201297 + }, + { + "acc": 0.78351984, + "epoch": 1.198500998848044, + "grad_norm": 5.25, + "learning_rate": 3.65236862768567e-06, + "loss": 0.77279978, + "memory(GiB)": 147.13, + "step": 51370, + "train_speed(iter/s)": 0.201317 + }, + { + "acc": 0.79646559, + "epoch": 1.1987343064203329, + "grad_norm": 5.90625, + "learning_rate": 3.650549522564569e-06, + "loss": 0.73050995, + "memory(GiB)": 147.13, + "step": 51380, + "train_speed(iter/s)": 0.201338 + }, + { + "acc": 0.77936029, + "epoch": 1.1989676139926218, + "grad_norm": 8.0625, + "learning_rate": 3.648730610077131e-06, + "loss": 0.79830408, + "memory(GiB)": 147.13, + "step": 51390, + "train_speed(iter/s)": 0.201358 + }, + { + "acc": 0.78159542, + "epoch": 1.1992009215649104, + "grad_norm": 4.71875, + "learning_rate": 3.646911890483006e-06, + "loss": 0.80945339, + "memory(GiB)": 147.13, + "step": 51400, + "train_speed(iter/s)": 0.201379 + }, + { + "acc": 0.77410226, + "epoch": 1.1994342291371995, + "grad_norm": 5.34375, + "learning_rate": 3.645093364041815e-06, + "loss": 0.81879921, + "memory(GiB)": 147.13, + "step": 51410, + "train_speed(iter/s)": 0.201398 + }, + { + "acc": 0.78253832, + "epoch": 1.1996675367094882, + "grad_norm": 4.40625, + "learning_rate": 3.6432750310131537e-06, + "loss": 0.78207197, + "memory(GiB)": 147.13, + "step": 51420, + "train_speed(iter/s)": 0.201417 + }, + { + "acc": 0.77488294, + "epoch": 1.1999008442817771, + "grad_norm": 6.0625, + "learning_rate": 3.6414568916565884e-06, + "loss": 0.791008, + "memory(GiB)": 147.13, + "step": 51430, + "train_speed(iter/s)": 0.201438 + }, + { + "acc": 0.78896904, + "epoch": 1.200134151854066, + "grad_norm": 6.5, + "learning_rate": 3.6396389462316558e-06, + "loss": 0.76538782, + "memory(GiB)": 147.13, + "step": 51440, + "train_speed(iter/s)": 0.201457 + }, + { + "acc": 0.77477431, + "epoch": 1.200367459426355, + "grad_norm": 5.03125, + "learning_rate": 3.6378211949978693e-06, + "loss": 0.8104063, + "memory(GiB)": 147.13, + "step": 51450, + "train_speed(iter/s)": 0.201477 + }, + { + "acc": 0.77967916, + "epoch": 1.2006007669986438, + "grad_norm": 5.40625, + "learning_rate": 3.6360036382147117e-06, + "loss": 0.7820168, + "memory(GiB)": 147.13, + "step": 51460, + "train_speed(iter/s)": 0.201498 + }, + { + "acc": 0.79494371, + "epoch": 1.2008340745709327, + "grad_norm": 6.40625, + "learning_rate": 3.634186276141638e-06, + "loss": 0.73448949, + "memory(GiB)": 147.13, + "step": 51470, + "train_speed(iter/s)": 0.201519 + }, + { + "acc": 0.77666898, + "epoch": 1.2010673821432216, + "grad_norm": 5.84375, + "learning_rate": 3.6323691090380756e-06, + "loss": 0.79062681, + "memory(GiB)": 147.13, + "step": 51480, + "train_speed(iter/s)": 0.20154 + }, + { + "acc": 0.78186355, + "epoch": 1.2013006897155105, + "grad_norm": 4.625, + "learning_rate": 3.630552137163427e-06, + "loss": 0.7758111, + "memory(GiB)": 147.13, + "step": 51490, + "train_speed(iter/s)": 0.20156 + }, + { + "acc": 0.78854303, + "epoch": 1.2015339972877994, + "grad_norm": 4.375, + "learning_rate": 3.6287353607770613e-06, + "loss": 0.75728836, + "memory(GiB)": 147.13, + "step": 51500, + "train_speed(iter/s)": 0.20158 + }, + { + "epoch": 1.2015339972877994, + "eval_acc": 0.7444104458239269, + "eval_loss": 0.8055254817008972, + "eval_runtime": 1269.0052, + "eval_samples_per_second": 28.362, + "eval_steps_per_second": 14.181, + "step": 51500 + }, + { + "acc": 0.79272242, + "epoch": 1.2017673048600883, + "grad_norm": 4.34375, + "learning_rate": 3.6269187801383267e-06, + "loss": 0.74171906, + "memory(GiB)": 147.13, + "step": 51510, + "train_speed(iter/s)": 0.20059 + }, + { + "acc": 0.77594776, + "epoch": 1.2020006124323772, + "grad_norm": 6.90625, + "learning_rate": 3.6251023955065356e-06, + "loss": 0.79424891, + "memory(GiB)": 147.13, + "step": 51520, + "train_speed(iter/s)": 0.200612 + }, + { + "acc": 0.78154349, + "epoch": 1.2022339200046661, + "grad_norm": 4.5, + "learning_rate": 3.623286207140979e-06, + "loss": 0.7680542, + "memory(GiB)": 147.13, + "step": 51530, + "train_speed(iter/s)": 0.200631 + }, + { + "acc": 0.7791707, + "epoch": 1.202467227576955, + "grad_norm": 5.25, + "learning_rate": 3.6214702153009157e-06, + "loss": 0.79780264, + "memory(GiB)": 147.13, + "step": 51540, + "train_speed(iter/s)": 0.200652 + }, + { + "acc": 0.7682992, + "epoch": 1.202700535149244, + "grad_norm": 8.75, + "learning_rate": 3.6196544202455787e-06, + "loss": 0.84575796, + "memory(GiB)": 147.13, + "step": 51550, + "train_speed(iter/s)": 0.200671 + }, + { + "acc": 0.77210584, + "epoch": 1.2029338427215328, + "grad_norm": 4.5, + "learning_rate": 3.617838822234175e-06, + "loss": 0.82075291, + "memory(GiB)": 147.13, + "step": 51560, + "train_speed(iter/s)": 0.200692 + }, + { + "acc": 0.77745943, + "epoch": 1.2031671502938217, + "grad_norm": 6.71875, + "learning_rate": 3.616023421525875e-06, + "loss": 0.81119947, + "memory(GiB)": 147.13, + "step": 51570, + "train_speed(iter/s)": 0.200713 + }, + { + "acc": 0.78472643, + "epoch": 1.2034004578661106, + "grad_norm": 5.53125, + "learning_rate": 3.61420821837983e-06, + "loss": 0.76066618, + "memory(GiB)": 147.13, + "step": 51580, + "train_speed(iter/s)": 0.200733 + }, + { + "acc": 0.79031987, + "epoch": 1.2036337654383995, + "grad_norm": 4.8125, + "learning_rate": 3.61239321305516e-06, + "loss": 0.73573508, + "memory(GiB)": 147.13, + "step": 51590, + "train_speed(iter/s)": 0.200754 + }, + { + "acc": 0.78004036, + "epoch": 1.2038670730106884, + "grad_norm": 6.9375, + "learning_rate": 3.610578405810955e-06, + "loss": 0.80121174, + "memory(GiB)": 147.13, + "step": 51600, + "train_speed(iter/s)": 0.200773 + }, + { + "acc": 0.78116407, + "epoch": 1.2041003805829773, + "grad_norm": 5.15625, + "learning_rate": 3.6087637969062783e-06, + "loss": 0.77282748, + "memory(GiB)": 147.13, + "step": 51610, + "train_speed(iter/s)": 0.200793 + }, + { + "acc": 0.79813347, + "epoch": 1.2043336881552662, + "grad_norm": 7.09375, + "learning_rate": 3.606949386600166e-06, + "loss": 0.71265526, + "memory(GiB)": 147.13, + "step": 51620, + "train_speed(iter/s)": 0.200813 + }, + { + "acc": 0.76442337, + "epoch": 1.204566995727555, + "grad_norm": 7.375, + "learning_rate": 3.605135175151624e-06, + "loss": 0.8571002, + "memory(GiB)": 147.13, + "step": 51630, + "train_speed(iter/s)": 0.200833 + }, + { + "acc": 0.78852344, + "epoch": 1.204800303299844, + "grad_norm": 4.78125, + "learning_rate": 3.6033211628196308e-06, + "loss": 0.74523458, + "memory(GiB)": 147.13, + "step": 51640, + "train_speed(iter/s)": 0.200854 + }, + { + "acc": 0.77451267, + "epoch": 1.205033610872133, + "grad_norm": 6.15625, + "learning_rate": 3.601507349863137e-06, + "loss": 0.79993911, + "memory(GiB)": 147.13, + "step": 51650, + "train_speed(iter/s)": 0.200874 + }, + { + "acc": 0.77563667, + "epoch": 1.2052669184444218, + "grad_norm": 6.34375, + "learning_rate": 3.599693736541061e-06, + "loss": 0.81237659, + "memory(GiB)": 147.13, + "step": 51660, + "train_speed(iter/s)": 0.200894 + }, + { + "acc": 0.76454887, + "epoch": 1.2055002260167107, + "grad_norm": 5.71875, + "learning_rate": 3.5978803231122977e-06, + "loss": 0.86580725, + "memory(GiB)": 147.13, + "step": 51670, + "train_speed(iter/s)": 0.200914 + }, + { + "acc": 0.77970967, + "epoch": 1.2057335335889996, + "grad_norm": 5.125, + "learning_rate": 3.596067109835713e-06, + "loss": 0.78640342, + "memory(GiB)": 147.13, + "step": 51680, + "train_speed(iter/s)": 0.200932 + }, + { + "acc": 0.77567797, + "epoch": 1.2059668411612885, + "grad_norm": 3.65625, + "learning_rate": 3.5942540969701386e-06, + "loss": 0.79667077, + "memory(GiB)": 147.13, + "step": 51690, + "train_speed(iter/s)": 0.200952 + }, + { + "acc": 0.77737932, + "epoch": 1.2062001487335774, + "grad_norm": 4.84375, + "learning_rate": 3.592441284774383e-06, + "loss": 0.79587197, + "memory(GiB)": 147.13, + "step": 51700, + "train_speed(iter/s)": 0.200972 + }, + { + "acc": 0.770786, + "epoch": 1.2064334563058663, + "grad_norm": 6.46875, + "learning_rate": 3.5906286735072255e-06, + "loss": 0.82086658, + "memory(GiB)": 147.13, + "step": 51710, + "train_speed(iter/s)": 0.200992 + }, + { + "acc": 0.78262281, + "epoch": 1.2066667638781552, + "grad_norm": 4.4375, + "learning_rate": 3.5888162634274154e-06, + "loss": 0.77802362, + "memory(GiB)": 147.13, + "step": 51720, + "train_speed(iter/s)": 0.201013 + }, + { + "acc": 0.7677485, + "epoch": 1.206900071450444, + "grad_norm": 4.5625, + "learning_rate": 3.5870040547936748e-06, + "loss": 0.83916025, + "memory(GiB)": 147.13, + "step": 51730, + "train_speed(iter/s)": 0.201034 + }, + { + "acc": 0.77800913, + "epoch": 1.207133379022733, + "grad_norm": 6.09375, + "learning_rate": 3.585192047864694e-06, + "loss": 0.80633202, + "memory(GiB)": 147.13, + "step": 51740, + "train_speed(iter/s)": 0.201054 + }, + { + "acc": 0.75977621, + "epoch": 1.2073666865950219, + "grad_norm": 6.5, + "learning_rate": 3.5833802428991373e-06, + "loss": 0.86888771, + "memory(GiB)": 147.13, + "step": 51750, + "train_speed(iter/s)": 0.201074 + }, + { + "acc": 0.76785235, + "epoch": 1.2075999941673108, + "grad_norm": 5.3125, + "learning_rate": 3.581568640155639e-06, + "loss": 0.82541256, + "memory(GiB)": 147.13, + "step": 51760, + "train_speed(iter/s)": 0.201094 + }, + { + "acc": 0.78889942, + "epoch": 1.2078333017395995, + "grad_norm": 5.21875, + "learning_rate": 3.5797572398928053e-06, + "loss": 0.73905168, + "memory(GiB)": 147.13, + "step": 51770, + "train_speed(iter/s)": 0.201111 + }, + { + "acc": 0.78549714, + "epoch": 1.2080666093118886, + "grad_norm": 5.1875, + "learning_rate": 3.5779460423692136e-06, + "loss": 0.77860193, + "memory(GiB)": 147.13, + "step": 51780, + "train_speed(iter/s)": 0.201131 + }, + { + "acc": 0.77393236, + "epoch": 1.2082999168841773, + "grad_norm": 4.46875, + "learning_rate": 3.5761350478434133e-06, + "loss": 0.82066584, + "memory(GiB)": 147.13, + "step": 51790, + "train_speed(iter/s)": 0.20115 + }, + { + "acc": 0.76424036, + "epoch": 1.2085332244564664, + "grad_norm": 5.03125, + "learning_rate": 3.5743242565739183e-06, + "loss": 0.86626759, + "memory(GiB)": 147.13, + "step": 51800, + "train_speed(iter/s)": 0.20117 + }, + { + "acc": 0.77947626, + "epoch": 1.208766532028755, + "grad_norm": 5.71875, + "learning_rate": 3.572513668819223e-06, + "loss": 0.79311905, + "memory(GiB)": 147.13, + "step": 51810, + "train_speed(iter/s)": 0.20119 + }, + { + "acc": 0.77670922, + "epoch": 1.208999839601044, + "grad_norm": 6.03125, + "learning_rate": 3.570703284837786e-06, + "loss": 0.81235285, + "memory(GiB)": 147.13, + "step": 51820, + "train_speed(iter/s)": 0.201211 + }, + { + "acc": 0.79082127, + "epoch": 1.2092331471733329, + "grad_norm": 5.375, + "learning_rate": 3.5688931048880397e-06, + "loss": 0.75774207, + "memory(GiB)": 147.13, + "step": 51830, + "train_speed(iter/s)": 0.20123 + }, + { + "acc": 0.78081455, + "epoch": 1.2094664547456218, + "grad_norm": 5.375, + "learning_rate": 3.567083129228387e-06, + "loss": 0.76074839, + "memory(GiB)": 147.13, + "step": 51840, + "train_speed(iter/s)": 0.201251 + }, + { + "acc": 0.77981596, + "epoch": 1.2096997623179107, + "grad_norm": 6.0, + "learning_rate": 3.5652733581172015e-06, + "loss": 0.7848402, + "memory(GiB)": 147.13, + "step": 51850, + "train_speed(iter/s)": 0.201272 + }, + { + "acc": 0.78686342, + "epoch": 1.2099330698901996, + "grad_norm": 6.28125, + "learning_rate": 3.5634637918128267e-06, + "loss": 0.77134657, + "memory(GiB)": 147.13, + "step": 51860, + "train_speed(iter/s)": 0.201291 + }, + { + "acc": 0.7602006, + "epoch": 1.2101663774624885, + "grad_norm": 6.46875, + "learning_rate": 3.56165443057358e-06, + "loss": 0.87410669, + "memory(GiB)": 147.13, + "step": 51870, + "train_speed(iter/s)": 0.20131 + }, + { + "acc": 0.78266001, + "epoch": 1.2103996850347774, + "grad_norm": 3.96875, + "learning_rate": 3.5598452746577443e-06, + "loss": 0.76107483, + "memory(GiB)": 147.13, + "step": 51880, + "train_speed(iter/s)": 0.20133 + }, + { + "acc": 0.7808578, + "epoch": 1.2106329926070662, + "grad_norm": 5.40625, + "learning_rate": 3.5580363243235773e-06, + "loss": 0.76968455, + "memory(GiB)": 147.13, + "step": 51890, + "train_speed(iter/s)": 0.20135 + }, + { + "acc": 0.78987422, + "epoch": 1.2108663001793551, + "grad_norm": 5.3125, + "learning_rate": 3.556227579829306e-06, + "loss": 0.75738258, + "memory(GiB)": 147.13, + "step": 51900, + "train_speed(iter/s)": 0.201369 + }, + { + "acc": 0.78409944, + "epoch": 1.211099607751644, + "grad_norm": 7.03125, + "learning_rate": 3.5544190414331305e-06, + "loss": 0.77000666, + "memory(GiB)": 147.13, + "step": 51910, + "train_speed(iter/s)": 0.201389 + }, + { + "acc": 0.76755562, + "epoch": 1.211332915323933, + "grad_norm": 6.5625, + "learning_rate": 3.552610709393215e-06, + "loss": 0.83009071, + "memory(GiB)": 147.13, + "step": 51920, + "train_speed(iter/s)": 0.201409 + }, + { + "acc": 0.77854753, + "epoch": 1.2115662228962218, + "grad_norm": 4.71875, + "learning_rate": 3.5508025839676997e-06, + "loss": 0.79306674, + "memory(GiB)": 147.13, + "step": 51930, + "train_speed(iter/s)": 0.201429 + }, + { + "acc": 0.78213997, + "epoch": 1.2117995304685107, + "grad_norm": 7.90625, + "learning_rate": 3.5489946654146945e-06, + "loss": 0.77478514, + "memory(GiB)": 147.13, + "step": 51940, + "train_speed(iter/s)": 0.20145 + }, + { + "acc": 0.79072771, + "epoch": 1.2120328380407996, + "grad_norm": 6.75, + "learning_rate": 3.547186953992281e-06, + "loss": 0.74444256, + "memory(GiB)": 147.13, + "step": 51950, + "train_speed(iter/s)": 0.20147 + }, + { + "acc": 0.78262076, + "epoch": 1.2122661456130885, + "grad_norm": 9.25, + "learning_rate": 3.5453794499585057e-06, + "loss": 0.77354512, + "memory(GiB)": 147.13, + "step": 51960, + "train_speed(iter/s)": 0.20149 + }, + { + "acc": 0.79070244, + "epoch": 1.2124994531853774, + "grad_norm": 4.3125, + "learning_rate": 3.543572153571393e-06, + "loss": 0.73883228, + "memory(GiB)": 147.13, + "step": 51970, + "train_speed(iter/s)": 0.20151 + }, + { + "acc": 0.78741131, + "epoch": 1.2127327607576663, + "grad_norm": 5.40625, + "learning_rate": 3.541765065088931e-06, + "loss": 0.76005692, + "memory(GiB)": 147.13, + "step": 51980, + "train_speed(iter/s)": 0.20153 + }, + { + "acc": 0.77303057, + "epoch": 1.2129660683299552, + "grad_norm": 6.53125, + "learning_rate": 3.539958184769082e-06, + "loss": 0.80659695, + "memory(GiB)": 147.13, + "step": 51990, + "train_speed(iter/s)": 0.20155 + }, + { + "acc": 0.78834496, + "epoch": 1.2131993759022441, + "grad_norm": 5.28125, + "learning_rate": 3.53815151286978e-06, + "loss": 0.74369602, + "memory(GiB)": 147.13, + "step": 52000, + "train_speed(iter/s)": 0.20157 + }, + { + "epoch": 1.2131993759022441, + "eval_acc": 0.7444085240931393, + "eval_loss": 0.8053334355354309, + "eval_runtime": 1270.0, + "eval_samples_per_second": 28.339, + "eval_steps_per_second": 14.17, + "step": 52000 + }, + { + "acc": 0.77881718, + "epoch": 1.213432683474533, + "grad_norm": 5.96875, + "learning_rate": 3.536345049648924e-06, + "loss": 0.78626618, + "memory(GiB)": 147.13, + "step": 52010, + "train_speed(iter/s)": 0.200588 + }, + { + "acc": 0.77282524, + "epoch": 1.213665991046822, + "grad_norm": 6.09375, + "learning_rate": 3.5345387953643872e-06, + "loss": 0.82502003, + "memory(GiB)": 147.13, + "step": 52020, + "train_speed(iter/s)": 0.200607 + }, + { + "acc": 0.79441557, + "epoch": 1.2138992986191108, + "grad_norm": 4.40625, + "learning_rate": 3.5327327502740114e-06, + "loss": 0.73947439, + "memory(GiB)": 147.13, + "step": 52030, + "train_speed(iter/s)": 0.200626 + }, + { + "acc": 0.7964479, + "epoch": 1.2141326061913997, + "grad_norm": 5.375, + "learning_rate": 3.5309269146356097e-06, + "loss": 0.72861004, + "memory(GiB)": 147.13, + "step": 52040, + "train_speed(iter/s)": 0.200645 + }, + { + "acc": 0.78275375, + "epoch": 1.2143659137636886, + "grad_norm": 4.9375, + "learning_rate": 3.5291212887069624e-06, + "loss": 0.77264972, + "memory(GiB)": 147.13, + "step": 52050, + "train_speed(iter/s)": 0.200664 + }, + { + "acc": 0.77427526, + "epoch": 1.2145992213359775, + "grad_norm": 8.25, + "learning_rate": 3.5273158727458253e-06, + "loss": 0.82076292, + "memory(GiB)": 147.13, + "step": 52060, + "train_speed(iter/s)": 0.200684 + }, + { + "acc": 0.78050809, + "epoch": 1.2148325289082664, + "grad_norm": 5.0625, + "learning_rate": 3.5255106670099186e-06, + "loss": 0.79774323, + "memory(GiB)": 147.13, + "step": 52070, + "train_speed(iter/s)": 0.200704 + }, + { + "acc": 0.77670012, + "epoch": 1.2150658364805553, + "grad_norm": 5.375, + "learning_rate": 3.5237056717569363e-06, + "loss": 0.80653028, + "memory(GiB)": 147.13, + "step": 52080, + "train_speed(iter/s)": 0.200723 + }, + { + "acc": 0.77818155, + "epoch": 1.2152991440528442, + "grad_norm": 5.09375, + "learning_rate": 3.5219008872445414e-06, + "loss": 0.80688896, + "memory(GiB)": 147.13, + "step": 52090, + "train_speed(iter/s)": 0.200743 + }, + { + "acc": 0.79195747, + "epoch": 1.2155324516251331, + "grad_norm": 4.40625, + "learning_rate": 3.5200963137303644e-06, + "loss": 0.73866539, + "memory(GiB)": 147.13, + "step": 52100, + "train_speed(iter/s)": 0.200764 + }, + { + "acc": 0.77116313, + "epoch": 1.215765759197422, + "grad_norm": 4.375, + "learning_rate": 3.5182919514720087e-06, + "loss": 0.81336851, + "memory(GiB)": 147.13, + "step": 52110, + "train_speed(iter/s)": 0.200784 + }, + { + "acc": 0.79589829, + "epoch": 1.215999066769711, + "grad_norm": 4.9375, + "learning_rate": 3.5164878007270464e-06, + "loss": 0.72459707, + "memory(GiB)": 147.13, + "step": 52120, + "train_speed(iter/s)": 0.200803 + }, + { + "acc": 0.7925993, + "epoch": 1.2162323743419998, + "grad_norm": 8.5625, + "learning_rate": 3.5146838617530197e-06, + "loss": 0.7503953, + "memory(GiB)": 147.13, + "step": 52130, + "train_speed(iter/s)": 0.200822 + }, + { + "acc": 0.77429123, + "epoch": 1.2164656819142887, + "grad_norm": 5.34375, + "learning_rate": 3.5128801348074426e-06, + "loss": 0.80790281, + "memory(GiB)": 147.13, + "step": 52140, + "train_speed(iter/s)": 0.200843 + }, + { + "acc": 0.79118776, + "epoch": 1.2166989894865776, + "grad_norm": 7.59375, + "learning_rate": 3.511076620147792e-06, + "loss": 0.74122844, + "memory(GiB)": 147.13, + "step": 52150, + "train_speed(iter/s)": 0.200863 + }, + { + "acc": 0.79350772, + "epoch": 1.2169322970588663, + "grad_norm": 4.96875, + "learning_rate": 3.5092733180315206e-06, + "loss": 0.75740881, + "memory(GiB)": 147.13, + "step": 52160, + "train_speed(iter/s)": 0.200883 + }, + { + "acc": 0.80063057, + "epoch": 1.2171656046311554, + "grad_norm": 6.40625, + "learning_rate": 3.5074702287160523e-06, + "loss": 0.70421405, + "memory(GiB)": 147.13, + "step": 52170, + "train_speed(iter/s)": 0.200903 + }, + { + "acc": 0.78242369, + "epoch": 1.217398912203444, + "grad_norm": 5.9375, + "learning_rate": 3.5056673524587733e-06, + "loss": 0.79225969, + "memory(GiB)": 147.13, + "step": 52180, + "train_speed(iter/s)": 0.200923 + }, + { + "acc": 0.77852802, + "epoch": 1.2176322197757332, + "grad_norm": 4.40625, + "learning_rate": 3.503864689517046e-06, + "loss": 0.80126581, + "memory(GiB)": 147.13, + "step": 52190, + "train_speed(iter/s)": 0.200944 + }, + { + "acc": 0.78113604, + "epoch": 1.217865527348022, + "grad_norm": 4.71875, + "learning_rate": 3.5020622401481996e-06, + "loss": 0.8105751, + "memory(GiB)": 147.13, + "step": 52200, + "train_speed(iter/s)": 0.200964 + }, + { + "acc": 0.76554623, + "epoch": 1.2180988349203108, + "grad_norm": 5.71875, + "learning_rate": 3.500260004609533e-06, + "loss": 0.83915272, + "memory(GiB)": 147.13, + "step": 52210, + "train_speed(iter/s)": 0.200984 + }, + { + "acc": 0.76399746, + "epoch": 1.2183321424925997, + "grad_norm": 5.4375, + "learning_rate": 3.4984579831583166e-06, + "loss": 0.82813606, + "memory(GiB)": 147.13, + "step": 52220, + "train_speed(iter/s)": 0.201005 + }, + { + "acc": 0.79804144, + "epoch": 1.2185654500648886, + "grad_norm": 14.8125, + "learning_rate": 3.4966561760517852e-06, + "loss": 0.71784048, + "memory(GiB)": 147.13, + "step": 52230, + "train_speed(iter/s)": 0.201026 + }, + { + "acc": 0.77001858, + "epoch": 1.2187987576371775, + "grad_norm": 4.96875, + "learning_rate": 3.494854583547148e-06, + "loss": 0.8265419, + "memory(GiB)": 147.13, + "step": 52240, + "train_speed(iter/s)": 0.201046 + }, + { + "acc": 0.80135403, + "epoch": 1.2190320652094664, + "grad_norm": 5.8125, + "learning_rate": 3.4930532059015845e-06, + "loss": 0.70065279, + "memory(GiB)": 147.13, + "step": 52250, + "train_speed(iter/s)": 0.201066 + }, + { + "acc": 0.77459936, + "epoch": 1.2192653727817553, + "grad_norm": 7.46875, + "learning_rate": 3.491252043372236e-06, + "loss": 0.81747303, + "memory(GiB)": 147.13, + "step": 52260, + "train_speed(iter/s)": 0.201087 + }, + { + "acc": 0.79471178, + "epoch": 1.2194986803540442, + "grad_norm": 7.625, + "learning_rate": 3.4894510962162194e-06, + "loss": 0.72956462, + "memory(GiB)": 147.13, + "step": 52270, + "train_speed(iter/s)": 0.201106 + }, + { + "acc": 0.76919899, + "epoch": 1.219731987926333, + "grad_norm": 6.25, + "learning_rate": 3.4876503646906203e-06, + "loss": 0.82097034, + "memory(GiB)": 147.13, + "step": 52280, + "train_speed(iter/s)": 0.201125 + }, + { + "acc": 0.78185291, + "epoch": 1.219965295498622, + "grad_norm": 4.71875, + "learning_rate": 3.4858498490524924e-06, + "loss": 0.80037918, + "memory(GiB)": 147.13, + "step": 52290, + "train_speed(iter/s)": 0.201146 + }, + { + "acc": 0.7733079, + "epoch": 1.2201986030709109, + "grad_norm": 4.65625, + "learning_rate": 3.4840495495588593e-06, + "loss": 0.78827391, + "memory(GiB)": 147.13, + "step": 52300, + "train_speed(iter/s)": 0.201166 + }, + { + "acc": 0.78070698, + "epoch": 1.2204319106431998, + "grad_norm": 4.59375, + "learning_rate": 3.4822494664667117e-06, + "loss": 0.80994854, + "memory(GiB)": 147.13, + "step": 52310, + "train_speed(iter/s)": 0.201186 + }, + { + "acc": 0.78385916, + "epoch": 1.2206652182154887, + "grad_norm": 5.4375, + "learning_rate": 3.4804496000330124e-06, + "loss": 0.75973425, + "memory(GiB)": 147.13, + "step": 52320, + "train_speed(iter/s)": 0.201206 + }, + { + "acc": 0.78199615, + "epoch": 1.2208985257877776, + "grad_norm": 6.09375, + "learning_rate": 3.478649950514691e-06, + "loss": 0.76241202, + "memory(GiB)": 147.13, + "step": 52330, + "train_speed(iter/s)": 0.201227 + }, + { + "acc": 0.79231329, + "epoch": 1.2211318333600665, + "grad_norm": 8.8125, + "learning_rate": 3.4768505181686468e-06, + "loss": 0.71977391, + "memory(GiB)": 147.13, + "step": 52340, + "train_speed(iter/s)": 0.201246 + }, + { + "acc": 0.77791686, + "epoch": 1.2213651409323554, + "grad_norm": 6.875, + "learning_rate": 3.4750513032517493e-06, + "loss": 0.79545641, + "memory(GiB)": 147.13, + "step": 52350, + "train_speed(iter/s)": 0.201267 + }, + { + "acc": 0.78212147, + "epoch": 1.2215984485046443, + "grad_norm": 4.8125, + "learning_rate": 3.473252306020837e-06, + "loss": 0.7947474, + "memory(GiB)": 147.13, + "step": 52360, + "train_speed(iter/s)": 0.201287 + }, + { + "acc": 0.77962971, + "epoch": 1.2218317560769332, + "grad_norm": 6.71875, + "learning_rate": 3.471453526732712e-06, + "loss": 0.79630547, + "memory(GiB)": 147.13, + "step": 52370, + "train_speed(iter/s)": 0.201308 + }, + { + "acc": 0.78212519, + "epoch": 1.222065063649222, + "grad_norm": 5.90625, + "learning_rate": 3.4696549656441537e-06, + "loss": 0.77139959, + "memory(GiB)": 147.13, + "step": 52380, + "train_speed(iter/s)": 0.201328 + }, + { + "acc": 0.76968145, + "epoch": 1.222298371221511, + "grad_norm": 6.375, + "learning_rate": 3.467856623011903e-06, + "loss": 0.83397465, + "memory(GiB)": 147.13, + "step": 52390, + "train_speed(iter/s)": 0.201348 + }, + { + "acc": 0.79080944, + "epoch": 1.2225316787937999, + "grad_norm": 4.09375, + "learning_rate": 3.4660584990926748e-06, + "loss": 0.73990412, + "memory(GiB)": 147.13, + "step": 52400, + "train_speed(iter/s)": 0.201367 + }, + { + "acc": 0.77898731, + "epoch": 1.2227649863660888, + "grad_norm": 6.5, + "learning_rate": 3.4642605941431494e-06, + "loss": 0.78911119, + "memory(GiB)": 147.13, + "step": 52410, + "train_speed(iter/s)": 0.201387 + }, + { + "acc": 0.76685104, + "epoch": 1.2229982939383777, + "grad_norm": 4.1875, + "learning_rate": 3.462462908419979e-06, + "loss": 0.82818241, + "memory(GiB)": 147.13, + "step": 52420, + "train_speed(iter/s)": 0.201405 + }, + { + "acc": 0.78912749, + "epoch": 1.2232316015106666, + "grad_norm": 4.4375, + "learning_rate": 3.4606654421797814e-06, + "loss": 0.75287247, + "memory(GiB)": 147.13, + "step": 52430, + "train_speed(iter/s)": 0.201424 + }, + { + "acc": 0.77771249, + "epoch": 1.2234649090829555, + "grad_norm": 5.40625, + "learning_rate": 3.458868195679146e-06, + "loss": 0.81555805, + "memory(GiB)": 147.13, + "step": 52440, + "train_speed(iter/s)": 0.201444 + }, + { + "acc": 0.78952436, + "epoch": 1.2236982166552444, + "grad_norm": 4.28125, + "learning_rate": 3.4570711691746262e-06, + "loss": 0.772684, + "memory(GiB)": 147.13, + "step": 52450, + "train_speed(iter/s)": 0.201465 + }, + { + "acc": 0.77552791, + "epoch": 1.2239315242275333, + "grad_norm": 4.65625, + "learning_rate": 3.4552743629227494e-06, + "loss": 0.80711937, + "memory(GiB)": 147.13, + "step": 52460, + "train_speed(iter/s)": 0.201485 + }, + { + "acc": 0.7582983, + "epoch": 1.2241648317998222, + "grad_norm": 7.4375, + "learning_rate": 3.4534777771800083e-06, + "loss": 0.85153837, + "memory(GiB)": 147.13, + "step": 52470, + "train_speed(iter/s)": 0.201505 + }, + { + "acc": 0.77164292, + "epoch": 1.224398139372111, + "grad_norm": 5.0625, + "learning_rate": 3.4516814122028676e-06, + "loss": 0.84448309, + "memory(GiB)": 147.13, + "step": 52480, + "train_speed(iter/s)": 0.201523 + }, + { + "acc": 0.76962109, + "epoch": 1.2246314469444, + "grad_norm": 6.625, + "learning_rate": 3.449885268247753e-06, + "loss": 0.81848526, + "memory(GiB)": 147.13, + "step": 52490, + "train_speed(iter/s)": 0.201542 + }, + { + "acc": 0.78164902, + "epoch": 1.2248647545166889, + "grad_norm": 5.28125, + "learning_rate": 3.448089345571066e-06, + "loss": 0.78736157, + "memory(GiB)": 147.13, + "step": 52500, + "train_speed(iter/s)": 0.201562 + }, + { + "epoch": 1.2248647545166889, + "eval_acc": 0.7444283819779446, + "eval_loss": 0.8053562045097351, + "eval_runtime": 1270.1507, + "eval_samples_per_second": 28.336, + "eval_steps_per_second": 14.168, + "step": 52500 + }, + { + "acc": 0.77160788, + "epoch": 1.2250980620889778, + "grad_norm": 4.78125, + "learning_rate": 3.4462936444291744e-06, + "loss": 0.80924244, + "memory(GiB)": 147.13, + "step": 52510, + "train_speed(iter/s)": 0.200588 + }, + { + "acc": 0.78200493, + "epoch": 1.2253313696612667, + "grad_norm": 4.40625, + "learning_rate": 3.4444981650784147e-06, + "loss": 0.78650246, + "memory(GiB)": 147.13, + "step": 52520, + "train_speed(iter/s)": 0.200607 + }, + { + "acc": 0.77312975, + "epoch": 1.2255646772335556, + "grad_norm": 5.5, + "learning_rate": 3.4427029077750895e-06, + "loss": 0.81262064, + "memory(GiB)": 147.13, + "step": 52530, + "train_speed(iter/s)": 0.200627 + }, + { + "acc": 0.78849087, + "epoch": 1.2257979848058445, + "grad_norm": 9.75, + "learning_rate": 3.4409078727754707e-06, + "loss": 0.75306044, + "memory(GiB)": 147.13, + "step": 52540, + "train_speed(iter/s)": 0.200647 + }, + { + "acc": 0.78517804, + "epoch": 1.2260312923781331, + "grad_norm": 4.9375, + "learning_rate": 3.4391130603358013e-06, + "loss": 0.7650878, + "memory(GiB)": 147.13, + "step": 52550, + "train_speed(iter/s)": 0.200666 + }, + { + "acc": 0.81452112, + "epoch": 1.2262645999504223, + "grad_norm": 4.4375, + "learning_rate": 3.4373184707122886e-06, + "loss": 0.67917662, + "memory(GiB)": 147.13, + "step": 52560, + "train_speed(iter/s)": 0.200686 + }, + { + "acc": 0.80121727, + "epoch": 1.226497907522711, + "grad_norm": 4.375, + "learning_rate": 3.4355241041611096e-06, + "loss": 0.69461622, + "memory(GiB)": 147.13, + "step": 52570, + "train_speed(iter/s)": 0.200706 + }, + { + "acc": 0.78457727, + "epoch": 1.226731215095, + "grad_norm": 5.125, + "learning_rate": 3.4337299609384122e-06, + "loss": 0.76386065, + "memory(GiB)": 147.13, + "step": 52580, + "train_speed(iter/s)": 0.200725 + }, + { + "acc": 0.7923645, + "epoch": 1.2269645226672887, + "grad_norm": 6.375, + "learning_rate": 3.431936041300308e-06, + "loss": 0.75402551, + "memory(GiB)": 147.13, + "step": 52590, + "train_speed(iter/s)": 0.200744 + }, + { + "acc": 0.78272791, + "epoch": 1.2271978302395776, + "grad_norm": 4.96875, + "learning_rate": 3.4301423455028777e-06, + "loss": 0.77303209, + "memory(GiB)": 147.13, + "step": 52600, + "train_speed(iter/s)": 0.200763 + }, + { + "acc": 0.77151518, + "epoch": 1.2274311378118665, + "grad_norm": 4.4375, + "learning_rate": 3.4283488738021707e-06, + "loss": 0.83244991, + "memory(GiB)": 147.13, + "step": 52610, + "train_speed(iter/s)": 0.200781 + }, + { + "acc": 0.79330654, + "epoch": 1.2276644453841554, + "grad_norm": 5.3125, + "learning_rate": 3.4265556264542054e-06, + "loss": 0.72327337, + "memory(GiB)": 147.13, + "step": 52620, + "train_speed(iter/s)": 0.200798 + }, + { + "acc": 0.7816957, + "epoch": 1.2278977529564443, + "grad_norm": 6.5, + "learning_rate": 3.424762603714967e-06, + "loss": 0.794205, + "memory(GiB)": 147.13, + "step": 52630, + "train_speed(iter/s)": 0.200818 + }, + { + "acc": 0.7910058, + "epoch": 1.2281310605287332, + "grad_norm": 4.5625, + "learning_rate": 3.4229698058404106e-06, + "loss": 0.7762475, + "memory(GiB)": 147.13, + "step": 52640, + "train_speed(iter/s)": 0.200836 + }, + { + "acc": 0.7753264, + "epoch": 1.2283643681010221, + "grad_norm": 8.4375, + "learning_rate": 3.4211772330864552e-06, + "loss": 0.81560392, + "memory(GiB)": 147.13, + "step": 52650, + "train_speed(iter/s)": 0.200856 + }, + { + "acc": 0.76706119, + "epoch": 1.228597675673311, + "grad_norm": 7.78125, + "learning_rate": 3.4193848857089924e-06, + "loss": 0.85505123, + "memory(GiB)": 147.13, + "step": 52660, + "train_speed(iter/s)": 0.200878 + }, + { + "acc": 0.78135409, + "epoch": 1.2288309832456, + "grad_norm": 5.03125, + "learning_rate": 3.4175927639638767e-06, + "loss": 0.76477842, + "memory(GiB)": 147.13, + "step": 52670, + "train_speed(iter/s)": 0.200897 + }, + { + "acc": 0.79110146, + "epoch": 1.2290642908178888, + "grad_norm": 5.3125, + "learning_rate": 3.4158008681069343e-06, + "loss": 0.76938314, + "memory(GiB)": 147.13, + "step": 52680, + "train_speed(iter/s)": 0.200918 + }, + { + "acc": 0.7797955, + "epoch": 1.2292975983901777, + "grad_norm": 6.0, + "learning_rate": 3.4140091983939584e-06, + "loss": 0.78467655, + "memory(GiB)": 147.13, + "step": 52690, + "train_speed(iter/s)": 0.200938 + }, + { + "acc": 0.78217092, + "epoch": 1.2295309059624666, + "grad_norm": 5.25, + "learning_rate": 3.4122177550807077e-06, + "loss": 0.79160671, + "memory(GiB)": 147.13, + "step": 52700, + "train_speed(iter/s)": 0.200959 + }, + { + "acc": 0.79885192, + "epoch": 1.2297642135347555, + "grad_norm": 5.25, + "learning_rate": 3.410426538422914e-06, + "loss": 0.71652822, + "memory(GiB)": 147.13, + "step": 52710, + "train_speed(iter/s)": 0.20098 + }, + { + "acc": 0.78403473, + "epoch": 1.2299975211070444, + "grad_norm": 6.59375, + "learning_rate": 3.4086355486762678e-06, + "loss": 0.78251448, + "memory(GiB)": 147.13, + "step": 52720, + "train_speed(iter/s)": 0.201 + }, + { + "acc": 0.76983075, + "epoch": 1.2302308286793333, + "grad_norm": 7.3125, + "learning_rate": 3.406844786096435e-06, + "loss": 0.8267252, + "memory(GiB)": 147.13, + "step": 52730, + "train_speed(iter/s)": 0.201019 + }, + { + "acc": 0.77623463, + "epoch": 1.2304641362516222, + "grad_norm": 5.375, + "learning_rate": 3.405054250939047e-06, + "loss": 0.81395969, + "memory(GiB)": 147.13, + "step": 52740, + "train_speed(iter/s)": 0.201039 + }, + { + "acc": 0.76780348, + "epoch": 1.230697443823911, + "grad_norm": 7.53125, + "learning_rate": 3.4032639434597003e-06, + "loss": 0.83625088, + "memory(GiB)": 147.13, + "step": 52750, + "train_speed(iter/s)": 0.201059 + }, + { + "acc": 0.79240489, + "epoch": 1.2309307513962, + "grad_norm": 5.5, + "learning_rate": 3.4014738639139622e-06, + "loss": 0.75073404, + "memory(GiB)": 147.13, + "step": 52760, + "train_speed(iter/s)": 0.201079 + }, + { + "acc": 0.79068785, + "epoch": 1.231164058968489, + "grad_norm": 5.4375, + "learning_rate": 3.399684012557365e-06, + "loss": 0.75344276, + "memory(GiB)": 147.13, + "step": 52770, + "train_speed(iter/s)": 0.201097 + }, + { + "acc": 0.81060543, + "epoch": 1.2313973665407778, + "grad_norm": 4.625, + "learning_rate": 3.3978943896454107e-06, + "loss": 0.66089754, + "memory(GiB)": 147.13, + "step": 52780, + "train_speed(iter/s)": 0.201117 + }, + { + "acc": 0.79075351, + "epoch": 1.2316306741130667, + "grad_norm": 6.84375, + "learning_rate": 3.396104995433567e-06, + "loss": 0.73719893, + "memory(GiB)": 147.13, + "step": 52790, + "train_speed(iter/s)": 0.201137 + }, + { + "acc": 0.78999639, + "epoch": 1.2318639816853556, + "grad_norm": 6.25, + "learning_rate": 3.3943158301772695e-06, + "loss": 0.74546919, + "memory(GiB)": 147.13, + "step": 52800, + "train_speed(iter/s)": 0.201157 + }, + { + "acc": 0.79035387, + "epoch": 1.2320972892576445, + "grad_norm": 3.6875, + "learning_rate": 3.39252689413192e-06, + "loss": 0.75522537, + "memory(GiB)": 147.13, + "step": 52810, + "train_speed(iter/s)": 0.201176 + }, + { + "acc": 0.77618947, + "epoch": 1.2323305968299334, + "grad_norm": 4.9375, + "learning_rate": 3.3907381875528916e-06, + "loss": 0.81478825, + "memory(GiB)": 147.13, + "step": 52820, + "train_speed(iter/s)": 0.201194 + }, + { + "acc": 0.75931473, + "epoch": 1.2325639044022223, + "grad_norm": 5.125, + "learning_rate": 3.388949710695517e-06, + "loss": 0.86494942, + "memory(GiB)": 147.13, + "step": 52830, + "train_speed(iter/s)": 0.201214 + }, + { + "acc": 0.7774549, + "epoch": 1.2327972119745112, + "grad_norm": 8.5625, + "learning_rate": 3.387161463815104e-06, + "loss": 0.80290031, + "memory(GiB)": 147.13, + "step": 52840, + "train_speed(iter/s)": 0.201234 + }, + { + "acc": 0.7622488, + "epoch": 1.2330305195468, + "grad_norm": 5.0, + "learning_rate": 3.3853734471669232e-06, + "loss": 0.86460876, + "memory(GiB)": 147.13, + "step": 52850, + "train_speed(iter/s)": 0.201254 + }, + { + "acc": 0.78175044, + "epoch": 1.233263827119089, + "grad_norm": 7.09375, + "learning_rate": 3.3835856610062135e-06, + "loss": 0.76835246, + "memory(GiB)": 147.13, + "step": 52860, + "train_speed(iter/s)": 0.201275 + }, + { + "acc": 0.77455969, + "epoch": 1.233497134691378, + "grad_norm": 6.3125, + "learning_rate": 3.381798105588181e-06, + "loss": 0.79920435, + "memory(GiB)": 147.13, + "step": 52870, + "train_speed(iter/s)": 0.201295 + }, + { + "acc": 0.7796454, + "epoch": 1.2337304422636668, + "grad_norm": 6.21875, + "learning_rate": 3.3800107811680004e-06, + "loss": 0.78245277, + "memory(GiB)": 147.13, + "step": 52880, + "train_speed(iter/s)": 0.201315 + }, + { + "acc": 0.77907124, + "epoch": 1.2339637498359557, + "grad_norm": 5.46875, + "learning_rate": 3.378223688000809e-06, + "loss": 0.78705425, + "memory(GiB)": 147.13, + "step": 52890, + "train_speed(iter/s)": 0.201334 + }, + { + "acc": 0.7818182, + "epoch": 1.2341970574082446, + "grad_norm": 4.96875, + "learning_rate": 3.3764368263417146e-06, + "loss": 0.76016235, + "memory(GiB)": 147.13, + "step": 52900, + "train_speed(iter/s)": 0.201355 + }, + { + "acc": 0.78806572, + "epoch": 1.2344303649805335, + "grad_norm": 8.0, + "learning_rate": 3.3746501964457916e-06, + "loss": 0.75710969, + "memory(GiB)": 147.13, + "step": 52910, + "train_speed(iter/s)": 0.201375 + }, + { + "acc": 0.78649473, + "epoch": 1.2346636725528224, + "grad_norm": 5.28125, + "learning_rate": 3.3728637985680814e-06, + "loss": 0.7589705, + "memory(GiB)": 147.13, + "step": 52920, + "train_speed(iter/s)": 0.201395 + }, + { + "acc": 0.79017825, + "epoch": 1.2348969801251113, + "grad_norm": 6.28125, + "learning_rate": 3.371077632963592e-06, + "loss": 0.74173431, + "memory(GiB)": 147.13, + "step": 52930, + "train_speed(iter/s)": 0.201413 + }, + { + "acc": 0.77001181, + "epoch": 1.2351302876974, + "grad_norm": 9.25, + "learning_rate": 3.3692916998872972e-06, + "loss": 0.82475138, + "memory(GiB)": 147.13, + "step": 52940, + "train_speed(iter/s)": 0.201433 + }, + { + "acc": 0.78582697, + "epoch": 1.235363595269689, + "grad_norm": 5.40625, + "learning_rate": 3.367505999594138e-06, + "loss": 0.76669807, + "memory(GiB)": 147.13, + "step": 52950, + "train_speed(iter/s)": 0.201453 + }, + { + "acc": 0.7884789, + "epoch": 1.2355969028419778, + "grad_norm": 7.0, + "learning_rate": 3.3657205323390234e-06, + "loss": 0.76776428, + "memory(GiB)": 147.13, + "step": 52960, + "train_speed(iter/s)": 0.201474 + }, + { + "acc": 0.77210593, + "epoch": 1.2358302104142667, + "grad_norm": 4.9375, + "learning_rate": 3.3639352983768276e-06, + "loss": 0.8105154, + "memory(GiB)": 147.13, + "step": 52970, + "train_speed(iter/s)": 0.201495 + }, + { + "acc": 0.80149565, + "epoch": 1.2360635179865556, + "grad_norm": 9.625, + "learning_rate": 3.3621502979623923e-06, + "loss": 0.69630108, + "memory(GiB)": 147.13, + "step": 52980, + "train_speed(iter/s)": 0.201515 + }, + { + "acc": 0.77142386, + "epoch": 1.2362968255588445, + "grad_norm": 10.125, + "learning_rate": 3.360365531350527e-06, + "loss": 0.830303, + "memory(GiB)": 147.13, + "step": 52990, + "train_speed(iter/s)": 0.201536 + }, + { + "acc": 0.78944068, + "epoch": 1.2365301331311334, + "grad_norm": 5.59375, + "learning_rate": 3.358580998796005e-06, + "loss": 0.75569763, + "memory(GiB)": 147.13, + "step": 53000, + "train_speed(iter/s)": 0.201555 + }, + { + "epoch": 1.2365301331311334, + "eval_acc": 0.7444263001029247, + "eval_loss": 0.8053351044654846, + "eval_runtime": 1270.5258, + "eval_samples_per_second": 28.328, + "eval_steps_per_second": 14.164, + "step": 53000 + }, + { + "acc": 0.7961484, + "epoch": 1.2367634407034223, + "grad_norm": 5.375, + "learning_rate": 3.3567967005535696e-06, + "loss": 0.73044991, + "memory(GiB)": 147.13, + "step": 53010, + "train_speed(iter/s)": 0.200588 + }, + { + "acc": 0.77993622, + "epoch": 1.2369967482757112, + "grad_norm": 4.75, + "learning_rate": 3.355012636877927e-06, + "loss": 0.77450337, + "memory(GiB)": 147.13, + "step": 53020, + "train_speed(iter/s)": 0.200607 + }, + { + "acc": 0.78480597, + "epoch": 1.237230055848, + "grad_norm": 5.0, + "learning_rate": 3.353228808023752e-06, + "loss": 0.77071924, + "memory(GiB)": 147.13, + "step": 53030, + "train_speed(iter/s)": 0.200626 + }, + { + "acc": 0.8039917, + "epoch": 1.237463363420289, + "grad_norm": 4.46875, + "learning_rate": 3.351445214245687e-06, + "loss": 0.68680391, + "memory(GiB)": 147.13, + "step": 53040, + "train_speed(iter/s)": 0.200645 + }, + { + "acc": 0.78955479, + "epoch": 1.2376966709925779, + "grad_norm": 5.40625, + "learning_rate": 3.3496618557983405e-06, + "loss": 0.73595805, + "memory(GiB)": 147.13, + "step": 53050, + "train_speed(iter/s)": 0.200665 + }, + { + "acc": 0.79907274, + "epoch": 1.2379299785648668, + "grad_norm": 4.46875, + "learning_rate": 3.347878732936283e-06, + "loss": 0.73365273, + "memory(GiB)": 147.13, + "step": 53060, + "train_speed(iter/s)": 0.200684 + }, + { + "acc": 0.77731886, + "epoch": 1.2381632861371556, + "grad_norm": 4.65625, + "learning_rate": 3.346095845914056e-06, + "loss": 0.81012163, + "memory(GiB)": 147.13, + "step": 53070, + "train_speed(iter/s)": 0.200704 + }, + { + "acc": 0.76856656, + "epoch": 1.2383965937094445, + "grad_norm": 5.40625, + "learning_rate": 3.3443131949861667e-06, + "loss": 0.84078465, + "memory(GiB)": 147.13, + "step": 53080, + "train_speed(iter/s)": 0.200722 + }, + { + "acc": 0.78171597, + "epoch": 1.2386299012817334, + "grad_norm": 12.3125, + "learning_rate": 3.3425307804070896e-06, + "loss": 0.79482059, + "memory(GiB)": 147.13, + "step": 53090, + "train_speed(iter/s)": 0.200742 + }, + { + "acc": 0.78333645, + "epoch": 1.2388632088540223, + "grad_norm": 6.78125, + "learning_rate": 3.3407486024312596e-06, + "loss": 0.78734274, + "memory(GiB)": 147.13, + "step": 53100, + "train_speed(iter/s)": 0.200761 + }, + { + "acc": 0.7784162, + "epoch": 1.2390965164263112, + "grad_norm": 6.375, + "learning_rate": 3.3389666613130856e-06, + "loss": 0.79651308, + "memory(GiB)": 147.13, + "step": 53110, + "train_speed(iter/s)": 0.200781 + }, + { + "acc": 0.81201239, + "epoch": 1.2393298239986001, + "grad_norm": 3.671875, + "learning_rate": 3.337184957306938e-06, + "loss": 0.6528636, + "memory(GiB)": 147.13, + "step": 53120, + "train_speed(iter/s)": 0.200801 + }, + { + "acc": 0.79003553, + "epoch": 1.239563131570889, + "grad_norm": 7.8125, + "learning_rate": 3.3354034906671545e-06, + "loss": 0.77086411, + "memory(GiB)": 147.13, + "step": 53130, + "train_speed(iter/s)": 0.200821 + }, + { + "acc": 0.78376465, + "epoch": 1.239796439143178, + "grad_norm": 7.25, + "learning_rate": 3.333622261648039e-06, + "loss": 0.76185646, + "memory(GiB)": 147.13, + "step": 53140, + "train_speed(iter/s)": 0.20084 + }, + { + "acc": 0.7993577, + "epoch": 1.2400297467154668, + "grad_norm": 5.9375, + "learning_rate": 3.3318412705038626e-06, + "loss": 0.70314708, + "memory(GiB)": 147.13, + "step": 53150, + "train_speed(iter/s)": 0.200859 + }, + { + "acc": 0.77541742, + "epoch": 1.2402630542877557, + "grad_norm": 6.5625, + "learning_rate": 3.330060517488861e-06, + "loss": 0.81968384, + "memory(GiB)": 147.13, + "step": 53160, + "train_speed(iter/s)": 0.20088 + }, + { + "acc": 0.78192739, + "epoch": 1.2404963618600446, + "grad_norm": 6.625, + "learning_rate": 3.328280002857234e-06, + "loss": 0.77632504, + "memory(GiB)": 147.13, + "step": 53170, + "train_speed(iter/s)": 0.2009 + }, + { + "acc": 0.79778252, + "epoch": 1.2407296694323335, + "grad_norm": 6.5625, + "learning_rate": 3.3264997268631515e-06, + "loss": 0.75389705, + "memory(GiB)": 147.13, + "step": 53180, + "train_speed(iter/s)": 0.200921 + }, + { + "acc": 0.77507801, + "epoch": 1.2409629770046224, + "grad_norm": 5.0, + "learning_rate": 3.324719689760746e-06, + "loss": 0.80741787, + "memory(GiB)": 147.13, + "step": 53190, + "train_speed(iter/s)": 0.200941 + }, + { + "acc": 0.78459139, + "epoch": 1.2411962845769113, + "grad_norm": 5.78125, + "learning_rate": 3.3229398918041184e-06, + "loss": 0.76652284, + "memory(GiB)": 147.13, + "step": 53200, + "train_speed(iter/s)": 0.200958 + }, + { + "acc": 0.7865819, + "epoch": 1.2414295921492002, + "grad_norm": 4.125, + "learning_rate": 3.321160333247334e-06, + "loss": 0.76798935, + "memory(GiB)": 147.13, + "step": 53210, + "train_speed(iter/s)": 0.200978 + }, + { + "acc": 0.77011223, + "epoch": 1.2416628997214891, + "grad_norm": 8.125, + "learning_rate": 3.319381014344424e-06, + "loss": 0.81456013, + "memory(GiB)": 147.13, + "step": 53220, + "train_speed(iter/s)": 0.200995 + }, + { + "acc": 0.78842793, + "epoch": 1.241896207293778, + "grad_norm": 5.65625, + "learning_rate": 3.3176019353493873e-06, + "loss": 0.74759521, + "memory(GiB)": 147.13, + "step": 53230, + "train_speed(iter/s)": 0.201014 + }, + { + "acc": 0.75929575, + "epoch": 1.242129514866067, + "grad_norm": 5.09375, + "learning_rate": 3.315823096516184e-06, + "loss": 0.8730217, + "memory(GiB)": 147.13, + "step": 53240, + "train_speed(iter/s)": 0.201033 + }, + { + "acc": 0.78181305, + "epoch": 1.2423628224383558, + "grad_norm": 7.375, + "learning_rate": 3.314044498098745e-06, + "loss": 0.77940979, + "memory(GiB)": 147.13, + "step": 53250, + "train_speed(iter/s)": 0.201051 + }, + { + "acc": 0.79055223, + "epoch": 1.2425961300106447, + "grad_norm": 5.09375, + "learning_rate": 3.3122661403509643e-06, + "loss": 0.73277559, + "memory(GiB)": 147.13, + "step": 53260, + "train_speed(iter/s)": 0.20107 + }, + { + "acc": 0.78648615, + "epoch": 1.2428294375829336, + "grad_norm": 7.40625, + "learning_rate": 3.3104880235267014e-06, + "loss": 0.7687428, + "memory(GiB)": 147.13, + "step": 53270, + "train_speed(iter/s)": 0.201089 + }, + { + "acc": 0.77741222, + "epoch": 1.2430627451552225, + "grad_norm": 7.5625, + "learning_rate": 3.3087101478797846e-06, + "loss": 0.80210009, + "memory(GiB)": 147.13, + "step": 53280, + "train_speed(iter/s)": 0.201108 + }, + { + "acc": 0.77117157, + "epoch": 1.2432960527275114, + "grad_norm": 4.46875, + "learning_rate": 3.3069325136640007e-06, + "loss": 0.81199608, + "memory(GiB)": 147.13, + "step": 53290, + "train_speed(iter/s)": 0.201126 + }, + { + "acc": 0.77286162, + "epoch": 1.2435293602998003, + "grad_norm": 6.21875, + "learning_rate": 3.305155121133109e-06, + "loss": 0.80843086, + "memory(GiB)": 147.13, + "step": 53300, + "train_speed(iter/s)": 0.201146 + }, + { + "acc": 0.78854752, + "epoch": 1.243762667872089, + "grad_norm": 5.5625, + "learning_rate": 3.303377970540832e-06, + "loss": 0.76747618, + "memory(GiB)": 147.13, + "step": 53310, + "train_speed(iter/s)": 0.201164 + }, + { + "acc": 0.77600183, + "epoch": 1.2439959754443781, + "grad_norm": 5.75, + "learning_rate": 3.3016010621408558e-06, + "loss": 0.7977457, + "memory(GiB)": 147.13, + "step": 53320, + "train_speed(iter/s)": 0.201183 + }, + { + "acc": 0.79537611, + "epoch": 1.2442292830166668, + "grad_norm": 9.9375, + "learning_rate": 3.299824396186835e-06, + "loss": 0.73316031, + "memory(GiB)": 147.13, + "step": 53330, + "train_speed(iter/s)": 0.201202 + }, + { + "acc": 0.78379049, + "epoch": 1.244462590588956, + "grad_norm": 4.90625, + "learning_rate": 3.2980479729323867e-06, + "loss": 0.77379413, + "memory(GiB)": 147.13, + "step": 53340, + "train_speed(iter/s)": 0.201222 + }, + { + "acc": 0.75966711, + "epoch": 1.2446958981612446, + "grad_norm": 4.625, + "learning_rate": 3.2962717926310966e-06, + "loss": 0.87534847, + "memory(GiB)": 147.13, + "step": 53350, + "train_speed(iter/s)": 0.20124 + }, + { + "acc": 0.79759836, + "epoch": 1.2449292057335335, + "grad_norm": 3.828125, + "learning_rate": 3.2944958555365135e-06, + "loss": 0.71614137, + "memory(GiB)": 147.13, + "step": 53360, + "train_speed(iter/s)": 0.201259 + }, + { + "acc": 0.80230551, + "epoch": 1.2451625133058224, + "grad_norm": 5.625, + "learning_rate": 3.292720161902152e-06, + "loss": 0.69462061, + "memory(GiB)": 147.13, + "step": 53370, + "train_speed(iter/s)": 0.201278 + }, + { + "acc": 0.79200816, + "epoch": 1.2453958208781113, + "grad_norm": 6.5625, + "learning_rate": 3.2909447119814907e-06, + "loss": 0.74009314, + "memory(GiB)": 147.13, + "step": 53380, + "train_speed(iter/s)": 0.201299 + }, + { + "acc": 0.78173537, + "epoch": 1.2456291284504002, + "grad_norm": 4.8125, + "learning_rate": 3.289169506027977e-06, + "loss": 0.769735, + "memory(GiB)": 147.13, + "step": 53390, + "train_speed(iter/s)": 0.201318 + }, + { + "acc": 0.78138733, + "epoch": 1.245862436022689, + "grad_norm": 4.84375, + "learning_rate": 3.287394544295018e-06, + "loss": 0.79002943, + "memory(GiB)": 147.13, + "step": 53400, + "train_speed(iter/s)": 0.201338 + }, + { + "acc": 0.77280626, + "epoch": 1.246095743594978, + "grad_norm": 6.21875, + "learning_rate": 3.2856198270359895e-06, + "loss": 0.82154875, + "memory(GiB)": 147.13, + "step": 53410, + "train_speed(iter/s)": 0.201357 + }, + { + "acc": 0.79443498, + "epoch": 1.2463290511672669, + "grad_norm": 4.90625, + "learning_rate": 3.2838453545042326e-06, + "loss": 0.72624025, + "memory(GiB)": 147.13, + "step": 53420, + "train_speed(iter/s)": 0.201376 + }, + { + "acc": 0.78037777, + "epoch": 1.2465623587395558, + "grad_norm": 4.71875, + "learning_rate": 3.2820711269530535e-06, + "loss": 0.7707921, + "memory(GiB)": 147.13, + "step": 53430, + "train_speed(iter/s)": 0.201395 + }, + { + "acc": 0.78268867, + "epoch": 1.2467956663118447, + "grad_norm": 5.90625, + "learning_rate": 3.280297144635721e-06, + "loss": 0.76652794, + "memory(GiB)": 147.13, + "step": 53440, + "train_speed(iter/s)": 0.201415 + }, + { + "acc": 0.76735125, + "epoch": 1.2470289738841336, + "grad_norm": 4.125, + "learning_rate": 3.278523407805474e-06, + "loss": 0.83181791, + "memory(GiB)": 147.13, + "step": 53450, + "train_speed(iter/s)": 0.201435 + }, + { + "acc": 0.77343864, + "epoch": 1.2472622814564225, + "grad_norm": 6.125, + "learning_rate": 3.276749916715508e-06, + "loss": 0.81363611, + "memory(GiB)": 147.13, + "step": 53460, + "train_speed(iter/s)": 0.201455 + }, + { + "acc": 0.7868227, + "epoch": 1.2474955890287114, + "grad_norm": 5.28125, + "learning_rate": 3.274976671618992e-06, + "loss": 0.75913687, + "memory(GiB)": 147.13, + "step": 53470, + "train_speed(iter/s)": 0.201475 + }, + { + "acc": 0.7895021, + "epoch": 1.2477288966010003, + "grad_norm": 4.40625, + "learning_rate": 3.2732036727690543e-06, + "loss": 0.75617399, + "memory(GiB)": 147.13, + "step": 53480, + "train_speed(iter/s)": 0.201495 + }, + { + "acc": 0.78445072, + "epoch": 1.2479622041732892, + "grad_norm": 5.25, + "learning_rate": 3.2714309204187905e-06, + "loss": 0.77543073, + "memory(GiB)": 147.13, + "step": 53490, + "train_speed(iter/s)": 0.201513 + }, + { + "acc": 0.76975474, + "epoch": 1.248195511745578, + "grad_norm": 5.25, + "learning_rate": 3.2696584148212606e-06, + "loss": 0.8373764, + "memory(GiB)": 147.13, + "step": 53500, + "train_speed(iter/s)": 0.201533 + }, + { + "epoch": 1.248195511745578, + "eval_acc": 0.744429663131803, + "eval_loss": 0.8053871393203735, + "eval_runtime": 1269.4157, + "eval_samples_per_second": 28.352, + "eval_steps_per_second": 14.177, + "step": 53500 + }, + { + "acc": 0.7716815, + "epoch": 1.248428819317867, + "grad_norm": 4.34375, + "learning_rate": 3.2678861562294916e-06, + "loss": 0.81136675, + "memory(GiB)": 147.13, + "step": 53510, + "train_speed(iter/s)": 0.20058 + }, + { + "acc": 0.77584715, + "epoch": 1.2486621268901559, + "grad_norm": 5.5, + "learning_rate": 3.2661141448964688e-06, + "loss": 0.78764925, + "memory(GiB)": 147.13, + "step": 53520, + "train_speed(iter/s)": 0.200599 + }, + { + "acc": 0.78310165, + "epoch": 1.2488954344624448, + "grad_norm": 4.25, + "learning_rate": 3.2643423810751497e-06, + "loss": 0.76774969, + "memory(GiB)": 147.13, + "step": 53530, + "train_speed(iter/s)": 0.200618 + }, + { + "acc": 0.76813574, + "epoch": 1.2491287420347337, + "grad_norm": 5.78125, + "learning_rate": 3.2625708650184496e-06, + "loss": 0.83579845, + "memory(GiB)": 147.13, + "step": 53540, + "train_speed(iter/s)": 0.200637 + }, + { + "acc": 0.77997684, + "epoch": 1.2493620496070226, + "grad_norm": 6.34375, + "learning_rate": 3.260799596979254e-06, + "loss": 0.79803414, + "memory(GiB)": 147.13, + "step": 53550, + "train_speed(iter/s)": 0.200657 + }, + { + "acc": 0.78893514, + "epoch": 1.2495953571793115, + "grad_norm": 4.4375, + "learning_rate": 3.25902857721041e-06, + "loss": 0.76923342, + "memory(GiB)": 147.13, + "step": 53560, + "train_speed(iter/s)": 0.200676 + }, + { + "acc": 0.76379528, + "epoch": 1.2498286647516004, + "grad_norm": 4.34375, + "learning_rate": 3.257257805964732e-06, + "loss": 0.85577602, + "memory(GiB)": 147.13, + "step": 53570, + "train_speed(iter/s)": 0.200696 + }, + { + "acc": 0.7713376, + "epoch": 1.2500619723238893, + "grad_norm": 5.34375, + "learning_rate": 3.255487283494995e-06, + "loss": 0.83352032, + "memory(GiB)": 147.13, + "step": 53580, + "train_speed(iter/s)": 0.200715 + }, + { + "acc": 0.78212161, + "epoch": 1.2502952798961782, + "grad_norm": 6.6875, + "learning_rate": 3.253717010053943e-06, + "loss": 0.7555759, + "memory(GiB)": 147.13, + "step": 53590, + "train_speed(iter/s)": 0.200734 + }, + { + "acc": 0.7807415, + "epoch": 1.250528587468467, + "grad_norm": 7.53125, + "learning_rate": 3.25194698589428e-06, + "loss": 0.7950223, + "memory(GiB)": 147.13, + "step": 53600, + "train_speed(iter/s)": 0.200754 + }, + { + "acc": 0.76310434, + "epoch": 1.250761895040756, + "grad_norm": 4.8125, + "learning_rate": 3.2501772112686757e-06, + "loss": 0.87415972, + "memory(GiB)": 147.13, + "step": 53610, + "train_speed(iter/s)": 0.200773 + }, + { + "acc": 0.7808527, + "epoch": 1.2509952026130449, + "grad_norm": 5.59375, + "learning_rate": 3.2484076864297687e-06, + "loss": 0.78480973, + "memory(GiB)": 147.13, + "step": 53620, + "train_speed(iter/s)": 0.200793 + }, + { + "acc": 0.77754536, + "epoch": 1.2512285101853338, + "grad_norm": 5.0, + "learning_rate": 3.246638411630154e-06, + "loss": 0.80107412, + "memory(GiB)": 147.13, + "step": 53630, + "train_speed(iter/s)": 0.200812 + }, + { + "acc": 0.76862297, + "epoch": 1.2514618177576227, + "grad_norm": 4.3125, + "learning_rate": 3.2448693871223968e-06, + "loss": 0.85014858, + "memory(GiB)": 147.13, + "step": 53640, + "train_speed(iter/s)": 0.200832 + }, + { + "acc": 0.78168688, + "epoch": 1.2516951253299116, + "grad_norm": 7.71875, + "learning_rate": 3.2431006131590244e-06, + "loss": 0.77133665, + "memory(GiB)": 147.13, + "step": 53650, + "train_speed(iter/s)": 0.200852 + }, + { + "acc": 0.77619853, + "epoch": 1.2519284329022005, + "grad_norm": 4.78125, + "learning_rate": 3.2413320899925287e-06, + "loss": 0.782967, + "memory(GiB)": 147.13, + "step": 53660, + "train_speed(iter/s)": 0.200871 + }, + { + "acc": 0.78052235, + "epoch": 1.2521617404744894, + "grad_norm": 6.5, + "learning_rate": 3.2395638178753673e-06, + "loss": 0.77000246, + "memory(GiB)": 147.13, + "step": 53670, + "train_speed(iter/s)": 0.200891 + }, + { + "acc": 0.76536398, + "epoch": 1.252395048046778, + "grad_norm": 5.96875, + "learning_rate": 3.2377957970599594e-06, + "loss": 0.83446522, + "memory(GiB)": 147.13, + "step": 53680, + "train_speed(iter/s)": 0.20091 + }, + { + "acc": 0.79172969, + "epoch": 1.2526283556190672, + "grad_norm": 6.09375, + "learning_rate": 3.2360280277986887e-06, + "loss": 0.73861761, + "memory(GiB)": 147.13, + "step": 53690, + "train_speed(iter/s)": 0.200929 + }, + { + "acc": 0.76554937, + "epoch": 1.2528616631913558, + "grad_norm": 6.1875, + "learning_rate": 3.234260510343905e-06, + "loss": 0.84061546, + "memory(GiB)": 147.13, + "step": 53700, + "train_speed(iter/s)": 0.200949 + }, + { + "acc": 0.77454815, + "epoch": 1.253094970763645, + "grad_norm": 4.78125, + "learning_rate": 3.23249324494792e-06, + "loss": 0.82709484, + "memory(GiB)": 147.13, + "step": 53710, + "train_speed(iter/s)": 0.200968 + }, + { + "acc": 0.7816751, + "epoch": 1.2533282783359336, + "grad_norm": 5.3125, + "learning_rate": 3.230726231863013e-06, + "loss": 0.78138809, + "memory(GiB)": 147.13, + "step": 53720, + "train_speed(iter/s)": 0.200986 + }, + { + "acc": 0.78555498, + "epoch": 1.2535615859082228, + "grad_norm": 6.78125, + "learning_rate": 3.2289594713414207e-06, + "loss": 0.78290424, + "memory(GiB)": 147.13, + "step": 53730, + "train_speed(iter/s)": 0.201005 + }, + { + "acc": 0.79429498, + "epoch": 1.2537948934805114, + "grad_norm": 5.59375, + "learning_rate": 3.2271929636353494e-06, + "loss": 0.72328415, + "memory(GiB)": 147.13, + "step": 53740, + "train_speed(iter/s)": 0.201025 + }, + { + "acc": 0.78629532, + "epoch": 1.2540282010528006, + "grad_norm": 4.15625, + "learning_rate": 3.2254267089969688e-06, + "loss": 0.76473923, + "memory(GiB)": 147.13, + "step": 53750, + "train_speed(iter/s)": 0.201043 + }, + { + "acc": 0.78774004, + "epoch": 1.2542615086250892, + "grad_norm": 7.0, + "learning_rate": 3.2236607076784086e-06, + "loss": 0.76563559, + "memory(GiB)": 147.13, + "step": 53760, + "train_speed(iter/s)": 0.201062 + }, + { + "acc": 0.78109026, + "epoch": 1.2544948161973781, + "grad_norm": 6.8125, + "learning_rate": 3.2218949599317664e-06, + "loss": 0.79619112, + "memory(GiB)": 147.13, + "step": 53770, + "train_speed(iter/s)": 0.201082 + }, + { + "acc": 0.77077236, + "epoch": 1.254728123769667, + "grad_norm": 5.03125, + "learning_rate": 3.220129466009102e-06, + "loss": 0.83952332, + "memory(GiB)": 147.13, + "step": 53780, + "train_speed(iter/s)": 0.201101 + }, + { + "acc": 0.7884572, + "epoch": 1.254961431341956, + "grad_norm": 5.625, + "learning_rate": 3.2183642261624393e-06, + "loss": 0.75626945, + "memory(GiB)": 147.13, + "step": 53790, + "train_speed(iter/s)": 0.20112 + }, + { + "acc": 0.75669336, + "epoch": 1.2551947389142448, + "grad_norm": 5.0, + "learning_rate": 3.216599240643765e-06, + "loss": 0.88252993, + "memory(GiB)": 147.13, + "step": 53800, + "train_speed(iter/s)": 0.20114 + }, + { + "acc": 0.77265863, + "epoch": 1.2554280464865337, + "grad_norm": 9.5, + "learning_rate": 3.2148345097050332e-06, + "loss": 0.83709316, + "memory(GiB)": 147.13, + "step": 53810, + "train_speed(iter/s)": 0.201159 + }, + { + "acc": 0.76388264, + "epoch": 1.2556613540588226, + "grad_norm": 15.5625, + "learning_rate": 3.213070033598155e-06, + "loss": 0.85438976, + "memory(GiB)": 147.13, + "step": 53820, + "train_speed(iter/s)": 0.20118 + }, + { + "acc": 0.80244446, + "epoch": 1.2558946616311115, + "grad_norm": 5.0625, + "learning_rate": 3.211305812575011e-06, + "loss": 0.70825286, + "memory(GiB)": 147.13, + "step": 53830, + "train_speed(iter/s)": 0.201199 + }, + { + "acc": 0.79701781, + "epoch": 1.2561279692034004, + "grad_norm": 5.4375, + "learning_rate": 3.209541846887442e-06, + "loss": 0.73176861, + "memory(GiB)": 147.13, + "step": 53840, + "train_speed(iter/s)": 0.201219 + }, + { + "acc": 0.79489713, + "epoch": 1.2563612767756893, + "grad_norm": 4.0, + "learning_rate": 3.207778136787256e-06, + "loss": 0.70806804, + "memory(GiB)": 147.13, + "step": 53850, + "train_speed(iter/s)": 0.201238 + }, + { + "acc": 0.78195076, + "epoch": 1.2565945843479782, + "grad_norm": 4.5625, + "learning_rate": 3.2060146825262196e-06, + "loss": 0.79273672, + "memory(GiB)": 147.13, + "step": 53860, + "train_speed(iter/s)": 0.201258 + }, + { + "acc": 0.79850488, + "epoch": 1.2568278919202671, + "grad_norm": 4.46875, + "learning_rate": 3.2042514843560644e-06, + "loss": 0.71020679, + "memory(GiB)": 147.13, + "step": 53870, + "train_speed(iter/s)": 0.201279 + }, + { + "acc": 0.77073278, + "epoch": 1.257061199492556, + "grad_norm": 5.84375, + "learning_rate": 3.2024885425284893e-06, + "loss": 0.81626215, + "memory(GiB)": 147.13, + "step": 53880, + "train_speed(iter/s)": 0.201299 + }, + { + "acc": 0.7916851, + "epoch": 1.257294507064845, + "grad_norm": 4.28125, + "learning_rate": 3.200725857295153e-06, + "loss": 0.73321352, + "memory(GiB)": 147.13, + "step": 53890, + "train_speed(iter/s)": 0.201319 + }, + { + "acc": 0.77081394, + "epoch": 1.2575278146371338, + "grad_norm": 4.78125, + "learning_rate": 3.1989634289076776e-06, + "loss": 0.81450729, + "memory(GiB)": 147.13, + "step": 53900, + "train_speed(iter/s)": 0.201336 + }, + { + "acc": 0.77993765, + "epoch": 1.2577611222094227, + "grad_norm": 4.625, + "learning_rate": 3.197201257617649e-06, + "loss": 0.78307796, + "memory(GiB)": 147.13, + "step": 53910, + "train_speed(iter/s)": 0.201356 + }, + { + "acc": 0.78636661, + "epoch": 1.2579944297817116, + "grad_norm": 5.4375, + "learning_rate": 3.195439343676617e-06, + "loss": 0.77123709, + "memory(GiB)": 147.13, + "step": 53920, + "train_speed(iter/s)": 0.201376 + }, + { + "acc": 0.78531208, + "epoch": 1.2582277373540005, + "grad_norm": 4.5, + "learning_rate": 3.1936776873360947e-06, + "loss": 0.78743391, + "memory(GiB)": 147.13, + "step": 53930, + "train_speed(iter/s)": 0.201395 + }, + { + "acc": 0.78387508, + "epoch": 1.2584610449262894, + "grad_norm": 9.875, + "learning_rate": 3.1919162888475586e-06, + "loss": 0.77252707, + "memory(GiB)": 147.13, + "step": 53940, + "train_speed(iter/s)": 0.201416 + }, + { + "acc": 0.79411707, + "epoch": 1.2586943524985783, + "grad_norm": 5.5, + "learning_rate": 3.190155148462446e-06, + "loss": 0.72856312, + "memory(GiB)": 147.13, + "step": 53950, + "train_speed(iter/s)": 0.201436 + }, + { + "acc": 0.76296749, + "epoch": 1.2589276600708672, + "grad_norm": 4.96875, + "learning_rate": 3.188394266432162e-06, + "loss": 0.8607048, + "memory(GiB)": 147.13, + "step": 53960, + "train_speed(iter/s)": 0.201455 + }, + { + "acc": 0.77531281, + "epoch": 1.259160967643156, + "grad_norm": 5.625, + "learning_rate": 3.186633643008069e-06, + "loss": 0.80578537, + "memory(GiB)": 147.13, + "step": 53970, + "train_speed(iter/s)": 0.201475 + }, + { + "acc": 0.78390398, + "epoch": 1.259394275215445, + "grad_norm": 4.46875, + "learning_rate": 3.1848732784414965e-06, + "loss": 0.75349741, + "memory(GiB)": 147.13, + "step": 53980, + "train_speed(iter/s)": 0.201495 + }, + { + "acc": 0.7925087, + "epoch": 1.259627582787734, + "grad_norm": 5.15625, + "learning_rate": 3.183113172983736e-06, + "loss": 0.75053062, + "memory(GiB)": 147.13, + "step": 53990, + "train_speed(iter/s)": 0.201513 + }, + { + "acc": 0.79274836, + "epoch": 1.2598608903600228, + "grad_norm": 6.0625, + "learning_rate": 3.181353326886042e-06, + "loss": 0.74251266, + "memory(GiB)": 147.13, + "step": 54000, + "train_speed(iter/s)": 0.201533 + }, + { + "epoch": 1.2598608903600228, + "eval_acc": 0.7445457677002215, + "eval_loss": 0.8053033947944641, + "eval_runtime": 1272.3358, + "eval_samples_per_second": 28.287, + "eval_steps_per_second": 14.144, + "step": 54000 + }, + { + "acc": 0.78218327, + "epoch": 1.2600941979323117, + "grad_norm": 6.46875, + "learning_rate": 3.1795937403996324e-06, + "loss": 0.76867433, + "memory(GiB)": 147.13, + "step": 54010, + "train_speed(iter/s)": 0.200585 + }, + { + "acc": 0.78693171, + "epoch": 1.2603275055046006, + "grad_norm": 5.40625, + "learning_rate": 3.1778344137756887e-06, + "loss": 0.75604353, + "memory(GiB)": 147.13, + "step": 54020, + "train_speed(iter/s)": 0.200605 + }, + { + "acc": 0.79023509, + "epoch": 1.2605608130768895, + "grad_norm": 4.84375, + "learning_rate": 3.176075347265352e-06, + "loss": 0.7607203, + "memory(GiB)": 147.13, + "step": 54030, + "train_speed(iter/s)": 0.200623 + }, + { + "acc": 0.77073202, + "epoch": 1.2607941206491784, + "grad_norm": 4.75, + "learning_rate": 3.17431654111973e-06, + "loss": 0.82227421, + "memory(GiB)": 147.13, + "step": 54040, + "train_speed(iter/s)": 0.200642 + }, + { + "acc": 0.78074164, + "epoch": 1.2610274282214673, + "grad_norm": 6.25, + "learning_rate": 3.1725579955898904e-06, + "loss": 0.76902056, + "memory(GiB)": 147.13, + "step": 54050, + "train_speed(iter/s)": 0.200661 + }, + { + "acc": 0.77640028, + "epoch": 1.2612607357937562, + "grad_norm": 6.5625, + "learning_rate": 3.170799710926867e-06, + "loss": 0.80473099, + "memory(GiB)": 147.13, + "step": 54060, + "train_speed(iter/s)": 0.200681 + }, + { + "acc": 0.76053233, + "epoch": 1.2614940433660449, + "grad_norm": 5.53125, + "learning_rate": 3.1690416873816533e-06, + "loss": 0.87321119, + "memory(GiB)": 147.13, + "step": 54070, + "train_speed(iter/s)": 0.200699 + }, + { + "acc": 0.78000383, + "epoch": 1.261727350938334, + "grad_norm": 4.875, + "learning_rate": 3.1672839252052083e-06, + "loss": 0.77255969, + "memory(GiB)": 147.13, + "step": 54080, + "train_speed(iter/s)": 0.200719 + }, + { + "acc": 0.79675488, + "epoch": 1.2619606585106227, + "grad_norm": 5.125, + "learning_rate": 3.165526424648449e-06, + "loss": 0.71965032, + "memory(GiB)": 147.13, + "step": 54090, + "train_speed(iter/s)": 0.200738 + }, + { + "acc": 0.7872025, + "epoch": 1.2621939660829118, + "grad_norm": 5.4375, + "learning_rate": 3.1637691859622612e-06, + "loss": 0.78567123, + "memory(GiB)": 147.13, + "step": 54100, + "train_speed(iter/s)": 0.200757 + }, + { + "acc": 0.7730463, + "epoch": 1.2624272736552005, + "grad_norm": 5.46875, + "learning_rate": 3.1620122093974864e-06, + "loss": 0.83145771, + "memory(GiB)": 147.13, + "step": 54110, + "train_speed(iter/s)": 0.200776 + }, + { + "acc": 0.78709173, + "epoch": 1.2626605812274896, + "grad_norm": 4.8125, + "learning_rate": 3.160255495204936e-06, + "loss": 0.76123199, + "memory(GiB)": 147.13, + "step": 54120, + "train_speed(iter/s)": 0.200795 + }, + { + "acc": 0.77258062, + "epoch": 1.2628938887997783, + "grad_norm": 7.15625, + "learning_rate": 3.158499043635378e-06, + "loss": 0.81969166, + "memory(GiB)": 147.13, + "step": 54130, + "train_speed(iter/s)": 0.200816 + }, + { + "acc": 0.78251991, + "epoch": 1.2631271963720674, + "grad_norm": 7.46875, + "learning_rate": 3.156742854939547e-06, + "loss": 0.79453034, + "memory(GiB)": 147.13, + "step": 54140, + "train_speed(iter/s)": 0.200835 + }, + { + "acc": 0.77685394, + "epoch": 1.263360503944356, + "grad_norm": 7.5625, + "learning_rate": 3.1549869293681385e-06, + "loss": 0.81942472, + "memory(GiB)": 147.13, + "step": 54150, + "train_speed(iter/s)": 0.200855 + }, + { + "acc": 0.77604828, + "epoch": 1.263593811516645, + "grad_norm": 5.0, + "learning_rate": 3.1532312671718102e-06, + "loss": 0.79661341, + "memory(GiB)": 147.13, + "step": 54160, + "train_speed(iter/s)": 0.200875 + }, + { + "acc": 0.77102032, + "epoch": 1.2638271190889339, + "grad_norm": 4.90625, + "learning_rate": 3.1514758686011816e-06, + "loss": 0.84676743, + "memory(GiB)": 147.13, + "step": 54170, + "train_speed(iter/s)": 0.200894 + }, + { + "acc": 0.77398539, + "epoch": 1.2640604266612228, + "grad_norm": 6.25, + "learning_rate": 3.149720733906836e-06, + "loss": 0.82969208, + "memory(GiB)": 147.13, + "step": 54180, + "train_speed(iter/s)": 0.200912 + }, + { + "acc": 0.77549362, + "epoch": 1.2642937342335117, + "grad_norm": 5.96875, + "learning_rate": 3.1479658633393194e-06, + "loss": 0.82915554, + "memory(GiB)": 147.13, + "step": 54190, + "train_speed(iter/s)": 0.200931 + }, + { + "acc": 0.78671007, + "epoch": 1.2645270418058006, + "grad_norm": 5.71875, + "learning_rate": 3.146211257149136e-06, + "loss": 0.77449756, + "memory(GiB)": 147.13, + "step": 54200, + "train_speed(iter/s)": 0.200951 + }, + { + "acc": 0.79366622, + "epoch": 1.2647603493780895, + "grad_norm": 7.75, + "learning_rate": 3.1444569155867573e-06, + "loss": 0.7421545, + "memory(GiB)": 147.13, + "step": 54210, + "train_speed(iter/s)": 0.200969 + }, + { + "acc": 0.77127199, + "epoch": 1.2649936569503784, + "grad_norm": 5.28125, + "learning_rate": 3.1427028389026147e-06, + "loss": 0.83733711, + "memory(GiB)": 147.13, + "step": 54220, + "train_speed(iter/s)": 0.200989 + }, + { + "acc": 0.77401648, + "epoch": 1.2652269645226673, + "grad_norm": 5.03125, + "learning_rate": 3.140949027347102e-06, + "loss": 0.82600069, + "memory(GiB)": 147.13, + "step": 54230, + "train_speed(iter/s)": 0.201007 + }, + { + "acc": 0.78014426, + "epoch": 1.2654602720949562, + "grad_norm": 7.84375, + "learning_rate": 3.139195481170577e-06, + "loss": 0.78453398, + "memory(GiB)": 147.13, + "step": 54240, + "train_speed(iter/s)": 0.201025 + }, + { + "acc": 0.79164677, + "epoch": 1.265693579667245, + "grad_norm": 5.6875, + "learning_rate": 3.1374422006233553e-06, + "loss": 0.72470675, + "memory(GiB)": 147.13, + "step": 54250, + "train_speed(iter/s)": 0.201044 + }, + { + "acc": 0.78896961, + "epoch": 1.265926887239534, + "grad_norm": 5.625, + "learning_rate": 3.1356891859557187e-06, + "loss": 0.76291442, + "memory(GiB)": 147.13, + "step": 54260, + "train_speed(iter/s)": 0.201063 + }, + { + "acc": 0.78453579, + "epoch": 1.2661601948118228, + "grad_norm": 5.46875, + "learning_rate": 3.1339364374179092e-06, + "loss": 0.76488409, + "memory(GiB)": 147.13, + "step": 54270, + "train_speed(iter/s)": 0.201082 + }, + { + "acc": 0.76442003, + "epoch": 1.2663935023841117, + "grad_norm": 7.03125, + "learning_rate": 3.1321839552601308e-06, + "loss": 0.83735924, + "memory(GiB)": 147.13, + "step": 54280, + "train_speed(iter/s)": 0.201102 + }, + { + "acc": 0.7594471, + "epoch": 1.2666268099564006, + "grad_norm": 5.5, + "learning_rate": 3.1304317397325503e-06, + "loss": 0.87646704, + "memory(GiB)": 147.13, + "step": 54290, + "train_speed(iter/s)": 0.201121 + }, + { + "acc": 0.78775072, + "epoch": 1.2668601175286895, + "grad_norm": 6.09375, + "learning_rate": 3.128679791085297e-06, + "loss": 0.74031334, + "memory(GiB)": 147.13, + "step": 54300, + "train_speed(iter/s)": 0.20114 + }, + { + "acc": 0.78536415, + "epoch": 1.2670934251009784, + "grad_norm": 4.5625, + "learning_rate": 3.1269281095684594e-06, + "loss": 0.78184915, + "memory(GiB)": 147.13, + "step": 54310, + "train_speed(iter/s)": 0.20116 + }, + { + "acc": 0.76834879, + "epoch": 1.2673267326732673, + "grad_norm": 5.09375, + "learning_rate": 3.1251766954320906e-06, + "loss": 0.83152952, + "memory(GiB)": 147.13, + "step": 54320, + "train_speed(iter/s)": 0.201178 + }, + { + "acc": 0.78719902, + "epoch": 1.2675600402455562, + "grad_norm": 8.5625, + "learning_rate": 3.123425548926203e-06, + "loss": 0.75794401, + "memory(GiB)": 147.13, + "step": 54330, + "train_speed(iter/s)": 0.201198 + }, + { + "acc": 0.77632055, + "epoch": 1.2677933478178451, + "grad_norm": 7.1875, + "learning_rate": 3.121674670300773e-06, + "loss": 0.81995049, + "memory(GiB)": 147.13, + "step": 54340, + "train_speed(iter/s)": 0.201219 + }, + { + "acc": 0.79015565, + "epoch": 1.268026655390134, + "grad_norm": 3.25, + "learning_rate": 3.1199240598057377e-06, + "loss": 0.7804615, + "memory(GiB)": 147.13, + "step": 54350, + "train_speed(iter/s)": 0.201239 + }, + { + "acc": 0.76458197, + "epoch": 1.268259962962423, + "grad_norm": 5.46875, + "learning_rate": 3.1181737176909967e-06, + "loss": 0.85183592, + "memory(GiB)": 147.13, + "step": 54360, + "train_speed(iter/s)": 0.201259 + }, + { + "acc": 0.76942658, + "epoch": 1.2684932705347118, + "grad_norm": 4.96875, + "learning_rate": 3.116423644206411e-06, + "loss": 0.83050613, + "memory(GiB)": 147.13, + "step": 54370, + "train_speed(iter/s)": 0.201278 + }, + { + "acc": 0.76355877, + "epoch": 1.2687265781070007, + "grad_norm": 7.28125, + "learning_rate": 3.1146738396018043e-06, + "loss": 0.86715822, + "memory(GiB)": 147.13, + "step": 54380, + "train_speed(iter/s)": 0.201298 + }, + { + "acc": 0.77561574, + "epoch": 1.2689598856792896, + "grad_norm": 5.5, + "learning_rate": 3.112924304126958e-06, + "loss": 0.81895523, + "memory(GiB)": 147.13, + "step": 54390, + "train_speed(iter/s)": 0.201318 + }, + { + "acc": 0.79888744, + "epoch": 1.2691931932515785, + "grad_norm": 4.125, + "learning_rate": 3.111175038031619e-06, + "loss": 0.70149899, + "memory(GiB)": 147.13, + "step": 54400, + "train_speed(iter/s)": 0.201337 + }, + { + "acc": 0.78107786, + "epoch": 1.2694265008238674, + "grad_norm": 4.96875, + "learning_rate": 3.1094260415654955e-06, + "loss": 0.79072466, + "memory(GiB)": 147.13, + "step": 54410, + "train_speed(iter/s)": 0.201357 + }, + { + "acc": 0.77758923, + "epoch": 1.2696598083961563, + "grad_norm": 5.75, + "learning_rate": 3.1076773149782557e-06, + "loss": 0.80385323, + "memory(GiB)": 147.13, + "step": 54420, + "train_speed(iter/s)": 0.201377 + }, + { + "acc": 0.79401674, + "epoch": 1.2698931159684452, + "grad_norm": 6.0, + "learning_rate": 3.105928858519529e-06, + "loss": 0.72925024, + "memory(GiB)": 147.13, + "step": 54430, + "train_speed(iter/s)": 0.201397 + }, + { + "acc": 0.79929819, + "epoch": 1.270126423540734, + "grad_norm": 4.5, + "learning_rate": 3.1041806724389067e-06, + "loss": 0.70583544, + "memory(GiB)": 147.13, + "step": 54440, + "train_speed(iter/s)": 0.201414 + }, + { + "acc": 0.78400726, + "epoch": 1.270359731113023, + "grad_norm": 6.46875, + "learning_rate": 3.1024327569859425e-06, + "loss": 0.79743586, + "memory(GiB)": 147.13, + "step": 54450, + "train_speed(iter/s)": 0.201434 + }, + { + "acc": 0.75207291, + "epoch": 1.2705930386853117, + "grad_norm": 5.96875, + "learning_rate": 3.1006851124101524e-06, + "loss": 0.90693626, + "memory(GiB)": 147.13, + "step": 54460, + "train_speed(iter/s)": 0.201453 + }, + { + "acc": 0.77595568, + "epoch": 1.2708263462576008, + "grad_norm": 18.375, + "learning_rate": 3.0989377389610097e-06, + "loss": 0.82628527, + "memory(GiB)": 147.13, + "step": 54470, + "train_speed(iter/s)": 0.201473 + }, + { + "acc": 0.77218075, + "epoch": 1.2710596538298895, + "grad_norm": 4.9375, + "learning_rate": 3.0971906368879524e-06, + "loss": 0.8140975, + "memory(GiB)": 147.13, + "step": 54480, + "train_speed(iter/s)": 0.201491 + }, + { + "acc": 0.80995159, + "epoch": 1.2712929614021786, + "grad_norm": 4.6875, + "learning_rate": 3.095443806440379e-06, + "loss": 0.68186216, + "memory(GiB)": 147.13, + "step": 54490, + "train_speed(iter/s)": 0.20151 + }, + { + "acc": 0.78228736, + "epoch": 1.2715262689744673, + "grad_norm": 5.65625, + "learning_rate": 3.0936972478676493e-06, + "loss": 0.77988653, + "memory(GiB)": 147.13, + "step": 54500, + "train_speed(iter/s)": 0.201529 + }, + { + "epoch": 1.2715262689744673, + "eval_acc": 0.7445822805851863, + "eval_loss": 0.80512934923172, + "eval_runtime": 1270.2737, + "eval_samples_per_second": 28.333, + "eval_steps_per_second": 14.167, + "step": 54500 + }, + { + "acc": 0.78444986, + "epoch": 1.2717595765467564, + "grad_norm": 4.90625, + "learning_rate": 3.0919509614190836e-06, + "loss": 0.77315531, + "memory(GiB)": 147.13, + "step": 54510, + "train_speed(iter/s)": 0.200592 + }, + { + "acc": 0.77153635, + "epoch": 1.271992884119045, + "grad_norm": 4.625, + "learning_rate": 3.0902049473439643e-06, + "loss": 0.80885544, + "memory(GiB)": 147.13, + "step": 54520, + "train_speed(iter/s)": 0.200612 + }, + { + "acc": 0.7847683, + "epoch": 1.2722261916913342, + "grad_norm": 9.9375, + "learning_rate": 3.0884592058915342e-06, + "loss": 0.79745474, + "memory(GiB)": 147.13, + "step": 54530, + "train_speed(iter/s)": 0.200631 + }, + { + "acc": 0.76501255, + "epoch": 1.272459499263623, + "grad_norm": 5.28125, + "learning_rate": 3.0867137373109972e-06, + "loss": 0.86392994, + "memory(GiB)": 147.13, + "step": 54540, + "train_speed(iter/s)": 0.20065 + }, + { + "acc": 0.78538918, + "epoch": 1.2726928068359118, + "grad_norm": 9.4375, + "learning_rate": 3.0849685418515174e-06, + "loss": 0.77651844, + "memory(GiB)": 147.13, + "step": 54550, + "train_speed(iter/s)": 0.20067 + }, + { + "acc": 0.76775522, + "epoch": 1.2729261144082007, + "grad_norm": 5.53125, + "learning_rate": 3.0832236197622223e-06, + "loss": 0.8431736, + "memory(GiB)": 147.13, + "step": 54560, + "train_speed(iter/s)": 0.20069 + }, + { + "acc": 0.79953661, + "epoch": 1.2731594219804896, + "grad_norm": 6.0625, + "learning_rate": 3.0814789712921977e-06, + "loss": 0.70614519, + "memory(GiB)": 147.13, + "step": 54570, + "train_speed(iter/s)": 0.20071 + }, + { + "acc": 0.78566427, + "epoch": 1.2733927295527785, + "grad_norm": 4.34375, + "learning_rate": 3.0797345966904933e-06, + "loss": 0.74611931, + "memory(GiB)": 147.13, + "step": 54580, + "train_speed(iter/s)": 0.200729 + }, + { + "acc": 0.78306842, + "epoch": 1.2736260371250674, + "grad_norm": 5.40625, + "learning_rate": 3.0779904962061173e-06, + "loss": 0.75449877, + "memory(GiB)": 147.13, + "step": 54590, + "train_speed(iter/s)": 0.200746 + }, + { + "acc": 0.78306794, + "epoch": 1.2738593446973563, + "grad_norm": 5.625, + "learning_rate": 3.076246670088041e-06, + "loss": 0.77245879, + "memory(GiB)": 147.13, + "step": 54600, + "train_speed(iter/s)": 0.200765 + }, + { + "acc": 0.77822618, + "epoch": 1.2740926522696452, + "grad_norm": 6.59375, + "learning_rate": 3.074503118585192e-06, + "loss": 0.78750162, + "memory(GiB)": 147.13, + "step": 54610, + "train_speed(iter/s)": 0.200784 + }, + { + "acc": 0.76630201, + "epoch": 1.274325959841934, + "grad_norm": 5.90625, + "learning_rate": 3.072759841946464e-06, + "loss": 0.84698448, + "memory(GiB)": 147.13, + "step": 54620, + "train_speed(iter/s)": 0.200802 + }, + { + "acc": 0.757019, + "epoch": 1.274559267414223, + "grad_norm": 6.5, + "learning_rate": 3.0710168404207086e-06, + "loss": 0.88410797, + "memory(GiB)": 147.13, + "step": 54630, + "train_speed(iter/s)": 0.200821 + }, + { + "acc": 0.7815589, + "epoch": 1.2747925749865119, + "grad_norm": 3.984375, + "learning_rate": 3.0692741142567385e-06, + "loss": 0.78019571, + "memory(GiB)": 147.13, + "step": 54640, + "train_speed(iter/s)": 0.200841 + }, + { + "acc": 0.78222027, + "epoch": 1.2750258825588008, + "grad_norm": 6.3125, + "learning_rate": 3.0675316637033296e-06, + "loss": 0.80412626, + "memory(GiB)": 147.13, + "step": 54650, + "train_speed(iter/s)": 0.20086 + }, + { + "acc": 0.77893991, + "epoch": 1.2752591901310897, + "grad_norm": 4.125, + "learning_rate": 3.0657894890092134e-06, + "loss": 0.79963918, + "memory(GiB)": 147.13, + "step": 54660, + "train_speed(iter/s)": 0.200879 + }, + { + "acc": 0.79304705, + "epoch": 1.2754924977033786, + "grad_norm": 7.5, + "learning_rate": 3.0640475904230848e-06, + "loss": 0.75699606, + "memory(GiB)": 147.13, + "step": 54670, + "train_speed(iter/s)": 0.200896 + }, + { + "acc": 0.78414087, + "epoch": 1.2757258052756675, + "grad_norm": 4.375, + "learning_rate": 3.062305968193601e-06, + "loss": 0.77377367, + "memory(GiB)": 147.13, + "step": 54680, + "train_speed(iter/s)": 0.200915 + }, + { + "acc": 0.80083361, + "epoch": 1.2759591128479564, + "grad_norm": 4.09375, + "learning_rate": 3.060564622569377e-06, + "loss": 0.69888325, + "memory(GiB)": 147.13, + "step": 54690, + "train_speed(iter/s)": 0.200935 + }, + { + "acc": 0.77480173, + "epoch": 1.2761924204202453, + "grad_norm": 6.34375, + "learning_rate": 3.0588235537989897e-06, + "loss": 0.79366236, + "memory(GiB)": 147.13, + "step": 54700, + "train_speed(iter/s)": 0.200954 + }, + { + "acc": 0.75772133, + "epoch": 1.2764257279925342, + "grad_norm": 5.0, + "learning_rate": 3.057082762130976e-06, + "loss": 0.87921181, + "memory(GiB)": 147.13, + "step": 54710, + "train_speed(iter/s)": 0.200974 + }, + { + "acc": 0.80236292, + "epoch": 1.276659035564823, + "grad_norm": 5.875, + "learning_rate": 3.0553422478138333e-06, + "loss": 0.69562078, + "memory(GiB)": 147.13, + "step": 54720, + "train_speed(iter/s)": 0.200992 + }, + { + "acc": 0.76373925, + "epoch": 1.276892343137112, + "grad_norm": 6.59375, + "learning_rate": 3.0536020110960214e-06, + "loss": 0.86020823, + "memory(GiB)": 147.13, + "step": 54730, + "train_speed(iter/s)": 0.201011 + }, + { + "acc": 0.76700792, + "epoch": 1.2771256507094009, + "grad_norm": 4.4375, + "learning_rate": 3.0518620522259557e-06, + "loss": 0.83871212, + "memory(GiB)": 147.13, + "step": 54740, + "train_speed(iter/s)": 0.201031 + }, + { + "acc": 0.78003569, + "epoch": 1.2773589582816898, + "grad_norm": 7.4375, + "learning_rate": 3.0501223714520155e-06, + "loss": 0.78800702, + "memory(GiB)": 147.13, + "step": 54750, + "train_speed(iter/s)": 0.201049 + }, + { + "acc": 0.7942296, + "epoch": 1.2775922658539787, + "grad_norm": 6.28125, + "learning_rate": 3.048382969022543e-06, + "loss": 0.75022469, + "memory(GiB)": 147.13, + "step": 54760, + "train_speed(iter/s)": 0.201069 + }, + { + "acc": 0.77333212, + "epoch": 1.2778255734262676, + "grad_norm": 8.4375, + "learning_rate": 3.0466438451858326e-06, + "loss": 0.81570168, + "memory(GiB)": 147.13, + "step": 54770, + "train_speed(iter/s)": 0.201088 + }, + { + "acc": 0.7831358, + "epoch": 1.2780588809985565, + "grad_norm": 6.625, + "learning_rate": 3.044905000190146e-06, + "loss": 0.77250023, + "memory(GiB)": 147.13, + "step": 54780, + "train_speed(iter/s)": 0.201107 + }, + { + "acc": 0.78039427, + "epoch": 1.2782921885708454, + "grad_norm": 6.53125, + "learning_rate": 3.043166434283703e-06, + "loss": 0.78244643, + "memory(GiB)": 147.13, + "step": 54790, + "train_speed(iter/s)": 0.201127 + }, + { + "acc": 0.7836205, + "epoch": 1.2785254961431343, + "grad_norm": 5.96875, + "learning_rate": 3.0414281477146823e-06, + "loss": 0.79010315, + "memory(GiB)": 147.13, + "step": 54800, + "train_speed(iter/s)": 0.201146 + }, + { + "acc": 0.77984056, + "epoch": 1.2787588037154232, + "grad_norm": 5.625, + "learning_rate": 3.0396901407312263e-06, + "loss": 0.80035505, + "memory(GiB)": 147.13, + "step": 54810, + "train_speed(iter/s)": 0.201166 + }, + { + "acc": 0.78728938, + "epoch": 1.278992111287712, + "grad_norm": 5.15625, + "learning_rate": 3.037952413581431e-06, + "loss": 0.74682913, + "memory(GiB)": 147.13, + "step": 54820, + "train_speed(iter/s)": 0.201186 + }, + { + "acc": 0.7650311, + "epoch": 1.2792254188600007, + "grad_norm": 7.59375, + "learning_rate": 3.03621496651336e-06, + "loss": 0.84308777, + "memory(GiB)": 147.13, + "step": 54830, + "train_speed(iter/s)": 0.201204 + }, + { + "acc": 0.75912447, + "epoch": 1.2794587264322899, + "grad_norm": 6.75, + "learning_rate": 3.0344777997750313e-06, + "loss": 0.8658433, + "memory(GiB)": 147.13, + "step": 54840, + "train_speed(iter/s)": 0.201224 + }, + { + "acc": 0.7930964, + "epoch": 1.2796920340045785, + "grad_norm": 4.125, + "learning_rate": 3.0327409136144257e-06, + "loss": 0.73651738, + "memory(GiB)": 147.13, + "step": 54850, + "train_speed(iter/s)": 0.201243 + }, + { + "acc": 0.7795516, + "epoch": 1.2799253415768677, + "grad_norm": 5.09375, + "learning_rate": 3.031004308279484e-06, + "loss": 0.77885942, + "memory(GiB)": 147.13, + "step": 54860, + "train_speed(iter/s)": 0.201262 + }, + { + "acc": 0.7870728, + "epoch": 1.2801586491491563, + "grad_norm": 4.1875, + "learning_rate": 3.0292679840181048e-06, + "loss": 0.77090311, + "memory(GiB)": 147.13, + "step": 54870, + "train_speed(iter/s)": 0.20128 + }, + { + "acc": 0.79632759, + "epoch": 1.2803919567214455, + "grad_norm": 5.53125, + "learning_rate": 3.02753194107815e-06, + "loss": 0.7337678, + "memory(GiB)": 147.13, + "step": 54880, + "train_speed(iter/s)": 0.201298 + }, + { + "acc": 0.78915548, + "epoch": 1.2806252642937341, + "grad_norm": 4.65625, + "learning_rate": 3.0257961797074353e-06, + "loss": 0.75042048, + "memory(GiB)": 147.13, + "step": 54890, + "train_speed(iter/s)": 0.201317 + }, + { + "acc": 0.78317914, + "epoch": 1.2808585718660233, + "grad_norm": 6.96875, + "learning_rate": 3.0240607001537442e-06, + "loss": 0.78657742, + "memory(GiB)": 147.13, + "step": 54900, + "train_speed(iter/s)": 0.201335 + }, + { + "acc": 0.7978034, + "epoch": 1.281091879438312, + "grad_norm": 6.4375, + "learning_rate": 3.022325502664813e-06, + "loss": 0.71349802, + "memory(GiB)": 147.13, + "step": 54910, + "train_speed(iter/s)": 0.201354 + }, + { + "acc": 0.78730545, + "epoch": 1.2813251870106008, + "grad_norm": 8.125, + "learning_rate": 3.020590587488342e-06, + "loss": 0.7752285, + "memory(GiB)": 147.13, + "step": 54920, + "train_speed(iter/s)": 0.201372 + }, + { + "acc": 0.76834173, + "epoch": 1.2815584945828897, + "grad_norm": 5.875, + "learning_rate": 3.0188559548719888e-06, + "loss": 0.83232594, + "memory(GiB)": 147.13, + "step": 54930, + "train_speed(iter/s)": 0.201391 + }, + { + "acc": 0.78525715, + "epoch": 1.2817918021551786, + "grad_norm": 7.96875, + "learning_rate": 3.0171216050633735e-06, + "loss": 0.7697998, + "memory(GiB)": 147.13, + "step": 54940, + "train_speed(iter/s)": 0.20141 + }, + { + "acc": 0.7783596, + "epoch": 1.2820251097274675, + "grad_norm": 8.375, + "learning_rate": 3.0153875383100732e-06, + "loss": 0.78331108, + "memory(GiB)": 147.13, + "step": 54950, + "train_speed(iter/s)": 0.201429 + }, + { + "acc": 0.77114954, + "epoch": 1.2822584172997564, + "grad_norm": 6.8125, + "learning_rate": 3.0136537548596247e-06, + "loss": 0.83518686, + "memory(GiB)": 147.13, + "step": 54960, + "train_speed(iter/s)": 0.201448 + }, + { + "acc": 0.79357519, + "epoch": 1.2824917248720453, + "grad_norm": 3.984375, + "learning_rate": 3.011920254959526e-06, + "loss": 0.72738924, + "memory(GiB)": 147.13, + "step": 54970, + "train_speed(iter/s)": 0.201468 + }, + { + "acc": 0.7828373, + "epoch": 1.2827250324443342, + "grad_norm": 4.21875, + "learning_rate": 3.010187038857233e-06, + "loss": 0.78417239, + "memory(GiB)": 147.13, + "step": 54980, + "train_speed(iter/s)": 0.201488 + }, + { + "acc": 0.78416605, + "epoch": 1.2829583400166231, + "grad_norm": 5.59375, + "learning_rate": 3.008454106800164e-06, + "loss": 0.77895746, + "memory(GiB)": 147.13, + "step": 54990, + "train_speed(iter/s)": 0.201507 + }, + { + "acc": 0.77086582, + "epoch": 1.283191647588912, + "grad_norm": 5.3125, + "learning_rate": 3.006721459035691e-06, + "loss": 0.81353455, + "memory(GiB)": 147.13, + "step": 55000, + "train_speed(iter/s)": 0.201526 + }, + { + "epoch": 1.283191647588912, + "eval_acc": 0.7444704999110399, + "eval_loss": 0.8050407767295837, + "eval_runtime": 1270.065, + "eval_samples_per_second": 28.338, + "eval_steps_per_second": 14.169, + "step": 55000 + }, + { + "acc": 0.76830349, + "epoch": 1.283424955161201, + "grad_norm": 4.90625, + "learning_rate": 3.0049890958111505e-06, + "loss": 0.82744799, + "memory(GiB)": 147.13, + "step": 55010, + "train_speed(iter/s)": 0.200597 + }, + { + "acc": 0.78124275, + "epoch": 1.2836582627334898, + "grad_norm": 5.25, + "learning_rate": 3.0032570173738367e-06, + "loss": 0.77643175, + "memory(GiB)": 147.13, + "step": 55020, + "train_speed(iter/s)": 0.200615 + }, + { + "acc": 0.77881341, + "epoch": 1.2838915703057787, + "grad_norm": 5.5, + "learning_rate": 3.0015252239710052e-06, + "loss": 0.8051384, + "memory(GiB)": 147.13, + "step": 55030, + "train_speed(iter/s)": 0.200634 + }, + { + "acc": 0.78495493, + "epoch": 1.2841248778780676, + "grad_norm": 6.09375, + "learning_rate": 2.9997937158498657e-06, + "loss": 0.77997541, + "memory(GiB)": 147.13, + "step": 55040, + "train_speed(iter/s)": 0.200653 + }, + { + "acc": 0.76635275, + "epoch": 1.2843581854503565, + "grad_norm": 5.15625, + "learning_rate": 2.998062493257593e-06, + "loss": 0.85216427, + "memory(GiB)": 147.13, + "step": 55050, + "train_speed(iter/s)": 0.200671 + }, + { + "acc": 0.77827249, + "epoch": 1.2845914930226454, + "grad_norm": 4.84375, + "learning_rate": 2.9963315564413174e-06, + "loss": 0.78949223, + "memory(GiB)": 147.13, + "step": 55060, + "train_speed(iter/s)": 0.200689 + }, + { + "acc": 0.7691534, + "epoch": 1.2848248005949343, + "grad_norm": 4.59375, + "learning_rate": 2.994600905648131e-06, + "loss": 0.83677711, + "memory(GiB)": 147.13, + "step": 55070, + "train_speed(iter/s)": 0.200708 + }, + { + "acc": 0.7861763, + "epoch": 1.2850581081672232, + "grad_norm": 6.5625, + "learning_rate": 2.9928705411250813e-06, + "loss": 0.77848387, + "memory(GiB)": 147.13, + "step": 55080, + "train_speed(iter/s)": 0.200726 + }, + { + "acc": 0.766677, + "epoch": 1.285291415739512, + "grad_norm": 4.625, + "learning_rate": 2.9911404631191796e-06, + "loss": 0.84276142, + "memory(GiB)": 147.13, + "step": 55090, + "train_speed(iter/s)": 0.200745 + }, + { + "acc": 0.77530727, + "epoch": 1.285524723311801, + "grad_norm": 5.21875, + "learning_rate": 2.9894106718773936e-06, + "loss": 0.82425938, + "memory(GiB)": 147.13, + "step": 55100, + "train_speed(iter/s)": 0.200763 + }, + { + "acc": 0.78108807, + "epoch": 1.28575803088409, + "grad_norm": 6.34375, + "learning_rate": 2.987681167646652e-06, + "loss": 0.78073077, + "memory(GiB)": 147.13, + "step": 55110, + "train_speed(iter/s)": 0.200782 + }, + { + "acc": 0.76942391, + "epoch": 1.2859913384563788, + "grad_norm": 6.5625, + "learning_rate": 2.985951950673836e-06, + "loss": 0.8330018, + "memory(GiB)": 147.13, + "step": 55120, + "train_speed(iter/s)": 0.200802 + }, + { + "acc": 0.75507021, + "epoch": 1.2862246460286677, + "grad_norm": 6.03125, + "learning_rate": 2.984223021205795e-06, + "loss": 0.89126616, + "memory(GiB)": 147.13, + "step": 55130, + "train_speed(iter/s)": 0.200822 + }, + { + "acc": 0.78265476, + "epoch": 1.2864579536009566, + "grad_norm": 4.75, + "learning_rate": 2.9824943794893312e-06, + "loss": 0.78735905, + "memory(GiB)": 147.13, + "step": 55140, + "train_speed(iter/s)": 0.200841 + }, + { + "acc": 0.78866234, + "epoch": 1.2866912611732455, + "grad_norm": 6.34375, + "learning_rate": 2.9807660257712097e-06, + "loss": 0.75422587, + "memory(GiB)": 147.13, + "step": 55150, + "train_speed(iter/s)": 0.200861 + }, + { + "acc": 0.78413725, + "epoch": 1.2869245687455344, + "grad_norm": 4.40625, + "learning_rate": 2.9790379602981508e-06, + "loss": 0.78166876, + "memory(GiB)": 147.13, + "step": 55160, + "train_speed(iter/s)": 0.200878 + }, + { + "acc": 0.77496414, + "epoch": 1.2871578763178233, + "grad_norm": 4.59375, + "learning_rate": 2.9773101833168374e-06, + "loss": 0.81109505, + "memory(GiB)": 147.13, + "step": 55170, + "train_speed(iter/s)": 0.200897 + }, + { + "acc": 0.77975092, + "epoch": 1.2873911838901122, + "grad_norm": 4.375, + "learning_rate": 2.9755826950739057e-06, + "loss": 0.79754815, + "memory(GiB)": 147.13, + "step": 55180, + "train_speed(iter/s)": 0.200915 + }, + { + "acc": 0.77278199, + "epoch": 1.287624491462401, + "grad_norm": 5.125, + "learning_rate": 2.973855495815957e-06, + "loss": 0.80997, + "memory(GiB)": 147.13, + "step": 55190, + "train_speed(iter/s)": 0.200934 + }, + { + "acc": 0.77235184, + "epoch": 1.28785779903469, + "grad_norm": 6.46875, + "learning_rate": 2.9721285857895475e-06, + "loss": 0.82727203, + "memory(GiB)": 147.13, + "step": 55200, + "train_speed(iter/s)": 0.200953 + }, + { + "acc": 0.78080778, + "epoch": 1.288091106606979, + "grad_norm": 5.6875, + "learning_rate": 2.9704019652411933e-06, + "loss": 0.80346479, + "memory(GiB)": 147.13, + "step": 55210, + "train_speed(iter/s)": 0.200972 + }, + { + "acc": 0.78474374, + "epoch": 1.2883244141792676, + "grad_norm": 5.96875, + "learning_rate": 2.9686756344173712e-06, + "loss": 0.77900343, + "memory(GiB)": 147.13, + "step": 55220, + "train_speed(iter/s)": 0.20099 + }, + { + "acc": 0.77824039, + "epoch": 1.2885577217515567, + "grad_norm": 25.75, + "learning_rate": 2.96694959356451e-06, + "loss": 0.81776686, + "memory(GiB)": 147.13, + "step": 55230, + "train_speed(iter/s)": 0.20101 + }, + { + "acc": 0.78159637, + "epoch": 1.2887910293238454, + "grad_norm": 6.09375, + "learning_rate": 2.9652238429290036e-06, + "loss": 0.79265308, + "memory(GiB)": 147.13, + "step": 55240, + "train_speed(iter/s)": 0.20103 + }, + { + "acc": 0.79291801, + "epoch": 1.2890243368961345, + "grad_norm": 6.40625, + "learning_rate": 2.9634983827572038e-06, + "loss": 0.73101878, + "memory(GiB)": 147.13, + "step": 55250, + "train_speed(iter/s)": 0.20105 + }, + { + "acc": 0.7721282, + "epoch": 1.2892576444684232, + "grad_norm": 4.53125, + "learning_rate": 2.961773213295417e-06, + "loss": 0.82999897, + "memory(GiB)": 147.13, + "step": 55260, + "train_speed(iter/s)": 0.20107 + }, + { + "acc": 0.79432802, + "epoch": 1.2894909520407123, + "grad_norm": 4.78125, + "learning_rate": 2.960048334789912e-06, + "loss": 0.73425961, + "memory(GiB)": 147.13, + "step": 55270, + "train_speed(iter/s)": 0.20109 + }, + { + "acc": 0.78572264, + "epoch": 1.289724259613001, + "grad_norm": 5.625, + "learning_rate": 2.9583237474869143e-06, + "loss": 0.76063013, + "memory(GiB)": 147.13, + "step": 55280, + "train_speed(iter/s)": 0.201108 + }, + { + "acc": 0.77698021, + "epoch": 1.28995756718529, + "grad_norm": 5.40625, + "learning_rate": 2.956599451632609e-06, + "loss": 0.79158425, + "memory(GiB)": 147.13, + "step": 55290, + "train_speed(iter/s)": 0.201127 + }, + { + "acc": 0.76085668, + "epoch": 1.2901908747575788, + "grad_norm": 5.375, + "learning_rate": 2.9548754474731376e-06, + "loss": 0.85578661, + "memory(GiB)": 147.13, + "step": 55300, + "train_speed(iter/s)": 0.201145 + }, + { + "acc": 0.78259788, + "epoch": 1.2904241823298677, + "grad_norm": 5.75, + "learning_rate": 2.953151735254604e-06, + "loss": 0.77954855, + "memory(GiB)": 147.13, + "step": 55310, + "train_speed(iter/s)": 0.201164 + }, + { + "acc": 0.78067522, + "epoch": 1.2906574899021566, + "grad_norm": 5.375, + "learning_rate": 2.9514283152230637e-06, + "loss": 0.78616638, + "memory(GiB)": 147.13, + "step": 55320, + "train_speed(iter/s)": 0.201181 + }, + { + "acc": 0.77021255, + "epoch": 1.2908907974744455, + "grad_norm": 8.5625, + "learning_rate": 2.949705187624539e-06, + "loss": 0.80933561, + "memory(GiB)": 147.13, + "step": 55330, + "train_speed(iter/s)": 0.2012 + }, + { + "acc": 0.777321, + "epoch": 1.2911241050467344, + "grad_norm": 5.53125, + "learning_rate": 2.947982352705001e-06, + "loss": 0.78500934, + "memory(GiB)": 147.13, + "step": 55340, + "train_speed(iter/s)": 0.201219 + }, + { + "acc": 0.78850975, + "epoch": 1.2913574126190233, + "grad_norm": 5.96875, + "learning_rate": 2.9462598107103855e-06, + "loss": 0.7612771, + "memory(GiB)": 147.13, + "step": 55350, + "train_speed(iter/s)": 0.201238 + }, + { + "acc": 0.77932158, + "epoch": 1.2915907201913122, + "grad_norm": 6.78125, + "learning_rate": 2.9445375618865857e-06, + "loss": 0.79323964, + "memory(GiB)": 147.13, + "step": 55360, + "train_speed(iter/s)": 0.201256 + }, + { + "acc": 0.75293274, + "epoch": 1.291824027763601, + "grad_norm": 6.15625, + "learning_rate": 2.942815606479452e-06, + "loss": 0.89559402, + "memory(GiB)": 147.13, + "step": 55370, + "train_speed(iter/s)": 0.201275 + }, + { + "acc": 0.79058542, + "epoch": 1.29205733533589, + "grad_norm": 4.8125, + "learning_rate": 2.941093944734793e-06, + "loss": 0.76939631, + "memory(GiB)": 147.13, + "step": 55380, + "train_speed(iter/s)": 0.201294 + }, + { + "acc": 0.78246756, + "epoch": 1.2922906429081789, + "grad_norm": 5.4375, + "learning_rate": 2.939372576898376e-06, + "loss": 0.78994184, + "memory(GiB)": 147.13, + "step": 55390, + "train_speed(iter/s)": 0.201313 + }, + { + "acc": 0.7745213, + "epoch": 1.2925239504804678, + "grad_norm": 5.84375, + "learning_rate": 2.937651503215924e-06, + "loss": 0.80446148, + "memory(GiB)": 147.13, + "step": 55400, + "train_speed(iter/s)": 0.20133 + }, + { + "acc": 0.77480874, + "epoch": 1.2927572580527567, + "grad_norm": 5.625, + "learning_rate": 2.9359307239331214e-06, + "loss": 0.8080328, + "memory(GiB)": 147.13, + "step": 55410, + "train_speed(iter/s)": 0.20135 + }, + { + "acc": 0.78159122, + "epoch": 1.2929905656250456, + "grad_norm": 5.1875, + "learning_rate": 2.9342102392956075e-06, + "loss": 0.77628255, + "memory(GiB)": 147.13, + "step": 55420, + "train_speed(iter/s)": 0.201368 + }, + { + "acc": 0.77849216, + "epoch": 1.2932238731973345, + "grad_norm": 5.59375, + "learning_rate": 2.932490049548982e-06, + "loss": 0.79256525, + "memory(GiB)": 147.13, + "step": 55430, + "train_speed(iter/s)": 0.201385 + }, + { + "acc": 0.80485229, + "epoch": 1.2934571807696233, + "grad_norm": 3.953125, + "learning_rate": 2.9307701549388025e-06, + "loss": 0.68343935, + "memory(GiB)": 147.13, + "step": 55440, + "train_speed(iter/s)": 0.201402 + }, + { + "acc": 0.77289133, + "epoch": 1.2936904883419122, + "grad_norm": 4.75, + "learning_rate": 2.929050555710582e-06, + "loss": 0.8112318, + "memory(GiB)": 147.13, + "step": 55450, + "train_speed(iter/s)": 0.201421 + }, + { + "acc": 0.7689023, + "epoch": 1.2939237959142011, + "grad_norm": 8.125, + "learning_rate": 2.9273312521097926e-06, + "loss": 0.82355614, + "memory(GiB)": 147.13, + "step": 55460, + "train_speed(iter/s)": 0.201441 + }, + { + "acc": 0.78700824, + "epoch": 1.29415710348649, + "grad_norm": 6.34375, + "learning_rate": 2.9256122443818657e-06, + "loss": 0.75792155, + "memory(GiB)": 147.13, + "step": 55470, + "train_speed(iter/s)": 0.201459 + }, + { + "acc": 0.78975463, + "epoch": 1.294390411058779, + "grad_norm": 5.90625, + "learning_rate": 2.923893532772187e-06, + "loss": 0.77099237, + "memory(GiB)": 147.13, + "step": 55480, + "train_speed(iter/s)": 0.201477 + }, + { + "acc": 0.77440877, + "epoch": 1.2946237186310678, + "grad_norm": 5.6875, + "learning_rate": 2.9221751175261036e-06, + "loss": 0.81504545, + "memory(GiB)": 147.13, + "step": 55490, + "train_speed(iter/s)": 0.201496 + }, + { + "acc": 0.78191586, + "epoch": 1.2948570262033567, + "grad_norm": 7.34375, + "learning_rate": 2.9204569988889186e-06, + "loss": 0.78614321, + "memory(GiB)": 147.13, + "step": 55500, + "train_speed(iter/s)": 0.201513 + }, + { + "epoch": 1.2948570262033567, + "eval_acc": 0.7445686683254407, + "eval_loss": 0.8049291968345642, + "eval_runtime": 1270.1062, + "eval_samples_per_second": 28.337, + "eval_steps_per_second": 14.169, + "step": 55500 + }, + { + "acc": 0.79387369, + "epoch": 1.2950903337756456, + "grad_norm": 6.3125, + "learning_rate": 2.9187391771058938e-06, + "loss": 0.7472805, + "memory(GiB)": 147.13, + "step": 55510, + "train_speed(iter/s)": 0.200593 + }, + { + "acc": 0.77921963, + "epoch": 1.2953236413479345, + "grad_norm": 4.96875, + "learning_rate": 2.9170216524222446e-06, + "loss": 0.78324275, + "memory(GiB)": 147.13, + "step": 55520, + "train_speed(iter/s)": 0.200611 + }, + { + "acc": 0.76559649, + "epoch": 1.2955569489202234, + "grad_norm": 7.96875, + "learning_rate": 2.9153044250831512e-06, + "loss": 0.84542322, + "memory(GiB)": 147.13, + "step": 55530, + "train_speed(iter/s)": 0.20063 + }, + { + "acc": 0.76427712, + "epoch": 1.2957902564925123, + "grad_norm": 6.21875, + "learning_rate": 2.913587495333744e-06, + "loss": 0.83981133, + "memory(GiB)": 147.13, + "step": 55540, + "train_speed(iter/s)": 0.200649 + }, + { + "acc": 0.77819748, + "epoch": 1.2960235640648012, + "grad_norm": 6.46875, + "learning_rate": 2.9118708634191177e-06, + "loss": 0.78465223, + "memory(GiB)": 147.13, + "step": 55550, + "train_speed(iter/s)": 0.200668 + }, + { + "acc": 0.78325009, + "epoch": 1.2962568716370901, + "grad_norm": 5.03125, + "learning_rate": 2.910154529584319e-06, + "loss": 0.78194418, + "memory(GiB)": 147.13, + "step": 55560, + "train_speed(iter/s)": 0.200687 + }, + { + "acc": 0.79050236, + "epoch": 1.296490179209379, + "grad_norm": 4.09375, + "learning_rate": 2.9084384940743543e-06, + "loss": 0.76087713, + "memory(GiB)": 147.13, + "step": 55570, + "train_speed(iter/s)": 0.200705 + }, + { + "acc": 0.78712444, + "epoch": 1.296723486781668, + "grad_norm": 5.21875, + "learning_rate": 2.9067227571341873e-06, + "loss": 0.76762724, + "memory(GiB)": 147.13, + "step": 55580, + "train_speed(iter/s)": 0.200723 + }, + { + "acc": 0.776087, + "epoch": 1.2969567943539568, + "grad_norm": 4.9375, + "learning_rate": 2.905007319008736e-06, + "loss": 0.81573505, + "memory(GiB)": 147.13, + "step": 55590, + "train_speed(iter/s)": 0.200741 + }, + { + "acc": 0.77835026, + "epoch": 1.2971901019262457, + "grad_norm": 4.8125, + "learning_rate": 2.903292179942883e-06, + "loss": 0.79308224, + "memory(GiB)": 147.13, + "step": 55600, + "train_speed(iter/s)": 0.200758 + }, + { + "acc": 0.78184395, + "epoch": 1.2974234094985344, + "grad_norm": 5.0, + "learning_rate": 2.9015773401814606e-06, + "loss": 0.79529061, + "memory(GiB)": 147.13, + "step": 55610, + "train_speed(iter/s)": 0.200777 + }, + { + "acc": 0.78858995, + "epoch": 1.2976567170708235, + "grad_norm": 5.65625, + "learning_rate": 2.899862799969265e-06, + "loss": 0.77276478, + "memory(GiB)": 147.13, + "step": 55620, + "train_speed(iter/s)": 0.200795 + }, + { + "acc": 0.77310009, + "epoch": 1.2978900246431122, + "grad_norm": 5.15625, + "learning_rate": 2.898148559551045e-06, + "loss": 0.81262197, + "memory(GiB)": 147.13, + "step": 55630, + "train_speed(iter/s)": 0.200814 + }, + { + "acc": 0.79125342, + "epoch": 1.2981233322154013, + "grad_norm": 5.15625, + "learning_rate": 2.8964346191715058e-06, + "loss": 0.74343634, + "memory(GiB)": 147.13, + "step": 55640, + "train_speed(iter/s)": 0.200831 + }, + { + "acc": 0.78252659, + "epoch": 1.29835663978769, + "grad_norm": 4.5625, + "learning_rate": 2.894720979075315e-06, + "loss": 0.76678948, + "memory(GiB)": 147.13, + "step": 55650, + "train_speed(iter/s)": 0.20085 + }, + { + "acc": 0.77767839, + "epoch": 1.2985899473599791, + "grad_norm": 4.84375, + "learning_rate": 2.8930076395070915e-06, + "loss": 0.76314602, + "memory(GiB)": 147.13, + "step": 55660, + "train_speed(iter/s)": 0.200869 + }, + { + "acc": 0.76303263, + "epoch": 1.2988232549322678, + "grad_norm": 4.71875, + "learning_rate": 2.8912946007114175e-06, + "loss": 0.85604687, + "memory(GiB)": 147.13, + "step": 55670, + "train_speed(iter/s)": 0.200887 + }, + { + "acc": 0.79654713, + "epoch": 1.299056562504557, + "grad_norm": 4.65625, + "learning_rate": 2.8895818629328254e-06, + "loss": 0.71905622, + "memory(GiB)": 147.13, + "step": 55680, + "train_speed(iter/s)": 0.200905 + }, + { + "acc": 0.76574378, + "epoch": 1.2992898700768456, + "grad_norm": 11.75, + "learning_rate": 2.8878694264158103e-06, + "loss": 0.84654961, + "memory(GiB)": 147.13, + "step": 55690, + "train_speed(iter/s)": 0.200924 + }, + { + "acc": 0.78273115, + "epoch": 1.2995231776491345, + "grad_norm": 5.71875, + "learning_rate": 2.8861572914048184e-06, + "loss": 0.76711287, + "memory(GiB)": 147.13, + "step": 55700, + "train_speed(iter/s)": 0.200943 + }, + { + "acc": 0.77995882, + "epoch": 1.2997564852214234, + "grad_norm": 5.84375, + "learning_rate": 2.8844454581442614e-06, + "loss": 0.77336879, + "memory(GiB)": 147.13, + "step": 55710, + "train_speed(iter/s)": 0.200962 + }, + { + "acc": 0.80577364, + "epoch": 1.2999897927937123, + "grad_norm": 4.90625, + "learning_rate": 2.8827339268785015e-06, + "loss": 0.70034103, + "memory(GiB)": 147.13, + "step": 55720, + "train_speed(iter/s)": 0.20098 + }, + { + "acc": 0.76289902, + "epoch": 1.3002231003660012, + "grad_norm": 6.09375, + "learning_rate": 2.881022697851855e-06, + "loss": 0.84976912, + "memory(GiB)": 147.13, + "step": 55730, + "train_speed(iter/s)": 0.201 + }, + { + "acc": 0.76065493, + "epoch": 1.30045640793829, + "grad_norm": 7.09375, + "learning_rate": 2.879311771308606e-06, + "loss": 0.87251511, + "memory(GiB)": 147.13, + "step": 55740, + "train_speed(iter/s)": 0.201019 + }, + { + "acc": 0.7798687, + "epoch": 1.300689715510579, + "grad_norm": 9.0, + "learning_rate": 2.877601147492983e-06, + "loss": 0.78704138, + "memory(GiB)": 147.13, + "step": 55750, + "train_speed(iter/s)": 0.201038 + }, + { + "acc": 0.76645966, + "epoch": 1.300923023082868, + "grad_norm": 4.5, + "learning_rate": 2.8758908266491815e-06, + "loss": 0.84115009, + "memory(GiB)": 147.13, + "step": 55760, + "train_speed(iter/s)": 0.201054 + }, + { + "acc": 0.77044306, + "epoch": 1.3011563306551568, + "grad_norm": 7.0, + "learning_rate": 2.874180809021348e-06, + "loss": 0.82898483, + "memory(GiB)": 147.13, + "step": 55770, + "train_speed(iter/s)": 0.201074 + }, + { + "acc": 0.78014908, + "epoch": 1.3013896382274457, + "grad_norm": 5.65625, + "learning_rate": 2.872471094853584e-06, + "loss": 0.78329229, + "memory(GiB)": 147.13, + "step": 55780, + "train_speed(iter/s)": 0.201093 + }, + { + "acc": 0.78297763, + "epoch": 1.3016229457997346, + "grad_norm": 6.15625, + "learning_rate": 2.8707616843899554e-06, + "loss": 0.76991262, + "memory(GiB)": 147.13, + "step": 55790, + "train_speed(iter/s)": 0.201112 + }, + { + "acc": 0.78463106, + "epoch": 1.3018562533720235, + "grad_norm": 7.28125, + "learning_rate": 2.8690525778744777e-06, + "loss": 0.81536741, + "memory(GiB)": 147.13, + "step": 55800, + "train_speed(iter/s)": 0.201131 + }, + { + "acc": 0.78778915, + "epoch": 1.3020895609443124, + "grad_norm": 5.1875, + "learning_rate": 2.867343775551126e-06, + "loss": 0.76017818, + "memory(GiB)": 147.13, + "step": 55810, + "train_speed(iter/s)": 0.20115 + }, + { + "acc": 0.76485157, + "epoch": 1.3023228685166013, + "grad_norm": 7.0625, + "learning_rate": 2.8656352776638274e-06, + "loss": 0.84671221, + "memory(GiB)": 147.13, + "step": 55820, + "train_speed(iter/s)": 0.201168 + }, + { + "acc": 0.78802285, + "epoch": 1.3025561760888902, + "grad_norm": 5.03125, + "learning_rate": 2.863927084456476e-06, + "loss": 0.74512553, + "memory(GiB)": 147.13, + "step": 55830, + "train_speed(iter/s)": 0.201188 + }, + { + "acc": 0.78229799, + "epoch": 1.302789483661179, + "grad_norm": 6.0, + "learning_rate": 2.862219196172911e-06, + "loss": 0.78259425, + "memory(GiB)": 147.13, + "step": 55840, + "train_speed(iter/s)": 0.201207 + }, + { + "acc": 0.78135662, + "epoch": 1.303022791233468, + "grad_norm": 5.1875, + "learning_rate": 2.8605116130569355e-06, + "loss": 0.78383131, + "memory(GiB)": 147.13, + "step": 55850, + "train_speed(iter/s)": 0.201226 + }, + { + "acc": 0.77223597, + "epoch": 1.3032560988057569, + "grad_norm": 7.78125, + "learning_rate": 2.8588043353523066e-06, + "loss": 0.83154545, + "memory(GiB)": 147.13, + "step": 55860, + "train_speed(iter/s)": 0.201246 + }, + { + "acc": 0.76503639, + "epoch": 1.3034894063780458, + "grad_norm": 6.0, + "learning_rate": 2.8570973633027342e-06, + "loss": 0.856847, + "memory(GiB)": 147.13, + "step": 55870, + "train_speed(iter/s)": 0.201264 + }, + { + "acc": 0.78612194, + "epoch": 1.3037227139503347, + "grad_norm": 4.59375, + "learning_rate": 2.8553906971518936e-06, + "loss": 0.76604691, + "memory(GiB)": 147.13, + "step": 55880, + "train_speed(iter/s)": 0.201282 + }, + { + "acc": 0.7799561, + "epoch": 1.3039560215226236, + "grad_norm": 8.3125, + "learning_rate": 2.8536843371434054e-06, + "loss": 0.78653412, + "memory(GiB)": 147.13, + "step": 55890, + "train_speed(iter/s)": 0.201301 + }, + { + "acc": 0.77653108, + "epoch": 1.3041893290949125, + "grad_norm": 6.0, + "learning_rate": 2.851978283520859e-06, + "loss": 0.79027066, + "memory(GiB)": 147.13, + "step": 55900, + "train_speed(iter/s)": 0.20132 + }, + { + "acc": 0.78584161, + "epoch": 1.3044226366672014, + "grad_norm": 6.0, + "learning_rate": 2.850272536527784e-06, + "loss": 0.78225594, + "memory(GiB)": 147.13, + "step": 55910, + "train_speed(iter/s)": 0.20134 + }, + { + "acc": 0.77251587, + "epoch": 1.3046559442394903, + "grad_norm": 8.75, + "learning_rate": 2.848567096407682e-06, + "loss": 0.83527536, + "memory(GiB)": 147.13, + "step": 55920, + "train_speed(iter/s)": 0.201358 + }, + { + "acc": 0.77410121, + "epoch": 1.3048892518117792, + "grad_norm": 6.3125, + "learning_rate": 2.8468619634040017e-06, + "loss": 0.820961, + "memory(GiB)": 147.13, + "step": 55930, + "train_speed(iter/s)": 0.201376 + }, + { + "acc": 0.76235828, + "epoch": 1.305122559384068, + "grad_norm": 4.78125, + "learning_rate": 2.8451571377601495e-06, + "loss": 0.85515633, + "memory(GiB)": 147.13, + "step": 55940, + "train_speed(iter/s)": 0.201395 + }, + { + "acc": 0.79035673, + "epoch": 1.305355866956357, + "grad_norm": 5.125, + "learning_rate": 2.8434526197194915e-06, + "loss": 0.73724895, + "memory(GiB)": 147.13, + "step": 55950, + "train_speed(iter/s)": 0.201414 + }, + { + "acc": 0.78144627, + "epoch": 1.3055891745286459, + "grad_norm": 5.4375, + "learning_rate": 2.8417484095253434e-06, + "loss": 0.78020916, + "memory(GiB)": 147.13, + "step": 55960, + "train_speed(iter/s)": 0.201432 + }, + { + "acc": 0.77665806, + "epoch": 1.3058224821009348, + "grad_norm": 5.4375, + "learning_rate": 2.8400445074209852e-06, + "loss": 0.78938322, + "memory(GiB)": 147.13, + "step": 55970, + "train_speed(iter/s)": 0.201451 + }, + { + "acc": 0.76400418, + "epoch": 1.3060557896732234, + "grad_norm": 6.1875, + "learning_rate": 2.8383409136496443e-06, + "loss": 0.86329784, + "memory(GiB)": 147.13, + "step": 55980, + "train_speed(iter/s)": 0.20147 + }, + { + "acc": 0.78050137, + "epoch": 1.3062890972455126, + "grad_norm": 4.59375, + "learning_rate": 2.8366376284545117e-06, + "loss": 0.77180653, + "memory(GiB)": 147.13, + "step": 55990, + "train_speed(iter/s)": 0.201489 + }, + { + "acc": 0.79701538, + "epoch": 1.3065224048178012, + "grad_norm": 9.75, + "learning_rate": 2.8349346520787284e-06, + "loss": 0.72172403, + "memory(GiB)": 147.13, + "step": 56000, + "train_speed(iter/s)": 0.201506 + }, + { + "epoch": 1.3065224048178012, + "eval_acc": 0.7445838820275092, + "eval_loss": 0.8049860000610352, + "eval_runtime": 1271.539, + "eval_samples_per_second": 28.305, + "eval_steps_per_second": 14.153, + "step": 56000 + }, + { + "acc": 0.7685607, + "epoch": 1.3067557123900904, + "grad_norm": 4.6875, + "learning_rate": 2.833231984765393e-06, + "loss": 0.81922808, + "memory(GiB)": 147.13, + "step": 56010, + "train_speed(iter/s)": 0.200593 + }, + { + "acc": 0.79894352, + "epoch": 1.306989019962379, + "grad_norm": 5.125, + "learning_rate": 2.8315296267575672e-06, + "loss": 0.68040066, + "memory(GiB)": 147.13, + "step": 56020, + "train_speed(iter/s)": 0.200612 + }, + { + "acc": 0.77997193, + "epoch": 1.3072223275346682, + "grad_norm": 4.6875, + "learning_rate": 2.8298275782982525e-06, + "loss": 0.81127281, + "memory(GiB)": 147.13, + "step": 56030, + "train_speed(iter/s)": 0.200631 + }, + { + "acc": 0.77440147, + "epoch": 1.3074556351069568, + "grad_norm": 4.625, + "learning_rate": 2.8281258396304224e-06, + "loss": 0.8060461, + "memory(GiB)": 147.13, + "step": 56040, + "train_speed(iter/s)": 0.20065 + }, + { + "acc": 0.7795207, + "epoch": 1.307688942679246, + "grad_norm": 5.03125, + "learning_rate": 2.8264244109969963e-06, + "loss": 0.82625637, + "memory(GiB)": 147.13, + "step": 56050, + "train_speed(iter/s)": 0.200669 + }, + { + "acc": 0.77957468, + "epoch": 1.3079222502515346, + "grad_norm": 6.59375, + "learning_rate": 2.824723292640856e-06, + "loss": 0.77296095, + "memory(GiB)": 147.13, + "step": 56060, + "train_speed(iter/s)": 0.200685 + }, + { + "acc": 0.76619778, + "epoch": 1.3081555578238238, + "grad_norm": 5.8125, + "learning_rate": 2.823022484804834e-06, + "loss": 0.82944441, + "memory(GiB)": 147.13, + "step": 56070, + "train_speed(iter/s)": 0.200702 + }, + { + "acc": 0.77313166, + "epoch": 1.3083888653961124, + "grad_norm": 7.875, + "learning_rate": 2.8213219877317164e-06, + "loss": 0.82456112, + "memory(GiB)": 147.13, + "step": 56080, + "train_speed(iter/s)": 0.200721 + }, + { + "acc": 0.76039181, + "epoch": 1.3086221729684013, + "grad_norm": 5.71875, + "learning_rate": 2.819621801664256e-06, + "loss": 0.85574379, + "memory(GiB)": 147.13, + "step": 56090, + "train_speed(iter/s)": 0.200741 + }, + { + "acc": 0.78070183, + "epoch": 1.3088554805406902, + "grad_norm": 4.78125, + "learning_rate": 2.817921926845147e-06, + "loss": 0.81434116, + "memory(GiB)": 147.13, + "step": 56100, + "train_speed(iter/s)": 0.20076 + }, + { + "acc": 0.78396792, + "epoch": 1.3090887881129791, + "grad_norm": 5.1875, + "learning_rate": 2.8162223635170515e-06, + "loss": 0.76618247, + "memory(GiB)": 147.13, + "step": 56110, + "train_speed(iter/s)": 0.200777 + }, + { + "acc": 0.79208441, + "epoch": 1.309322095685268, + "grad_norm": 6.53125, + "learning_rate": 2.814523111922577e-06, + "loss": 0.74408183, + "memory(GiB)": 147.13, + "step": 56120, + "train_speed(iter/s)": 0.200796 + }, + { + "acc": 0.79604063, + "epoch": 1.309555403257557, + "grad_norm": 4.53125, + "learning_rate": 2.812824172304297e-06, + "loss": 0.75110483, + "memory(GiB)": 147.13, + "step": 56130, + "train_speed(iter/s)": 0.200814 + }, + { + "acc": 0.77951555, + "epoch": 1.3097887108298458, + "grad_norm": 5.75, + "learning_rate": 2.8111255449047277e-06, + "loss": 0.76333828, + "memory(GiB)": 147.13, + "step": 56140, + "train_speed(iter/s)": 0.200833 + }, + { + "acc": 0.79114752, + "epoch": 1.3100220184021347, + "grad_norm": 5.21875, + "learning_rate": 2.809427229966353e-06, + "loss": 0.73384008, + "memory(GiB)": 147.13, + "step": 56150, + "train_speed(iter/s)": 0.200851 + }, + { + "acc": 0.77196131, + "epoch": 1.3102553259744236, + "grad_norm": 8.3125, + "learning_rate": 2.8077292277316036e-06, + "loss": 0.80360489, + "memory(GiB)": 147.13, + "step": 56160, + "train_speed(iter/s)": 0.200869 + }, + { + "acc": 0.78124895, + "epoch": 1.3104886335467125, + "grad_norm": 5.375, + "learning_rate": 2.8060315384428692e-06, + "loss": 0.79064064, + "memory(GiB)": 147.13, + "step": 56170, + "train_speed(iter/s)": 0.200889 + }, + { + "acc": 0.77550502, + "epoch": 1.3107219411190014, + "grad_norm": 8.8125, + "learning_rate": 2.8043341623424974e-06, + "loss": 0.82532978, + "memory(GiB)": 147.13, + "step": 56180, + "train_speed(iter/s)": 0.200907 + }, + { + "acc": 0.80522442, + "epoch": 1.3109552486912903, + "grad_norm": 5.15625, + "learning_rate": 2.8026370996727835e-06, + "loss": 0.68403687, + "memory(GiB)": 147.13, + "step": 56190, + "train_speed(iter/s)": 0.200925 + }, + { + "acc": 0.79306288, + "epoch": 1.3111885562635792, + "grad_norm": 7.21875, + "learning_rate": 2.800940350675988e-06, + "loss": 0.7425447, + "memory(GiB)": 147.13, + "step": 56200, + "train_speed(iter/s)": 0.200944 + }, + { + "acc": 0.76478882, + "epoch": 1.3114218638358681, + "grad_norm": 5.40625, + "learning_rate": 2.7992439155943185e-06, + "loss": 0.87208843, + "memory(GiB)": 147.13, + "step": 56210, + "train_speed(iter/s)": 0.200963 + }, + { + "acc": 0.7814106, + "epoch": 1.311655171408157, + "grad_norm": 4.34375, + "learning_rate": 2.797547794669938e-06, + "loss": 0.7701395, + "memory(GiB)": 147.13, + "step": 56220, + "train_speed(iter/s)": 0.200982 + }, + { + "acc": 0.76465588, + "epoch": 1.311888478980446, + "grad_norm": 6.15625, + "learning_rate": 2.7958519881449723e-06, + "loss": 0.860077, + "memory(GiB)": 147.13, + "step": 56230, + "train_speed(iter/s)": 0.201001 + }, + { + "acc": 0.78399577, + "epoch": 1.3121217865527348, + "grad_norm": 4.40625, + "learning_rate": 2.794156496261493e-06, + "loss": 0.75324211, + "memory(GiB)": 147.13, + "step": 56240, + "train_speed(iter/s)": 0.201021 + }, + { + "acc": 0.77755909, + "epoch": 1.3123550941250237, + "grad_norm": 5.78125, + "learning_rate": 2.792461319261538e-06, + "loss": 0.79980249, + "memory(GiB)": 147.13, + "step": 56250, + "train_speed(iter/s)": 0.201039 + }, + { + "acc": 0.77361746, + "epoch": 1.3125884016973126, + "grad_norm": 4.96875, + "learning_rate": 2.790766457387083e-06, + "loss": 0.81588135, + "memory(GiB)": 147.13, + "step": 56260, + "train_speed(iter/s)": 0.201056 + }, + { + "acc": 0.78619328, + "epoch": 1.3128217092696015, + "grad_norm": 4.25, + "learning_rate": 2.7890719108800766e-06, + "loss": 0.75692482, + "memory(GiB)": 147.13, + "step": 56270, + "train_speed(iter/s)": 0.201074 + }, + { + "acc": 0.77736025, + "epoch": 1.3130550168418904, + "grad_norm": 6.0, + "learning_rate": 2.7873776799824115e-06, + "loss": 0.79954443, + "memory(GiB)": 147.13, + "step": 56280, + "train_speed(iter/s)": 0.201092 + }, + { + "acc": 0.8001195, + "epoch": 1.3132883244141793, + "grad_norm": 5.0, + "learning_rate": 2.7856837649359416e-06, + "loss": 0.7024128, + "memory(GiB)": 147.13, + "step": 56290, + "train_speed(iter/s)": 0.201111 + }, + { + "acc": 0.78101416, + "epoch": 1.3135216319864682, + "grad_norm": 6.75, + "learning_rate": 2.7839901659824707e-06, + "loss": 0.80548544, + "memory(GiB)": 147.13, + "step": 56300, + "train_speed(iter/s)": 0.201128 + }, + { + "acc": 0.7754415, + "epoch": 1.313754939558757, + "grad_norm": 5.25, + "learning_rate": 2.7822968833637577e-06, + "loss": 0.79988079, + "memory(GiB)": 147.13, + "step": 56310, + "train_speed(iter/s)": 0.201147 + }, + { + "acc": 0.7806459, + "epoch": 1.313988247131046, + "grad_norm": 4.78125, + "learning_rate": 2.7806039173215225e-06, + "loss": 0.78693638, + "memory(GiB)": 147.13, + "step": 56320, + "train_speed(iter/s)": 0.201165 + }, + { + "acc": 0.78040733, + "epoch": 1.314221554703335, + "grad_norm": 6.5625, + "learning_rate": 2.7789112680974316e-06, + "loss": 0.78004808, + "memory(GiB)": 147.13, + "step": 56330, + "train_speed(iter/s)": 0.201183 + }, + { + "acc": 0.80179815, + "epoch": 1.3144548622756238, + "grad_norm": 3.546875, + "learning_rate": 2.7772189359331136e-06, + "loss": 0.68882008, + "memory(GiB)": 147.13, + "step": 56340, + "train_speed(iter/s)": 0.2012 + }, + { + "acc": 0.7934073, + "epoch": 1.3146881698479127, + "grad_norm": 6.09375, + "learning_rate": 2.7755269210701475e-06, + "loss": 0.76322498, + "memory(GiB)": 147.13, + "step": 56350, + "train_speed(iter/s)": 0.201217 + }, + { + "acc": 0.78926401, + "epoch": 1.3149214774202016, + "grad_norm": 6.03125, + "learning_rate": 2.7738352237500667e-06, + "loss": 0.74684887, + "memory(GiB)": 147.13, + "step": 56360, + "train_speed(iter/s)": 0.201235 + }, + { + "acc": 0.76026993, + "epoch": 1.3151547849924903, + "grad_norm": 4.59375, + "learning_rate": 2.7721438442143607e-06, + "loss": 0.85534048, + "memory(GiB)": 147.13, + "step": 56370, + "train_speed(iter/s)": 0.201253 + }, + { + "acc": 0.77141409, + "epoch": 1.3153880925647794, + "grad_norm": 5.125, + "learning_rate": 2.7704527827044714e-06, + "loss": 0.81075706, + "memory(GiB)": 147.13, + "step": 56380, + "train_speed(iter/s)": 0.201271 + }, + { + "acc": 0.79955773, + "epoch": 1.315621400137068, + "grad_norm": 3.953125, + "learning_rate": 2.7687620394618025e-06, + "loss": 0.71365733, + "memory(GiB)": 147.13, + "step": 56390, + "train_speed(iter/s)": 0.20129 + }, + { + "acc": 0.78430367, + "epoch": 1.3158547077093572, + "grad_norm": 6.34375, + "learning_rate": 2.767071614727702e-06, + "loss": 0.75429764, + "memory(GiB)": 147.13, + "step": 56400, + "train_speed(iter/s)": 0.201309 + }, + { + "acc": 0.76308584, + "epoch": 1.3160880152816459, + "grad_norm": 6.65625, + "learning_rate": 2.765381508743482e-06, + "loss": 0.86523905, + "memory(GiB)": 147.13, + "step": 56410, + "train_speed(iter/s)": 0.201326 + }, + { + "acc": 0.78555737, + "epoch": 1.316321322853935, + "grad_norm": 5.65625, + "learning_rate": 2.7636917217504007e-06, + "loss": 0.77236929, + "memory(GiB)": 147.13, + "step": 56420, + "train_speed(iter/s)": 0.201345 + }, + { + "acc": 0.78901734, + "epoch": 1.3165546304262237, + "grad_norm": 4.21875, + "learning_rate": 2.762002253989678e-06, + "loss": 0.75951052, + "memory(GiB)": 147.13, + "step": 56430, + "train_speed(iter/s)": 0.201364 + }, + { + "acc": 0.76562176, + "epoch": 1.3167879379985128, + "grad_norm": 5.375, + "learning_rate": 2.7603131057024835e-06, + "loss": 0.84810133, + "memory(GiB)": 147.13, + "step": 56440, + "train_speed(iter/s)": 0.201383 + }, + { + "acc": 0.78627138, + "epoch": 1.3170212455708015, + "grad_norm": 5.125, + "learning_rate": 2.7586242771299404e-06, + "loss": 0.76413832, + "memory(GiB)": 147.13, + "step": 56450, + "train_speed(iter/s)": 0.201402 + }, + { + "acc": 0.78318043, + "epoch": 1.3172545531430904, + "grad_norm": 4.15625, + "learning_rate": 2.7569357685131325e-06, + "loss": 0.80174465, + "memory(GiB)": 147.13, + "step": 56460, + "train_speed(iter/s)": 0.201419 + }, + { + "acc": 0.79765291, + "epoch": 1.3174878607153793, + "grad_norm": 6.84375, + "learning_rate": 2.7552475800930907e-06, + "loss": 0.72690916, + "memory(GiB)": 147.13, + "step": 56470, + "train_speed(iter/s)": 0.201438 + }, + { + "acc": 0.79450531, + "epoch": 1.3177211682876682, + "grad_norm": 4.3125, + "learning_rate": 2.753559712110808e-06, + "loss": 0.73781071, + "memory(GiB)": 147.13, + "step": 56480, + "train_speed(iter/s)": 0.201455 + }, + { + "acc": 0.78455567, + "epoch": 1.317954475859957, + "grad_norm": 4.84375, + "learning_rate": 2.75187216480722e-06, + "loss": 0.7775054, + "memory(GiB)": 147.13, + "step": 56490, + "train_speed(iter/s)": 0.201473 + }, + { + "acc": 0.79019089, + "epoch": 1.318187783432246, + "grad_norm": 4.90625, + "learning_rate": 2.75018493842323e-06, + "loss": 0.75038633, + "memory(GiB)": 147.13, + "step": 56500, + "train_speed(iter/s)": 0.201491 + }, + { + "epoch": 1.318187783432246, + "eval_acc": 0.7445172620268719, + "eval_loss": 0.8049691915512085, + "eval_runtime": 1270.3048, + "eval_samples_per_second": 28.333, + "eval_steps_per_second": 14.167, + "step": 56500 + }, + { + "acc": 0.77661104, + "epoch": 1.3184210910045349, + "grad_norm": 4.21875, + "learning_rate": 2.748498033199686e-06, + "loss": 0.79541645, + "memory(GiB)": 147.13, + "step": 56510, + "train_speed(iter/s)": 0.200588 + }, + { + "acc": 0.78313751, + "epoch": 1.3186543985768238, + "grad_norm": 4.90625, + "learning_rate": 2.7468114493773913e-06, + "loss": 0.80430899, + "memory(GiB)": 147.13, + "step": 56520, + "train_speed(iter/s)": 0.200606 + }, + { + "acc": 0.76499681, + "epoch": 1.3188877061491127, + "grad_norm": 5.71875, + "learning_rate": 2.7451251871971103e-06, + "loss": 0.85248184, + "memory(GiB)": 147.13, + "step": 56530, + "train_speed(iter/s)": 0.200625 + }, + { + "acc": 0.78148131, + "epoch": 1.3191210137214016, + "grad_norm": 5.09375, + "learning_rate": 2.743439246899552e-06, + "loss": 0.80053949, + "memory(GiB)": 147.13, + "step": 56540, + "train_speed(iter/s)": 0.200644 + }, + { + "acc": 0.78812108, + "epoch": 1.3193543212936905, + "grad_norm": 4.28125, + "learning_rate": 2.7417536287253864e-06, + "loss": 0.77857547, + "memory(GiB)": 147.13, + "step": 56550, + "train_speed(iter/s)": 0.200661 + }, + { + "acc": 0.78670983, + "epoch": 1.3195876288659794, + "grad_norm": 4.4375, + "learning_rate": 2.7400683329152358e-06, + "loss": 0.77823715, + "memory(GiB)": 147.13, + "step": 56560, + "train_speed(iter/s)": 0.200679 + }, + { + "acc": 0.78537912, + "epoch": 1.3198209364382683, + "grad_norm": 4.15625, + "learning_rate": 2.738383359709671e-06, + "loss": 0.76351776, + "memory(GiB)": 147.13, + "step": 56570, + "train_speed(iter/s)": 0.200698 + }, + { + "acc": 0.77593765, + "epoch": 1.3200542440105572, + "grad_norm": 9.25, + "learning_rate": 2.736698709349227e-06, + "loss": 0.80890398, + "memory(GiB)": 147.13, + "step": 56580, + "train_speed(iter/s)": 0.200715 + }, + { + "acc": 0.7956008, + "epoch": 1.320287551582846, + "grad_norm": 5.5, + "learning_rate": 2.7350143820743847e-06, + "loss": 0.7315671, + "memory(GiB)": 147.13, + "step": 56590, + "train_speed(iter/s)": 0.200732 + }, + { + "acc": 0.77545271, + "epoch": 1.320520859155135, + "grad_norm": 24.25, + "learning_rate": 2.7333303781255816e-06, + "loss": 0.78528709, + "memory(GiB)": 147.13, + "step": 56600, + "train_speed(iter/s)": 0.20075 + }, + { + "acc": 0.76972427, + "epoch": 1.3207541667274239, + "grad_norm": 9.625, + "learning_rate": 2.7316466977432067e-06, + "loss": 0.80668039, + "memory(GiB)": 147.13, + "step": 56610, + "train_speed(iter/s)": 0.200768 + }, + { + "acc": 0.77437797, + "epoch": 1.3209874742997127, + "grad_norm": 5.65625, + "learning_rate": 2.729963341167608e-06, + "loss": 0.82345848, + "memory(GiB)": 147.13, + "step": 56620, + "train_speed(iter/s)": 0.200787 + }, + { + "acc": 0.79564867, + "epoch": 1.3212207818720016, + "grad_norm": 4.9375, + "learning_rate": 2.728280308639081e-06, + "loss": 0.72023449, + "memory(GiB)": 147.13, + "step": 56630, + "train_speed(iter/s)": 0.200806 + }, + { + "acc": 0.77554989, + "epoch": 1.3214540894442905, + "grad_norm": 5.40625, + "learning_rate": 2.7265976003978828e-06, + "loss": 0.79516258, + "memory(GiB)": 147.13, + "step": 56640, + "train_speed(iter/s)": 0.200824 + }, + { + "acc": 0.76518278, + "epoch": 1.3216873970165794, + "grad_norm": 6.0625, + "learning_rate": 2.7249152166842164e-06, + "loss": 0.8493721, + "memory(GiB)": 147.13, + "step": 56650, + "train_speed(iter/s)": 0.200843 + }, + { + "acc": 0.78103828, + "epoch": 1.3219207045888683, + "grad_norm": 5.9375, + "learning_rate": 2.72323315773824e-06, + "loss": 0.78014269, + "memory(GiB)": 147.13, + "step": 56660, + "train_speed(iter/s)": 0.200862 + }, + { + "acc": 0.77679877, + "epoch": 1.3221540121611572, + "grad_norm": 4.03125, + "learning_rate": 2.72155142380007e-06, + "loss": 0.80789299, + "memory(GiB)": 147.13, + "step": 56670, + "train_speed(iter/s)": 0.200881 + }, + { + "acc": 0.79273858, + "epoch": 1.3223873197334461, + "grad_norm": 6.65625, + "learning_rate": 2.7198700151097714e-06, + "loss": 0.74300385, + "memory(GiB)": 147.13, + "step": 56680, + "train_speed(iter/s)": 0.2009 + }, + { + "acc": 0.76670465, + "epoch": 1.322620627305735, + "grad_norm": 4.90625, + "learning_rate": 2.7181889319073674e-06, + "loss": 0.83968229, + "memory(GiB)": 147.13, + "step": 56690, + "train_speed(iter/s)": 0.200917 + }, + { + "acc": 0.80488224, + "epoch": 1.322853934878024, + "grad_norm": 6.09375, + "learning_rate": 2.7165081744328304e-06, + "loss": 0.69187484, + "memory(GiB)": 147.13, + "step": 56700, + "train_speed(iter/s)": 0.200936 + }, + { + "acc": 0.78700209, + "epoch": 1.3230872424503128, + "grad_norm": 5.125, + "learning_rate": 2.714827742926088e-06, + "loss": 0.75780425, + "memory(GiB)": 147.13, + "step": 56710, + "train_speed(iter/s)": 0.200954 + }, + { + "acc": 0.80247622, + "epoch": 1.3233205500226017, + "grad_norm": 5.03125, + "learning_rate": 2.7131476376270215e-06, + "loss": 0.71583662, + "memory(GiB)": 147.13, + "step": 56720, + "train_speed(iter/s)": 0.200971 + }, + { + "acc": 0.78960152, + "epoch": 1.3235538575948906, + "grad_norm": 4.75, + "learning_rate": 2.711467858775464e-06, + "loss": 0.75800562, + "memory(GiB)": 147.13, + "step": 56730, + "train_speed(iter/s)": 0.20099 + }, + { + "acc": 0.77212892, + "epoch": 1.3237871651671795, + "grad_norm": 4.625, + "learning_rate": 2.7097884066112062e-06, + "loss": 0.81948633, + "memory(GiB)": 147.13, + "step": 56740, + "train_speed(iter/s)": 0.201009 + }, + { + "acc": 0.77226896, + "epoch": 1.3240204727394684, + "grad_norm": 4.96875, + "learning_rate": 2.7081092813739863e-06, + "loss": 0.81515799, + "memory(GiB)": 147.13, + "step": 56750, + "train_speed(iter/s)": 0.201028 + }, + { + "acc": 0.7703146, + "epoch": 1.3242537803117571, + "grad_norm": 9.4375, + "learning_rate": 2.7064304833035027e-06, + "loss": 0.82806835, + "memory(GiB)": 147.13, + "step": 56760, + "train_speed(iter/s)": 0.201046 + }, + { + "acc": 0.75851698, + "epoch": 1.3244870878840462, + "grad_norm": 9.625, + "learning_rate": 2.704752012639399e-06, + "loss": 0.87062492, + "memory(GiB)": 147.13, + "step": 56770, + "train_speed(iter/s)": 0.201063 + }, + { + "acc": 0.78109846, + "epoch": 1.324720395456335, + "grad_norm": 7.53125, + "learning_rate": 2.703073869621281e-06, + "loss": 0.75848188, + "memory(GiB)": 147.13, + "step": 56780, + "train_speed(iter/s)": 0.201083 + }, + { + "acc": 0.78177109, + "epoch": 1.324953703028624, + "grad_norm": 5.78125, + "learning_rate": 2.7013960544887007e-06, + "loss": 0.77262831, + "memory(GiB)": 147.13, + "step": 56790, + "train_speed(iter/s)": 0.201101 + }, + { + "acc": 0.79652271, + "epoch": 1.3251870106009127, + "grad_norm": 3.84375, + "learning_rate": 2.699718567481164e-06, + "loss": 0.72971768, + "memory(GiB)": 147.13, + "step": 56800, + "train_speed(iter/s)": 0.20112 + }, + { + "acc": 0.76938486, + "epoch": 1.3254203181732018, + "grad_norm": 5.4375, + "learning_rate": 2.698041408838136e-06, + "loss": 0.81897049, + "memory(GiB)": 147.13, + "step": 56810, + "train_speed(iter/s)": 0.201139 + }, + { + "acc": 0.796984, + "epoch": 1.3256536257454905, + "grad_norm": 4.21875, + "learning_rate": 2.696364578799028e-06, + "loss": 0.69904175, + "memory(GiB)": 147.13, + "step": 56820, + "train_speed(iter/s)": 0.201158 + }, + { + "acc": 0.79061108, + "epoch": 1.3258869333177796, + "grad_norm": 5.03125, + "learning_rate": 2.694688077603207e-06, + "loss": 0.75381279, + "memory(GiB)": 147.13, + "step": 56830, + "train_speed(iter/s)": 0.201177 + }, + { + "acc": 0.77822804, + "epoch": 1.3261202408900683, + "grad_norm": 4.75, + "learning_rate": 2.6930119054899905e-06, + "loss": 0.78712869, + "memory(GiB)": 147.13, + "step": 56840, + "train_speed(iter/s)": 0.201195 + }, + { + "acc": 0.79944668, + "epoch": 1.3263535484623572, + "grad_norm": 5.59375, + "learning_rate": 2.6913360626986575e-06, + "loss": 0.72601347, + "memory(GiB)": 147.13, + "step": 56850, + "train_speed(iter/s)": 0.201214 + }, + { + "acc": 0.78989477, + "epoch": 1.326586856034646, + "grad_norm": 5.34375, + "learning_rate": 2.68966054946843e-06, + "loss": 0.72735806, + "memory(GiB)": 147.13, + "step": 56860, + "train_speed(iter/s)": 0.201232 + }, + { + "acc": 0.78181467, + "epoch": 1.326820163606935, + "grad_norm": 4.96875, + "learning_rate": 2.687985366038486e-06, + "loss": 0.79019494, + "memory(GiB)": 147.13, + "step": 56870, + "train_speed(iter/s)": 0.201251 + }, + { + "acc": 0.76266632, + "epoch": 1.327053471179224, + "grad_norm": 9.0, + "learning_rate": 2.6863105126479616e-06, + "loss": 0.85104504, + "memory(GiB)": 147.13, + "step": 56880, + "train_speed(iter/s)": 0.20127 + }, + { + "acc": 0.77862854, + "epoch": 1.3272867787515128, + "grad_norm": 4.03125, + "learning_rate": 2.6846359895359373e-06, + "loss": 0.79643354, + "memory(GiB)": 147.13, + "step": 56890, + "train_speed(iter/s)": 0.201288 + }, + { + "acc": 0.77985401, + "epoch": 1.3275200863238017, + "grad_norm": 4.9375, + "learning_rate": 2.682961796941456e-06, + "loss": 0.78767076, + "memory(GiB)": 147.13, + "step": 56900, + "train_speed(iter/s)": 0.201306 + }, + { + "acc": 0.7925395, + "epoch": 1.3277533938960906, + "grad_norm": 5.0, + "learning_rate": 2.6812879351035015e-06, + "loss": 0.71750937, + "memory(GiB)": 147.13, + "step": 56910, + "train_speed(iter/s)": 0.201325 + }, + { + "acc": 0.80067816, + "epoch": 1.3279867014683795, + "grad_norm": 4.53125, + "learning_rate": 2.679614404261023e-06, + "loss": 0.70950232, + "memory(GiB)": 147.13, + "step": 56920, + "train_speed(iter/s)": 0.201343 + }, + { + "acc": 0.80294123, + "epoch": 1.3282200090406684, + "grad_norm": 9.4375, + "learning_rate": 2.677941204652914e-06, + "loss": 0.70604153, + "memory(GiB)": 147.13, + "step": 56930, + "train_speed(iter/s)": 0.201362 + }, + { + "acc": 0.78235593, + "epoch": 1.3284533166129573, + "grad_norm": 5.65625, + "learning_rate": 2.676268336518024e-06, + "loss": 0.77650847, + "memory(GiB)": 147.13, + "step": 56940, + "train_speed(iter/s)": 0.201381 + }, + { + "acc": 0.78522987, + "epoch": 1.3286866241852462, + "grad_norm": 4.34375, + "learning_rate": 2.6745958000951546e-06, + "loss": 0.76834965, + "memory(GiB)": 147.13, + "step": 56950, + "train_speed(iter/s)": 0.2014 + }, + { + "acc": 0.77212758, + "epoch": 1.328919931757535, + "grad_norm": 5.8125, + "learning_rate": 2.672923595623056e-06, + "loss": 0.78489156, + "memory(GiB)": 147.13, + "step": 56960, + "train_speed(iter/s)": 0.201417 + }, + { + "acc": 0.78244338, + "epoch": 1.329153239329824, + "grad_norm": 4.46875, + "learning_rate": 2.67125172334044e-06, + "loss": 0.77883863, + "memory(GiB)": 147.13, + "step": 56970, + "train_speed(iter/s)": 0.201435 + }, + { + "acc": 0.76625214, + "epoch": 1.3293865469021129, + "grad_norm": 5.3125, + "learning_rate": 2.669580183485963e-06, + "loss": 0.8450387, + "memory(GiB)": 147.13, + "step": 56980, + "train_speed(iter/s)": 0.201453 + }, + { + "acc": 0.76922102, + "epoch": 1.3296198544744018, + "grad_norm": 5.09375, + "learning_rate": 2.667908976298239e-06, + "loss": 0.81567163, + "memory(GiB)": 147.13, + "step": 56990, + "train_speed(iter/s)": 0.201472 + }, + { + "acc": 0.76809058, + "epoch": 1.3298531620466907, + "grad_norm": 9.375, + "learning_rate": 2.666238102015832e-06, + "loss": 0.83717842, + "memory(GiB)": 147.13, + "step": 57000, + "train_speed(iter/s)": 0.201488 + }, + { + "epoch": 1.3298531620466907, + "eval_acc": 0.7444829911611593, + "eval_loss": 0.8050174117088318, + "eval_runtime": 1269.7274, + "eval_samples_per_second": 28.345, + "eval_steps_per_second": 14.173, + "step": 57000 + }, + { + "acc": 0.79596758, + "epoch": 1.3300864696189796, + "grad_norm": 6.75, + "learning_rate": 2.6645675608772554e-06, + "loss": 0.73099294, + "memory(GiB)": 147.13, + "step": 57010, + "train_speed(iter/s)": 0.200594 + }, + { + "acc": 0.76844263, + "epoch": 1.3303197771912685, + "grad_norm": 6.03125, + "learning_rate": 2.662897353120983e-06, + "loss": 0.82072315, + "memory(GiB)": 147.13, + "step": 57020, + "train_speed(iter/s)": 0.200613 + }, + { + "acc": 0.77861929, + "epoch": 1.3305530847635574, + "grad_norm": 8.375, + "learning_rate": 2.6612274789854326e-06, + "loss": 0.75851078, + "memory(GiB)": 147.13, + "step": 57030, + "train_speed(iter/s)": 0.200631 + }, + { + "acc": 0.77308245, + "epoch": 1.3307863923358463, + "grad_norm": 6.5, + "learning_rate": 2.659557938708982e-06, + "loss": 0.81746635, + "memory(GiB)": 147.13, + "step": 57040, + "train_speed(iter/s)": 0.20065 + }, + { + "acc": 0.79103942, + "epoch": 1.3310196999081352, + "grad_norm": 6.78125, + "learning_rate": 2.657888732529956e-06, + "loss": 0.75392189, + "memory(GiB)": 147.13, + "step": 57050, + "train_speed(iter/s)": 0.200667 + }, + { + "acc": 0.76411114, + "epoch": 1.331253007480424, + "grad_norm": 5.1875, + "learning_rate": 2.656219860686633e-06, + "loss": 0.86522112, + "memory(GiB)": 147.13, + "step": 57060, + "train_speed(iter/s)": 0.200687 + }, + { + "acc": 0.77968445, + "epoch": 1.331486315052713, + "grad_norm": 5.9375, + "learning_rate": 2.6545513234172413e-06, + "loss": 0.79840469, + "memory(GiB)": 147.13, + "step": 57070, + "train_speed(iter/s)": 0.200705 + }, + { + "acc": 0.79253349, + "epoch": 1.3317196226250019, + "grad_norm": 4.875, + "learning_rate": 2.65288312095997e-06, + "loss": 0.7571218, + "memory(GiB)": 147.13, + "step": 57080, + "train_speed(iter/s)": 0.200724 + }, + { + "acc": 0.78285084, + "epoch": 1.3319529301972908, + "grad_norm": 3.703125, + "learning_rate": 2.651215253552951e-06, + "loss": 0.77423506, + "memory(GiB)": 147.13, + "step": 57090, + "train_speed(iter/s)": 0.200742 + }, + { + "acc": 0.78827653, + "epoch": 1.3321862377695797, + "grad_norm": 5.8125, + "learning_rate": 2.6495477214342704e-06, + "loss": 0.74848604, + "memory(GiB)": 147.13, + "step": 57100, + "train_speed(iter/s)": 0.20076 + }, + { + "acc": 0.77849426, + "epoch": 1.3324195453418686, + "grad_norm": 5.21875, + "learning_rate": 2.647880524841971e-06, + "loss": 0.79774141, + "memory(GiB)": 147.13, + "step": 57110, + "train_speed(iter/s)": 0.200778 + }, + { + "acc": 0.78932242, + "epoch": 1.3326528529141575, + "grad_norm": 6.125, + "learning_rate": 2.646213664014042e-06, + "loss": 0.74397902, + "memory(GiB)": 147.13, + "step": 57120, + "train_speed(iter/s)": 0.200797 + }, + { + "acc": 0.78925648, + "epoch": 1.3328861604864464, + "grad_norm": 4.65625, + "learning_rate": 2.6445471391884304e-06, + "loss": 0.76678162, + "memory(GiB)": 147.13, + "step": 57130, + "train_speed(iter/s)": 0.200815 + }, + { + "acc": 0.78613939, + "epoch": 1.3331194680587353, + "grad_norm": 5.0625, + "learning_rate": 2.6428809506030306e-06, + "loss": 0.78517361, + "memory(GiB)": 147.13, + "step": 57140, + "train_speed(iter/s)": 0.200833 + }, + { + "acc": 0.77853289, + "epoch": 1.333352775631024, + "grad_norm": 5.75, + "learning_rate": 2.641215098495688e-06, + "loss": 0.80418797, + "memory(GiB)": 147.13, + "step": 57150, + "train_speed(iter/s)": 0.20085 + }, + { + "acc": 0.7710371, + "epoch": 1.333586083203313, + "grad_norm": 7.1875, + "learning_rate": 2.639549583104209e-06, + "loss": 0.82036028, + "memory(GiB)": 147.13, + "step": 57160, + "train_speed(iter/s)": 0.200869 + }, + { + "acc": 0.7829278, + "epoch": 1.3338193907756017, + "grad_norm": 6.40625, + "learning_rate": 2.6378844046663375e-06, + "loss": 0.78318968, + "memory(GiB)": 147.13, + "step": 57170, + "train_speed(iter/s)": 0.200887 + }, + { + "acc": 0.77414856, + "epoch": 1.3340526983478909, + "grad_norm": 4.59375, + "learning_rate": 2.636219563419783e-06, + "loss": 0.79386172, + "memory(GiB)": 147.13, + "step": 57180, + "train_speed(iter/s)": 0.200905 + }, + { + "acc": 0.78854723, + "epoch": 1.3342860059201795, + "grad_norm": 6.0625, + "learning_rate": 2.6345550596021967e-06, + "loss": 0.76046, + "memory(GiB)": 147.13, + "step": 57190, + "train_speed(iter/s)": 0.200924 + }, + { + "acc": 0.77748866, + "epoch": 1.3345193134924687, + "grad_norm": 5.9375, + "learning_rate": 2.632890893451191e-06, + "loss": 0.80022764, + "memory(GiB)": 147.13, + "step": 57200, + "train_speed(iter/s)": 0.200942 + }, + { + "acc": 0.78416834, + "epoch": 1.3347526210647573, + "grad_norm": 4.5, + "learning_rate": 2.63122706520432e-06, + "loss": 0.79278574, + "memory(GiB)": 147.13, + "step": 57210, + "train_speed(iter/s)": 0.200961 + }, + { + "acc": 0.7817687, + "epoch": 1.3349859286370465, + "grad_norm": 5.59375, + "learning_rate": 2.6295635750990998e-06, + "loss": 0.80557747, + "memory(GiB)": 147.13, + "step": 57220, + "train_speed(iter/s)": 0.200979 + }, + { + "acc": 0.7696763, + "epoch": 1.3352192362093351, + "grad_norm": 5.90625, + "learning_rate": 2.627900423372991e-06, + "loss": 0.81974831, + "memory(GiB)": 147.13, + "step": 57230, + "train_speed(iter/s)": 0.200998 + }, + { + "acc": 0.79114909, + "epoch": 1.335452543781624, + "grad_norm": 4.28125, + "learning_rate": 2.626237610263406e-06, + "loss": 0.74017973, + "memory(GiB)": 147.13, + "step": 57240, + "train_speed(iter/s)": 0.201017 + }, + { + "acc": 0.7918015, + "epoch": 1.335685851353913, + "grad_norm": 7.0625, + "learning_rate": 2.6245751360077133e-06, + "loss": 0.72015886, + "memory(GiB)": 147.13, + "step": 57250, + "train_speed(iter/s)": 0.201035 + }, + { + "acc": 0.775524, + "epoch": 1.3359191589262018, + "grad_norm": 7.34375, + "learning_rate": 2.622913000843228e-06, + "loss": 0.79100475, + "memory(GiB)": 147.13, + "step": 57260, + "train_speed(iter/s)": 0.201053 + }, + { + "acc": 0.7828311, + "epoch": 1.3361524664984907, + "grad_norm": 5.15625, + "learning_rate": 2.6212512050072236e-06, + "loss": 0.77825603, + "memory(GiB)": 147.13, + "step": 57270, + "train_speed(iter/s)": 0.201071 + }, + { + "acc": 0.79701376, + "epoch": 1.3363857740707796, + "grad_norm": 5.15625, + "learning_rate": 2.6195897487369195e-06, + "loss": 0.71078596, + "memory(GiB)": 147.13, + "step": 57280, + "train_speed(iter/s)": 0.201089 + }, + { + "acc": 0.77222824, + "epoch": 1.3366190816430685, + "grad_norm": 6.03125, + "learning_rate": 2.6179286322694866e-06, + "loss": 0.80330944, + "memory(GiB)": 147.13, + "step": 57290, + "train_speed(iter/s)": 0.201107 + }, + { + "acc": 0.7781415, + "epoch": 1.3368523892153574, + "grad_norm": 6.4375, + "learning_rate": 2.6162678558420484e-06, + "loss": 0.79362383, + "memory(GiB)": 147.13, + "step": 57300, + "train_speed(iter/s)": 0.201126 + }, + { + "acc": 0.76344051, + "epoch": 1.3370856967876463, + "grad_norm": 6.40625, + "learning_rate": 2.6146074196916806e-06, + "loss": 0.84555454, + "memory(GiB)": 147.13, + "step": 57310, + "train_speed(iter/s)": 0.201146 + }, + { + "acc": 0.79169607, + "epoch": 1.3373190043599352, + "grad_norm": 5.03125, + "learning_rate": 2.6129473240554126e-06, + "loss": 0.7247673, + "memory(GiB)": 147.13, + "step": 57320, + "train_speed(iter/s)": 0.201165 + }, + { + "acc": 0.77501116, + "epoch": 1.3375523119322241, + "grad_norm": 5.1875, + "learning_rate": 2.6112875691702176e-06, + "loss": 0.79619384, + "memory(GiB)": 147.13, + "step": 57330, + "train_speed(iter/s)": 0.201184 + }, + { + "acc": 0.79097958, + "epoch": 1.337785619504513, + "grad_norm": 10.5, + "learning_rate": 2.609628155273032e-06, + "loss": 0.75640821, + "memory(GiB)": 147.13, + "step": 57340, + "train_speed(iter/s)": 0.201202 + }, + { + "acc": 0.77276196, + "epoch": 1.338018927076802, + "grad_norm": 4.96875, + "learning_rate": 2.6079690826007307e-06, + "loss": 0.83116341, + "memory(GiB)": 147.13, + "step": 57350, + "train_speed(iter/s)": 0.20122 + }, + { + "acc": 0.77244496, + "epoch": 1.3382522346490908, + "grad_norm": 5.125, + "learning_rate": 2.606310351390148e-06, + "loss": 0.79508491, + "memory(GiB)": 147.13, + "step": 57360, + "train_speed(iter/s)": 0.201239 + }, + { + "acc": 0.78902702, + "epoch": 1.3384855422213797, + "grad_norm": 5.125, + "learning_rate": 2.6046519618780673e-06, + "loss": 0.73824391, + "memory(GiB)": 147.13, + "step": 57370, + "train_speed(iter/s)": 0.201256 + }, + { + "acc": 0.78787184, + "epoch": 1.3387188497936686, + "grad_norm": 6.4375, + "learning_rate": 2.6029939143012228e-06, + "loss": 0.75440736, + "memory(GiB)": 147.13, + "step": 57380, + "train_speed(iter/s)": 0.201273 + }, + { + "acc": 0.80366573, + "epoch": 1.3389521573659575, + "grad_norm": 8.125, + "learning_rate": 2.601336208896304e-06, + "loss": 0.69952521, + "memory(GiB)": 147.13, + "step": 57390, + "train_speed(iter/s)": 0.201292 + }, + { + "acc": 0.77663946, + "epoch": 1.3391854649382464, + "grad_norm": 5.5625, + "learning_rate": 2.5996788458999404e-06, + "loss": 0.80009689, + "memory(GiB)": 147.13, + "step": 57400, + "train_speed(iter/s)": 0.20131 + }, + { + "acc": 0.77313061, + "epoch": 1.3394187725105353, + "grad_norm": 5.3125, + "learning_rate": 2.598021825548727e-06, + "loss": 0.81944647, + "memory(GiB)": 147.13, + "step": 57410, + "train_speed(iter/s)": 0.201328 + }, + { + "acc": 0.78171992, + "epoch": 1.3396520800828242, + "grad_norm": 5.46875, + "learning_rate": 2.596365148079197e-06, + "loss": 0.77206979, + "memory(GiB)": 147.13, + "step": 57420, + "train_speed(iter/s)": 0.201347 + }, + { + "acc": 0.78008041, + "epoch": 1.3398853876551131, + "grad_norm": 4.46875, + "learning_rate": 2.594708813727847e-06, + "loss": 0.79095898, + "memory(GiB)": 147.13, + "step": 57430, + "train_speed(iter/s)": 0.201365 + }, + { + "acc": 0.79991326, + "epoch": 1.340118695227402, + "grad_norm": 5.78125, + "learning_rate": 2.5930528227311148e-06, + "loss": 0.69914985, + "memory(GiB)": 147.13, + "step": 57440, + "train_speed(iter/s)": 0.201382 + }, + { + "acc": 0.79570603, + "epoch": 1.340352002799691, + "grad_norm": 5.1875, + "learning_rate": 2.591397175325391e-06, + "loss": 0.71980028, + "memory(GiB)": 147.13, + "step": 57450, + "train_speed(iter/s)": 0.201399 + }, + { + "acc": 0.78314619, + "epoch": 1.3405853103719798, + "grad_norm": 5.53125, + "learning_rate": 2.5897418717470224e-06, + "loss": 0.77125511, + "memory(GiB)": 147.13, + "step": 57460, + "train_speed(iter/s)": 0.201417 + }, + { + "acc": 0.77202096, + "epoch": 1.3408186179442687, + "grad_norm": 6.9375, + "learning_rate": 2.5880869122322994e-06, + "loss": 0.82745008, + "memory(GiB)": 147.13, + "step": 57470, + "train_speed(iter/s)": 0.201435 + }, + { + "acc": 0.77319994, + "epoch": 1.3410519255165576, + "grad_norm": 5.0, + "learning_rate": 2.5864322970174714e-06, + "loss": 0.79846964, + "memory(GiB)": 147.13, + "step": 57480, + "train_speed(iter/s)": 0.201454 + }, + { + "acc": 0.79231257, + "epoch": 1.3412852330888465, + "grad_norm": 5.125, + "learning_rate": 2.5847780263387314e-06, + "loss": 0.73147917, + "memory(GiB)": 147.13, + "step": 57490, + "train_speed(iter/s)": 0.201472 + }, + { + "acc": 0.77735405, + "epoch": 1.3415185406611354, + "grad_norm": 6.53125, + "learning_rate": 2.583124100432227e-06, + "loss": 0.78960352, + "memory(GiB)": 147.13, + "step": 57500, + "train_speed(iter/s)": 0.201491 + }, + { + "epoch": 1.3415185406611354, + "eval_acc": 0.7445643444311685, + "eval_loss": 0.8049200177192688, + "eval_runtime": 1269.9367, + "eval_samples_per_second": 28.341, + "eval_steps_per_second": 14.171, + "step": 57500 + }, + { + "acc": 0.77931428, + "epoch": 1.3417518482334243, + "grad_norm": 5.0625, + "learning_rate": 2.5814705195340527e-06, + "loss": 0.79522343, + "memory(GiB)": 147.13, + "step": 57510, + "train_speed(iter/s)": 0.200603 + }, + { + "acc": 0.77775626, + "epoch": 1.341985155805713, + "grad_norm": 5.3125, + "learning_rate": 2.5798172838802616e-06, + "loss": 0.79539862, + "memory(GiB)": 147.13, + "step": 57520, + "train_speed(iter/s)": 0.20062 + }, + { + "acc": 0.7882154, + "epoch": 1.342218463378002, + "grad_norm": 5.75, + "learning_rate": 2.5781643937068495e-06, + "loss": 0.75900679, + "memory(GiB)": 147.13, + "step": 57530, + "train_speed(iter/s)": 0.200638 + }, + { + "acc": 0.78802242, + "epoch": 1.3424517709502908, + "grad_norm": 4.3125, + "learning_rate": 2.5765118492497654e-06, + "loss": 0.75461683, + "memory(GiB)": 147.13, + "step": 57540, + "train_speed(iter/s)": 0.200656 + }, + { + "acc": 0.79044104, + "epoch": 1.34268507852258, + "grad_norm": 5.40625, + "learning_rate": 2.5748596507449118e-06, + "loss": 0.75172663, + "memory(GiB)": 147.13, + "step": 57550, + "train_speed(iter/s)": 0.200674 + }, + { + "acc": 0.78111725, + "epoch": 1.3429183860948686, + "grad_norm": 6.03125, + "learning_rate": 2.5732077984281378e-06, + "loss": 0.76782961, + "memory(GiB)": 147.13, + "step": 57560, + "train_speed(iter/s)": 0.200691 + }, + { + "acc": 0.78649011, + "epoch": 1.3431516936671577, + "grad_norm": 6.0, + "learning_rate": 2.571556292535247e-06, + "loss": 0.77026129, + "memory(GiB)": 147.13, + "step": 57570, + "train_speed(iter/s)": 0.20071 + }, + { + "acc": 0.79622579, + "epoch": 1.3433850012394464, + "grad_norm": 4.0625, + "learning_rate": 2.5699051333019897e-06, + "loss": 0.74895239, + "memory(GiB)": 147.13, + "step": 57580, + "train_speed(iter/s)": 0.200728 + }, + { + "acc": 0.77864823, + "epoch": 1.3436183088117355, + "grad_norm": 5.0, + "learning_rate": 2.568254320964067e-06, + "loss": 0.78803215, + "memory(GiB)": 147.13, + "step": 57590, + "train_speed(iter/s)": 0.200746 + }, + { + "acc": 0.776053, + "epoch": 1.3438516163840242, + "grad_norm": 6.46875, + "learning_rate": 2.5666038557571355e-06, + "loss": 0.81020985, + "memory(GiB)": 147.13, + "step": 57600, + "train_speed(iter/s)": 0.200764 + }, + { + "acc": 0.78326931, + "epoch": 1.3440849239563133, + "grad_norm": 3.53125, + "learning_rate": 2.5649537379167944e-06, + "loss": 0.78707523, + "memory(GiB)": 147.13, + "step": 57610, + "train_speed(iter/s)": 0.200781 + }, + { + "acc": 0.76431599, + "epoch": 1.344318231528602, + "grad_norm": 5.5, + "learning_rate": 2.5633039676786044e-06, + "loss": 0.8433094, + "memory(GiB)": 147.13, + "step": 57620, + "train_speed(iter/s)": 0.2008 + }, + { + "acc": 0.78314161, + "epoch": 1.3445515391008909, + "grad_norm": 4.71875, + "learning_rate": 2.5616545452780607e-06, + "loss": 0.76809273, + "memory(GiB)": 147.13, + "step": 57630, + "train_speed(iter/s)": 0.200819 + }, + { + "acc": 0.78058529, + "epoch": 1.3447848466731798, + "grad_norm": 5.28125, + "learning_rate": 2.5600054709506244e-06, + "loss": 0.78540258, + "memory(GiB)": 147.13, + "step": 57640, + "train_speed(iter/s)": 0.200837 + }, + { + "acc": 0.77517385, + "epoch": 1.3450181542454687, + "grad_norm": 6.75, + "learning_rate": 2.5583567449316983e-06, + "loss": 0.7901629, + "memory(GiB)": 147.13, + "step": 57650, + "train_speed(iter/s)": 0.200854 + }, + { + "acc": 0.75654125, + "epoch": 1.3452514618177576, + "grad_norm": 5.28125, + "learning_rate": 2.5567083674566363e-06, + "loss": 0.87824078, + "memory(GiB)": 147.13, + "step": 57660, + "train_speed(iter/s)": 0.200873 + }, + { + "acc": 0.79099903, + "epoch": 1.3454847693900465, + "grad_norm": 4.71875, + "learning_rate": 2.555060338760746e-06, + "loss": 0.74126825, + "memory(GiB)": 147.13, + "step": 57670, + "train_speed(iter/s)": 0.200891 + }, + { + "acc": 0.77133732, + "epoch": 1.3457180769623354, + "grad_norm": 5.5, + "learning_rate": 2.553412659079281e-06, + "loss": 0.83065491, + "memory(GiB)": 147.13, + "step": 57680, + "train_speed(iter/s)": 0.200907 + }, + { + "acc": 0.77394104, + "epoch": 1.3459513845346243, + "grad_norm": 4.5625, + "learning_rate": 2.5517653286474486e-06, + "loss": 0.7871973, + "memory(GiB)": 147.13, + "step": 57690, + "train_speed(iter/s)": 0.200926 + }, + { + "acc": 0.77455368, + "epoch": 1.3461846921069132, + "grad_norm": 7.375, + "learning_rate": 2.5501183477004036e-06, + "loss": 0.81267767, + "memory(GiB)": 147.13, + "step": 57700, + "train_speed(iter/s)": 0.200944 + }, + { + "acc": 0.78425789, + "epoch": 1.346417999679202, + "grad_norm": 6.65625, + "learning_rate": 2.548471716473255e-06, + "loss": 0.75778885, + "memory(GiB)": 147.13, + "step": 57710, + "train_speed(iter/s)": 0.200963 + }, + { + "acc": 0.77946496, + "epoch": 1.346651307251491, + "grad_norm": 5.5625, + "learning_rate": 2.546825435201056e-06, + "loss": 0.79724245, + "memory(GiB)": 147.13, + "step": 57720, + "train_speed(iter/s)": 0.200981 + }, + { + "acc": 0.76264691, + "epoch": 1.3468846148237799, + "grad_norm": 6.46875, + "learning_rate": 2.5451795041188137e-06, + "loss": 0.84872093, + "memory(GiB)": 147.13, + "step": 57730, + "train_speed(iter/s)": 0.200998 + }, + { + "acc": 0.78879805, + "epoch": 1.3471179223960688, + "grad_norm": 6.75, + "learning_rate": 2.543533923461484e-06, + "loss": 0.76141896, + "memory(GiB)": 147.13, + "step": 57740, + "train_speed(iter/s)": 0.201016 + }, + { + "acc": 0.78948154, + "epoch": 1.3473512299683577, + "grad_norm": 4.09375, + "learning_rate": 2.541888693463971e-06, + "loss": 0.73227172, + "memory(GiB)": 147.13, + "step": 57750, + "train_speed(iter/s)": 0.201033 + }, + { + "acc": 0.77823572, + "epoch": 1.3475845375406466, + "grad_norm": 4.9375, + "learning_rate": 2.540243814361135e-06, + "loss": 0.80870161, + "memory(GiB)": 147.13, + "step": 57760, + "train_speed(iter/s)": 0.201051 + }, + { + "acc": 0.77902465, + "epoch": 1.3478178451129355, + "grad_norm": 7.4375, + "learning_rate": 2.5385992863877783e-06, + "loss": 0.80094547, + "memory(GiB)": 147.13, + "step": 57770, + "train_speed(iter/s)": 0.201069 + }, + { + "acc": 0.77547317, + "epoch": 1.3480511526852244, + "grad_norm": 7.125, + "learning_rate": 2.5369551097786606e-06, + "loss": 0.79117107, + "memory(GiB)": 147.13, + "step": 57780, + "train_speed(iter/s)": 0.201086 + }, + { + "acc": 0.78636332, + "epoch": 1.3482844602575133, + "grad_norm": 4.75, + "learning_rate": 2.5353112847684846e-06, + "loss": 0.77924938, + "memory(GiB)": 147.13, + "step": 57790, + "train_speed(iter/s)": 0.201103 + }, + { + "acc": 0.79167924, + "epoch": 1.3485177678298021, + "grad_norm": 6.71875, + "learning_rate": 2.5336678115919056e-06, + "loss": 0.7614068, + "memory(GiB)": 147.13, + "step": 57800, + "train_speed(iter/s)": 0.201122 + }, + { + "acc": 0.773736, + "epoch": 1.348751075402091, + "grad_norm": 5.40625, + "learning_rate": 2.532024690483531e-06, + "loss": 0.81560125, + "memory(GiB)": 147.13, + "step": 57810, + "train_speed(iter/s)": 0.20114 + }, + { + "acc": 0.78321466, + "epoch": 1.34898438297438, + "grad_norm": 6.5, + "learning_rate": 2.5303819216779134e-06, + "loss": 0.76348276, + "memory(GiB)": 147.13, + "step": 57820, + "train_speed(iter/s)": 0.201158 + }, + { + "acc": 0.76863384, + "epoch": 1.3492176905466688, + "grad_norm": 5.15625, + "learning_rate": 2.528739505409561e-06, + "loss": 0.84750042, + "memory(GiB)": 147.13, + "step": 57830, + "train_speed(iter/s)": 0.201177 + }, + { + "acc": 0.79967451, + "epoch": 1.3494509981189577, + "grad_norm": 4.59375, + "learning_rate": 2.5270974419129248e-06, + "loss": 0.71103725, + "memory(GiB)": 147.13, + "step": 57840, + "train_speed(iter/s)": 0.201194 + }, + { + "acc": 0.77464585, + "epoch": 1.3496843056912466, + "grad_norm": 9.125, + "learning_rate": 2.525455731422414e-06, + "loss": 0.8018692, + "memory(GiB)": 147.13, + "step": 57850, + "train_speed(iter/s)": 0.201213 + }, + { + "acc": 0.79444895, + "epoch": 1.3499176132635355, + "grad_norm": 5.0, + "learning_rate": 2.5238143741723743e-06, + "loss": 0.72763662, + "memory(GiB)": 147.13, + "step": 57860, + "train_speed(iter/s)": 0.201231 + }, + { + "acc": 0.78565588, + "epoch": 1.3501509208358244, + "grad_norm": 6.0, + "learning_rate": 2.5221733703971165e-06, + "loss": 0.76235085, + "memory(GiB)": 147.13, + "step": 57870, + "train_speed(iter/s)": 0.201249 + }, + { + "acc": 0.78204556, + "epoch": 1.3503842284081133, + "grad_norm": 7.15625, + "learning_rate": 2.5205327203308887e-06, + "loss": 0.76153746, + "memory(GiB)": 147.13, + "step": 57880, + "train_speed(iter/s)": 0.201268 + }, + { + "acc": 0.78484373, + "epoch": 1.3506175359804022, + "grad_norm": 5.15625, + "learning_rate": 2.518892424207894e-06, + "loss": 0.77373028, + "memory(GiB)": 147.13, + "step": 57890, + "train_speed(iter/s)": 0.201285 + }, + { + "acc": 0.7609642, + "epoch": 1.3508508435526911, + "grad_norm": 6.65625, + "learning_rate": 2.517252482262286e-06, + "loss": 0.85898237, + "memory(GiB)": 147.13, + "step": 57900, + "train_speed(iter/s)": 0.201304 + }, + { + "acc": 0.78211231, + "epoch": 1.3510841511249798, + "grad_norm": 4.5625, + "learning_rate": 2.515612894728164e-06, + "loss": 0.78129263, + "memory(GiB)": 147.13, + "step": 57910, + "train_speed(iter/s)": 0.201321 + }, + { + "acc": 0.77921953, + "epoch": 1.351317458697269, + "grad_norm": 6.5, + "learning_rate": 2.5139736618395804e-06, + "loss": 0.7952693, + "memory(GiB)": 147.13, + "step": 57920, + "train_speed(iter/s)": 0.20134 + }, + { + "acc": 0.78653154, + "epoch": 1.3515507662695576, + "grad_norm": 6.09375, + "learning_rate": 2.5123347838305354e-06, + "loss": 0.75747843, + "memory(GiB)": 147.13, + "step": 57930, + "train_speed(iter/s)": 0.201359 + }, + { + "acc": 0.77528124, + "epoch": 1.3517840738418467, + "grad_norm": 6.96875, + "learning_rate": 2.510696260934975e-06, + "loss": 0.80986633, + "memory(GiB)": 147.13, + "step": 57940, + "train_speed(iter/s)": 0.201375 + }, + { + "acc": 0.77857499, + "epoch": 1.3520173814141354, + "grad_norm": 6.21875, + "learning_rate": 2.509058093386802e-06, + "loss": 0.79531021, + "memory(GiB)": 147.13, + "step": 57950, + "train_speed(iter/s)": 0.201393 + }, + { + "acc": 0.75173092, + "epoch": 1.3522506889864245, + "grad_norm": 5.0625, + "learning_rate": 2.507420281419862e-06, + "loss": 0.89647017, + "memory(GiB)": 147.13, + "step": 57960, + "train_speed(iter/s)": 0.201411 + }, + { + "acc": 0.77753296, + "epoch": 1.3524839965587132, + "grad_norm": 4.90625, + "learning_rate": 2.505782825267954e-06, + "loss": 0.79882941, + "memory(GiB)": 147.13, + "step": 57970, + "train_speed(iter/s)": 0.201428 + }, + { + "acc": 0.77016959, + "epoch": 1.3527173041310023, + "grad_norm": 4.5625, + "learning_rate": 2.5041457251648204e-06, + "loss": 0.84093132, + "memory(GiB)": 147.13, + "step": 57980, + "train_speed(iter/s)": 0.201445 + }, + { + "acc": 0.78002801, + "epoch": 1.352950611703291, + "grad_norm": 5.0, + "learning_rate": 2.502508981344162e-06, + "loss": 0.78813906, + "memory(GiB)": 147.13, + "step": 57990, + "train_speed(iter/s)": 0.201463 + }, + { + "acc": 0.78480358, + "epoch": 1.35318391927558, + "grad_norm": 5.65625, + "learning_rate": 2.5008725940396182e-06, + "loss": 0.76581554, + "memory(GiB)": 147.13, + "step": 58000, + "train_speed(iter/s)": 0.20148 + }, + { + "epoch": 1.35318391927558, + "eval_acc": 0.7446276014029275, + "eval_loss": 0.8048146963119507, + "eval_runtime": 1270.2313, + "eval_samples_per_second": 28.334, + "eval_steps_per_second": 14.167, + "step": 58000 + }, + { + "acc": 0.7872107, + "epoch": 1.3534172268478688, + "grad_norm": 6.875, + "learning_rate": 2.499236563484788e-06, + "loss": 0.75764198, + "memory(GiB)": 147.13, + "step": 58010, + "train_speed(iter/s)": 0.2006 + }, + { + "acc": 0.79398293, + "epoch": 1.3536505344201577, + "grad_norm": 4.78125, + "learning_rate": 2.4976008899132122e-06, + "loss": 0.73490696, + "memory(GiB)": 147.13, + "step": 58020, + "train_speed(iter/s)": 0.200618 + }, + { + "acc": 0.78950262, + "epoch": 1.3538838419924466, + "grad_norm": 4.96875, + "learning_rate": 2.49596557355838e-06, + "loss": 0.73174076, + "memory(GiB)": 147.13, + "step": 58030, + "train_speed(iter/s)": 0.200636 + }, + { + "acc": 0.78074455, + "epoch": 1.3541171495647355, + "grad_norm": 6.6875, + "learning_rate": 2.4943306146537365e-06, + "loss": 0.79538136, + "memory(GiB)": 147.13, + "step": 58040, + "train_speed(iter/s)": 0.200655 + }, + { + "acc": 0.77637463, + "epoch": 1.3543504571370244, + "grad_norm": 8.75, + "learning_rate": 2.4926960134326684e-06, + "loss": 0.79754944, + "memory(GiB)": 147.13, + "step": 58050, + "train_speed(iter/s)": 0.200673 + }, + { + "acc": 0.78122358, + "epoch": 1.3545837647093133, + "grad_norm": 6.0625, + "learning_rate": 2.491061770128518e-06, + "loss": 0.77927294, + "memory(GiB)": 147.13, + "step": 58060, + "train_speed(iter/s)": 0.20069 + }, + { + "acc": 0.77235141, + "epoch": 1.3548170722816022, + "grad_norm": 5.09375, + "learning_rate": 2.4894278849745705e-06, + "loss": 0.81898794, + "memory(GiB)": 147.13, + "step": 58070, + "train_speed(iter/s)": 0.200707 + }, + { + "acc": 0.77147989, + "epoch": 1.355050379853891, + "grad_norm": 7.71875, + "learning_rate": 2.4877943582040636e-06, + "loss": 0.81207123, + "memory(GiB)": 147.13, + "step": 58080, + "train_speed(iter/s)": 0.200726 + }, + { + "acc": 0.79436178, + "epoch": 1.35528368742618, + "grad_norm": 4.4375, + "learning_rate": 2.486161190050182e-06, + "loss": 0.73931255, + "memory(GiB)": 147.13, + "step": 58090, + "train_speed(iter/s)": 0.200743 + }, + { + "acc": 0.78587637, + "epoch": 1.355516994998469, + "grad_norm": 5.8125, + "learning_rate": 2.4845283807460587e-06, + "loss": 0.76380043, + "memory(GiB)": 147.13, + "step": 58100, + "train_speed(iter/s)": 0.200762 + }, + { + "acc": 0.79440575, + "epoch": 1.3557503025707578, + "grad_norm": 5.21875, + "learning_rate": 2.4828959305247795e-06, + "loss": 0.74594207, + "memory(GiB)": 147.13, + "step": 58110, + "train_speed(iter/s)": 0.20078 + }, + { + "acc": 0.79074926, + "epoch": 1.3559836101430467, + "grad_norm": 5.0625, + "learning_rate": 2.4812638396193734e-06, + "loss": 0.76517062, + "memory(GiB)": 147.13, + "step": 58120, + "train_speed(iter/s)": 0.200798 + }, + { + "acc": 0.7802947, + "epoch": 1.3562169177153356, + "grad_norm": 4.6875, + "learning_rate": 2.479632108262825e-06, + "loss": 0.77691612, + "memory(GiB)": 147.13, + "step": 58130, + "train_speed(iter/s)": 0.200816 + }, + { + "acc": 0.78983259, + "epoch": 1.3564502252876245, + "grad_norm": 4.90625, + "learning_rate": 2.4780007366880584e-06, + "loss": 0.75796638, + "memory(GiB)": 147.13, + "step": 58140, + "train_speed(iter/s)": 0.200832 + }, + { + "acc": 0.77917747, + "epoch": 1.3566835328599134, + "grad_norm": 3.953125, + "learning_rate": 2.476369725127956e-06, + "loss": 0.79284611, + "memory(GiB)": 147.13, + "step": 58150, + "train_speed(iter/s)": 0.20085 + }, + { + "acc": 0.76637774, + "epoch": 1.3569168404322023, + "grad_norm": 5.34375, + "learning_rate": 2.474739073815342e-06, + "loss": 0.84874744, + "memory(GiB)": 147.13, + "step": 58160, + "train_speed(iter/s)": 0.200868 + }, + { + "acc": 0.78275561, + "epoch": 1.3571501480044912, + "grad_norm": 4.59375, + "learning_rate": 2.47310878298299e-06, + "loss": 0.79091311, + "memory(GiB)": 147.13, + "step": 58170, + "train_speed(iter/s)": 0.200886 + }, + { + "acc": 0.78089781, + "epoch": 1.35738345557678, + "grad_norm": 5.5, + "learning_rate": 2.4714788528636275e-06, + "loss": 0.77343168, + "memory(GiB)": 147.13, + "step": 58180, + "train_speed(iter/s)": 0.200903 + }, + { + "acc": 0.77175679, + "epoch": 1.357616763149069, + "grad_norm": 4.625, + "learning_rate": 2.4698492836899234e-06, + "loss": 0.80777512, + "memory(GiB)": 147.13, + "step": 58190, + "train_speed(iter/s)": 0.20092 + }, + { + "acc": 0.78164663, + "epoch": 1.3578500707213579, + "grad_norm": 4.8125, + "learning_rate": 2.4682200756944997e-06, + "loss": 0.7826436, + "memory(GiB)": 147.13, + "step": 58200, + "train_speed(iter/s)": 0.200938 + }, + { + "acc": 0.79077911, + "epoch": 1.3580833782936468, + "grad_norm": 6.03125, + "learning_rate": 2.4665912291099225e-06, + "loss": 0.74198914, + "memory(GiB)": 147.13, + "step": 58210, + "train_speed(iter/s)": 0.200956 + }, + { + "acc": 0.78291769, + "epoch": 1.3583166858659357, + "grad_norm": 5.59375, + "learning_rate": 2.4649627441687134e-06, + "loss": 0.79139719, + "memory(GiB)": 147.13, + "step": 58220, + "train_speed(iter/s)": 0.200973 + }, + { + "acc": 0.77830348, + "epoch": 1.3585499934382246, + "grad_norm": 5.40625, + "learning_rate": 2.463334621103336e-06, + "loss": 0.7960639, + "memory(GiB)": 147.13, + "step": 58230, + "train_speed(iter/s)": 0.200991 + }, + { + "acc": 0.77357435, + "epoch": 1.3587833010105135, + "grad_norm": 6.53125, + "learning_rate": 2.461706860146203e-06, + "loss": 0.81305132, + "memory(GiB)": 147.13, + "step": 58240, + "train_speed(iter/s)": 0.201008 + }, + { + "acc": 0.76329145, + "epoch": 1.3590166085828024, + "grad_norm": 6.3125, + "learning_rate": 2.4600794615296797e-06, + "loss": 0.85375662, + "memory(GiB)": 147.13, + "step": 58250, + "train_speed(iter/s)": 0.201026 + }, + { + "acc": 0.78082247, + "epoch": 1.3592499161550913, + "grad_norm": 17.125, + "learning_rate": 2.4584524254860736e-06, + "loss": 0.78417535, + "memory(GiB)": 147.13, + "step": 58260, + "train_speed(iter/s)": 0.201043 + }, + { + "acc": 0.77918062, + "epoch": 1.3594832237273802, + "grad_norm": 5.0, + "learning_rate": 2.4568257522476476e-06, + "loss": 0.80585155, + "memory(GiB)": 147.13, + "step": 58270, + "train_speed(iter/s)": 0.201061 + }, + { + "acc": 0.76542234, + "epoch": 1.359716531299669, + "grad_norm": 4.34375, + "learning_rate": 2.455199442046607e-06, + "loss": 0.83563366, + "memory(GiB)": 147.13, + "step": 58280, + "train_speed(iter/s)": 0.20108 + }, + { + "acc": 0.80455227, + "epoch": 1.359949838871958, + "grad_norm": 5.25, + "learning_rate": 2.453573495115104e-06, + "loss": 0.68476171, + "memory(GiB)": 147.13, + "step": 58290, + "train_speed(iter/s)": 0.201099 + }, + { + "acc": 0.80062113, + "epoch": 1.3601831464442466, + "grad_norm": 4.21875, + "learning_rate": 2.4519479116852476e-06, + "loss": 0.7182889, + "memory(GiB)": 147.13, + "step": 58300, + "train_speed(iter/s)": 0.201116 + }, + { + "acc": 0.78591204, + "epoch": 1.3604164540165358, + "grad_norm": 5.71875, + "learning_rate": 2.450322691989086e-06, + "loss": 0.7478971, + "memory(GiB)": 147.13, + "step": 58310, + "train_speed(iter/s)": 0.201134 + }, + { + "acc": 0.78090954, + "epoch": 1.3606497615888244, + "grad_norm": 6.5625, + "learning_rate": 2.4486978362586196e-06, + "loss": 0.79978738, + "memory(GiB)": 147.13, + "step": 58320, + "train_speed(iter/s)": 0.201151 + }, + { + "acc": 0.77255669, + "epoch": 1.3608830691611136, + "grad_norm": 4.96875, + "learning_rate": 2.447073344725794e-06, + "loss": 0.80740366, + "memory(GiB)": 147.13, + "step": 58330, + "train_speed(iter/s)": 0.201168 + }, + { + "acc": 0.77630148, + "epoch": 1.3611163767334022, + "grad_norm": 6.34375, + "learning_rate": 2.4454492176225087e-06, + "loss": 0.79302568, + "memory(GiB)": 147.13, + "step": 58340, + "train_speed(iter/s)": 0.201186 + }, + { + "acc": 0.78025756, + "epoch": 1.3613496843056914, + "grad_norm": 5.34375, + "learning_rate": 2.4438254551806034e-06, + "loss": 0.77262883, + "memory(GiB)": 147.13, + "step": 58350, + "train_speed(iter/s)": 0.201204 + }, + { + "acc": 0.78731136, + "epoch": 1.36158299187798, + "grad_norm": 4.71875, + "learning_rate": 2.4422020576318737e-06, + "loss": 0.75996304, + "memory(GiB)": 147.13, + "step": 58360, + "train_speed(iter/s)": 0.201221 + }, + { + "acc": 0.75691671, + "epoch": 1.3618162994502692, + "grad_norm": 6.125, + "learning_rate": 2.4405790252080576e-06, + "loss": 0.90783367, + "memory(GiB)": 147.13, + "step": 58370, + "train_speed(iter/s)": 0.201239 + }, + { + "acc": 0.78348799, + "epoch": 1.3620496070225578, + "grad_norm": 6.125, + "learning_rate": 2.4389563581408397e-06, + "loss": 0.79257278, + "memory(GiB)": 147.13, + "step": 58380, + "train_speed(iter/s)": 0.201255 + }, + { + "acc": 0.76404047, + "epoch": 1.3622829145948467, + "grad_norm": 7.3125, + "learning_rate": 2.4373340566618603e-06, + "loss": 0.85965233, + "memory(GiB)": 147.13, + "step": 58390, + "train_speed(iter/s)": 0.201273 + }, + { + "acc": 0.77306118, + "epoch": 1.3625162221671356, + "grad_norm": 5.21875, + "learning_rate": 2.435712121002698e-06, + "loss": 0.82709122, + "memory(GiB)": 147.13, + "step": 58400, + "train_speed(iter/s)": 0.20129 + }, + { + "acc": 0.78636055, + "epoch": 1.3627495297394245, + "grad_norm": 4.59375, + "learning_rate": 2.4340905513948866e-06, + "loss": 0.76273184, + "memory(GiB)": 147.13, + "step": 58410, + "train_speed(iter/s)": 0.201308 + }, + { + "acc": 0.79748254, + "epoch": 1.3629828373117134, + "grad_norm": 4.34375, + "learning_rate": 2.432469348069904e-06, + "loss": 0.73208895, + "memory(GiB)": 147.13, + "step": 58420, + "train_speed(iter/s)": 0.201323 + }, + { + "acc": 0.79371405, + "epoch": 1.3632161448840023, + "grad_norm": 5.9375, + "learning_rate": 2.4308485112591764e-06, + "loss": 0.73580298, + "memory(GiB)": 147.13, + "step": 58430, + "train_speed(iter/s)": 0.20134 + }, + { + "acc": 0.79239931, + "epoch": 1.3634494524562912, + "grad_norm": 6.84375, + "learning_rate": 2.429228041194077e-06, + "loss": 0.74616742, + "memory(GiB)": 147.13, + "step": 58440, + "train_speed(iter/s)": 0.201358 + }, + { + "acc": 0.75693569, + "epoch": 1.3636827600285801, + "grad_norm": 4.5, + "learning_rate": 2.4276079381059258e-06, + "loss": 0.89039459, + "memory(GiB)": 147.13, + "step": 58450, + "train_speed(iter/s)": 0.201376 + }, + { + "acc": 0.76860232, + "epoch": 1.363916067600869, + "grad_norm": 7.15625, + "learning_rate": 2.4259882022259968e-06, + "loss": 0.82961311, + "memory(GiB)": 147.13, + "step": 58460, + "train_speed(iter/s)": 0.201394 + }, + { + "acc": 0.79684687, + "epoch": 1.364149375173158, + "grad_norm": 5.25, + "learning_rate": 2.424368833785502e-06, + "loss": 0.72636261, + "memory(GiB)": 147.13, + "step": 58470, + "train_speed(iter/s)": 0.201411 + }, + { + "acc": 0.77321219, + "epoch": 1.3643826827454468, + "grad_norm": 5.125, + "learning_rate": 2.4227498330156095e-06, + "loss": 0.82248058, + "memory(GiB)": 147.13, + "step": 58480, + "train_speed(iter/s)": 0.20143 + }, + { + "acc": 0.77851667, + "epoch": 1.3646159903177357, + "grad_norm": 5.4375, + "learning_rate": 2.421131200147428e-06, + "loss": 0.81334505, + "memory(GiB)": 147.13, + "step": 58490, + "train_speed(iter/s)": 0.201447 + }, + { + "acc": 0.78208332, + "epoch": 1.3648492978900246, + "grad_norm": 5.15625, + "learning_rate": 2.4195129354120204e-06, + "loss": 0.78902369, + "memory(GiB)": 147.13, + "step": 58500, + "train_speed(iter/s)": 0.201465 + }, + { + "epoch": 1.3648492978900246, + "eval_acc": 0.7446186333259187, + "eval_loss": 0.8047336339950562, + "eval_runtime": 1268.6994, + "eval_samples_per_second": 28.368, + "eval_steps_per_second": 14.185, + "step": 58500 + }, + { + "acc": 0.78909907, + "epoch": 1.3650826054623135, + "grad_norm": 4.84375, + "learning_rate": 2.4178950390403917e-06, + "loss": 0.74475222, + "memory(GiB)": 147.13, + "step": 58510, + "train_speed(iter/s)": 0.200594 + }, + { + "acc": 0.75878892, + "epoch": 1.3653159130346024, + "grad_norm": 7.4375, + "learning_rate": 2.416277511263494e-06, + "loss": 0.86468029, + "memory(GiB)": 147.13, + "step": 58520, + "train_speed(iter/s)": 0.200611 + }, + { + "acc": 0.77081804, + "epoch": 1.3655492206068913, + "grad_norm": 4.6875, + "learning_rate": 2.4146603523122347e-06, + "loss": 0.83188496, + "memory(GiB)": 147.13, + "step": 58530, + "train_speed(iter/s)": 0.200629 + }, + { + "acc": 0.78759122, + "epoch": 1.3657825281791802, + "grad_norm": 6.4375, + "learning_rate": 2.413043562417456e-06, + "loss": 0.7472733, + "memory(GiB)": 147.13, + "step": 58540, + "train_speed(iter/s)": 0.200647 + }, + { + "acc": 0.79054499, + "epoch": 1.3660158357514691, + "grad_norm": 6.25, + "learning_rate": 2.4114271418099583e-06, + "loss": 0.77312899, + "memory(GiB)": 147.13, + "step": 58550, + "train_speed(iter/s)": 0.200666 + }, + { + "acc": 0.76001344, + "epoch": 1.366249143323758, + "grad_norm": 5.6875, + "learning_rate": 2.4098110907204824e-06, + "loss": 0.86084118, + "memory(GiB)": 147.13, + "step": 58560, + "train_speed(iter/s)": 0.200683 + }, + { + "acc": 0.7808567, + "epoch": 1.366482450896047, + "grad_norm": 5.0, + "learning_rate": 2.4081954093797234e-06, + "loss": 0.81140366, + "memory(GiB)": 147.13, + "step": 58570, + "train_speed(iter/s)": 0.200701 + }, + { + "acc": 0.78830757, + "epoch": 1.3667157584683358, + "grad_norm": 5.375, + "learning_rate": 2.406580098018316e-06, + "loss": 0.74808269, + "memory(GiB)": 147.13, + "step": 58580, + "train_speed(iter/s)": 0.200719 + }, + { + "acc": 0.78679028, + "epoch": 1.3669490660406247, + "grad_norm": 5.84375, + "learning_rate": 2.4049651568668447e-06, + "loss": 0.76290827, + "memory(GiB)": 147.13, + "step": 58590, + "train_speed(iter/s)": 0.200738 + }, + { + "acc": 0.77562418, + "epoch": 1.3671823736129136, + "grad_norm": 5.9375, + "learning_rate": 2.403350586155845e-06, + "loss": 0.82148495, + "memory(GiB)": 147.13, + "step": 58600, + "train_speed(iter/s)": 0.200756 + }, + { + "acc": 0.76451578, + "epoch": 1.3674156811852025, + "grad_norm": 5.46875, + "learning_rate": 2.4017363861157927e-06, + "loss": 0.85645256, + "memory(GiB)": 147.13, + "step": 58610, + "train_speed(iter/s)": 0.200775 + }, + { + "acc": 0.77976284, + "epoch": 1.3676489887574914, + "grad_norm": 5.96875, + "learning_rate": 2.400122556977119e-06, + "loss": 0.79148855, + "memory(GiB)": 147.13, + "step": 58620, + "train_speed(iter/s)": 0.200793 + }, + { + "acc": 0.77969866, + "epoch": 1.3678822963297803, + "grad_norm": 3.96875, + "learning_rate": 2.398509098970193e-06, + "loss": 0.77951374, + "memory(GiB)": 147.13, + "step": 58630, + "train_speed(iter/s)": 0.200811 + }, + { + "acc": 0.78442101, + "epoch": 1.3681156039020692, + "grad_norm": 4.0625, + "learning_rate": 2.3968960123253392e-06, + "loss": 0.76123886, + "memory(GiB)": 147.13, + "step": 58640, + "train_speed(iter/s)": 0.20083 + }, + { + "acc": 0.7682898, + "epoch": 1.368348911474358, + "grad_norm": 7.375, + "learning_rate": 2.3952832972728234e-06, + "loss": 0.83347187, + "memory(GiB)": 147.13, + "step": 58650, + "train_speed(iter/s)": 0.200847 + }, + { + "acc": 0.78368082, + "epoch": 1.368582219046647, + "grad_norm": 5.5625, + "learning_rate": 2.39367095404286e-06, + "loss": 0.78032255, + "memory(GiB)": 147.13, + "step": 58660, + "train_speed(iter/s)": 0.200866 + }, + { + "acc": 0.7843792, + "epoch": 1.368815526618936, + "grad_norm": 4.53125, + "learning_rate": 2.392058982865611e-06, + "loss": 0.76798601, + "memory(GiB)": 147.13, + "step": 58670, + "train_speed(iter/s)": 0.200883 + }, + { + "acc": 0.77215185, + "epoch": 1.3690488341912248, + "grad_norm": 7.34375, + "learning_rate": 2.3904473839711826e-06, + "loss": 0.81567993, + "memory(GiB)": 147.13, + "step": 58680, + "train_speed(iter/s)": 0.200902 + }, + { + "acc": 0.79290199, + "epoch": 1.3692821417635135, + "grad_norm": 5.96875, + "learning_rate": 2.388836157589634e-06, + "loss": 0.75185156, + "memory(GiB)": 147.13, + "step": 58690, + "train_speed(iter/s)": 0.20092 + }, + { + "acc": 0.78097115, + "epoch": 1.3695154493358026, + "grad_norm": 5.5, + "learning_rate": 2.3872253039509637e-06, + "loss": 0.78168497, + "memory(GiB)": 147.13, + "step": 58700, + "train_speed(iter/s)": 0.200937 + }, + { + "acc": 0.76524758, + "epoch": 1.3697487569080913, + "grad_norm": 6.3125, + "learning_rate": 2.3856148232851237e-06, + "loss": 0.90382605, + "memory(GiB)": 147.13, + "step": 58710, + "train_speed(iter/s)": 0.200955 + }, + { + "acc": 0.78163342, + "epoch": 1.3699820644803804, + "grad_norm": 5.53125, + "learning_rate": 2.384004715822009e-06, + "loss": 0.78530493, + "memory(GiB)": 147.13, + "step": 58720, + "train_speed(iter/s)": 0.200972 + }, + { + "acc": 0.78916397, + "epoch": 1.370215372052669, + "grad_norm": 9.8125, + "learning_rate": 2.3823949817914584e-06, + "loss": 0.75205698, + "memory(GiB)": 147.13, + "step": 58730, + "train_speed(iter/s)": 0.20099 + }, + { + "acc": 0.79455929, + "epoch": 1.3704486796249582, + "grad_norm": 6.84375, + "learning_rate": 2.380785621423266e-06, + "loss": 0.72274475, + "memory(GiB)": 147.13, + "step": 58740, + "train_speed(iter/s)": 0.201007 + }, + { + "acc": 0.78989534, + "epoch": 1.3706819871972469, + "grad_norm": 6.0625, + "learning_rate": 2.379176634947163e-06, + "loss": 0.7328239, + "memory(GiB)": 147.13, + "step": 58750, + "train_speed(iter/s)": 0.201025 + }, + { + "acc": 0.77720261, + "epoch": 1.370915294769536, + "grad_norm": 5.75, + "learning_rate": 2.377568022592838e-06, + "loss": 0.79559016, + "memory(GiB)": 147.13, + "step": 58760, + "train_speed(iter/s)": 0.201042 + }, + { + "acc": 0.77461166, + "epoch": 1.3711486023418247, + "grad_norm": 4.15625, + "learning_rate": 2.3759597845899123e-06, + "loss": 0.79362354, + "memory(GiB)": 147.13, + "step": 58770, + "train_speed(iter/s)": 0.201059 + }, + { + "acc": 0.78339438, + "epoch": 1.3713819099141136, + "grad_norm": 9.4375, + "learning_rate": 2.374351921167967e-06, + "loss": 0.74716663, + "memory(GiB)": 147.13, + "step": 58780, + "train_speed(iter/s)": 0.201077 + }, + { + "acc": 0.78279791, + "epoch": 1.3716152174864025, + "grad_norm": 5.34375, + "learning_rate": 2.37274443255652e-06, + "loss": 0.80144253, + "memory(GiB)": 147.13, + "step": 58790, + "train_speed(iter/s)": 0.201094 + }, + { + "acc": 0.78304019, + "epoch": 1.3718485250586914, + "grad_norm": 4.53125, + "learning_rate": 2.3711373189850444e-06, + "loss": 0.76427855, + "memory(GiB)": 147.13, + "step": 58800, + "train_speed(iter/s)": 0.201111 + }, + { + "acc": 0.76471519, + "epoch": 1.3720818326309803, + "grad_norm": 5.25, + "learning_rate": 2.369530580682953e-06, + "loss": 0.83371811, + "memory(GiB)": 147.13, + "step": 58810, + "train_speed(iter/s)": 0.201129 + }, + { + "acc": 0.7774107, + "epoch": 1.3723151402032692, + "grad_norm": 5.28125, + "learning_rate": 2.367924217879604e-06, + "loss": 0.80156479, + "memory(GiB)": 147.13, + "step": 58820, + "train_speed(iter/s)": 0.201146 + }, + { + "acc": 0.78052721, + "epoch": 1.372548447775558, + "grad_norm": 5.0625, + "learning_rate": 2.3663182308043115e-06, + "loss": 0.79353514, + "memory(GiB)": 147.13, + "step": 58830, + "train_speed(iter/s)": 0.201163 + }, + { + "acc": 0.77628222, + "epoch": 1.372781755347847, + "grad_norm": 6.34375, + "learning_rate": 2.3647126196863234e-06, + "loss": 0.78026848, + "memory(GiB)": 147.13, + "step": 58840, + "train_speed(iter/s)": 0.20118 + }, + { + "acc": 0.7791482, + "epoch": 1.3730150629201359, + "grad_norm": 6.0625, + "learning_rate": 2.3631073847548457e-06, + "loss": 0.80607452, + "memory(GiB)": 147.13, + "step": 58850, + "train_speed(iter/s)": 0.201197 + }, + { + "acc": 0.76585202, + "epoch": 1.3732483704924248, + "grad_norm": 4.46875, + "learning_rate": 2.3615025262390228e-06, + "loss": 0.8779665, + "memory(GiB)": 147.13, + "step": 58860, + "train_speed(iter/s)": 0.201214 + }, + { + "acc": 0.77455015, + "epoch": 1.3734816780647137, + "grad_norm": 5.59375, + "learning_rate": 2.3598980443679483e-06, + "loss": 0.80353298, + "memory(GiB)": 147.13, + "step": 58870, + "train_speed(iter/s)": 0.201232 + }, + { + "acc": 0.77604198, + "epoch": 1.3737149856370026, + "grad_norm": 5.625, + "learning_rate": 2.3582939393706604e-06, + "loss": 0.79859333, + "memory(GiB)": 147.13, + "step": 58880, + "train_speed(iter/s)": 0.201249 + }, + { + "acc": 0.79776068, + "epoch": 1.3739482932092915, + "grad_norm": 8.8125, + "learning_rate": 2.3566902114761435e-06, + "loss": 0.71753788, + "memory(GiB)": 147.13, + "step": 58890, + "train_speed(iter/s)": 0.201267 + }, + { + "acc": 0.78484306, + "epoch": 1.3741816007815804, + "grad_norm": 4.40625, + "learning_rate": 2.3550868609133326e-06, + "loss": 0.76917105, + "memory(GiB)": 147.13, + "step": 58900, + "train_speed(iter/s)": 0.201285 + }, + { + "acc": 0.78789654, + "epoch": 1.3744149083538693, + "grad_norm": 5.65625, + "learning_rate": 2.3534838879111026e-06, + "loss": 0.74949794, + "memory(GiB)": 147.13, + "step": 58910, + "train_speed(iter/s)": 0.201303 + }, + { + "acc": 0.7702579, + "epoch": 1.3746482159261582, + "grad_norm": 4.1875, + "learning_rate": 2.35188129269828e-06, + "loss": 0.82006226, + "memory(GiB)": 147.13, + "step": 58920, + "train_speed(iter/s)": 0.201321 + }, + { + "acc": 0.77914472, + "epoch": 1.374881523498447, + "grad_norm": 6.875, + "learning_rate": 2.3502790755036324e-06, + "loss": 0.80082273, + "memory(GiB)": 147.13, + "step": 58930, + "train_speed(iter/s)": 0.201338 + }, + { + "acc": 0.78248625, + "epoch": 1.375114831070736, + "grad_norm": 6.71875, + "learning_rate": 2.3486772365558786e-06, + "loss": 0.80548611, + "memory(GiB)": 147.13, + "step": 58940, + "train_speed(iter/s)": 0.201356 + }, + { + "acc": 0.7839232, + "epoch": 1.3753481386430249, + "grad_norm": 6.125, + "learning_rate": 2.3470757760836794e-06, + "loss": 0.75650473, + "memory(GiB)": 147.13, + "step": 58950, + "train_speed(iter/s)": 0.201373 + }, + { + "acc": 0.77035446, + "epoch": 1.3755814462153138, + "grad_norm": 4.40625, + "learning_rate": 2.34547469431564e-06, + "loss": 0.84554367, + "memory(GiB)": 147.13, + "step": 58960, + "train_speed(iter/s)": 0.201391 + }, + { + "acc": 0.77048821, + "epoch": 1.3758147537876027, + "grad_norm": 5.0, + "learning_rate": 2.3438739914803193e-06, + "loss": 0.82956161, + "memory(GiB)": 147.13, + "step": 58970, + "train_speed(iter/s)": 0.201409 + }, + { + "acc": 0.76621504, + "epoch": 1.3760480613598916, + "grad_norm": 5.0, + "learning_rate": 2.3422736678062126e-06, + "loss": 0.85359306, + "memory(GiB)": 147.13, + "step": 58980, + "train_speed(iter/s)": 0.201427 + }, + { + "acc": 0.8010005, + "epoch": 1.3762813689321804, + "grad_norm": 4.375, + "learning_rate": 2.3406737235217714e-06, + "loss": 0.69008727, + "memory(GiB)": 147.13, + "step": 58990, + "train_speed(iter/s)": 0.201445 + }, + { + "acc": 0.78054056, + "epoch": 1.3765146765044693, + "grad_norm": 5.65625, + "learning_rate": 2.33907415885538e-06, + "loss": 0.75995293, + "memory(GiB)": 147.13, + "step": 59000, + "train_speed(iter/s)": 0.201462 + }, + { + "epoch": 1.3765146765044693, + "eval_acc": 0.7445920493833567, + "eval_loss": 0.8047448992729187, + "eval_runtime": 1269.8518, + "eval_samples_per_second": 28.343, + "eval_steps_per_second": 14.172, + "step": 59000 + }, + { + "acc": 0.78174009, + "epoch": 1.3767479840767582, + "grad_norm": 4.40625, + "learning_rate": 2.3374749740353815e-06, + "loss": 0.77599964, + "memory(GiB)": 147.13, + "step": 59010, + "train_speed(iter/s)": 0.200599 + }, + { + "acc": 0.78817472, + "epoch": 1.3769812916490471, + "grad_norm": 4.34375, + "learning_rate": 2.335876169290056e-06, + "loss": 0.76314001, + "memory(GiB)": 147.13, + "step": 59020, + "train_speed(iter/s)": 0.200615 + }, + { + "acc": 0.78163285, + "epoch": 1.377214599221336, + "grad_norm": 8.5625, + "learning_rate": 2.3342777448476326e-06, + "loss": 0.76157379, + "memory(GiB)": 147.13, + "step": 59030, + "train_speed(iter/s)": 0.200632 + }, + { + "acc": 0.7947587, + "epoch": 1.377447906793625, + "grad_norm": 5.71875, + "learning_rate": 2.3326797009362884e-06, + "loss": 0.72174067, + "memory(GiB)": 147.13, + "step": 59040, + "train_speed(iter/s)": 0.20065 + }, + { + "acc": 0.78288536, + "epoch": 1.3776812143659138, + "grad_norm": 5.0625, + "learning_rate": 2.33108203778414e-06, + "loss": 0.78767552, + "memory(GiB)": 147.13, + "step": 59050, + "train_speed(iter/s)": 0.200667 + }, + { + "acc": 0.76853189, + "epoch": 1.3779145219382025, + "grad_norm": 4.5625, + "learning_rate": 2.3294847556192575e-06, + "loss": 0.83004322, + "memory(GiB)": 147.13, + "step": 59060, + "train_speed(iter/s)": 0.200685 + }, + { + "acc": 0.76078501, + "epoch": 1.3781478295104916, + "grad_norm": 6.40625, + "learning_rate": 2.32788785466965e-06, + "loss": 0.86728783, + "memory(GiB)": 147.13, + "step": 59070, + "train_speed(iter/s)": 0.200703 + }, + { + "acc": 0.77539091, + "epoch": 1.3783811370827803, + "grad_norm": 5.625, + "learning_rate": 2.3262913351632725e-06, + "loss": 0.79250236, + "memory(GiB)": 147.13, + "step": 59080, + "train_speed(iter/s)": 0.200721 + }, + { + "acc": 0.77549553, + "epoch": 1.3786144446550694, + "grad_norm": 4.65625, + "learning_rate": 2.3246951973280328e-06, + "loss": 0.80941544, + "memory(GiB)": 147.13, + "step": 59090, + "train_speed(iter/s)": 0.200739 + }, + { + "acc": 0.78177032, + "epoch": 1.3788477522273581, + "grad_norm": 6.3125, + "learning_rate": 2.3230994413917767e-06, + "loss": 0.77314987, + "memory(GiB)": 147.13, + "step": 59100, + "train_speed(iter/s)": 0.200757 + }, + { + "acc": 0.77774806, + "epoch": 1.3790810597996472, + "grad_norm": 4.28125, + "learning_rate": 2.3215040675822976e-06, + "loss": 0.79984112, + "memory(GiB)": 147.13, + "step": 59110, + "train_speed(iter/s)": 0.200775 + }, + { + "acc": 0.78535266, + "epoch": 1.379314367371936, + "grad_norm": 5.0, + "learning_rate": 2.319909076127333e-06, + "loss": 0.76313772, + "memory(GiB)": 147.13, + "step": 59120, + "train_speed(iter/s)": 0.200793 + }, + { + "acc": 0.78793802, + "epoch": 1.379547674944225, + "grad_norm": 4.53125, + "learning_rate": 2.3183144672545706e-06, + "loss": 0.77101927, + "memory(GiB)": 147.13, + "step": 59130, + "train_speed(iter/s)": 0.200811 + }, + { + "acc": 0.79543657, + "epoch": 1.3797809825165137, + "grad_norm": 6.125, + "learning_rate": 2.3167202411916372e-06, + "loss": 0.71729631, + "memory(GiB)": 147.13, + "step": 59140, + "train_speed(iter/s)": 0.200829 + }, + { + "acc": 0.79009199, + "epoch": 1.3800142900888028, + "grad_norm": 7.875, + "learning_rate": 2.315126398166112e-06, + "loss": 0.75708284, + "memory(GiB)": 147.13, + "step": 59150, + "train_speed(iter/s)": 0.200845 + }, + { + "acc": 0.78260612, + "epoch": 1.3802475976610915, + "grad_norm": 6.625, + "learning_rate": 2.3135329384055134e-06, + "loss": 0.78136015, + "memory(GiB)": 147.13, + "step": 59160, + "train_speed(iter/s)": 0.200861 + }, + { + "acc": 0.75776453, + "epoch": 1.3804809052333804, + "grad_norm": 4.375, + "learning_rate": 2.3119398621373055e-06, + "loss": 0.87936306, + "memory(GiB)": 147.13, + "step": 59170, + "train_speed(iter/s)": 0.200879 + }, + { + "acc": 0.78566179, + "epoch": 1.3807142128056693, + "grad_norm": 6.46875, + "learning_rate": 2.3103471695889035e-06, + "loss": 0.75471678, + "memory(GiB)": 147.13, + "step": 59180, + "train_speed(iter/s)": 0.200896 + }, + { + "acc": 0.77722082, + "epoch": 1.3809475203779582, + "grad_norm": 3.84375, + "learning_rate": 2.308754860987659e-06, + "loss": 0.78902168, + "memory(GiB)": 147.13, + "step": 59190, + "train_speed(iter/s)": 0.20091 + }, + { + "acc": 0.78038301, + "epoch": 1.381180827950247, + "grad_norm": 5.34375, + "learning_rate": 2.3071629365608793e-06, + "loss": 0.7852129, + "memory(GiB)": 147.13, + "step": 59200, + "train_speed(iter/s)": 0.200929 + }, + { + "acc": 0.76663785, + "epoch": 1.381414135522536, + "grad_norm": 6.8125, + "learning_rate": 2.305571396535807e-06, + "loss": 0.84909973, + "memory(GiB)": 147.13, + "step": 59210, + "train_speed(iter/s)": 0.200946 + }, + { + "acc": 0.76717291, + "epoch": 1.381647443094825, + "grad_norm": 5.84375, + "learning_rate": 2.303980241139636e-06, + "loss": 0.84376945, + "memory(GiB)": 147.13, + "step": 59220, + "train_speed(iter/s)": 0.200964 + }, + { + "acc": 0.75790958, + "epoch": 1.3818807506671138, + "grad_norm": 9.0, + "learning_rate": 2.3023894705995e-06, + "loss": 0.85827541, + "memory(GiB)": 147.13, + "step": 59230, + "train_speed(iter/s)": 0.200981 + }, + { + "acc": 0.7841033, + "epoch": 1.3821140582394027, + "grad_norm": 10.6875, + "learning_rate": 2.3007990851424862e-06, + "loss": 0.79800854, + "memory(GiB)": 147.13, + "step": 59240, + "train_speed(iter/s)": 0.200999 + }, + { + "acc": 0.78057899, + "epoch": 1.3823473658116916, + "grad_norm": 4.15625, + "learning_rate": 2.2992090849956176e-06, + "loss": 0.8046237, + "memory(GiB)": 147.13, + "step": 59250, + "train_speed(iter/s)": 0.201017 + }, + { + "acc": 0.77489777, + "epoch": 1.3825806733839805, + "grad_norm": 6.46875, + "learning_rate": 2.2976194703858666e-06, + "loss": 0.80679083, + "memory(GiB)": 147.13, + "step": 59260, + "train_speed(iter/s)": 0.201034 + }, + { + "acc": 0.77184649, + "epoch": 1.3828139809562694, + "grad_norm": 6.0625, + "learning_rate": 2.2960302415401525e-06, + "loss": 0.84999313, + "memory(GiB)": 147.13, + "step": 59270, + "train_speed(iter/s)": 0.201052 + }, + { + "acc": 0.79643536, + "epoch": 1.3830472885285583, + "grad_norm": 6.625, + "learning_rate": 2.2944413986853344e-06, + "loss": 0.75381327, + "memory(GiB)": 147.13, + "step": 59280, + "train_speed(iter/s)": 0.201069 + }, + { + "acc": 0.77682009, + "epoch": 1.3832805961008472, + "grad_norm": 4.0, + "learning_rate": 2.292852942048222e-06, + "loss": 0.78831792, + "memory(GiB)": 147.13, + "step": 59290, + "train_speed(iter/s)": 0.201088 + }, + { + "acc": 0.78009706, + "epoch": 1.383513903673136, + "grad_norm": 5.375, + "learning_rate": 2.2912648718555665e-06, + "loss": 0.79728909, + "memory(GiB)": 147.13, + "step": 59300, + "train_speed(iter/s)": 0.201106 + }, + { + "acc": 0.77483168, + "epoch": 1.383747211245425, + "grad_norm": 5.40625, + "learning_rate": 2.2896771883340614e-06, + "loss": 0.81179886, + "memory(GiB)": 147.13, + "step": 59310, + "train_speed(iter/s)": 0.201124 + }, + { + "acc": 0.77034178, + "epoch": 1.383980518817714, + "grad_norm": 4.8125, + "learning_rate": 2.2880898917103515e-06, + "loss": 0.84435978, + "memory(GiB)": 147.13, + "step": 59320, + "train_speed(iter/s)": 0.201142 + }, + { + "acc": 0.78930626, + "epoch": 1.3842138263900028, + "grad_norm": 8.9375, + "learning_rate": 2.2865029822110222e-06, + "loss": 0.73878508, + "memory(GiB)": 147.13, + "step": 59330, + "train_speed(iter/s)": 0.201159 + }, + { + "acc": 0.78097978, + "epoch": 1.3844471339622917, + "grad_norm": 4.59375, + "learning_rate": 2.2849164600626045e-06, + "loss": 0.77706594, + "memory(GiB)": 147.13, + "step": 59340, + "train_speed(iter/s)": 0.201178 + }, + { + "acc": 0.77245169, + "epoch": 1.3846804415345806, + "grad_norm": 5.25, + "learning_rate": 2.2833303254915713e-06, + "loss": 0.80866556, + "memory(GiB)": 147.13, + "step": 59350, + "train_speed(iter/s)": 0.201195 + }, + { + "acc": 0.80767746, + "epoch": 1.3849137491068695, + "grad_norm": 5.71875, + "learning_rate": 2.2817445787243464e-06, + "loss": 0.6845746, + "memory(GiB)": 147.13, + "step": 59360, + "train_speed(iter/s)": 0.201214 + }, + { + "acc": 0.78989286, + "epoch": 1.3851470566791584, + "grad_norm": 7.21875, + "learning_rate": 2.280159219987293e-06, + "loss": 0.73846169, + "memory(GiB)": 147.13, + "step": 59370, + "train_speed(iter/s)": 0.201231 + }, + { + "acc": 0.76815262, + "epoch": 1.3853803642514473, + "grad_norm": 8.6875, + "learning_rate": 2.27857424950672e-06, + "loss": 0.82266207, + "memory(GiB)": 147.13, + "step": 59380, + "train_speed(iter/s)": 0.201249 + }, + { + "acc": 0.76027946, + "epoch": 1.3856136718237362, + "grad_norm": 6.03125, + "learning_rate": 2.2769896675088833e-06, + "loss": 0.87801542, + "memory(GiB)": 147.13, + "step": 59390, + "train_speed(iter/s)": 0.201267 + }, + { + "acc": 0.78674126, + "epoch": 1.385846979396025, + "grad_norm": 4.96875, + "learning_rate": 2.2754054742199787e-06, + "loss": 0.78030553, + "memory(GiB)": 147.13, + "step": 59400, + "train_speed(iter/s)": 0.201285 + }, + { + "acc": 0.78899231, + "epoch": 1.386080286968314, + "grad_norm": 4.625, + "learning_rate": 2.273821669866153e-06, + "loss": 0.73768158, + "memory(GiB)": 147.13, + "step": 59410, + "train_speed(iter/s)": 0.201301 + }, + { + "acc": 0.77118654, + "epoch": 1.3863135945406029, + "grad_norm": 6.15625, + "learning_rate": 2.2722382546734904e-06, + "loss": 0.83121977, + "memory(GiB)": 147.13, + "step": 59420, + "train_speed(iter/s)": 0.201319 + }, + { + "acc": 0.76945696, + "epoch": 1.3865469021128918, + "grad_norm": 5.78125, + "learning_rate": 2.270655228868026e-06, + "loss": 0.84560337, + "memory(GiB)": 147.13, + "step": 59430, + "train_speed(iter/s)": 0.201336 + }, + { + "acc": 0.79610276, + "epoch": 1.3867802096851807, + "grad_norm": 5.25, + "learning_rate": 2.2690725926757355e-06, + "loss": 0.73223934, + "memory(GiB)": 147.13, + "step": 59440, + "train_speed(iter/s)": 0.201354 + }, + { + "acc": 0.76559458, + "epoch": 1.3870135172574694, + "grad_norm": 5.875, + "learning_rate": 2.267490346322539e-06, + "loss": 0.86651058, + "memory(GiB)": 147.13, + "step": 59450, + "train_speed(iter/s)": 0.20137 + }, + { + "acc": 0.80258989, + "epoch": 1.3872468248297585, + "grad_norm": 10.0, + "learning_rate": 2.265908490034301e-06, + "loss": 0.71374941, + "memory(GiB)": 147.13, + "step": 59460, + "train_speed(iter/s)": 0.201388 + }, + { + "acc": 0.77058544, + "epoch": 1.3874801324020472, + "grad_norm": 5.21875, + "learning_rate": 2.2643270240368305e-06, + "loss": 0.83985233, + "memory(GiB)": 147.13, + "step": 59470, + "train_speed(iter/s)": 0.201406 + }, + { + "acc": 0.77997141, + "epoch": 1.3877134399743363, + "grad_norm": 5.53125, + "learning_rate": 2.2627459485558846e-06, + "loss": 0.77843761, + "memory(GiB)": 147.13, + "step": 59480, + "train_speed(iter/s)": 0.201423 + }, + { + "acc": 0.76730309, + "epoch": 1.387946747546625, + "grad_norm": 7.03125, + "learning_rate": 2.2611652638171568e-06, + "loss": 0.82932968, + "memory(GiB)": 147.13, + "step": 59490, + "train_speed(iter/s)": 0.20144 + }, + { + "acc": 0.78150969, + "epoch": 1.388180055118914, + "grad_norm": 5.5, + "learning_rate": 2.259584970046294e-06, + "loss": 0.7819231, + "memory(GiB)": 147.13, + "step": 59500, + "train_speed(iter/s)": 0.201457 + }, + { + "epoch": 1.388180055118914, + "eval_acc": 0.7445906080852659, + "eval_loss": 0.8047671914100647, + "eval_runtime": 1269.924, + "eval_samples_per_second": 28.341, + "eval_steps_per_second": 14.171, + "step": 59500 + }, + { + "acc": 0.74836121, + "epoch": 1.3884133626912027, + "grad_norm": 6.53125, + "learning_rate": 2.2580050674688815e-06, + "loss": 0.9253665, + "memory(GiB)": 147.13, + "step": 59510, + "train_speed(iter/s)": 0.200601 + }, + { + "acc": 0.77755685, + "epoch": 1.3886466702634919, + "grad_norm": 5.25, + "learning_rate": 2.2564255563104465e-06, + "loss": 0.79394531, + "memory(GiB)": 147.13, + "step": 59520, + "train_speed(iter/s)": 0.200617 + }, + { + "acc": 0.77132187, + "epoch": 1.3888799778357805, + "grad_norm": 5.90625, + "learning_rate": 2.254846436796468e-06, + "loss": 0.82167749, + "memory(GiB)": 147.13, + "step": 59530, + "train_speed(iter/s)": 0.200635 + }, + { + "acc": 0.76772861, + "epoch": 1.3891132854080694, + "grad_norm": 5.75, + "learning_rate": 2.2532677091523615e-06, + "loss": 0.84193869, + "memory(GiB)": 147.13, + "step": 59540, + "train_speed(iter/s)": 0.200654 + }, + { + "acc": 0.79113283, + "epoch": 1.3893465929803583, + "grad_norm": 6.0, + "learning_rate": 2.2516893736034935e-06, + "loss": 0.74261956, + "memory(GiB)": 147.13, + "step": 59550, + "train_speed(iter/s)": 0.20067 + }, + { + "acc": 0.76318316, + "epoch": 1.3895799005526472, + "grad_norm": 5.46875, + "learning_rate": 2.250111430375169e-06, + "loss": 0.8669055, + "memory(GiB)": 147.13, + "step": 59560, + "train_speed(iter/s)": 0.200688 + }, + { + "acc": 0.780229, + "epoch": 1.3898132081249361, + "grad_norm": 6.5, + "learning_rate": 2.248533879692639e-06, + "loss": 0.77526116, + "memory(GiB)": 147.13, + "step": 59570, + "train_speed(iter/s)": 0.200705 + }, + { + "acc": 0.7953208, + "epoch": 1.390046515697225, + "grad_norm": 4.5625, + "learning_rate": 2.246956721781097e-06, + "loss": 0.74956465, + "memory(GiB)": 147.13, + "step": 59580, + "train_speed(iter/s)": 0.200722 + }, + { + "acc": 0.79553127, + "epoch": 1.390279823269514, + "grad_norm": 4.78125, + "learning_rate": 2.245379956865684e-06, + "loss": 0.74106674, + "memory(GiB)": 147.13, + "step": 59590, + "train_speed(iter/s)": 0.200741 + }, + { + "acc": 0.74994259, + "epoch": 1.3905131308418028, + "grad_norm": 5.28125, + "learning_rate": 2.243803585171483e-06, + "loss": 0.89359455, + "memory(GiB)": 147.13, + "step": 59600, + "train_speed(iter/s)": 0.200758 + }, + { + "acc": 0.76904812, + "epoch": 1.3907464384140917, + "grad_norm": 5.53125, + "learning_rate": 2.2422276069235174e-06, + "loss": 0.82504959, + "memory(GiB)": 147.13, + "step": 59610, + "train_speed(iter/s)": 0.200776 + }, + { + "acc": 0.77541676, + "epoch": 1.3909797459863806, + "grad_norm": 7.625, + "learning_rate": 2.240652022346761e-06, + "loss": 0.80968227, + "memory(GiB)": 147.13, + "step": 59620, + "train_speed(iter/s)": 0.200793 + }, + { + "acc": 0.7667109, + "epoch": 1.3912130535586695, + "grad_norm": 5.34375, + "learning_rate": 2.2390768316661256e-06, + "loss": 0.82808847, + "memory(GiB)": 147.13, + "step": 59630, + "train_speed(iter/s)": 0.20081 + }, + { + "acc": 0.79110146, + "epoch": 1.3914463611309584, + "grad_norm": 4.71875, + "learning_rate": 2.237502035106472e-06, + "loss": 0.75989141, + "memory(GiB)": 147.13, + "step": 59640, + "train_speed(iter/s)": 0.200828 + }, + { + "acc": 0.76692033, + "epoch": 1.3916796687032473, + "grad_norm": 16.375, + "learning_rate": 2.2359276328926007e-06, + "loss": 0.83675632, + "memory(GiB)": 147.13, + "step": 59650, + "train_speed(iter/s)": 0.200846 + }, + { + "acc": 0.77610579, + "epoch": 1.3919129762755362, + "grad_norm": 5.25, + "learning_rate": 2.2343536252492542e-06, + "loss": 0.80217104, + "memory(GiB)": 147.13, + "step": 59660, + "train_speed(iter/s)": 0.200863 + }, + { + "acc": 0.77709975, + "epoch": 1.3921462838478251, + "grad_norm": 5.96875, + "learning_rate": 2.2327800124011285e-06, + "loss": 0.81001654, + "memory(GiB)": 147.13, + "step": 59670, + "train_speed(iter/s)": 0.200881 + }, + { + "acc": 0.77643595, + "epoch": 1.392379591420114, + "grad_norm": 5.4375, + "learning_rate": 2.231206794572848e-06, + "loss": 0.80721416, + "memory(GiB)": 147.13, + "step": 59680, + "train_speed(iter/s)": 0.200898 + }, + { + "acc": 0.77637901, + "epoch": 1.392612898992403, + "grad_norm": 5.34375, + "learning_rate": 2.229633971988996e-06, + "loss": 0.80078964, + "memory(GiB)": 147.13, + "step": 59690, + "train_speed(iter/s)": 0.200915 + }, + { + "acc": 0.78074894, + "epoch": 1.3928462065646918, + "grad_norm": 8.0625, + "learning_rate": 2.2280615448740873e-06, + "loss": 0.79391489, + "memory(GiB)": 147.13, + "step": 59700, + "train_speed(iter/s)": 0.200933 + }, + { + "acc": 0.78368201, + "epoch": 1.3930795141369807, + "grad_norm": 9.0625, + "learning_rate": 2.2264895134525898e-06, + "loss": 0.77110548, + "memory(GiB)": 147.13, + "step": 59710, + "train_speed(iter/s)": 0.200951 + }, + { + "acc": 0.79938531, + "epoch": 1.3933128217092696, + "grad_norm": 4.40625, + "learning_rate": 2.2249178779489065e-06, + "loss": 0.71430826, + "memory(GiB)": 147.13, + "step": 59720, + "train_speed(iter/s)": 0.200968 + }, + { + "acc": 0.78940001, + "epoch": 1.3935461292815585, + "grad_norm": 4.625, + "learning_rate": 2.223346638587392e-06, + "loss": 0.76045151, + "memory(GiB)": 147.13, + "step": 59730, + "train_speed(iter/s)": 0.200984 + }, + { + "acc": 0.79823899, + "epoch": 1.3937794368538474, + "grad_norm": 6.46875, + "learning_rate": 2.2217757955923386e-06, + "loss": 0.72001886, + "memory(GiB)": 147.13, + "step": 59740, + "train_speed(iter/s)": 0.201001 + }, + { + "acc": 0.79044533, + "epoch": 1.3940127444261363, + "grad_norm": 4.5, + "learning_rate": 2.220205349187981e-06, + "loss": 0.74306707, + "memory(GiB)": 147.13, + "step": 59750, + "train_speed(iter/s)": 0.201018 + }, + { + "acc": 0.78812528, + "epoch": 1.3942460519984252, + "grad_norm": 8.375, + "learning_rate": 2.218635299598504e-06, + "loss": 0.76315918, + "memory(GiB)": 147.13, + "step": 59760, + "train_speed(iter/s)": 0.201036 + }, + { + "acc": 0.78087091, + "epoch": 1.3944793595707141, + "grad_norm": 5.9375, + "learning_rate": 2.2170656470480284e-06, + "loss": 0.79584026, + "memory(GiB)": 147.13, + "step": 59770, + "train_speed(iter/s)": 0.201053 + }, + { + "acc": 0.77213335, + "epoch": 1.394712667143003, + "grad_norm": 5.96875, + "learning_rate": 2.215496391760625e-06, + "loss": 0.83666573, + "memory(GiB)": 147.13, + "step": 59780, + "train_speed(iter/s)": 0.201071 + }, + { + "acc": 0.79270315, + "epoch": 1.394945974715292, + "grad_norm": 5.65625, + "learning_rate": 2.2139275339603023e-06, + "loss": 0.7484087, + "memory(GiB)": 147.13, + "step": 59790, + "train_speed(iter/s)": 0.201088 + }, + { + "acc": 0.80184422, + "epoch": 1.3951792822875808, + "grad_norm": 4.8125, + "learning_rate": 2.2123590738710153e-06, + "loss": 0.71304121, + "memory(GiB)": 147.13, + "step": 59800, + "train_speed(iter/s)": 0.201105 + }, + { + "acc": 0.793783, + "epoch": 1.3954125898598697, + "grad_norm": 5.03125, + "learning_rate": 2.2107910117166608e-06, + "loss": 0.73716478, + "memory(GiB)": 147.13, + "step": 59810, + "train_speed(iter/s)": 0.201123 + }, + { + "acc": 0.77772017, + "epoch": 1.3956458974321586, + "grad_norm": 4.75, + "learning_rate": 2.2092233477210767e-06, + "loss": 0.78334846, + "memory(GiB)": 147.13, + "step": 59820, + "train_speed(iter/s)": 0.201141 + }, + { + "acc": 0.78544092, + "epoch": 1.3958792050044475, + "grad_norm": 5.9375, + "learning_rate": 2.2076560821080515e-06, + "loss": 0.77470407, + "memory(GiB)": 147.13, + "step": 59830, + "train_speed(iter/s)": 0.201158 + }, + { + "acc": 0.77739944, + "epoch": 1.3961125125767362, + "grad_norm": 4.875, + "learning_rate": 2.2060892151013067e-06, + "loss": 0.807547, + "memory(GiB)": 147.13, + "step": 59840, + "train_speed(iter/s)": 0.201175 + }, + { + "acc": 0.7652626, + "epoch": 1.3963458201490253, + "grad_norm": 4.75, + "learning_rate": 2.2045227469245178e-06, + "loss": 0.83140392, + "memory(GiB)": 147.13, + "step": 59850, + "train_speed(iter/s)": 0.201194 + }, + { + "acc": 0.78730431, + "epoch": 1.396579127721314, + "grad_norm": 4.9375, + "learning_rate": 2.202956677801292e-06, + "loss": 0.74223843, + "memory(GiB)": 147.13, + "step": 59860, + "train_speed(iter/s)": 0.201211 + }, + { + "acc": 0.77190733, + "epoch": 1.396812435293603, + "grad_norm": 5.8125, + "learning_rate": 2.2013910079551905e-06, + "loss": 0.82516975, + "memory(GiB)": 147.13, + "step": 59870, + "train_speed(iter/s)": 0.201227 + }, + { + "acc": 0.77773724, + "epoch": 1.3970457428658918, + "grad_norm": 6.1875, + "learning_rate": 2.199825737609709e-06, + "loss": 0.78944416, + "memory(GiB)": 147.13, + "step": 59880, + "train_speed(iter/s)": 0.201245 + }, + { + "acc": 0.77316818, + "epoch": 1.397279050438181, + "grad_norm": 6.125, + "learning_rate": 2.198260866988288e-06, + "loss": 0.81869354, + "memory(GiB)": 147.13, + "step": 59890, + "train_speed(iter/s)": 0.201262 + }, + { + "acc": 0.79717088, + "epoch": 1.3975123580104696, + "grad_norm": 4.75, + "learning_rate": 2.1966963963143184e-06, + "loss": 0.72420158, + "memory(GiB)": 147.13, + "step": 59900, + "train_speed(iter/s)": 0.201279 + }, + { + "acc": 0.7967824, + "epoch": 1.3977456655827587, + "grad_norm": 4.21875, + "learning_rate": 2.1951323258111194e-06, + "loss": 0.71405821, + "memory(GiB)": 147.13, + "step": 59910, + "train_speed(iter/s)": 0.201295 + }, + { + "acc": 0.77692866, + "epoch": 1.3979789731550474, + "grad_norm": 5.125, + "learning_rate": 2.193568655701969e-06, + "loss": 0.80145493, + "memory(GiB)": 147.13, + "step": 59920, + "train_speed(iter/s)": 0.201313 + }, + { + "acc": 0.77152061, + "epoch": 1.3982122807273363, + "grad_norm": 4.125, + "learning_rate": 2.1920053862100754e-06, + "loss": 0.82826338, + "memory(GiB)": 147.13, + "step": 59930, + "train_speed(iter/s)": 0.201331 + }, + { + "acc": 0.78626838, + "epoch": 1.3984455882996252, + "grad_norm": 22.0, + "learning_rate": 2.190442517558599e-06, + "loss": 0.76881523, + "memory(GiB)": 147.13, + "step": 59940, + "train_speed(iter/s)": 0.201348 + }, + { + "acc": 0.77308369, + "epoch": 1.398678895871914, + "grad_norm": 5.4375, + "learning_rate": 2.188880049970637e-06, + "loss": 0.82151947, + "memory(GiB)": 147.13, + "step": 59950, + "train_speed(iter/s)": 0.201365 + }, + { + "acc": 0.76596837, + "epoch": 1.398912203444203, + "grad_norm": 5.5625, + "learning_rate": 2.18731798366923e-06, + "loss": 0.8408473, + "memory(GiB)": 147.13, + "step": 59960, + "train_speed(iter/s)": 0.201384 + }, + { + "acc": 0.7691186, + "epoch": 1.3991455110164919, + "grad_norm": 6.09375, + "learning_rate": 2.1857563188773644e-06, + "loss": 0.84743719, + "memory(GiB)": 147.13, + "step": 59970, + "train_speed(iter/s)": 0.201402 + }, + { + "acc": 0.77092209, + "epoch": 1.3993788185887808, + "grad_norm": 8.4375, + "learning_rate": 2.184195055817966e-06, + "loss": 0.84977627, + "memory(GiB)": 147.13, + "step": 59980, + "train_speed(iter/s)": 0.201418 + }, + { + "acc": 0.77478681, + "epoch": 1.3996121261610697, + "grad_norm": 6.90625, + "learning_rate": 2.1826341947139067e-06, + "loss": 0.80044651, + "memory(GiB)": 147.13, + "step": 59990, + "train_speed(iter/s)": 0.201435 + }, + { + "acc": 0.7726613, + "epoch": 1.3998454337333586, + "grad_norm": 3.703125, + "learning_rate": 2.181073735787998e-06, + "loss": 0.83067513, + "memory(GiB)": 147.13, + "step": 60000, + "train_speed(iter/s)": 0.201452 + }, + { + "epoch": 1.3998454337333586, + "eval_acc": 0.7445992558738102, + "eval_loss": 0.8046594858169556, + "eval_runtime": 1271.0218, + "eval_samples_per_second": 28.317, + "eval_steps_per_second": 14.159, + "step": 60000 + }, + { + "acc": 0.78201704, + "epoch": 1.4000787413056475, + "grad_norm": 4.84375, + "learning_rate": 2.179513679262992e-06, + "loss": 0.78135967, + "memory(GiB)": 147.13, + "step": 60010, + "train_speed(iter/s)": 0.200603 + }, + { + "acc": 0.7830049, + "epoch": 1.4003120488779364, + "grad_norm": 5.03125, + "learning_rate": 2.1779540253615917e-06, + "loss": 0.78097987, + "memory(GiB)": 147.13, + "step": 60020, + "train_speed(iter/s)": 0.200621 + }, + { + "acc": 0.80212574, + "epoch": 1.4005453564502253, + "grad_norm": 5.46875, + "learning_rate": 2.176394774306434e-06, + "loss": 0.71014204, + "memory(GiB)": 147.13, + "step": 60030, + "train_speed(iter/s)": 0.200636 + }, + { + "acc": 0.79081697, + "epoch": 1.4007786640225142, + "grad_norm": 6.875, + "learning_rate": 2.174835926320102e-06, + "loss": 0.75322866, + "memory(GiB)": 147.13, + "step": 60040, + "train_speed(iter/s)": 0.200655 + }, + { + "acc": 0.77095056, + "epoch": 1.401011971594803, + "grad_norm": 5.375, + "learning_rate": 2.173277481625119e-06, + "loss": 0.83664217, + "memory(GiB)": 147.13, + "step": 60050, + "train_speed(iter/s)": 0.200671 + }, + { + "acc": 0.76814718, + "epoch": 1.401245279167092, + "grad_norm": 5.375, + "learning_rate": 2.1717194404439563e-06, + "loss": 0.84824924, + "memory(GiB)": 147.13, + "step": 60060, + "train_speed(iter/s)": 0.200687 + }, + { + "acc": 0.77537007, + "epoch": 1.4014785867393809, + "grad_norm": 4.8125, + "learning_rate": 2.17016180299902e-06, + "loss": 0.79466648, + "memory(GiB)": 147.13, + "step": 60070, + "train_speed(iter/s)": 0.200704 + }, + { + "acc": 0.78756018, + "epoch": 1.4017118943116698, + "grad_norm": 4.5, + "learning_rate": 2.168604569512666e-06, + "loss": 0.75703373, + "memory(GiB)": 147.13, + "step": 60080, + "train_speed(iter/s)": 0.200721 + }, + { + "acc": 0.76781497, + "epoch": 1.4019452018839587, + "grad_norm": 7.03125, + "learning_rate": 2.167047740207187e-06, + "loss": 0.84280319, + "memory(GiB)": 147.13, + "step": 60090, + "train_speed(iter/s)": 0.200739 + }, + { + "acc": 0.76985674, + "epoch": 1.4021785094562476, + "grad_norm": 5.3125, + "learning_rate": 2.1654913153048186e-06, + "loss": 0.8241436, + "memory(GiB)": 147.13, + "step": 60100, + "train_speed(iter/s)": 0.200756 + }, + { + "acc": 0.79031215, + "epoch": 1.4024118170285365, + "grad_norm": 3.828125, + "learning_rate": 2.1639352950277433e-06, + "loss": 0.74541531, + "memory(GiB)": 147.13, + "step": 60110, + "train_speed(iter/s)": 0.200775 + }, + { + "acc": 0.7930346, + "epoch": 1.4026451246008254, + "grad_norm": 7.75, + "learning_rate": 2.162379679598079e-06, + "loss": 0.73493719, + "memory(GiB)": 147.13, + "step": 60120, + "train_speed(iter/s)": 0.20079 + }, + { + "acc": 0.77408686, + "epoch": 1.4028784321731143, + "grad_norm": 4.75, + "learning_rate": 2.1608244692378946e-06, + "loss": 0.83344536, + "memory(GiB)": 147.13, + "step": 60130, + "train_speed(iter/s)": 0.200806 + }, + { + "acc": 0.78932433, + "epoch": 1.4031117397454032, + "grad_norm": 5.84375, + "learning_rate": 2.1592696641691884e-06, + "loss": 0.744315, + "memory(GiB)": 147.13, + "step": 60140, + "train_speed(iter/s)": 0.200824 + }, + { + "acc": 0.78552046, + "epoch": 1.403345047317692, + "grad_norm": 5.40625, + "learning_rate": 2.157715264613915e-06, + "loss": 0.78410473, + "memory(GiB)": 147.13, + "step": 60150, + "train_speed(iter/s)": 0.200841 + }, + { + "acc": 0.76167541, + "epoch": 1.403578354889981, + "grad_norm": 6.46875, + "learning_rate": 2.156161270793961e-06, + "loss": 0.8703126, + "memory(GiB)": 147.13, + "step": 60160, + "train_speed(iter/s)": 0.200858 + }, + { + "acc": 0.78156233, + "epoch": 1.4038116624622698, + "grad_norm": 4.875, + "learning_rate": 2.1546076829311584e-06, + "loss": 0.7798255, + "memory(GiB)": 147.13, + "step": 60170, + "train_speed(iter/s)": 0.200876 + }, + { + "acc": 0.79834514, + "epoch": 1.4040449700345587, + "grad_norm": 5.5625, + "learning_rate": 2.153054501247284e-06, + "loss": 0.73483548, + "memory(GiB)": 147.13, + "step": 60180, + "train_speed(iter/s)": 0.200893 + }, + { + "acc": 0.77490888, + "epoch": 1.4042782776068476, + "grad_norm": 5.4375, + "learning_rate": 2.151501725964051e-06, + "loss": 0.79429326, + "memory(GiB)": 147.13, + "step": 60190, + "train_speed(iter/s)": 0.200911 + }, + { + "acc": 0.79103308, + "epoch": 1.4045115851791365, + "grad_norm": 3.796875, + "learning_rate": 2.14994935730312e-06, + "loss": 0.74287324, + "memory(GiB)": 147.13, + "step": 60200, + "train_speed(iter/s)": 0.200927 + }, + { + "acc": 0.76605501, + "epoch": 1.4047448927514254, + "grad_norm": 8.1875, + "learning_rate": 2.1483973954860894e-06, + "loss": 0.86529417, + "memory(GiB)": 147.13, + "step": 60210, + "train_speed(iter/s)": 0.200943 + }, + { + "acc": 0.7755722, + "epoch": 1.4049782003237143, + "grad_norm": 4.9375, + "learning_rate": 2.146845840734504e-06, + "loss": 0.79309669, + "memory(GiB)": 147.13, + "step": 60220, + "train_speed(iter/s)": 0.200961 + }, + { + "acc": 0.7734189, + "epoch": 1.405211507896003, + "grad_norm": 8.75, + "learning_rate": 2.1452946932698454e-06, + "loss": 0.84152288, + "memory(GiB)": 147.13, + "step": 60230, + "train_speed(iter/s)": 0.200978 + }, + { + "acc": 0.78702106, + "epoch": 1.4054448154682921, + "grad_norm": 4.5625, + "learning_rate": 2.1437439533135386e-06, + "loss": 0.77411451, + "memory(GiB)": 147.13, + "step": 60240, + "train_speed(iter/s)": 0.200993 + }, + { + "acc": 0.78498821, + "epoch": 1.4056781230405808, + "grad_norm": 4.96875, + "learning_rate": 2.142193621086956e-06, + "loss": 0.75319977, + "memory(GiB)": 147.13, + "step": 60250, + "train_speed(iter/s)": 0.201011 + }, + { + "acc": 0.7821517, + "epoch": 1.40591143061287, + "grad_norm": 4.46875, + "learning_rate": 2.140643696811401e-06, + "loss": 0.80938702, + "memory(GiB)": 147.13, + "step": 60260, + "train_speed(iter/s)": 0.201027 + }, + { + "acc": 0.77948647, + "epoch": 1.4061447381851586, + "grad_norm": 7.25, + "learning_rate": 2.1390941807081285e-06, + "loss": 0.79588933, + "memory(GiB)": 147.13, + "step": 60270, + "train_speed(iter/s)": 0.201044 + }, + { + "acc": 0.79571724, + "epoch": 1.4063780457574477, + "grad_norm": 7.15625, + "learning_rate": 2.1375450729983294e-06, + "loss": 0.72755728, + "memory(GiB)": 147.13, + "step": 60280, + "train_speed(iter/s)": 0.201061 + }, + { + "acc": 0.79610963, + "epoch": 1.4066113533297364, + "grad_norm": 4.90625, + "learning_rate": 2.1359963739031407e-06, + "loss": 0.728936, + "memory(GiB)": 147.13, + "step": 60290, + "train_speed(iter/s)": 0.201079 + }, + { + "acc": 0.7886919, + "epoch": 1.4068446609020255, + "grad_norm": 5.21875, + "learning_rate": 2.134448083643638e-06, + "loss": 0.76339221, + "memory(GiB)": 147.13, + "step": 60300, + "train_speed(iter/s)": 0.201095 + }, + { + "acc": 0.76663504, + "epoch": 1.4070779684743142, + "grad_norm": 5.59375, + "learning_rate": 2.1329002024408375e-06, + "loss": 0.84437771, + "memory(GiB)": 147.13, + "step": 60310, + "train_speed(iter/s)": 0.201113 + }, + { + "acc": 0.77732272, + "epoch": 1.407311276046603, + "grad_norm": 5.125, + "learning_rate": 2.1313527305157015e-06, + "loss": 0.78522806, + "memory(GiB)": 147.13, + "step": 60320, + "train_speed(iter/s)": 0.201128 + }, + { + "acc": 0.78382611, + "epoch": 1.407544583618892, + "grad_norm": 4.0625, + "learning_rate": 2.1298056680891288e-06, + "loss": 0.78302422, + "memory(GiB)": 147.13, + "step": 60330, + "train_speed(iter/s)": 0.201145 + }, + { + "acc": 0.77463741, + "epoch": 1.407777891191181, + "grad_norm": 6.0, + "learning_rate": 2.1282590153819645e-06, + "loss": 0.80812149, + "memory(GiB)": 147.13, + "step": 60340, + "train_speed(iter/s)": 0.201161 + }, + { + "acc": 0.79031725, + "epoch": 1.4080111987634698, + "grad_norm": 4.03125, + "learning_rate": 2.1267127726149896e-06, + "loss": 0.76469936, + "memory(GiB)": 147.13, + "step": 60350, + "train_speed(iter/s)": 0.201179 + }, + { + "acc": 0.76453934, + "epoch": 1.4082445063357587, + "grad_norm": 5.78125, + "learning_rate": 2.1251669400089353e-06, + "loss": 0.84970903, + "memory(GiB)": 147.13, + "step": 60360, + "train_speed(iter/s)": 0.201195 + }, + { + "acc": 0.79993305, + "epoch": 1.4084778139080476, + "grad_norm": 4.84375, + "learning_rate": 2.1236215177844617e-06, + "loss": 0.70226107, + "memory(GiB)": 147.13, + "step": 60370, + "train_speed(iter/s)": 0.201212 + }, + { + "acc": 0.80778503, + "epoch": 1.4087111214803365, + "grad_norm": 7.0625, + "learning_rate": 2.1220765061621828e-06, + "loss": 0.67411156, + "memory(GiB)": 147.13, + "step": 60380, + "train_speed(iter/s)": 0.201229 + }, + { + "acc": 0.77227788, + "epoch": 1.4089444290526254, + "grad_norm": 4.15625, + "learning_rate": 2.120531905362646e-06, + "loss": 0.81891384, + "memory(GiB)": 147.13, + "step": 60390, + "train_speed(iter/s)": 0.201246 + }, + { + "acc": 0.79898548, + "epoch": 1.4091777366249143, + "grad_norm": 4.8125, + "learning_rate": 2.118987715606342e-06, + "loss": 0.72313986, + "memory(GiB)": 147.13, + "step": 60400, + "train_speed(iter/s)": 0.201261 + }, + { + "acc": 0.78145227, + "epoch": 1.4094110441972032, + "grad_norm": 5.71875, + "learning_rate": 2.1174439371137064e-06, + "loss": 0.78863721, + "memory(GiB)": 147.13, + "step": 60410, + "train_speed(iter/s)": 0.201278 + }, + { + "acc": 0.78333859, + "epoch": 1.409644351769492, + "grad_norm": 5.875, + "learning_rate": 2.1159005701051093e-06, + "loss": 0.77123041, + "memory(GiB)": 147.13, + "step": 60420, + "train_speed(iter/s)": 0.201296 + }, + { + "acc": 0.79827003, + "epoch": 1.409877659341781, + "grad_norm": 5.375, + "learning_rate": 2.11435761480087e-06, + "loss": 0.71006422, + "memory(GiB)": 147.13, + "step": 60430, + "train_speed(iter/s)": 0.201312 + }, + { + "acc": 0.78789725, + "epoch": 1.41011096691407, + "grad_norm": 4.40625, + "learning_rate": 2.112815071421243e-06, + "loss": 0.7574213, + "memory(GiB)": 147.13, + "step": 60440, + "train_speed(iter/s)": 0.201329 + }, + { + "acc": 0.77921619, + "epoch": 1.4103442744863588, + "grad_norm": 4.3125, + "learning_rate": 2.111272940186424e-06, + "loss": 0.78693404, + "memory(GiB)": 147.13, + "step": 60450, + "train_speed(iter/s)": 0.201347 + }, + { + "acc": 0.76266708, + "epoch": 1.4105775820586477, + "grad_norm": 5.84375, + "learning_rate": 2.109731221316555e-06, + "loss": 0.87478523, + "memory(GiB)": 147.13, + "step": 60460, + "train_speed(iter/s)": 0.201363 + }, + { + "acc": 0.76366029, + "epoch": 1.4108108896309366, + "grad_norm": 7.03125, + "learning_rate": 2.108189915031715e-06, + "loss": 0.84197912, + "memory(GiB)": 147.13, + "step": 60470, + "train_speed(iter/s)": 0.201381 + }, + { + "acc": 0.78174658, + "epoch": 1.4110441972032255, + "grad_norm": 5.65625, + "learning_rate": 2.1066490215519243e-06, + "loss": 0.78470864, + "memory(GiB)": 147.13, + "step": 60480, + "train_speed(iter/s)": 0.201398 + }, + { + "acc": 0.80206575, + "epoch": 1.4112775047755144, + "grad_norm": 6.53125, + "learning_rate": 2.105108541097143e-06, + "loss": 0.71396837, + "memory(GiB)": 147.13, + "step": 60490, + "train_speed(iter/s)": 0.201416 + }, + { + "acc": 0.77739525, + "epoch": 1.4115108123478033, + "grad_norm": 6.40625, + "learning_rate": 2.1035684738872792e-06, + "loss": 0.78567958, + "memory(GiB)": 147.13, + "step": 60500, + "train_speed(iter/s)": 0.201433 + }, + { + "epoch": 1.4115108123478033, + "eval_acc": 0.7446263202490692, + "eval_loss": 0.8046724200248718, + "eval_runtime": 1269.8878, + "eval_samples_per_second": 28.342, + "eval_steps_per_second": 14.171, + "step": 60500 + }, + { + "acc": 0.78144083, + "epoch": 1.4117441199200922, + "grad_norm": 4.96875, + "learning_rate": 2.1020288201421722e-06, + "loss": 0.7848321, + "memory(GiB)": 147.13, + "step": 60510, + "train_speed(iter/s)": 0.200589 + }, + { + "acc": 0.78034506, + "epoch": 1.411977427492381, + "grad_norm": 4.96875, + "learning_rate": 2.100489580081611e-06, + "loss": 0.79630418, + "memory(GiB)": 147.13, + "step": 60520, + "train_speed(iter/s)": 0.200608 + }, + { + "acc": 0.76285105, + "epoch": 1.41221073506467, + "grad_norm": 6.125, + "learning_rate": 2.09895075392532e-06, + "loss": 0.84258804, + "memory(GiB)": 147.13, + "step": 60530, + "train_speed(iter/s)": 0.200625 + }, + { + "acc": 0.7814455, + "epoch": 1.4124440426369589, + "grad_norm": 5.15625, + "learning_rate": 2.0974123418929644e-06, + "loss": 0.78600588, + "memory(GiB)": 147.13, + "step": 60540, + "train_speed(iter/s)": 0.200644 + }, + { + "acc": 0.79303546, + "epoch": 1.4126773502092478, + "grad_norm": 5.15625, + "learning_rate": 2.095874344204155e-06, + "loss": 0.73297596, + "memory(GiB)": 147.13, + "step": 60550, + "train_speed(iter/s)": 0.200662 + }, + { + "acc": 0.79392943, + "epoch": 1.4129106577815367, + "grad_norm": 6.0, + "learning_rate": 2.094336761078438e-06, + "loss": 0.73787174, + "memory(GiB)": 147.13, + "step": 60560, + "train_speed(iter/s)": 0.200679 + }, + { + "acc": 0.78089685, + "epoch": 1.4131439653538256, + "grad_norm": 5.9375, + "learning_rate": 2.0927995927353062e-06, + "loss": 0.79184189, + "memory(GiB)": 147.13, + "step": 60570, + "train_speed(iter/s)": 0.200696 + }, + { + "acc": 0.78633385, + "epoch": 1.4133772729261145, + "grad_norm": 5.0, + "learning_rate": 2.091262839394188e-06, + "loss": 0.76935081, + "memory(GiB)": 147.13, + "step": 60580, + "train_speed(iter/s)": 0.200712 + }, + { + "acc": 0.7927001, + "epoch": 1.4136105804984034, + "grad_norm": 6.125, + "learning_rate": 2.0897265012744543e-06, + "loss": 0.75596743, + "memory(GiB)": 147.13, + "step": 60590, + "train_speed(iter/s)": 0.200729 + }, + { + "acc": 0.77544589, + "epoch": 1.413843888070692, + "grad_norm": 5.15625, + "learning_rate": 2.0881905785954172e-06, + "loss": 0.79783726, + "memory(GiB)": 147.13, + "step": 60600, + "train_speed(iter/s)": 0.200747 + }, + { + "acc": 0.76404762, + "epoch": 1.4140771956429812, + "grad_norm": 6.25, + "learning_rate": 2.086655071576327e-06, + "loss": 0.83482218, + "memory(GiB)": 147.13, + "step": 60610, + "train_speed(iter/s)": 0.200763 + }, + { + "acc": 0.77434025, + "epoch": 1.4143105032152699, + "grad_norm": 5.15625, + "learning_rate": 2.085119980436381e-06, + "loss": 0.79602251, + "memory(GiB)": 147.13, + "step": 60620, + "train_speed(iter/s)": 0.200781 + }, + { + "acc": 0.78777227, + "epoch": 1.414543810787559, + "grad_norm": 4.03125, + "learning_rate": 2.083585305394709e-06, + "loss": 0.75320654, + "memory(GiB)": 147.13, + "step": 60630, + "train_speed(iter/s)": 0.200798 + }, + { + "acc": 0.77788224, + "epoch": 1.4147771183598477, + "grad_norm": 6.0, + "learning_rate": 2.0820510466703898e-06, + "loss": 0.77969913, + "memory(GiB)": 147.13, + "step": 60640, + "train_speed(iter/s)": 0.200815 + }, + { + "acc": 0.79586306, + "epoch": 1.4150104259321368, + "grad_norm": 5.21875, + "learning_rate": 2.080517204482434e-06, + "loss": 0.73031406, + "memory(GiB)": 147.13, + "step": 60650, + "train_speed(iter/s)": 0.200832 + }, + { + "acc": 0.79818316, + "epoch": 1.4152437335044254, + "grad_norm": 4.09375, + "learning_rate": 2.078983779049801e-06, + "loss": 0.73085876, + "memory(GiB)": 147.13, + "step": 60660, + "train_speed(iter/s)": 0.200849 + }, + { + "acc": 0.78288202, + "epoch": 1.4154770410767146, + "grad_norm": 5.40625, + "learning_rate": 2.0774507705913844e-06, + "loss": 0.77484465, + "memory(GiB)": 147.13, + "step": 60670, + "train_speed(iter/s)": 0.200865 + }, + { + "acc": 0.77833548, + "epoch": 1.4157103486490032, + "grad_norm": 5.40625, + "learning_rate": 2.07591817932602e-06, + "loss": 0.78327904, + "memory(GiB)": 147.13, + "step": 60680, + "train_speed(iter/s)": 0.200881 + }, + { + "acc": 0.78331342, + "epoch": 1.4159436562212924, + "grad_norm": 10.5625, + "learning_rate": 2.074386005472488e-06, + "loss": 0.77935891, + "memory(GiB)": 147.13, + "step": 60690, + "train_speed(iter/s)": 0.200898 + }, + { + "acc": 0.77536631, + "epoch": 1.416176963793581, + "grad_norm": 4.96875, + "learning_rate": 2.072854249249503e-06, + "loss": 0.79027624, + "memory(GiB)": 147.13, + "step": 60700, + "train_speed(iter/s)": 0.200916 + }, + { + "acc": 0.77584419, + "epoch": 1.41641027136587, + "grad_norm": 4.34375, + "learning_rate": 2.0713229108757244e-06, + "loss": 0.80542765, + "memory(GiB)": 147.13, + "step": 60710, + "train_speed(iter/s)": 0.200932 + }, + { + "acc": 0.78596239, + "epoch": 1.4166435789381588, + "grad_norm": 4.8125, + "learning_rate": 2.0697919905697474e-06, + "loss": 0.77891278, + "memory(GiB)": 147.13, + "step": 60720, + "train_speed(iter/s)": 0.200949 + }, + { + "acc": 0.78150387, + "epoch": 1.4168768865104477, + "grad_norm": 3.5, + "learning_rate": 2.0682614885501147e-06, + "loss": 0.77592754, + "memory(GiB)": 147.13, + "step": 60730, + "train_speed(iter/s)": 0.200964 + }, + { + "acc": 0.7803638, + "epoch": 1.4171101940827366, + "grad_norm": 6.21875, + "learning_rate": 2.066731405035302e-06, + "loss": 0.78199749, + "memory(GiB)": 147.13, + "step": 60740, + "train_speed(iter/s)": 0.20098 + }, + { + "acc": 0.79469795, + "epoch": 1.4173435016550255, + "grad_norm": 5.28125, + "learning_rate": 2.065201740243728e-06, + "loss": 0.72167587, + "memory(GiB)": 147.13, + "step": 60750, + "train_speed(iter/s)": 0.200997 + }, + { + "acc": 0.76823006, + "epoch": 1.4175768092273144, + "grad_norm": 5.53125, + "learning_rate": 2.063672494393755e-06, + "loss": 0.84083357, + "memory(GiB)": 147.13, + "step": 60760, + "train_speed(iter/s)": 0.201015 + }, + { + "acc": 0.79289522, + "epoch": 1.4178101167996033, + "grad_norm": 6.125, + "learning_rate": 2.0621436677036775e-06, + "loss": 0.74563189, + "memory(GiB)": 147.13, + "step": 60770, + "train_speed(iter/s)": 0.201033 + }, + { + "acc": 0.79262247, + "epoch": 1.4180434243718922, + "grad_norm": 5.0625, + "learning_rate": 2.0606152603917406e-06, + "loss": 0.74184866, + "memory(GiB)": 147.13, + "step": 60780, + "train_speed(iter/s)": 0.20105 + }, + { + "acc": 0.78077393, + "epoch": 1.4182767319441811, + "grad_norm": 7.8125, + "learning_rate": 2.0590872726761215e-06, + "loss": 0.77461243, + "memory(GiB)": 147.13, + "step": 60790, + "train_speed(iter/s)": 0.201068 + }, + { + "acc": 0.78589611, + "epoch": 1.41851003951647, + "grad_norm": 5.40625, + "learning_rate": 2.057559704774938e-06, + "loss": 0.78779116, + "memory(GiB)": 147.13, + "step": 60800, + "train_speed(iter/s)": 0.201085 + }, + { + "acc": 0.77445693, + "epoch": 1.418743347088759, + "grad_norm": 6.875, + "learning_rate": 2.0560325569062535e-06, + "loss": 0.80334835, + "memory(GiB)": 147.13, + "step": 60810, + "train_speed(iter/s)": 0.201103 + }, + { + "acc": 0.77701745, + "epoch": 1.4189766546610478, + "grad_norm": 5.21875, + "learning_rate": 2.054505829288066e-06, + "loss": 0.81892672, + "memory(GiB)": 147.13, + "step": 60820, + "train_speed(iter/s)": 0.201121 + }, + { + "acc": 0.79528999, + "epoch": 1.4192099622333367, + "grad_norm": 5.1875, + "learning_rate": 2.0529795221383164e-06, + "loss": 0.72682176, + "memory(GiB)": 147.13, + "step": 60830, + "train_speed(iter/s)": 0.201136 + }, + { + "acc": 0.75750227, + "epoch": 1.4194432698056256, + "grad_norm": 5.96875, + "learning_rate": 2.0514536356748814e-06, + "loss": 0.86790705, + "memory(GiB)": 147.13, + "step": 60840, + "train_speed(iter/s)": 0.201154 + }, + { + "acc": 0.76532421, + "epoch": 1.4196765773779145, + "grad_norm": 4.40625, + "learning_rate": 2.0499281701155852e-06, + "loss": 0.84386234, + "memory(GiB)": 147.13, + "step": 60850, + "train_speed(iter/s)": 0.201171 + }, + { + "acc": 0.7653913, + "epoch": 1.4199098849502034, + "grad_norm": 9.375, + "learning_rate": 2.0484031256781845e-06, + "loss": 0.84208984, + "memory(GiB)": 147.13, + "step": 60860, + "train_speed(iter/s)": 0.201188 + }, + { + "acc": 0.7715621, + "epoch": 1.4201431925224923, + "grad_norm": 5.65625, + "learning_rate": 2.046878502580382e-06, + "loss": 0.8072238, + "memory(GiB)": 147.13, + "step": 60870, + "train_speed(iter/s)": 0.201206 + }, + { + "acc": 0.77248592, + "epoch": 1.4203765000947812, + "grad_norm": 4.625, + "learning_rate": 2.045354301039815e-06, + "loss": 0.81728535, + "memory(GiB)": 147.13, + "step": 60880, + "train_speed(iter/s)": 0.201223 + }, + { + "acc": 0.79007959, + "epoch": 1.4206098076670701, + "grad_norm": 7.90625, + "learning_rate": 2.043830521274061e-06, + "loss": 0.74873753, + "memory(GiB)": 147.13, + "step": 60890, + "train_speed(iter/s)": 0.20124 + }, + { + "acc": 0.7762063, + "epoch": 1.420843115239359, + "grad_norm": 5.0625, + "learning_rate": 2.0423071635006436e-06, + "loss": 0.80805073, + "memory(GiB)": 147.13, + "step": 60900, + "train_speed(iter/s)": 0.201257 + }, + { + "acc": 0.77343888, + "epoch": 1.421076422811648, + "grad_norm": 5.5, + "learning_rate": 2.0407842279370176e-06, + "loss": 0.82168198, + "memory(GiB)": 147.13, + "step": 60910, + "train_speed(iter/s)": 0.201274 + }, + { + "acc": 0.7828186, + "epoch": 1.4213097303839368, + "grad_norm": 4.5, + "learning_rate": 2.039261714800585e-06, + "loss": 0.7554111, + "memory(GiB)": 147.13, + "step": 60920, + "train_speed(iter/s)": 0.201291 + }, + { + "acc": 0.78235793, + "epoch": 1.4215430379562257, + "grad_norm": 5.90625, + "learning_rate": 2.0377396243086827e-06, + "loss": 0.7957407, + "memory(GiB)": 147.13, + "step": 60930, + "train_speed(iter/s)": 0.201309 + }, + { + "acc": 0.77107277, + "epoch": 1.4217763455285146, + "grad_norm": 27.75, + "learning_rate": 2.036217956678588e-06, + "loss": 0.83353214, + "memory(GiB)": 147.13, + "step": 60940, + "train_speed(iter/s)": 0.201325 + }, + { + "acc": 0.78861694, + "epoch": 1.4220096531008035, + "grad_norm": 4.34375, + "learning_rate": 2.034696712127518e-06, + "loss": 0.75524387, + "memory(GiB)": 147.13, + "step": 60950, + "train_speed(iter/s)": 0.201343 + }, + { + "acc": 0.76062737, + "epoch": 1.4222429606730924, + "grad_norm": 6.53125, + "learning_rate": 2.0331758908726323e-06, + "loss": 0.87450056, + "memory(GiB)": 147.13, + "step": 60960, + "train_speed(iter/s)": 0.201359 + }, + { + "acc": 0.77146816, + "epoch": 1.4224762682453813, + "grad_norm": 5.25, + "learning_rate": 2.031655493131026e-06, + "loss": 0.80269995, + "memory(GiB)": 147.13, + "step": 60970, + "train_speed(iter/s)": 0.201377 + }, + { + "acc": 0.76841688, + "epoch": 1.4227095758176702, + "grad_norm": 5.5625, + "learning_rate": 2.030135519119735e-06, + "loss": 0.83026876, + "memory(GiB)": 147.13, + "step": 60980, + "train_speed(iter/s)": 0.201393 + }, + { + "acc": 0.78153791, + "epoch": 1.422942883389959, + "grad_norm": 8.375, + "learning_rate": 2.0286159690557366e-06, + "loss": 0.8033679, + "memory(GiB)": 147.13, + "step": 60990, + "train_speed(iter/s)": 0.20141 + }, + { + "acc": 0.79216089, + "epoch": 1.423176190962248, + "grad_norm": 8.0, + "learning_rate": 2.027096843155944e-06, + "loss": 0.76074972, + "memory(GiB)": 147.13, + "step": 61000, + "train_speed(iter/s)": 0.201427 + }, + { + "epoch": 1.423176190962248, + "eval_acc": 0.744670359912952, + "eval_loss": 0.8046051859855652, + "eval_runtime": 1270.2783, + "eval_samples_per_second": 28.333, + "eval_steps_per_second": 14.167, + "step": 61000 + }, + { + "acc": 0.79125977, + "epoch": 1.4234094985345367, + "grad_norm": 10.5625, + "learning_rate": 2.025578141637215e-06, + "loss": 0.74310904, + "memory(GiB)": 147.13, + "step": 61010, + "train_speed(iter/s)": 0.20059 + }, + { + "acc": 0.76769581, + "epoch": 1.4236428061068258, + "grad_norm": 4.875, + "learning_rate": 2.024059864716343e-06, + "loss": 0.83147717, + "memory(GiB)": 147.13, + "step": 61020, + "train_speed(iter/s)": 0.200608 + }, + { + "acc": 0.77622566, + "epoch": 1.4238761136791145, + "grad_norm": 7.03125, + "learning_rate": 2.022542012610058e-06, + "loss": 0.80299397, + "memory(GiB)": 147.13, + "step": 61030, + "train_speed(iter/s)": 0.200626 + }, + { + "acc": 0.78763971, + "epoch": 1.4241094212514036, + "grad_norm": 4.53125, + "learning_rate": 2.0210245855350397e-06, + "loss": 0.7463274, + "memory(GiB)": 147.13, + "step": 61040, + "train_speed(iter/s)": 0.200643 + }, + { + "acc": 0.78232255, + "epoch": 1.4243427288236923, + "grad_norm": 5.8125, + "learning_rate": 2.019507583707893e-06, + "loss": 0.80083017, + "memory(GiB)": 147.13, + "step": 61050, + "train_speed(iter/s)": 0.200661 + }, + { + "acc": 0.79297686, + "epoch": 1.4245760363959814, + "grad_norm": 4.71875, + "learning_rate": 2.017991007345175e-06, + "loss": 0.73181958, + "memory(GiB)": 147.13, + "step": 61060, + "train_speed(iter/s)": 0.200677 + }, + { + "acc": 0.78155422, + "epoch": 1.42480934396827, + "grad_norm": 5.375, + "learning_rate": 2.016474856663372e-06, + "loss": 0.79105949, + "memory(GiB)": 147.13, + "step": 61070, + "train_speed(iter/s)": 0.200693 + }, + { + "acc": 0.77549515, + "epoch": 1.425042651540559, + "grad_norm": 6.0, + "learning_rate": 2.014959131878918e-06, + "loss": 0.81070833, + "memory(GiB)": 147.13, + "step": 61080, + "train_speed(iter/s)": 0.20071 + }, + { + "acc": 0.78981018, + "epoch": 1.4252759591128479, + "grad_norm": 4.03125, + "learning_rate": 2.0134438332081814e-06, + "loss": 0.76355982, + "memory(GiB)": 147.13, + "step": 61090, + "train_speed(iter/s)": 0.200727 + }, + { + "acc": 0.78231525, + "epoch": 1.4255092666851368, + "grad_norm": 7.0625, + "learning_rate": 2.0119289608674682e-06, + "loss": 0.77640409, + "memory(GiB)": 147.13, + "step": 61100, + "train_speed(iter/s)": 0.200745 + }, + { + "acc": 0.77027392, + "epoch": 1.4257425742574257, + "grad_norm": 6.28125, + "learning_rate": 2.010414515073029e-06, + "loss": 0.8242136, + "memory(GiB)": 147.13, + "step": 61110, + "train_speed(iter/s)": 0.200761 + }, + { + "acc": 0.78219414, + "epoch": 1.4259758818297146, + "grad_norm": 5.03125, + "learning_rate": 2.0089004960410485e-06, + "loss": 0.78226018, + "memory(GiB)": 147.13, + "step": 61120, + "train_speed(iter/s)": 0.200778 + }, + { + "acc": 0.79695072, + "epoch": 1.4262091894020035, + "grad_norm": 4.46875, + "learning_rate": 2.007386903987654e-06, + "loss": 0.72442718, + "memory(GiB)": 147.13, + "step": 61130, + "train_speed(iter/s)": 0.200796 + }, + { + "acc": 0.78832622, + "epoch": 1.4264424969742924, + "grad_norm": 5.15625, + "learning_rate": 2.0058737391289085e-06, + "loss": 0.7365797, + "memory(GiB)": 147.13, + "step": 61140, + "train_speed(iter/s)": 0.200813 + }, + { + "acc": 0.79924631, + "epoch": 1.4266758045465813, + "grad_norm": 10.625, + "learning_rate": 2.0043610016808185e-06, + "loss": 0.71290426, + "memory(GiB)": 147.13, + "step": 61150, + "train_speed(iter/s)": 0.200831 + }, + { + "acc": 0.793398, + "epoch": 1.4269091121188702, + "grad_norm": 5.25, + "learning_rate": 2.0028486918593253e-06, + "loss": 0.74249353, + "memory(GiB)": 147.13, + "step": 61160, + "train_speed(iter/s)": 0.200847 + }, + { + "acc": 0.79817839, + "epoch": 1.427142419691159, + "grad_norm": 7.1875, + "learning_rate": 2.001336809880311e-06, + "loss": 0.71638279, + "memory(GiB)": 147.13, + "step": 61170, + "train_speed(iter/s)": 0.200864 + }, + { + "acc": 0.7944067, + "epoch": 1.427375727263448, + "grad_norm": 7.3125, + "learning_rate": 1.9998253559595952e-06, + "loss": 0.72443352, + "memory(GiB)": 147.13, + "step": 61180, + "train_speed(iter/s)": 0.200882 + }, + { + "acc": 0.77642808, + "epoch": 1.4276090348357369, + "grad_norm": 5.40625, + "learning_rate": 1.9983143303129373e-06, + "loss": 0.81988869, + "memory(GiB)": 147.13, + "step": 61190, + "train_speed(iter/s)": 0.200899 + }, + { + "acc": 0.79813347, + "epoch": 1.4278423424080258, + "grad_norm": 4.875, + "learning_rate": 1.996803733156038e-06, + "loss": 0.72240438, + "memory(GiB)": 147.13, + "step": 61200, + "train_speed(iter/s)": 0.200916 + }, + { + "acc": 0.76945052, + "epoch": 1.4280756499803147, + "grad_norm": 5.96875, + "learning_rate": 1.9952935647045317e-06, + "loss": 0.83810911, + "memory(GiB)": 147.13, + "step": 61210, + "train_speed(iter/s)": 0.200933 + }, + { + "acc": 0.79682202, + "epoch": 1.4283089575526036, + "grad_norm": 7.59375, + "learning_rate": 1.9937838251739983e-06, + "loss": 0.72564445, + "memory(GiB)": 147.13, + "step": 61220, + "train_speed(iter/s)": 0.200949 + }, + { + "acc": 0.79631333, + "epoch": 1.4285422651248925, + "grad_norm": 6.0, + "learning_rate": 1.9922745147799505e-06, + "loss": 0.74118524, + "memory(GiB)": 147.13, + "step": 61230, + "train_speed(iter/s)": 0.200967 + }, + { + "acc": 0.80053787, + "epoch": 1.4287755726971814, + "grad_norm": 3.890625, + "learning_rate": 1.9907656337378396e-06, + "loss": 0.69944177, + "memory(GiB)": 147.13, + "step": 61240, + "train_speed(iter/s)": 0.200983 + }, + { + "acc": 0.76880045, + "epoch": 1.4290088802694703, + "grad_norm": 5.3125, + "learning_rate": 1.9892571822630622e-06, + "loss": 0.8351284, + "memory(GiB)": 147.13, + "step": 61250, + "train_speed(iter/s)": 0.201 + }, + { + "acc": 0.76350574, + "epoch": 1.4292421878417592, + "grad_norm": 4.46875, + "learning_rate": 1.987749160570946e-06, + "loss": 0.85821857, + "memory(GiB)": 147.13, + "step": 61260, + "train_speed(iter/s)": 0.201018 + }, + { + "acc": 0.7792346, + "epoch": 1.429475495414048, + "grad_norm": 5.5, + "learning_rate": 1.9862415688767657e-06, + "loss": 0.78018122, + "memory(GiB)": 147.13, + "step": 61270, + "train_speed(iter/s)": 0.201036 + }, + { + "acc": 0.7726048, + "epoch": 1.429708802986337, + "grad_norm": 5.375, + "learning_rate": 1.984734407395722e-06, + "loss": 0.82185307, + "memory(GiB)": 147.13, + "step": 61280, + "train_speed(iter/s)": 0.201053 + }, + { + "acc": 0.78452177, + "epoch": 1.4299421105586259, + "grad_norm": 4.0625, + "learning_rate": 1.9832276763429674e-06, + "loss": 0.75924959, + "memory(GiB)": 147.13, + "step": 61290, + "train_speed(iter/s)": 0.201071 + }, + { + "acc": 0.77438474, + "epoch": 1.4301754181309148, + "grad_norm": 4.65625, + "learning_rate": 1.9817213759335846e-06, + "loss": 0.84851189, + "memory(GiB)": 147.13, + "step": 61300, + "train_speed(iter/s)": 0.201089 + }, + { + "acc": 0.77766914, + "epoch": 1.4304087257032037, + "grad_norm": 5.15625, + "learning_rate": 1.9802155063825995e-06, + "loss": 0.7944005, + "memory(GiB)": 147.13, + "step": 61310, + "train_speed(iter/s)": 0.201107 + }, + { + "acc": 0.74958205, + "epoch": 1.4306420332754926, + "grad_norm": 5.84375, + "learning_rate": 1.9787100679049742e-06, + "loss": 0.91778793, + "memory(GiB)": 147.13, + "step": 61320, + "train_speed(iter/s)": 0.201124 + }, + { + "acc": 0.79310417, + "epoch": 1.4308753408477815, + "grad_norm": 9.375, + "learning_rate": 1.977205060715607e-06, + "loss": 0.74190741, + "memory(GiB)": 147.13, + "step": 61330, + "train_speed(iter/s)": 0.201141 + }, + { + "acc": 0.77827301, + "epoch": 1.4311086484200704, + "grad_norm": 5.5, + "learning_rate": 1.975700485029341e-06, + "loss": 0.80124111, + "memory(GiB)": 147.13, + "step": 61340, + "train_speed(iter/s)": 0.201157 + }, + { + "acc": 0.78497138, + "epoch": 1.4313419559923592, + "grad_norm": 6.21875, + "learning_rate": 1.9741963410609506e-06, + "loss": 0.75833988, + "memory(GiB)": 147.13, + "step": 61350, + "train_speed(iter/s)": 0.201175 + }, + { + "acc": 0.77764912, + "epoch": 1.4315752635646481, + "grad_norm": 3.65625, + "learning_rate": 1.9726926290251548e-06, + "loss": 0.7967144, + "memory(GiB)": 147.13, + "step": 61360, + "train_speed(iter/s)": 0.201192 + }, + { + "acc": 0.76858315, + "epoch": 1.431808571136937, + "grad_norm": 6.1875, + "learning_rate": 1.971189349136607e-06, + "loss": 0.82962227, + "memory(GiB)": 147.13, + "step": 61370, + "train_speed(iter/s)": 0.201209 + }, + { + "acc": 0.7532167, + "epoch": 1.4320418787092257, + "grad_norm": 4.6875, + "learning_rate": 1.969686501609898e-06, + "loss": 0.89655132, + "memory(GiB)": 147.13, + "step": 61380, + "train_speed(iter/s)": 0.201227 + }, + { + "acc": 0.79799242, + "epoch": 1.4322751862815148, + "grad_norm": 3.984375, + "learning_rate": 1.9681840866595644e-06, + "loss": 0.73923264, + "memory(GiB)": 147.13, + "step": 61390, + "train_speed(iter/s)": 0.201245 + }, + { + "acc": 0.77811699, + "epoch": 1.4325084938538035, + "grad_norm": 6.15625, + "learning_rate": 1.966682104500068e-06, + "loss": 0.79530067, + "memory(GiB)": 147.13, + "step": 61400, + "train_speed(iter/s)": 0.201263 + }, + { + "acc": 0.76311293, + "epoch": 1.4327418014260926, + "grad_norm": 4.8125, + "learning_rate": 1.9651805553458212e-06, + "loss": 0.83997898, + "memory(GiB)": 147.13, + "step": 61410, + "train_speed(iter/s)": 0.201281 + }, + { + "acc": 0.77644825, + "epoch": 1.4329751089983813, + "grad_norm": 5.65625, + "learning_rate": 1.9636794394111676e-06, + "loss": 0.81493549, + "memory(GiB)": 147.13, + "step": 61420, + "train_speed(iter/s)": 0.201298 + }, + { + "acc": 0.7841002, + "epoch": 1.4332084165706704, + "grad_norm": 5.625, + "learning_rate": 1.962178756910393e-06, + "loss": 0.77840614, + "memory(GiB)": 147.13, + "step": 61430, + "train_speed(iter/s)": 0.201315 + }, + { + "acc": 0.77601957, + "epoch": 1.4334417241429591, + "grad_norm": 6.15625, + "learning_rate": 1.9606785080577173e-06, + "loss": 0.80642538, + "memory(GiB)": 147.13, + "step": 61440, + "train_speed(iter/s)": 0.201332 + }, + { + "acc": 0.78321104, + "epoch": 1.4336750317152482, + "grad_norm": 4.4375, + "learning_rate": 1.959178693067303e-06, + "loss": 0.77348614, + "memory(GiB)": 147.13, + "step": 61450, + "train_speed(iter/s)": 0.201349 + }, + { + "acc": 0.77611961, + "epoch": 1.433908339287537, + "grad_norm": 5.53125, + "learning_rate": 1.9576793121532467e-06, + "loss": 0.81834526, + "memory(GiB)": 147.13, + "step": 61460, + "train_speed(iter/s)": 0.201365 + }, + { + "acc": 0.76458263, + "epoch": 1.4341416468598258, + "grad_norm": 5.40625, + "learning_rate": 1.9561803655295835e-06, + "loss": 0.86758423, + "memory(GiB)": 147.13, + "step": 61470, + "train_speed(iter/s)": 0.201381 + }, + { + "acc": 0.78354859, + "epoch": 1.4343749544321147, + "grad_norm": 5.28125, + "learning_rate": 1.9546818534102903e-06, + "loss": 0.77626381, + "memory(GiB)": 147.13, + "step": 61480, + "train_speed(iter/s)": 0.201397 + }, + { + "acc": 0.79747005, + "epoch": 1.4346082620044036, + "grad_norm": 4.6875, + "learning_rate": 1.9531837760092765e-06, + "loss": 0.72363586, + "memory(GiB)": 147.13, + "step": 61490, + "train_speed(iter/s)": 0.201414 + }, + { + "acc": 0.77367868, + "epoch": 1.4348415695766925, + "grad_norm": 4.65625, + "learning_rate": 1.9516861335403963e-06, + "loss": 0.80418158, + "memory(GiB)": 147.13, + "step": 61500, + "train_speed(iter/s)": 0.201431 + }, + { + "epoch": 1.4348415695766925, + "eval_acc": 0.7446573882301356, + "eval_loss": 0.8045687675476074, + "eval_runtime": 1271.1131, + "eval_samples_per_second": 28.315, + "eval_steps_per_second": 14.158, + "step": 61500 + }, + { + "acc": 0.77232461, + "epoch": 1.4350748771489814, + "grad_norm": 5.09375, + "learning_rate": 1.9501889262174323e-06, + "loss": 0.82037907, + "memory(GiB)": 147.13, + "step": 61510, + "train_speed(iter/s)": 0.2006 + }, + { + "acc": 0.77945395, + "epoch": 1.4353081847212703, + "grad_norm": 5.4375, + "learning_rate": 1.9486921542541147e-06, + "loss": 0.79027767, + "memory(GiB)": 147.13, + "step": 61520, + "train_speed(iter/s)": 0.200618 + }, + { + "acc": 0.79523678, + "epoch": 1.4355414922935592, + "grad_norm": 4.375, + "learning_rate": 1.9471958178641055e-06, + "loss": 0.7177619, + "memory(GiB)": 147.13, + "step": 61530, + "train_speed(iter/s)": 0.200635 + }, + { + "acc": 0.79970589, + "epoch": 1.435774799865848, + "grad_norm": 4.25, + "learning_rate": 1.9456999172610046e-06, + "loss": 0.71671066, + "memory(GiB)": 147.13, + "step": 61540, + "train_speed(iter/s)": 0.200652 + }, + { + "acc": 0.76421814, + "epoch": 1.436008107438137, + "grad_norm": 5.625, + "learning_rate": 1.9442044526583555e-06, + "loss": 0.86059494, + "memory(GiB)": 147.13, + "step": 61550, + "train_speed(iter/s)": 0.200669 + }, + { + "acc": 0.77591553, + "epoch": 1.436241415010426, + "grad_norm": 7.0, + "learning_rate": 1.9427094242696304e-06, + "loss": 0.81485806, + "memory(GiB)": 147.13, + "step": 61560, + "train_speed(iter/s)": 0.200686 + }, + { + "acc": 0.78154783, + "epoch": 1.4364747225827148, + "grad_norm": 6.46875, + "learning_rate": 1.941214832308249e-06, + "loss": 0.78967419, + "memory(GiB)": 147.13, + "step": 61570, + "train_speed(iter/s)": 0.200703 + }, + { + "acc": 0.77090359, + "epoch": 1.4367080301550037, + "grad_norm": 5.5, + "learning_rate": 1.9397206769875602e-06, + "loss": 0.83600082, + "memory(GiB)": 147.13, + "step": 61580, + "train_speed(iter/s)": 0.20072 + }, + { + "acc": 0.77267971, + "epoch": 1.4369413377272926, + "grad_norm": 5.46875, + "learning_rate": 1.9382269585208576e-06, + "loss": 0.84045687, + "memory(GiB)": 147.13, + "step": 61590, + "train_speed(iter/s)": 0.200737 + }, + { + "acc": 0.75382733, + "epoch": 1.4371746452995815, + "grad_norm": 4.5625, + "learning_rate": 1.936733677121367e-06, + "loss": 0.87408772, + "memory(GiB)": 147.13, + "step": 61600, + "train_speed(iter/s)": 0.200754 + }, + { + "acc": 0.78201303, + "epoch": 1.4374079528718704, + "grad_norm": 6.21875, + "learning_rate": 1.935240833002252e-06, + "loss": 0.78332329, + "memory(GiB)": 147.13, + "step": 61610, + "train_speed(iter/s)": 0.200771 + }, + { + "acc": 0.8092041, + "epoch": 1.4376412604441593, + "grad_norm": 4.9375, + "learning_rate": 1.933748426376622e-06, + "loss": 0.68223801, + "memory(GiB)": 147.13, + "step": 61620, + "train_speed(iter/s)": 0.200788 + }, + { + "acc": 0.79494247, + "epoch": 1.4378745680164482, + "grad_norm": 5.40625, + "learning_rate": 1.932256457457509e-06, + "loss": 0.72881784, + "memory(GiB)": 147.13, + "step": 61630, + "train_speed(iter/s)": 0.200805 + }, + { + "acc": 0.78494654, + "epoch": 1.438107875588737, + "grad_norm": 4.78125, + "learning_rate": 1.9307649264578982e-06, + "loss": 0.75482392, + "memory(GiB)": 147.13, + "step": 61640, + "train_speed(iter/s)": 0.200822 + }, + { + "acc": 0.7854641, + "epoch": 1.438341183161026, + "grad_norm": 5.9375, + "learning_rate": 1.9292738335907e-06, + "loss": 0.77157822, + "memory(GiB)": 147.13, + "step": 61650, + "train_speed(iter/s)": 0.200839 + }, + { + "acc": 0.7829257, + "epoch": 1.438574490733315, + "grad_norm": 6.53125, + "learning_rate": 1.9277831790687724e-06, + "loss": 0.77883649, + "memory(GiB)": 147.13, + "step": 61660, + "train_speed(iter/s)": 0.200854 + }, + { + "acc": 0.78623981, + "epoch": 1.4388077983056038, + "grad_norm": 5.0625, + "learning_rate": 1.9262929631049034e-06, + "loss": 0.76112614, + "memory(GiB)": 147.13, + "step": 61670, + "train_speed(iter/s)": 0.20087 + }, + { + "acc": 0.7791275, + "epoch": 1.4390411058778927, + "grad_norm": 4.5, + "learning_rate": 1.924803185911819e-06, + "loss": 0.79928355, + "memory(GiB)": 147.13, + "step": 61680, + "train_speed(iter/s)": 0.200888 + }, + { + "acc": 0.79443464, + "epoch": 1.4392744134501816, + "grad_norm": 5.125, + "learning_rate": 1.923313847702188e-06, + "loss": 0.72661982, + "memory(GiB)": 147.13, + "step": 61690, + "train_speed(iter/s)": 0.200903 + }, + { + "acc": 0.78530979, + "epoch": 1.4395077210224705, + "grad_norm": 5.1875, + "learning_rate": 1.9218249486886097e-06, + "loss": 0.76591215, + "memory(GiB)": 147.13, + "step": 61700, + "train_speed(iter/s)": 0.20092 + }, + { + "acc": 0.78451734, + "epoch": 1.4397410285947594, + "grad_norm": 6.0, + "learning_rate": 1.9203364890836277e-06, + "loss": 0.78146687, + "memory(GiB)": 147.13, + "step": 61710, + "train_speed(iter/s)": 0.200936 + }, + { + "acc": 0.77538376, + "epoch": 1.4399743361670483, + "grad_norm": 5.78125, + "learning_rate": 1.918848469099718e-06, + "loss": 0.79503469, + "memory(GiB)": 147.13, + "step": 61720, + "train_speed(iter/s)": 0.200953 + }, + { + "acc": 0.79319973, + "epoch": 1.4402076437393372, + "grad_norm": 4.5, + "learning_rate": 1.9173608889492936e-06, + "loss": 0.7500802, + "memory(GiB)": 147.13, + "step": 61730, + "train_speed(iter/s)": 0.20097 + }, + { + "acc": 0.77077246, + "epoch": 1.440440951311626, + "grad_norm": 5.75, + "learning_rate": 1.915873748844705e-06, + "loss": 0.82878227, + "memory(GiB)": 147.13, + "step": 61740, + "train_speed(iter/s)": 0.200986 + }, + { + "acc": 0.78738832, + "epoch": 1.440674258883915, + "grad_norm": 6.25, + "learning_rate": 1.9143870489982443e-06, + "loss": 0.73900366, + "memory(GiB)": 147.13, + "step": 61750, + "train_speed(iter/s)": 0.201002 + }, + { + "acc": 0.76408119, + "epoch": 1.4409075664562039, + "grad_norm": 5.625, + "learning_rate": 1.9129007896221365e-06, + "loss": 0.88208513, + "memory(GiB)": 147.13, + "step": 61760, + "train_speed(iter/s)": 0.201018 + }, + { + "acc": 0.77571197, + "epoch": 1.4411408740284926, + "grad_norm": 5.4375, + "learning_rate": 1.9114149709285416e-06, + "loss": 0.8385725, + "memory(GiB)": 147.13, + "step": 61770, + "train_speed(iter/s)": 0.201035 + }, + { + "acc": 0.78287396, + "epoch": 1.4413741816007817, + "grad_norm": 5.1875, + "learning_rate": 1.909929593129565e-06, + "loss": 0.76827679, + "memory(GiB)": 147.13, + "step": 61780, + "train_speed(iter/s)": 0.201051 + }, + { + "acc": 0.76898689, + "epoch": 1.4416074891730704, + "grad_norm": 8.125, + "learning_rate": 1.9084446564372393e-06, + "loss": 0.83712349, + "memory(GiB)": 147.13, + "step": 61790, + "train_speed(iter/s)": 0.201068 + }, + { + "acc": 0.79256206, + "epoch": 1.4418407967453595, + "grad_norm": 6.46875, + "learning_rate": 1.9069601610635424e-06, + "loss": 0.74981198, + "memory(GiB)": 147.13, + "step": 61800, + "train_speed(iter/s)": 0.201086 + }, + { + "acc": 0.77438807, + "epoch": 1.4420741043176482, + "grad_norm": 5.6875, + "learning_rate": 1.9054761072203843e-06, + "loss": 0.80913887, + "memory(GiB)": 147.13, + "step": 61810, + "train_speed(iter/s)": 0.201102 + }, + { + "acc": 0.78683462, + "epoch": 1.4423074118899373, + "grad_norm": 4.125, + "learning_rate": 1.9039924951196109e-06, + "loss": 0.74516659, + "memory(GiB)": 147.13, + "step": 61820, + "train_speed(iter/s)": 0.201119 + }, + { + "acc": 0.76727543, + "epoch": 1.442540719462226, + "grad_norm": 4.9375, + "learning_rate": 1.9025093249730108e-06, + "loss": 0.85720892, + "memory(GiB)": 147.13, + "step": 61830, + "train_speed(iter/s)": 0.201137 + }, + { + "acc": 0.78560305, + "epoch": 1.442774027034515, + "grad_norm": 4.03125, + "learning_rate": 1.9010265969923052e-06, + "loss": 0.7749969, + "memory(GiB)": 147.13, + "step": 61840, + "train_speed(iter/s)": 0.201152 + }, + { + "acc": 0.77236147, + "epoch": 1.4430073346068037, + "grad_norm": 7.28125, + "learning_rate": 1.8995443113891527e-06, + "loss": 0.83206615, + "memory(GiB)": 147.13, + "step": 61850, + "train_speed(iter/s)": 0.20117 + }, + { + "acc": 0.77366638, + "epoch": 1.4432406421790926, + "grad_norm": 5.03125, + "learning_rate": 1.898062468375147e-06, + "loss": 0.80375004, + "memory(GiB)": 147.13, + "step": 61860, + "train_speed(iter/s)": 0.201187 + }, + { + "acc": 0.7846086, + "epoch": 1.4434739497513815, + "grad_norm": 6.0625, + "learning_rate": 1.8965810681618251e-06, + "loss": 0.78282852, + "memory(GiB)": 147.13, + "step": 61870, + "train_speed(iter/s)": 0.201204 + }, + { + "acc": 0.76645479, + "epoch": 1.4437072573236704, + "grad_norm": 9.0625, + "learning_rate": 1.8951001109606538e-06, + "loss": 0.84201212, + "memory(GiB)": 147.13, + "step": 61880, + "train_speed(iter/s)": 0.201221 + }, + { + "acc": 0.77387953, + "epoch": 1.4439405648959593, + "grad_norm": 4.15625, + "learning_rate": 1.893619596983038e-06, + "loss": 0.82105818, + "memory(GiB)": 147.13, + "step": 61890, + "train_speed(iter/s)": 0.201238 + }, + { + "acc": 0.7874526, + "epoch": 1.4441738724682482, + "grad_norm": 3.734375, + "learning_rate": 1.8921395264403236e-06, + "loss": 0.76474705, + "memory(GiB)": 147.13, + "step": 61900, + "train_speed(iter/s)": 0.201255 + }, + { + "acc": 0.78465261, + "epoch": 1.4444071800405371, + "grad_norm": 4.3125, + "learning_rate": 1.890659899543788e-06, + "loss": 0.77885513, + "memory(GiB)": 147.13, + "step": 61910, + "train_speed(iter/s)": 0.201272 + }, + { + "acc": 0.77660131, + "epoch": 1.444640487612826, + "grad_norm": 5.78125, + "learning_rate": 1.88918071650465e-06, + "loss": 0.8049757, + "memory(GiB)": 147.13, + "step": 61920, + "train_speed(iter/s)": 0.201289 + }, + { + "acc": 0.77938328, + "epoch": 1.444873795185115, + "grad_norm": 5.125, + "learning_rate": 1.8877019775340587e-06, + "loss": 0.81291084, + "memory(GiB)": 147.13, + "step": 61930, + "train_speed(iter/s)": 0.201307 + }, + { + "acc": 0.78789864, + "epoch": 1.4451071027574038, + "grad_norm": 6.28125, + "learning_rate": 1.8862236828431086e-06, + "loss": 0.76462269, + "memory(GiB)": 147.13, + "step": 61940, + "train_speed(iter/s)": 0.201324 + }, + { + "acc": 0.77977004, + "epoch": 1.4453404103296927, + "grad_norm": 6.0, + "learning_rate": 1.8847458326428226e-06, + "loss": 0.79729452, + "memory(GiB)": 147.13, + "step": 61950, + "train_speed(iter/s)": 0.20134 + }, + { + "acc": 0.78940058, + "epoch": 1.4455737179019816, + "grad_norm": 6.34375, + "learning_rate": 1.8832684271441643e-06, + "loss": 0.74841819, + "memory(GiB)": 147.13, + "step": 61960, + "train_speed(iter/s)": 0.201357 + }, + { + "acc": 0.78960233, + "epoch": 1.4458070254742705, + "grad_norm": 3.703125, + "learning_rate": 1.8817914665580322e-06, + "loss": 0.7424305, + "memory(GiB)": 147.13, + "step": 61970, + "train_speed(iter/s)": 0.201373 + }, + { + "acc": 0.80876637, + "epoch": 1.4460403330465594, + "grad_norm": 7.53125, + "learning_rate": 1.8803149510952613e-06, + "loss": 0.71806173, + "memory(GiB)": 147.13, + "step": 61980, + "train_speed(iter/s)": 0.20139 + }, + { + "acc": 0.77671194, + "epoch": 1.4462736406188483, + "grad_norm": 4.875, + "learning_rate": 1.8788388809666259e-06, + "loss": 0.79790506, + "memory(GiB)": 147.13, + "step": 61990, + "train_speed(iter/s)": 0.201407 + }, + { + "acc": 0.78528218, + "epoch": 1.4465069481911372, + "grad_norm": 5.21875, + "learning_rate": 1.877363256382832e-06, + "loss": 0.7618494, + "memory(GiB)": 147.13, + "step": 62000, + "train_speed(iter/s)": 0.201423 + }, + { + "epoch": 1.4465069481911372, + "eval_acc": 0.7446714809225781, + "eval_loss": 0.8045513033866882, + "eval_runtime": 1270.3592, + "eval_samples_per_second": 28.331, + "eval_steps_per_second": 14.166, + "step": 62000 + }, + { + "acc": 0.78110247, + "epoch": 1.4467402557634261, + "grad_norm": 7.09375, + "learning_rate": 1.8758880775545279e-06, + "loss": 0.77745328, + "memory(GiB)": 147.13, + "step": 62010, + "train_speed(iter/s)": 0.200597 + }, + { + "acc": 0.78310804, + "epoch": 1.446973563335715, + "grad_norm": 4.15625, + "learning_rate": 1.8744133446922935e-06, + "loss": 0.78575201, + "memory(GiB)": 147.13, + "step": 62020, + "train_speed(iter/s)": 0.200613 + }, + { + "acc": 0.77370396, + "epoch": 1.447206870908004, + "grad_norm": 4.6875, + "learning_rate": 1.8729390580066442e-06, + "loss": 0.82233162, + "memory(GiB)": 147.13, + "step": 62030, + "train_speed(iter/s)": 0.200629 + }, + { + "acc": 0.77510133, + "epoch": 1.4474401784802928, + "grad_norm": 6.09375, + "learning_rate": 1.8714652177080377e-06, + "loss": 0.80517254, + "memory(GiB)": 147.13, + "step": 62040, + "train_speed(iter/s)": 0.200645 + }, + { + "acc": 0.78512564, + "epoch": 1.4476734860525817, + "grad_norm": 5.9375, + "learning_rate": 1.869991824006861e-06, + "loss": 0.78300362, + "memory(GiB)": 147.13, + "step": 62050, + "train_speed(iter/s)": 0.200662 + }, + { + "acc": 0.78685694, + "epoch": 1.4479067936248706, + "grad_norm": 20.25, + "learning_rate": 1.8685188771134433e-06, + "loss": 0.77022228, + "memory(GiB)": 147.13, + "step": 62060, + "train_speed(iter/s)": 0.200678 + }, + { + "acc": 0.76881313, + "epoch": 1.4481401011971595, + "grad_norm": 5.09375, + "learning_rate": 1.8670463772380464e-06, + "loss": 0.82338123, + "memory(GiB)": 147.13, + "step": 62070, + "train_speed(iter/s)": 0.200695 + }, + { + "acc": 0.79330425, + "epoch": 1.4483734087694484, + "grad_norm": 6.59375, + "learning_rate": 1.8655743245908692e-06, + "loss": 0.73466892, + "memory(GiB)": 147.13, + "step": 62080, + "train_speed(iter/s)": 0.200711 + }, + { + "acc": 0.79285131, + "epoch": 1.4486067163417373, + "grad_norm": 6.3125, + "learning_rate": 1.864102719382045e-06, + "loss": 0.73613644, + "memory(GiB)": 147.13, + "step": 62090, + "train_speed(iter/s)": 0.200727 + }, + { + "acc": 0.79956055, + "epoch": 1.4488400239140262, + "grad_norm": 4.125, + "learning_rate": 1.8626315618216484e-06, + "loss": 0.72277913, + "memory(GiB)": 147.13, + "step": 62100, + "train_speed(iter/s)": 0.200744 + }, + { + "acc": 0.77584467, + "epoch": 1.4490733314863151, + "grad_norm": 4.75, + "learning_rate": 1.8611608521196844e-06, + "loss": 0.80201588, + "memory(GiB)": 147.13, + "step": 62110, + "train_speed(iter/s)": 0.200761 + }, + { + "acc": 0.78103304, + "epoch": 1.449306639058604, + "grad_norm": 8.125, + "learning_rate": 1.8596905904860956e-06, + "loss": 0.78463149, + "memory(GiB)": 147.13, + "step": 62120, + "train_speed(iter/s)": 0.200778 + }, + { + "acc": 0.78027878, + "epoch": 1.449539946630893, + "grad_norm": 6.65625, + "learning_rate": 1.8582207771307647e-06, + "loss": 0.78982868, + "memory(GiB)": 147.13, + "step": 62130, + "train_speed(iter/s)": 0.200794 + }, + { + "acc": 0.80753584, + "epoch": 1.4497732542031816, + "grad_norm": 6.125, + "learning_rate": 1.8567514122635027e-06, + "loss": 0.68397255, + "memory(GiB)": 147.13, + "step": 62140, + "train_speed(iter/s)": 0.200811 + }, + { + "acc": 0.80782089, + "epoch": 1.4500065617754707, + "grad_norm": 7.21875, + "learning_rate": 1.8552824960940658e-06, + "loss": 0.67650642, + "memory(GiB)": 147.13, + "step": 62150, + "train_speed(iter/s)": 0.200828 + }, + { + "acc": 0.77864218, + "epoch": 1.4502398693477594, + "grad_norm": 7.0, + "learning_rate": 1.8538140288321387e-06, + "loss": 0.80679083, + "memory(GiB)": 147.13, + "step": 62160, + "train_speed(iter/s)": 0.200844 + }, + { + "acc": 0.79958267, + "epoch": 1.4504731769200485, + "grad_norm": 6.15625, + "learning_rate": 1.8523460106873436e-06, + "loss": 0.7120677, + "memory(GiB)": 147.13, + "step": 62170, + "train_speed(iter/s)": 0.20086 + }, + { + "acc": 0.77506552, + "epoch": 1.4507064844923372, + "grad_norm": 6.71875, + "learning_rate": 1.8508784418692428e-06, + "loss": 0.83289452, + "memory(GiB)": 147.13, + "step": 62180, + "train_speed(iter/s)": 0.200876 + }, + { + "acc": 0.80013189, + "epoch": 1.4509397920646263, + "grad_norm": 5.0625, + "learning_rate": 1.8494113225873295e-06, + "loss": 0.70295601, + "memory(GiB)": 147.13, + "step": 62190, + "train_speed(iter/s)": 0.200893 + }, + { + "acc": 0.79650593, + "epoch": 1.451173099636915, + "grad_norm": 8.4375, + "learning_rate": 1.8479446530510348e-06, + "loss": 0.72814693, + "memory(GiB)": 147.13, + "step": 62200, + "train_speed(iter/s)": 0.20091 + }, + { + "acc": 0.77118607, + "epoch": 1.451406407209204, + "grad_norm": 4.09375, + "learning_rate": 1.8464784334697234e-06, + "loss": 0.84197454, + "memory(GiB)": 147.13, + "step": 62210, + "train_speed(iter/s)": 0.200925 + }, + { + "acc": 0.79284525, + "epoch": 1.4516397147814928, + "grad_norm": 4.78125, + "learning_rate": 1.845012664052701e-06, + "loss": 0.74894667, + "memory(GiB)": 147.13, + "step": 62220, + "train_speed(iter/s)": 0.200941 + }, + { + "acc": 0.77012434, + "epoch": 1.451873022353782, + "grad_norm": 5.1875, + "learning_rate": 1.843547345009203e-06, + "loss": 0.81883812, + "memory(GiB)": 147.13, + "step": 62230, + "train_speed(iter/s)": 0.200958 + }, + { + "acc": 0.77381506, + "epoch": 1.4521063299260706, + "grad_norm": 6.71875, + "learning_rate": 1.8420824765484058e-06, + "loss": 0.79767299, + "memory(GiB)": 147.13, + "step": 62240, + "train_speed(iter/s)": 0.200974 + }, + { + "acc": 0.77434549, + "epoch": 1.4523396374983595, + "grad_norm": 4.9375, + "learning_rate": 1.8406180588794176e-06, + "loss": 0.81015167, + "memory(GiB)": 147.13, + "step": 62250, + "train_speed(iter/s)": 0.20099 + }, + { + "acc": 0.79392915, + "epoch": 1.4525729450706484, + "grad_norm": 6.53125, + "learning_rate": 1.8391540922112822e-06, + "loss": 0.72809219, + "memory(GiB)": 147.13, + "step": 62260, + "train_speed(iter/s)": 0.201007 + }, + { + "acc": 0.77831402, + "epoch": 1.4528062526429373, + "grad_norm": 5.59375, + "learning_rate": 1.8376905767529834e-06, + "loss": 0.78499942, + "memory(GiB)": 147.13, + "step": 62270, + "train_speed(iter/s)": 0.201022 + }, + { + "acc": 0.79829054, + "epoch": 1.4530395602152262, + "grad_norm": 6.21875, + "learning_rate": 1.8362275127134348e-06, + "loss": 0.71653814, + "memory(GiB)": 147.13, + "step": 62280, + "train_speed(iter/s)": 0.201037 + }, + { + "acc": 0.75779715, + "epoch": 1.453272867787515, + "grad_norm": 5.25, + "learning_rate": 1.8347649003014911e-06, + "loss": 0.86373796, + "memory(GiB)": 147.13, + "step": 62290, + "train_speed(iter/s)": 0.201054 + }, + { + "acc": 0.76059661, + "epoch": 1.453506175359804, + "grad_norm": 8.1875, + "learning_rate": 1.833302739725939e-06, + "loss": 0.87852764, + "memory(GiB)": 147.13, + "step": 62300, + "train_speed(iter/s)": 0.201069 + }, + { + "acc": 0.77951355, + "epoch": 1.4537394829320929, + "grad_norm": 11.6875, + "learning_rate": 1.8318410311955003e-06, + "loss": 0.7952219, + "memory(GiB)": 147.13, + "step": 62310, + "train_speed(iter/s)": 0.201087 + }, + { + "acc": 0.78401728, + "epoch": 1.4539727905043818, + "grad_norm": 4.6875, + "learning_rate": 1.830379774918834e-06, + "loss": 0.77956657, + "memory(GiB)": 147.13, + "step": 62320, + "train_speed(iter/s)": 0.201102 + }, + { + "acc": 0.78924007, + "epoch": 1.4542060980766707, + "grad_norm": 5.59375, + "learning_rate": 1.8289189711045324e-06, + "loss": 0.75395088, + "memory(GiB)": 147.13, + "step": 62330, + "train_speed(iter/s)": 0.201119 + }, + { + "acc": 0.77698288, + "epoch": 1.4544394056489596, + "grad_norm": 4.71875, + "learning_rate": 1.8274586199611283e-06, + "loss": 0.78425674, + "memory(GiB)": 147.13, + "step": 62340, + "train_speed(iter/s)": 0.201135 + }, + { + "acc": 0.78032331, + "epoch": 1.4546727132212485, + "grad_norm": 6.59375, + "learning_rate": 1.8259987216970826e-06, + "loss": 0.79403481, + "memory(GiB)": 147.13, + "step": 62350, + "train_speed(iter/s)": 0.201152 + }, + { + "acc": 0.7868978, + "epoch": 1.4549060207935374, + "grad_norm": 4.9375, + "learning_rate": 1.8245392765207993e-06, + "loss": 0.75227661, + "memory(GiB)": 147.13, + "step": 62360, + "train_speed(iter/s)": 0.201168 + }, + { + "acc": 0.78523664, + "epoch": 1.4551393283658263, + "grad_norm": 5.40625, + "learning_rate": 1.8230802846406104e-06, + "loss": 0.76713495, + "memory(GiB)": 147.13, + "step": 62370, + "train_speed(iter/s)": 0.201183 + }, + { + "acc": 0.77572908, + "epoch": 1.4553726359381152, + "grad_norm": 5.84375, + "learning_rate": 1.821621746264789e-06, + "loss": 0.80009136, + "memory(GiB)": 147.13, + "step": 62380, + "train_speed(iter/s)": 0.2012 + }, + { + "acc": 0.78314152, + "epoch": 1.455605943510404, + "grad_norm": 7.15625, + "learning_rate": 1.8201636616015405e-06, + "loss": 0.77063742, + "memory(GiB)": 147.13, + "step": 62390, + "train_speed(iter/s)": 0.201216 + }, + { + "acc": 0.77914481, + "epoch": 1.455839251082693, + "grad_norm": 4.53125, + "learning_rate": 1.8187060308590038e-06, + "loss": 0.77920589, + "memory(GiB)": 147.13, + "step": 62400, + "train_speed(iter/s)": 0.201232 + }, + { + "acc": 0.79587164, + "epoch": 1.4560725586549819, + "grad_norm": 5.65625, + "learning_rate": 1.8172488542452583e-06, + "loss": 0.73282957, + "memory(GiB)": 147.13, + "step": 62410, + "train_speed(iter/s)": 0.201249 + }, + { + "acc": 0.79226098, + "epoch": 1.4563058662272708, + "grad_norm": 5.65625, + "learning_rate": 1.8157921319683147e-06, + "loss": 0.73642225, + "memory(GiB)": 147.13, + "step": 62420, + "train_speed(iter/s)": 0.201265 + }, + { + "acc": 0.76984286, + "epoch": 1.4565391737995597, + "grad_norm": 5.0625, + "learning_rate": 1.8143358642361191e-06, + "loss": 0.83457851, + "memory(GiB)": 147.13, + "step": 62430, + "train_speed(iter/s)": 0.201282 + }, + { + "acc": 0.78718529, + "epoch": 1.4567724813718486, + "grad_norm": 5.3125, + "learning_rate": 1.8128800512565514e-06, + "loss": 0.75155096, + "memory(GiB)": 147.13, + "step": 62440, + "train_speed(iter/s)": 0.201299 + }, + { + "acc": 0.78033142, + "epoch": 1.4570057889441375, + "grad_norm": 4.03125, + "learning_rate": 1.811424693237433e-06, + "loss": 0.77643776, + "memory(GiB)": 147.13, + "step": 62450, + "train_speed(iter/s)": 0.201317 + }, + { + "acc": 0.80350819, + "epoch": 1.4572390965164264, + "grad_norm": 4.40625, + "learning_rate": 1.8099697903865127e-06, + "loss": 0.69405346, + "memory(GiB)": 147.13, + "step": 62460, + "train_speed(iter/s)": 0.201333 + }, + { + "acc": 0.7767561, + "epoch": 1.4574724040887153, + "grad_norm": 4.09375, + "learning_rate": 1.8085153429114766e-06, + "loss": 0.80634718, + "memory(GiB)": 147.13, + "step": 62470, + "train_speed(iter/s)": 0.201351 + }, + { + "acc": 0.79806414, + "epoch": 1.4577057116610042, + "grad_norm": 4.71875, + "learning_rate": 1.8070613510199497e-06, + "loss": 0.72472191, + "memory(GiB)": 147.13, + "step": 62480, + "train_speed(iter/s)": 0.201368 + }, + { + "acc": 0.7791873, + "epoch": 1.457939019233293, + "grad_norm": 4.75, + "learning_rate": 1.8056078149194861e-06, + "loss": 0.78082762, + "memory(GiB)": 147.13, + "step": 62490, + "train_speed(iter/s)": 0.201385 + }, + { + "acc": 0.78747339, + "epoch": 1.458172326805582, + "grad_norm": 7.03125, + "learning_rate": 1.8041547348175803e-06, + "loss": 0.76401696, + "memory(GiB)": 147.13, + "step": 62500, + "train_speed(iter/s)": 0.201402 + }, + { + "epoch": 1.458172326805582, + "eval_acc": 0.7446378506337948, + "eval_loss": 0.8045361042022705, + "eval_runtime": 1270.7937, + "eval_samples_per_second": 28.322, + "eval_steps_per_second": 14.161, + "step": 62500 + }, + { + "acc": 0.76177197, + "epoch": 1.4584056343778709, + "grad_norm": 4.9375, + "learning_rate": 1.802702110921658e-06, + "loss": 0.85588417, + "memory(GiB)": 147.13, + "step": 62510, + "train_speed(iter/s)": 0.200583 + }, + { + "acc": 0.7588037, + "epoch": 1.4586389419501598, + "grad_norm": 6.59375, + "learning_rate": 1.8012499434390784e-06, + "loss": 0.8837925, + "memory(GiB)": 147.13, + "step": 62520, + "train_speed(iter/s)": 0.200598 + }, + { + "acc": 0.76614499, + "epoch": 1.4588722495224484, + "grad_norm": 5.90625, + "learning_rate": 1.7997982325771425e-06, + "loss": 0.83949127, + "memory(GiB)": 147.13, + "step": 62530, + "train_speed(iter/s)": 0.200613 + }, + { + "acc": 0.78617134, + "epoch": 1.4591055570947375, + "grad_norm": 6.21875, + "learning_rate": 1.7983469785430785e-06, + "loss": 0.79134507, + "memory(GiB)": 147.13, + "step": 62540, + "train_speed(iter/s)": 0.200629 + }, + { + "acc": 0.79341078, + "epoch": 1.4593388646670262, + "grad_norm": 4.3125, + "learning_rate": 1.7968961815440534e-06, + "loss": 0.76289606, + "memory(GiB)": 147.13, + "step": 62550, + "train_speed(iter/s)": 0.200646 + }, + { + "acc": 0.77591524, + "epoch": 1.4595721722393153, + "grad_norm": 7.96875, + "learning_rate": 1.7954458417871667e-06, + "loss": 0.80238419, + "memory(GiB)": 147.13, + "step": 62560, + "train_speed(iter/s)": 0.200663 + }, + { + "acc": 0.79817991, + "epoch": 1.459805479811604, + "grad_norm": 5.1875, + "learning_rate": 1.7939959594794564e-06, + "loss": 0.71526041, + "memory(GiB)": 147.13, + "step": 62570, + "train_speed(iter/s)": 0.200681 + }, + { + "acc": 0.77496409, + "epoch": 1.4600387873838931, + "grad_norm": 7.46875, + "learning_rate": 1.7925465348278898e-06, + "loss": 0.7962131, + "memory(GiB)": 147.13, + "step": 62580, + "train_speed(iter/s)": 0.200697 + }, + { + "acc": 0.78668814, + "epoch": 1.4602720949561818, + "grad_norm": 6.4375, + "learning_rate": 1.7910975680393756e-06, + "loss": 0.76099, + "memory(GiB)": 147.13, + "step": 62590, + "train_speed(iter/s)": 0.200714 + }, + { + "acc": 0.77080307, + "epoch": 1.460505402528471, + "grad_norm": 5.15625, + "learning_rate": 1.789649059320751e-06, + "loss": 0.82710533, + "memory(GiB)": 147.13, + "step": 62600, + "train_speed(iter/s)": 0.20073 + }, + { + "acc": 0.77813931, + "epoch": 1.4607387101007596, + "grad_norm": 5.8125, + "learning_rate": 1.7882010088787888e-06, + "loss": 0.801579, + "memory(GiB)": 147.13, + "step": 62610, + "train_speed(iter/s)": 0.200749 + }, + { + "acc": 0.77716465, + "epoch": 1.4609720176730487, + "grad_norm": 5.9375, + "learning_rate": 1.7867534169202018e-06, + "loss": 0.80519314, + "memory(GiB)": 147.13, + "step": 62620, + "train_speed(iter/s)": 0.200766 + }, + { + "acc": 0.75499105, + "epoch": 1.4612053252453374, + "grad_norm": 4.65625, + "learning_rate": 1.785306283651629e-06, + "loss": 0.88495131, + "memory(GiB)": 147.13, + "step": 62630, + "train_speed(iter/s)": 0.200784 + }, + { + "acc": 0.77682557, + "epoch": 1.4614386328176263, + "grad_norm": 6.34375, + "learning_rate": 1.783859609279654e-06, + "loss": 0.80335407, + "memory(GiB)": 147.13, + "step": 62640, + "train_speed(iter/s)": 0.200802 + }, + { + "acc": 0.78338518, + "epoch": 1.4616719403899152, + "grad_norm": 5.4375, + "learning_rate": 1.7824133940107818e-06, + "loss": 0.77647243, + "memory(GiB)": 147.13, + "step": 62650, + "train_speed(iter/s)": 0.200818 + }, + { + "acc": 0.77094107, + "epoch": 1.4619052479622041, + "grad_norm": 4.375, + "learning_rate": 1.7809676380514646e-06, + "loss": 0.84018555, + "memory(GiB)": 147.13, + "step": 62660, + "train_speed(iter/s)": 0.200835 + }, + { + "acc": 0.78348899, + "epoch": 1.462138555534493, + "grad_norm": 5.78125, + "learning_rate": 1.7795223416080804e-06, + "loss": 0.7956295, + "memory(GiB)": 147.13, + "step": 62670, + "train_speed(iter/s)": 0.200851 + }, + { + "acc": 0.77315598, + "epoch": 1.462371863106782, + "grad_norm": 5.59375, + "learning_rate": 1.778077504886948e-06, + "loss": 0.81796093, + "memory(GiB)": 147.13, + "step": 62680, + "train_speed(iter/s)": 0.200868 + }, + { + "acc": 0.7750021, + "epoch": 1.4626051706790708, + "grad_norm": 7.0625, + "learning_rate": 1.7766331280943156e-06, + "loss": 0.80663986, + "memory(GiB)": 147.13, + "step": 62690, + "train_speed(iter/s)": 0.200886 + }, + { + "acc": 0.78340206, + "epoch": 1.4628384782513597, + "grad_norm": 5.28125, + "learning_rate": 1.775189211436366e-06, + "loss": 0.78596182, + "memory(GiB)": 147.13, + "step": 62700, + "train_speed(iter/s)": 0.200902 + }, + { + "acc": 0.77824869, + "epoch": 1.4630717858236486, + "grad_norm": 5.25, + "learning_rate": 1.7737457551192221e-06, + "loss": 0.80983734, + "memory(GiB)": 147.13, + "step": 62710, + "train_speed(iter/s)": 0.200918 + }, + { + "acc": 0.77051554, + "epoch": 1.4633050933959375, + "grad_norm": 6.8125, + "learning_rate": 1.7723027593489322e-06, + "loss": 0.82581806, + "memory(GiB)": 147.13, + "step": 62720, + "train_speed(iter/s)": 0.200936 + }, + { + "acc": 0.76511116, + "epoch": 1.4635384009682264, + "grad_norm": 7.53125, + "learning_rate": 1.7708602243314876e-06, + "loss": 0.84343872, + "memory(GiB)": 147.13, + "step": 62730, + "train_speed(iter/s)": 0.200953 + }, + { + "acc": 0.78928823, + "epoch": 1.4637717085405153, + "grad_norm": 4.3125, + "learning_rate": 1.7694181502728074e-06, + "loss": 0.74881258, + "memory(GiB)": 147.13, + "step": 62740, + "train_speed(iter/s)": 0.20097 + }, + { + "acc": 0.77590837, + "epoch": 1.4640050161128042, + "grad_norm": 6.125, + "learning_rate": 1.7679765373787467e-06, + "loss": 0.78589211, + "memory(GiB)": 147.13, + "step": 62750, + "train_speed(iter/s)": 0.200987 + }, + { + "acc": 0.78272462, + "epoch": 1.464238323685093, + "grad_norm": 6.59375, + "learning_rate": 1.7665353858550993e-06, + "loss": 0.75262532, + "memory(GiB)": 147.13, + "step": 62760, + "train_speed(iter/s)": 0.201003 + }, + { + "acc": 0.77953529, + "epoch": 1.464471631257382, + "grad_norm": 7.4375, + "learning_rate": 1.7650946959075833e-06, + "loss": 0.81623087, + "memory(GiB)": 147.13, + "step": 62770, + "train_speed(iter/s)": 0.20102 + }, + { + "acc": 0.77969527, + "epoch": 1.464704938829671, + "grad_norm": 5.53125, + "learning_rate": 1.763654467741861e-06, + "loss": 0.78417301, + "memory(GiB)": 147.13, + "step": 62780, + "train_speed(iter/s)": 0.201037 + }, + { + "acc": 0.79427376, + "epoch": 1.4649382464019598, + "grad_norm": 6.53125, + "learning_rate": 1.7622147015635222e-06, + "loss": 0.75227299, + "memory(GiB)": 147.13, + "step": 62790, + "train_speed(iter/s)": 0.201054 + }, + { + "acc": 0.77105227, + "epoch": 1.4651715539742487, + "grad_norm": 4.75, + "learning_rate": 1.760775397578095e-06, + "loss": 0.83062248, + "memory(GiB)": 147.13, + "step": 62800, + "train_speed(iter/s)": 0.20107 + }, + { + "acc": 0.7859479, + "epoch": 1.4654048615465376, + "grad_norm": 5.375, + "learning_rate": 1.7593365559910397e-06, + "loss": 0.76885614, + "memory(GiB)": 147.13, + "step": 62810, + "train_speed(iter/s)": 0.201086 + }, + { + "acc": 0.76209989, + "epoch": 1.4656381691188265, + "grad_norm": 6.4375, + "learning_rate": 1.7578981770077474e-06, + "loss": 0.84865894, + "memory(GiB)": 147.13, + "step": 62820, + "train_speed(iter/s)": 0.201103 + }, + { + "acc": 0.78125958, + "epoch": 1.4658714766911154, + "grad_norm": 6.0625, + "learning_rate": 1.7564602608335502e-06, + "loss": 0.77670856, + "memory(GiB)": 147.13, + "step": 62830, + "train_speed(iter/s)": 0.20112 + }, + { + "acc": 0.79948587, + "epoch": 1.4661047842634043, + "grad_norm": 7.4375, + "learning_rate": 1.7550228076737069e-06, + "loss": 0.72959423, + "memory(GiB)": 147.13, + "step": 62840, + "train_speed(iter/s)": 0.201137 + }, + { + "acc": 0.75941429, + "epoch": 1.4663380918356932, + "grad_norm": 5.75, + "learning_rate": 1.7535858177334163e-06, + "loss": 0.86305752, + "memory(GiB)": 147.13, + "step": 62850, + "train_speed(iter/s)": 0.201152 + }, + { + "acc": 0.78896446, + "epoch": 1.466571399407982, + "grad_norm": 4.28125, + "learning_rate": 1.7521492912178062e-06, + "loss": 0.75683656, + "memory(GiB)": 147.13, + "step": 62860, + "train_speed(iter/s)": 0.20117 + }, + { + "acc": 0.79165177, + "epoch": 1.466804706980271, + "grad_norm": 5.40625, + "learning_rate": 1.7507132283319445e-06, + "loss": 0.73763084, + "memory(GiB)": 147.13, + "step": 62870, + "train_speed(iter/s)": 0.201185 + }, + { + "acc": 0.76576929, + "epoch": 1.4670380145525599, + "grad_norm": 4.1875, + "learning_rate": 1.7492776292808217e-06, + "loss": 0.84900112, + "memory(GiB)": 147.13, + "step": 62880, + "train_speed(iter/s)": 0.201201 + }, + { + "acc": 0.77859445, + "epoch": 1.4672713221248488, + "grad_norm": 4.78125, + "learning_rate": 1.7478424942693751e-06, + "loss": 0.79272537, + "memory(GiB)": 147.13, + "step": 62890, + "train_speed(iter/s)": 0.201217 + }, + { + "acc": 0.7813118, + "epoch": 1.4675046296971377, + "grad_norm": 6.9375, + "learning_rate": 1.7464078235024678e-06, + "loss": 0.78525038, + "memory(GiB)": 147.13, + "step": 62900, + "train_speed(iter/s)": 0.201233 + }, + { + "acc": 0.78674507, + "epoch": 1.4677379372694266, + "grad_norm": 7.4375, + "learning_rate": 1.7449736171848964e-06, + "loss": 0.76398363, + "memory(GiB)": 147.13, + "step": 62910, + "train_speed(iter/s)": 0.201248 + }, + { + "acc": 0.77175941, + "epoch": 1.4679712448417153, + "grad_norm": 9.0625, + "learning_rate": 1.7435398755213977e-06, + "loss": 0.8071166, + "memory(GiB)": 147.13, + "step": 62920, + "train_speed(iter/s)": 0.201265 + }, + { + "acc": 0.75375805, + "epoch": 1.4682045524140044, + "grad_norm": 6.59375, + "learning_rate": 1.7421065987166335e-06, + "loss": 0.87107162, + "memory(GiB)": 147.13, + "step": 62930, + "train_speed(iter/s)": 0.201281 + }, + { + "acc": 0.80961304, + "epoch": 1.468437859986293, + "grad_norm": 5.34375, + "learning_rate": 1.7406737869752082e-06, + "loss": 0.67591333, + "memory(GiB)": 147.13, + "step": 62940, + "train_speed(iter/s)": 0.201296 + }, + { + "acc": 0.77118597, + "epoch": 1.4686711675585822, + "grad_norm": 5.28125, + "learning_rate": 1.7392414405016527e-06, + "loss": 0.83532896, + "memory(GiB)": 147.13, + "step": 62950, + "train_speed(iter/s)": 0.201312 + }, + { + "acc": 0.76841412, + "epoch": 1.4689044751308709, + "grad_norm": 6.3125, + "learning_rate": 1.7378095595004323e-06, + "loss": 0.84016171, + "memory(GiB)": 147.13, + "step": 62960, + "train_speed(iter/s)": 0.201329 + }, + { + "acc": 0.77326422, + "epoch": 1.46913778270316, + "grad_norm": 6.40625, + "learning_rate": 1.736378144175952e-06, + "loss": 0.83091669, + "memory(GiB)": 147.13, + "step": 62970, + "train_speed(iter/s)": 0.201346 + }, + { + "acc": 0.7910758, + "epoch": 1.4693710902754487, + "grad_norm": 6.0625, + "learning_rate": 1.7349471947325414e-06, + "loss": 0.72511082, + "memory(GiB)": 147.13, + "step": 62980, + "train_speed(iter/s)": 0.201363 + }, + { + "acc": 0.78480339, + "epoch": 1.4696043978477378, + "grad_norm": 4.28125, + "learning_rate": 1.7335167113744732e-06, + "loss": 0.76047554, + "memory(GiB)": 147.13, + "step": 62990, + "train_speed(iter/s)": 0.20138 + }, + { + "acc": 0.78779182, + "epoch": 1.4698377054200265, + "grad_norm": 4.3125, + "learning_rate": 1.7320866943059427e-06, + "loss": 0.75346622, + "memory(GiB)": 147.13, + "step": 63000, + "train_speed(iter/s)": 0.201397 + }, + { + "epoch": 1.4698377054200265, + "eval_acc": 0.7446705200571843, + "eval_loss": 0.8045252561569214, + "eval_runtime": 1270.0191, + "eval_samples_per_second": 28.339, + "eval_steps_per_second": 14.17, + "step": 63000 + }, + { + "acc": 0.79027567, + "epoch": 1.4700710129923154, + "grad_norm": 5.84375, + "learning_rate": 1.7306571437310893e-06, + "loss": 0.75496302, + "memory(GiB)": 147.13, + "step": 63010, + "train_speed(iter/s)": 0.200586 + }, + { + "acc": 0.77779145, + "epoch": 1.4703043205646043, + "grad_norm": 5.15625, + "learning_rate": 1.7292280598539769e-06, + "loss": 0.80010624, + "memory(GiB)": 147.13, + "step": 63020, + "train_speed(iter/s)": 0.200603 + }, + { + "acc": 0.78809614, + "epoch": 1.4705376281368931, + "grad_norm": 5.375, + "learning_rate": 1.72779944287861e-06, + "loss": 0.7661644, + "memory(GiB)": 147.13, + "step": 63030, + "train_speed(iter/s)": 0.20062 + }, + { + "acc": 0.78059311, + "epoch": 1.470770935709182, + "grad_norm": 4.875, + "learning_rate": 1.7263712930089227e-06, + "loss": 0.78118305, + "memory(GiB)": 147.13, + "step": 63040, + "train_speed(iter/s)": 0.200635 + }, + { + "acc": 0.78940554, + "epoch": 1.471004243281471, + "grad_norm": 9.75, + "learning_rate": 1.7249436104487805e-06, + "loss": 0.74943719, + "memory(GiB)": 147.13, + "step": 63050, + "train_speed(iter/s)": 0.200651 + }, + { + "acc": 0.77817397, + "epoch": 1.4712375508537598, + "grad_norm": 3.9375, + "learning_rate": 1.7235163954019878e-06, + "loss": 0.83013039, + "memory(GiB)": 147.13, + "step": 63060, + "train_speed(iter/s)": 0.200667 + }, + { + "acc": 0.7942832, + "epoch": 1.4714708584260487, + "grad_norm": 4.71875, + "learning_rate": 1.7220896480722766e-06, + "loss": 0.71083398, + "memory(GiB)": 147.13, + "step": 63070, + "train_speed(iter/s)": 0.200683 + }, + { + "acc": 0.77893252, + "epoch": 1.4717041659983376, + "grad_norm": 5.28125, + "learning_rate": 1.7206633686633172e-06, + "loss": 0.7933465, + "memory(GiB)": 147.13, + "step": 63080, + "train_speed(iter/s)": 0.200699 + }, + { + "acc": 0.77962651, + "epoch": 1.4719374735706265, + "grad_norm": 3.625, + "learning_rate": 1.719237557378709e-06, + "loss": 0.79189463, + "memory(GiB)": 147.13, + "step": 63090, + "train_speed(iter/s)": 0.200715 + }, + { + "acc": 0.7757308, + "epoch": 1.4721707811429154, + "grad_norm": 4.59375, + "learning_rate": 1.7178122144219873e-06, + "loss": 0.79125972, + "memory(GiB)": 147.13, + "step": 63100, + "train_speed(iter/s)": 0.200732 + }, + { + "acc": 0.78209968, + "epoch": 1.4724040887152043, + "grad_norm": 4.25, + "learning_rate": 1.716387339996618e-06, + "loss": 0.78565693, + "memory(GiB)": 147.13, + "step": 63110, + "train_speed(iter/s)": 0.200747 + }, + { + "acc": 0.76956949, + "epoch": 1.4726373962874932, + "grad_norm": 7.1875, + "learning_rate": 1.7149629343060003e-06, + "loss": 0.82966146, + "memory(GiB)": 147.13, + "step": 63120, + "train_speed(iter/s)": 0.200764 + }, + { + "acc": 0.78773918, + "epoch": 1.4728707038597821, + "grad_norm": 4.8125, + "learning_rate": 1.7135389975534711e-06, + "loss": 0.76110039, + "memory(GiB)": 147.13, + "step": 63130, + "train_speed(iter/s)": 0.200779 + }, + { + "acc": 0.7758707, + "epoch": 1.473104011432071, + "grad_norm": 4.84375, + "learning_rate": 1.7121155299422936e-06, + "loss": 0.81253977, + "memory(GiB)": 147.13, + "step": 63140, + "train_speed(iter/s)": 0.200795 + }, + { + "acc": 0.75602198, + "epoch": 1.47333731900436, + "grad_norm": 5.1875, + "learning_rate": 1.710692531675671e-06, + "loss": 0.90366888, + "memory(GiB)": 147.13, + "step": 63150, + "train_speed(iter/s)": 0.200813 + }, + { + "acc": 0.77753906, + "epoch": 1.4735706265766488, + "grad_norm": 5.40625, + "learning_rate": 1.709270002956732e-06, + "loss": 0.77708693, + "memory(GiB)": 147.13, + "step": 63160, + "train_speed(iter/s)": 0.20083 + }, + { + "acc": 0.77087703, + "epoch": 1.4738039341489377, + "grad_norm": 5.6875, + "learning_rate": 1.7078479439885458e-06, + "loss": 0.82326393, + "memory(GiB)": 147.13, + "step": 63170, + "train_speed(iter/s)": 0.200845 + }, + { + "acc": 0.78759995, + "epoch": 1.4740372417212266, + "grad_norm": 7.46875, + "learning_rate": 1.7064263549741095e-06, + "loss": 0.76839981, + "memory(GiB)": 147.13, + "step": 63180, + "train_speed(iter/s)": 0.200862 + }, + { + "acc": 0.79642954, + "epoch": 1.4742705492935155, + "grad_norm": 5.34375, + "learning_rate": 1.7050052361163522e-06, + "loss": 0.7275032, + "memory(GiB)": 147.13, + "step": 63190, + "train_speed(iter/s)": 0.20088 + }, + { + "acc": 0.79400454, + "epoch": 1.4745038568658044, + "grad_norm": 5.375, + "learning_rate": 1.7035845876181422e-06, + "loss": 0.74391875, + "memory(GiB)": 147.13, + "step": 63200, + "train_speed(iter/s)": 0.200895 + }, + { + "acc": 0.7892807, + "epoch": 1.4747371644380933, + "grad_norm": 12.75, + "learning_rate": 1.7021644096822748e-06, + "loss": 0.76604948, + "memory(GiB)": 147.13, + "step": 63210, + "train_speed(iter/s)": 0.200911 + }, + { + "acc": 0.76153727, + "epoch": 1.4749704720103822, + "grad_norm": 4.125, + "learning_rate": 1.7007447025114798e-06, + "loss": 0.86720562, + "memory(GiB)": 147.13, + "step": 63220, + "train_speed(iter/s)": 0.200928 + }, + { + "acc": 0.77364707, + "epoch": 1.4752037795826711, + "grad_norm": 5.8125, + "learning_rate": 1.699325466308418e-06, + "loss": 0.81828022, + "memory(GiB)": 147.13, + "step": 63230, + "train_speed(iter/s)": 0.200946 + }, + { + "acc": 0.77053213, + "epoch": 1.47543708715496, + "grad_norm": 4.21875, + "learning_rate": 1.6979067012756888e-06, + "loss": 0.83672915, + "memory(GiB)": 147.13, + "step": 63240, + "train_speed(iter/s)": 0.200962 + }, + { + "acc": 0.78983154, + "epoch": 1.475670394727249, + "grad_norm": 5.1875, + "learning_rate": 1.6964884076158194e-06, + "loss": 0.76191545, + "memory(GiB)": 147.13, + "step": 63250, + "train_speed(iter/s)": 0.200978 + }, + { + "acc": 0.75502505, + "epoch": 1.4759037022995378, + "grad_norm": 5.3125, + "learning_rate": 1.6950705855312677e-06, + "loss": 0.89188004, + "memory(GiB)": 147.13, + "step": 63260, + "train_speed(iter/s)": 0.200994 + }, + { + "acc": 0.7857728, + "epoch": 1.4761370098718267, + "grad_norm": 5.21875, + "learning_rate": 1.6936532352244316e-06, + "loss": 0.75499687, + "memory(GiB)": 147.13, + "step": 63270, + "train_speed(iter/s)": 0.201011 + }, + { + "acc": 0.78429489, + "epoch": 1.4763703174441156, + "grad_norm": 5.625, + "learning_rate": 1.6922363568976347e-06, + "loss": 0.78085718, + "memory(GiB)": 147.13, + "step": 63280, + "train_speed(iter/s)": 0.201027 + }, + { + "acc": 0.78143988, + "epoch": 1.4766036250164045, + "grad_norm": 5.75, + "learning_rate": 1.690819950753138e-06, + "loss": 0.77364426, + "memory(GiB)": 147.13, + "step": 63290, + "train_speed(iter/s)": 0.201044 + }, + { + "acc": 0.7872983, + "epoch": 1.4768369325886934, + "grad_norm": 4.6875, + "learning_rate": 1.6894040169931303e-06, + "loss": 0.75831628, + "memory(GiB)": 147.13, + "step": 63300, + "train_speed(iter/s)": 0.201059 + }, + { + "acc": 0.7711751, + "epoch": 1.477070240160982, + "grad_norm": 4.875, + "learning_rate": 1.6879885558197395e-06, + "loss": 0.80848656, + "memory(GiB)": 147.13, + "step": 63310, + "train_speed(iter/s)": 0.201076 + }, + { + "acc": 0.78801699, + "epoch": 1.4773035477332712, + "grad_norm": 4.125, + "learning_rate": 1.6865735674350198e-06, + "loss": 0.74979692, + "memory(GiB)": 147.13, + "step": 63320, + "train_speed(iter/s)": 0.201093 + }, + { + "acc": 0.8035552, + "epoch": 1.47753685530556, + "grad_norm": 4.4375, + "learning_rate": 1.6851590520409611e-06, + "loss": 0.69476547, + "memory(GiB)": 147.13, + "step": 63330, + "train_speed(iter/s)": 0.201108 + }, + { + "acc": 0.78817482, + "epoch": 1.477770162877849, + "grad_norm": 6.46875, + "learning_rate": 1.6837450098394848e-06, + "loss": 0.76696267, + "memory(GiB)": 147.13, + "step": 63340, + "train_speed(iter/s)": 0.201124 + }, + { + "acc": 0.79101725, + "epoch": 1.4780034704501377, + "grad_norm": 5.46875, + "learning_rate": 1.6823314410324426e-06, + "loss": 0.74547434, + "memory(GiB)": 147.13, + "step": 63350, + "train_speed(iter/s)": 0.201141 + }, + { + "acc": 0.78399458, + "epoch": 1.4782367780224268, + "grad_norm": 9.375, + "learning_rate": 1.680918345821626e-06, + "loss": 0.78179579, + "memory(GiB)": 147.13, + "step": 63360, + "train_speed(iter/s)": 0.201158 + }, + { + "acc": 0.80156345, + "epoch": 1.4784700855947155, + "grad_norm": 5.4375, + "learning_rate": 1.6795057244087493e-06, + "loss": 0.70179181, + "memory(GiB)": 147.13, + "step": 63370, + "train_speed(iter/s)": 0.201173 + }, + { + "acc": 0.79969625, + "epoch": 1.4787033931670046, + "grad_norm": 7.09375, + "learning_rate": 1.678093576995467e-06, + "loss": 0.71199322, + "memory(GiB)": 147.13, + "step": 63380, + "train_speed(iter/s)": 0.201189 + }, + { + "acc": 0.77803707, + "epoch": 1.4789367007392933, + "grad_norm": 5.8125, + "learning_rate": 1.676681903783362e-06, + "loss": 0.77450371, + "memory(GiB)": 147.13, + "step": 63390, + "train_speed(iter/s)": 0.201204 + }, + { + "acc": 0.79645801, + "epoch": 1.4791700083115822, + "grad_norm": 5.0, + "learning_rate": 1.6752707049739487e-06, + "loss": 0.72130475, + "memory(GiB)": 147.13, + "step": 63400, + "train_speed(iter/s)": 0.201221 + }, + { + "acc": 0.78377075, + "epoch": 1.479403315883871, + "grad_norm": 5.9375, + "learning_rate": 1.6738599807686774e-06, + "loss": 0.78707418, + "memory(GiB)": 147.13, + "step": 63410, + "train_speed(iter/s)": 0.201237 + }, + { + "acc": 0.7851202, + "epoch": 1.47963662345616, + "grad_norm": 5.53125, + "learning_rate": 1.6724497313689258e-06, + "loss": 0.76640143, + "memory(GiB)": 147.13, + "step": 63420, + "train_speed(iter/s)": 0.201253 + }, + { + "acc": 0.77378364, + "epoch": 1.4798699310284489, + "grad_norm": 4.5, + "learning_rate": 1.6710399569760105e-06, + "loss": 0.81367893, + "memory(GiB)": 147.13, + "step": 63430, + "train_speed(iter/s)": 0.201268 + }, + { + "acc": 0.77930741, + "epoch": 1.4801032386007378, + "grad_norm": 5.65625, + "learning_rate": 1.669630657791174e-06, + "loss": 0.77642231, + "memory(GiB)": 147.13, + "step": 63440, + "train_speed(iter/s)": 0.201284 + }, + { + "acc": 0.773598, + "epoch": 1.4803365461730267, + "grad_norm": 5.5, + "learning_rate": 1.6682218340155936e-06, + "loss": 0.80465155, + "memory(GiB)": 147.13, + "step": 63450, + "train_speed(iter/s)": 0.201302 + }, + { + "acc": 0.76938334, + "epoch": 1.4805698537453156, + "grad_norm": 6.1875, + "learning_rate": 1.666813485850377e-06, + "loss": 0.82184029, + "memory(GiB)": 147.13, + "step": 63460, + "train_speed(iter/s)": 0.201318 + }, + { + "acc": 0.77726431, + "epoch": 1.4808031613176045, + "grad_norm": 6.25, + "learning_rate": 1.665405613496569e-06, + "loss": 0.81066351, + "memory(GiB)": 147.13, + "step": 63470, + "train_speed(iter/s)": 0.201334 + }, + { + "acc": 0.75288639, + "epoch": 1.4810364688898934, + "grad_norm": 6.03125, + "learning_rate": 1.6639982171551405e-06, + "loss": 0.89472895, + "memory(GiB)": 147.13, + "step": 63480, + "train_speed(iter/s)": 0.20135 + }, + { + "acc": 0.77361879, + "epoch": 1.4812697764621823, + "grad_norm": 7.03125, + "learning_rate": 1.6625912970269958e-06, + "loss": 0.80337019, + "memory(GiB)": 147.13, + "step": 63490, + "train_speed(iter/s)": 0.201367 + }, + { + "acc": 0.77668934, + "epoch": 1.4815030840344712, + "grad_norm": 4.65625, + "learning_rate": 1.6611848533129754e-06, + "loss": 0.79210997, + "memory(GiB)": 147.13, + "step": 63500, + "train_speed(iter/s)": 0.201385 + }, + { + "epoch": 1.4815030840344712, + "eval_acc": 0.7447049510671291, + "eval_loss": 0.8045029044151306, + "eval_runtime": 1268.8732, + "eval_samples_per_second": 28.365, + "eval_steps_per_second": 14.183, + "step": 63500 + }, + { + "acc": 0.77970753, + "epoch": 1.48173639160676, + "grad_norm": 3.75, + "learning_rate": 1.6597788862138458e-06, + "loss": 0.78724799, + "memory(GiB)": 147.13, + "step": 63510, + "train_speed(iter/s)": 0.200581 + }, + { + "acc": 0.78490286, + "epoch": 1.481969699179049, + "grad_norm": 4.5625, + "learning_rate": 1.6583733959303116e-06, + "loss": 0.7604619, + "memory(GiB)": 147.13, + "step": 63520, + "train_speed(iter/s)": 0.200596 + }, + { + "acc": 0.77267179, + "epoch": 1.4822030067513379, + "grad_norm": 4.78125, + "learning_rate": 1.6569683826630045e-06, + "loss": 0.80196037, + "memory(GiB)": 147.13, + "step": 63530, + "train_speed(iter/s)": 0.200613 + }, + { + "acc": 0.78124847, + "epoch": 1.4824363143236268, + "grad_norm": 4.84375, + "learning_rate": 1.6555638466124878e-06, + "loss": 0.80149517, + "memory(GiB)": 147.13, + "step": 63540, + "train_speed(iter/s)": 0.200628 + }, + { + "acc": 0.77615232, + "epoch": 1.4826696218959157, + "grad_norm": 5.9375, + "learning_rate": 1.654159787979262e-06, + "loss": 0.81325369, + "memory(GiB)": 147.13, + "step": 63550, + "train_speed(iter/s)": 0.200645 + }, + { + "acc": 0.78834553, + "epoch": 1.4829029294682046, + "grad_norm": 6.03125, + "learning_rate": 1.6527562069637543e-06, + "loss": 0.77705812, + "memory(GiB)": 147.13, + "step": 63560, + "train_speed(iter/s)": 0.200662 + }, + { + "acc": 0.76742916, + "epoch": 1.4831362370404935, + "grad_norm": 7.125, + "learning_rate": 1.6513531037663262e-06, + "loss": 0.83876429, + "memory(GiB)": 147.13, + "step": 63570, + "train_speed(iter/s)": 0.200678 + }, + { + "acc": 0.79001141, + "epoch": 1.4833695446127824, + "grad_norm": 5.1875, + "learning_rate": 1.6499504785872679e-06, + "loss": 0.74909034, + "memory(GiB)": 147.13, + "step": 63580, + "train_speed(iter/s)": 0.200693 + }, + { + "acc": 0.79263821, + "epoch": 1.4836028521850713, + "grad_norm": 6.125, + "learning_rate": 1.648548331626807e-06, + "loss": 0.76054592, + "memory(GiB)": 147.13, + "step": 63590, + "train_speed(iter/s)": 0.200708 + }, + { + "acc": 0.78314829, + "epoch": 1.4838361597573602, + "grad_norm": 6.28125, + "learning_rate": 1.6471466630850985e-06, + "loss": 0.78806133, + "memory(GiB)": 147.13, + "step": 63600, + "train_speed(iter/s)": 0.200724 + }, + { + "acc": 0.7892365, + "epoch": 1.484069467329649, + "grad_norm": 4.6875, + "learning_rate": 1.645745473162228e-06, + "loss": 0.74208422, + "memory(GiB)": 147.13, + "step": 63610, + "train_speed(iter/s)": 0.200741 + }, + { + "acc": 0.80071468, + "epoch": 1.484302774901938, + "grad_norm": 4.84375, + "learning_rate": 1.644344762058218e-06, + "loss": 0.71267734, + "memory(GiB)": 147.13, + "step": 63620, + "train_speed(iter/s)": 0.200757 + }, + { + "acc": 0.77488637, + "epoch": 1.4845360824742269, + "grad_norm": 10.1875, + "learning_rate": 1.6429445299730173e-06, + "loss": 0.82684231, + "memory(GiB)": 147.13, + "step": 63630, + "train_speed(iter/s)": 0.200773 + }, + { + "acc": 0.75984926, + "epoch": 1.4847693900465158, + "grad_norm": 5.3125, + "learning_rate": 1.6415447771065112e-06, + "loss": 0.88017397, + "memory(GiB)": 147.13, + "step": 63640, + "train_speed(iter/s)": 0.200789 + }, + { + "acc": 0.76972971, + "epoch": 1.4850026976188047, + "grad_norm": 5.75, + "learning_rate": 1.6401455036585111e-06, + "loss": 0.82038412, + "memory(GiB)": 147.13, + "step": 63650, + "train_speed(iter/s)": 0.200804 + }, + { + "acc": 0.77063775, + "epoch": 1.4852360051910936, + "grad_norm": 6.125, + "learning_rate": 1.6387467098287656e-06, + "loss": 0.82443304, + "memory(GiB)": 147.13, + "step": 63660, + "train_speed(iter/s)": 0.200821 + }, + { + "acc": 0.79164023, + "epoch": 1.4854693127633825, + "grad_norm": 4.4375, + "learning_rate": 1.637348395816951e-06, + "loss": 0.76277428, + "memory(GiB)": 147.13, + "step": 63670, + "train_speed(iter/s)": 0.200838 + }, + { + "acc": 0.80538321, + "epoch": 1.4857026203356711, + "grad_norm": 4.9375, + "learning_rate": 1.635950561822676e-06, + "loss": 0.68404865, + "memory(GiB)": 147.13, + "step": 63680, + "train_speed(iter/s)": 0.200853 + }, + { + "acc": 0.77760887, + "epoch": 1.4859359279079603, + "grad_norm": 6.625, + "learning_rate": 1.6345532080454813e-06, + "loss": 0.80281401, + "memory(GiB)": 147.13, + "step": 63690, + "train_speed(iter/s)": 0.200871 + }, + { + "acc": 0.7884223, + "epoch": 1.486169235480249, + "grad_norm": 7.5625, + "learning_rate": 1.6331563346848366e-06, + "loss": 0.77055836, + "memory(GiB)": 147.13, + "step": 63700, + "train_speed(iter/s)": 0.200887 + }, + { + "acc": 0.78238873, + "epoch": 1.486402543052538, + "grad_norm": 4.75, + "learning_rate": 1.6317599419401486e-06, + "loss": 0.78170815, + "memory(GiB)": 147.13, + "step": 63710, + "train_speed(iter/s)": 0.200904 + }, + { + "acc": 0.77347994, + "epoch": 1.4866358506248267, + "grad_norm": 8.0, + "learning_rate": 1.6303640300107493e-06, + "loss": 0.79208412, + "memory(GiB)": 147.13, + "step": 63720, + "train_speed(iter/s)": 0.200919 + }, + { + "acc": 0.7714962, + "epoch": 1.4868691581971158, + "grad_norm": 5.5, + "learning_rate": 1.628968599095907e-06, + "loss": 0.81774483, + "memory(GiB)": 147.13, + "step": 63730, + "train_speed(iter/s)": 0.200936 + }, + { + "acc": 0.78385878, + "epoch": 1.4871024657694045, + "grad_norm": 5.875, + "learning_rate": 1.6275736493948174e-06, + "loss": 0.77382607, + "memory(GiB)": 147.13, + "step": 63740, + "train_speed(iter/s)": 0.200951 + }, + { + "acc": 0.77496548, + "epoch": 1.4873357733416936, + "grad_norm": 4.40625, + "learning_rate": 1.626179181106609e-06, + "loss": 0.82866879, + "memory(GiB)": 147.13, + "step": 63750, + "train_speed(iter/s)": 0.200966 + }, + { + "acc": 0.78694777, + "epoch": 1.4875690809139823, + "grad_norm": 7.8125, + "learning_rate": 1.6247851944303433e-06, + "loss": 0.7435267, + "memory(GiB)": 147.13, + "step": 63760, + "train_speed(iter/s)": 0.200983 + }, + { + "acc": 0.7818573, + "epoch": 1.4878023884862714, + "grad_norm": 4.875, + "learning_rate": 1.6233916895650093e-06, + "loss": 0.77421417, + "memory(GiB)": 147.13, + "step": 63770, + "train_speed(iter/s)": 0.200999 + }, + { + "acc": 0.77949824, + "epoch": 1.4880356960585601, + "grad_norm": 5.84375, + "learning_rate": 1.6219986667095323e-06, + "loss": 0.81707954, + "memory(GiB)": 147.13, + "step": 63780, + "train_speed(iter/s)": 0.201017 + }, + { + "acc": 0.77718635, + "epoch": 1.488269003630849, + "grad_norm": 5.8125, + "learning_rate": 1.6206061260627643e-06, + "loss": 0.80593338, + "memory(GiB)": 147.13, + "step": 63790, + "train_speed(iter/s)": 0.201034 + }, + { + "acc": 0.79007106, + "epoch": 1.488502311203138, + "grad_norm": 6.40625, + "learning_rate": 1.6192140678234903e-06, + "loss": 0.75823188, + "memory(GiB)": 147.13, + "step": 63800, + "train_speed(iter/s)": 0.201051 + }, + { + "acc": 0.76932173, + "epoch": 1.4887356187754268, + "grad_norm": 6.09375, + "learning_rate": 1.617822492190424e-06, + "loss": 0.82226, + "memory(GiB)": 147.13, + "step": 63810, + "train_speed(iter/s)": 0.201068 + }, + { + "acc": 0.77304816, + "epoch": 1.4889689263477157, + "grad_norm": 4.78125, + "learning_rate": 1.616431399362216e-06, + "loss": 0.80309229, + "memory(GiB)": 147.13, + "step": 63820, + "train_speed(iter/s)": 0.201084 + }, + { + "acc": 0.79105062, + "epoch": 1.4892022339200046, + "grad_norm": 4.8125, + "learning_rate": 1.615040789537443e-06, + "loss": 0.73891435, + "memory(GiB)": 147.13, + "step": 63830, + "train_speed(iter/s)": 0.201101 + }, + { + "acc": 0.78510084, + "epoch": 1.4894355414922935, + "grad_norm": 6.0625, + "learning_rate": 1.6136506629146125e-06, + "loss": 0.76382818, + "memory(GiB)": 147.13, + "step": 63840, + "train_speed(iter/s)": 0.201118 + }, + { + "acc": 0.78627157, + "epoch": 1.4896688490645824, + "grad_norm": 4.84375, + "learning_rate": 1.6122610196921673e-06, + "loss": 0.75550165, + "memory(GiB)": 147.13, + "step": 63850, + "train_speed(iter/s)": 0.201135 + }, + { + "acc": 0.78514376, + "epoch": 1.4899021566368713, + "grad_norm": 6.1875, + "learning_rate": 1.6108718600684764e-06, + "loss": 0.78006501, + "memory(GiB)": 147.13, + "step": 63860, + "train_speed(iter/s)": 0.201151 + }, + { + "acc": 0.76863089, + "epoch": 1.4901354642091602, + "grad_norm": 4.71875, + "learning_rate": 1.609483184241844e-06, + "loss": 0.81679535, + "memory(GiB)": 147.13, + "step": 63870, + "train_speed(iter/s)": 0.201168 + }, + { + "acc": 0.79790592, + "epoch": 1.490368771781449, + "grad_norm": 5.5, + "learning_rate": 1.6080949924105022e-06, + "loss": 0.71203775, + "memory(GiB)": 147.13, + "step": 63880, + "train_speed(iter/s)": 0.201184 + }, + { + "acc": 0.77048893, + "epoch": 1.490602079353738, + "grad_norm": 6.25, + "learning_rate": 1.6067072847726134e-06, + "loss": 0.81707821, + "memory(GiB)": 147.13, + "step": 63890, + "train_speed(iter/s)": 0.2012 + }, + { + "acc": 0.75077691, + "epoch": 1.490835386926027, + "grad_norm": 4.84375, + "learning_rate": 1.605320061526277e-06, + "loss": 0.89881153, + "memory(GiB)": 147.13, + "step": 63900, + "train_speed(iter/s)": 0.201217 + }, + { + "acc": 0.81723709, + "epoch": 1.4910686944983158, + "grad_norm": 4.75, + "learning_rate": 1.6039333228695132e-06, + "loss": 0.65049648, + "memory(GiB)": 147.13, + "step": 63910, + "train_speed(iter/s)": 0.201232 + }, + { + "acc": 0.78162031, + "epoch": 1.4913020020706047, + "grad_norm": 6.34375, + "learning_rate": 1.6025470690002815e-06, + "loss": 0.77352018, + "memory(GiB)": 147.13, + "step": 63920, + "train_speed(iter/s)": 0.201249 + }, + { + "acc": 0.76750431, + "epoch": 1.4915353096428936, + "grad_norm": 4.40625, + "learning_rate": 1.6011613001164677e-06, + "loss": 0.84007797, + "memory(GiB)": 147.13, + "step": 63930, + "train_speed(iter/s)": 0.201265 + }, + { + "acc": 0.78385334, + "epoch": 1.4917686172151825, + "grad_norm": 6.71875, + "learning_rate": 1.5997760164158927e-06, + "loss": 0.77498088, + "memory(GiB)": 147.13, + "step": 63940, + "train_speed(iter/s)": 0.20128 + }, + { + "acc": 0.78162289, + "epoch": 1.4920019247874714, + "grad_norm": 4.8125, + "learning_rate": 1.5983912180963012e-06, + "loss": 0.76992779, + "memory(GiB)": 147.13, + "step": 63950, + "train_speed(iter/s)": 0.201296 + }, + { + "acc": 0.78982821, + "epoch": 1.4922352323597603, + "grad_norm": 5.03125, + "learning_rate": 1.5970069053553776e-06, + "loss": 0.74374228, + "memory(GiB)": 147.13, + "step": 63960, + "train_speed(iter/s)": 0.201313 + }, + { + "acc": 0.79276485, + "epoch": 1.4924685399320492, + "grad_norm": 5.46875, + "learning_rate": 1.5956230783907294e-06, + "loss": 0.72949905, + "memory(GiB)": 147.13, + "step": 63970, + "train_speed(iter/s)": 0.20133 + }, + { + "acc": 0.77740602, + "epoch": 1.492701847504338, + "grad_norm": 5.75, + "learning_rate": 1.5942397373998959e-06, + "loss": 0.79231834, + "memory(GiB)": 147.13, + "step": 63980, + "train_speed(iter/s)": 0.201346 + }, + { + "acc": 0.78017025, + "epoch": 1.492935155076627, + "grad_norm": 5.4375, + "learning_rate": 1.5928568825803526e-06, + "loss": 0.78005562, + "memory(GiB)": 147.13, + "step": 63990, + "train_speed(iter/s)": 0.201364 + }, + { + "acc": 0.78059478, + "epoch": 1.493168462648916, + "grad_norm": 7.40625, + "learning_rate": 1.5914745141294974e-06, + "loss": 0.76288352, + "memory(GiB)": 147.13, + "step": 64000, + "train_speed(iter/s)": 0.20138 + }, + { + "epoch": 1.493168462648916, + "eval_acc": 0.7446312847202705, + "eval_loss": 0.8045158982276917, + "eval_runtime": 1269.109, + "eval_samples_per_second": 28.359, + "eval_steps_per_second": 14.18, + "step": 64000 + }, + { + "acc": 0.80024071, + "epoch": 1.4934017702212048, + "grad_norm": 6.28125, + "learning_rate": 1.5900926322446686e-06, + "loss": 0.74026098, + "memory(GiB)": 147.13, + "step": 64010, + "train_speed(iter/s)": 0.200581 + }, + { + "acc": 0.78730555, + "epoch": 1.4936350777934937, + "grad_norm": 5.5, + "learning_rate": 1.5887112371231227e-06, + "loss": 0.75264711, + "memory(GiB)": 147.13, + "step": 64020, + "train_speed(iter/s)": 0.200598 + }, + { + "acc": 0.76471243, + "epoch": 1.4938683853657826, + "grad_norm": 5.875, + "learning_rate": 1.5873303289620585e-06, + "loss": 0.85071449, + "memory(GiB)": 147.13, + "step": 64030, + "train_speed(iter/s)": 0.200613 + }, + { + "acc": 0.77108183, + "epoch": 1.4941016929380715, + "grad_norm": 8.1875, + "learning_rate": 1.5859499079585982e-06, + "loss": 0.81153488, + "memory(GiB)": 147.13, + "step": 64040, + "train_speed(iter/s)": 0.20063 + }, + { + "acc": 0.78847384, + "epoch": 1.4943350005103604, + "grad_norm": 5.84375, + "learning_rate": 1.5845699743097953e-06, + "loss": 0.75993419, + "memory(GiB)": 147.13, + "step": 64050, + "train_speed(iter/s)": 0.200644 + }, + { + "acc": 0.78438349, + "epoch": 1.4945683080826493, + "grad_norm": 5.28125, + "learning_rate": 1.583190528212638e-06, + "loss": 0.78886342, + "memory(GiB)": 147.13, + "step": 64060, + "train_speed(iter/s)": 0.200661 + }, + { + "acc": 0.78701124, + "epoch": 1.494801615654938, + "grad_norm": 5.8125, + "learning_rate": 1.5818115698640386e-06, + "loss": 0.77774706, + "memory(GiB)": 147.13, + "step": 64070, + "train_speed(iter/s)": 0.200679 + }, + { + "acc": 0.76535654, + "epoch": 1.495034923227227, + "grad_norm": 6.75, + "learning_rate": 1.5804330994608463e-06, + "loss": 0.84333496, + "memory(GiB)": 147.13, + "step": 64080, + "train_speed(iter/s)": 0.200696 + }, + { + "acc": 0.79128575, + "epoch": 1.4952682307995158, + "grad_norm": 5.46875, + "learning_rate": 1.5790551171998337e-06, + "loss": 0.75358858, + "memory(GiB)": 147.13, + "step": 64090, + "train_speed(iter/s)": 0.200712 + }, + { + "acc": 0.77005005, + "epoch": 1.4955015383718049, + "grad_norm": 7.6875, + "learning_rate": 1.5776776232777114e-06, + "loss": 0.84556427, + "memory(GiB)": 147.13, + "step": 64100, + "train_speed(iter/s)": 0.200728 + }, + { + "acc": 0.77952299, + "epoch": 1.4957348459440936, + "grad_norm": 9.6875, + "learning_rate": 1.5763006178911139e-06, + "loss": 0.80671349, + "memory(GiB)": 147.13, + "step": 64110, + "train_speed(iter/s)": 0.200744 + }, + { + "acc": 0.77896752, + "epoch": 1.4959681535163827, + "grad_norm": 5.4375, + "learning_rate": 1.5749241012366068e-06, + "loss": 0.78669481, + "memory(GiB)": 147.13, + "step": 64120, + "train_speed(iter/s)": 0.20076 + }, + { + "acc": 0.7698513, + "epoch": 1.4962014610886714, + "grad_norm": 5.8125, + "learning_rate": 1.5735480735106927e-06, + "loss": 0.82209358, + "memory(GiB)": 147.13, + "step": 64130, + "train_speed(iter/s)": 0.200776 + }, + { + "acc": 0.78666344, + "epoch": 1.4964347686609605, + "grad_norm": 5.03125, + "learning_rate": 1.5721725349097926e-06, + "loss": 0.74558525, + "memory(GiB)": 147.13, + "step": 64140, + "train_speed(iter/s)": 0.200792 + }, + { + "acc": 0.79377465, + "epoch": 1.4966680762332492, + "grad_norm": 4.40625, + "learning_rate": 1.570797485630269e-06, + "loss": 0.75561595, + "memory(GiB)": 147.13, + "step": 64150, + "train_speed(iter/s)": 0.20081 + }, + { + "acc": 0.77909813, + "epoch": 1.4969013838055383, + "grad_norm": 6.9375, + "learning_rate": 1.5694229258684063e-06, + "loss": 0.77388506, + "memory(GiB)": 147.13, + "step": 64160, + "train_speed(iter/s)": 0.200827 + }, + { + "acc": 0.77309704, + "epoch": 1.497134691377827, + "grad_norm": 4.5625, + "learning_rate": 1.5680488558204259e-06, + "loss": 0.82009621, + "memory(GiB)": 147.13, + "step": 64170, + "train_speed(iter/s)": 0.200841 + }, + { + "acc": 0.77360954, + "epoch": 1.4973679989501159, + "grad_norm": 5.03125, + "learning_rate": 1.566675275682475e-06, + "loss": 0.80817747, + "memory(GiB)": 147.13, + "step": 64180, + "train_speed(iter/s)": 0.200858 + }, + { + "acc": 0.78091269, + "epoch": 1.4976013065224048, + "grad_norm": 4.25, + "learning_rate": 1.565302185650629e-06, + "loss": 0.78140159, + "memory(GiB)": 147.13, + "step": 64190, + "train_speed(iter/s)": 0.200874 + }, + { + "acc": 0.79408712, + "epoch": 1.4978346140946937, + "grad_norm": 5.4375, + "learning_rate": 1.5639295859208998e-06, + "loss": 0.74546127, + "memory(GiB)": 147.13, + "step": 64200, + "train_speed(iter/s)": 0.20089 + }, + { + "acc": 0.79185085, + "epoch": 1.4980679216669825, + "grad_norm": 6.96875, + "learning_rate": 1.562557476689222e-06, + "loss": 0.7497364, + "memory(GiB)": 147.13, + "step": 64210, + "train_speed(iter/s)": 0.200906 + }, + { + "acc": 0.79052382, + "epoch": 1.4983012292392714, + "grad_norm": 4.75, + "learning_rate": 1.5611858581514683e-06, + "loss": 0.75888486, + "memory(GiB)": 147.13, + "step": 64220, + "train_speed(iter/s)": 0.200923 + }, + { + "acc": 0.78845158, + "epoch": 1.4985345368115603, + "grad_norm": 4.125, + "learning_rate": 1.559814730503434e-06, + "loss": 0.7904933, + "memory(GiB)": 147.13, + "step": 64230, + "train_speed(iter/s)": 0.200939 + }, + { + "acc": 0.77261376, + "epoch": 1.4987678443838492, + "grad_norm": 5.4375, + "learning_rate": 1.5584440939408473e-06, + "loss": 0.82468948, + "memory(GiB)": 147.13, + "step": 64240, + "train_speed(iter/s)": 0.200956 + }, + { + "acc": 0.76977367, + "epoch": 1.4990011519561381, + "grad_norm": 5.84375, + "learning_rate": 1.557073948659365e-06, + "loss": 0.83275337, + "memory(GiB)": 147.13, + "step": 64250, + "train_speed(iter/s)": 0.200972 + }, + { + "acc": 0.79217887, + "epoch": 1.499234459528427, + "grad_norm": 5.9375, + "learning_rate": 1.555704294854578e-06, + "loss": 0.74504166, + "memory(GiB)": 147.13, + "step": 64260, + "train_speed(iter/s)": 0.200988 + }, + { + "acc": 0.7652029, + "epoch": 1.499467767100716, + "grad_norm": 5.9375, + "learning_rate": 1.5543351327220025e-06, + "loss": 0.84907303, + "memory(GiB)": 147.13, + "step": 64270, + "train_speed(iter/s)": 0.201003 + }, + { + "acc": 0.79375896, + "epoch": 1.4997010746730048, + "grad_norm": 5.5, + "learning_rate": 1.5529664624570839e-06, + "loss": 0.72161312, + "memory(GiB)": 147.13, + "step": 64280, + "train_speed(iter/s)": 0.20102 + }, + { + "acc": 0.78259649, + "epoch": 1.4999343822452937, + "grad_norm": 6.53125, + "learning_rate": 1.551598284255203e-06, + "loss": 0.79077225, + "memory(GiB)": 147.13, + "step": 64290, + "train_speed(iter/s)": 0.201036 + }, + { + "acc": 0.78028545, + "epoch": 1.5001676898175826, + "grad_norm": 5.375, + "learning_rate": 1.550230598311664e-06, + "loss": 0.81074438, + "memory(GiB)": 147.13, + "step": 64300, + "train_speed(iter/s)": 0.201053 + }, + { + "acc": 0.76743765, + "epoch": 1.5004009973898715, + "grad_norm": 4.21875, + "learning_rate": 1.548863404821706e-06, + "loss": 0.83184929, + "memory(GiB)": 147.13, + "step": 64310, + "train_speed(iter/s)": 0.201069 + }, + { + "acc": 0.80110559, + "epoch": 1.5006343049621604, + "grad_norm": 11.0, + "learning_rate": 1.547496703980495e-06, + "loss": 0.71932917, + "memory(GiB)": 147.13, + "step": 64320, + "train_speed(iter/s)": 0.201085 + }, + { + "acc": 0.77408819, + "epoch": 1.5008676125344493, + "grad_norm": 5.0, + "learning_rate": 1.5461304959831248e-06, + "loss": 0.81594849, + "memory(GiB)": 147.13, + "step": 64330, + "train_speed(iter/s)": 0.201101 + }, + { + "acc": 0.80996132, + "epoch": 1.5011009201067382, + "grad_norm": 5.53125, + "learning_rate": 1.5447647810246241e-06, + "loss": 0.68073368, + "memory(GiB)": 147.13, + "step": 64340, + "train_speed(iter/s)": 0.201116 + }, + { + "acc": 0.77236247, + "epoch": 1.5013342276790271, + "grad_norm": 7.9375, + "learning_rate": 1.5433995592999457e-06, + "loss": 0.79192848, + "memory(GiB)": 147.13, + "step": 64350, + "train_speed(iter/s)": 0.201132 + }, + { + "acc": 0.78151517, + "epoch": 1.501567535251316, + "grad_norm": 4.4375, + "learning_rate": 1.5420348310039796e-06, + "loss": 0.79086199, + "memory(GiB)": 147.13, + "step": 64360, + "train_speed(iter/s)": 0.201149 + }, + { + "acc": 0.77429748, + "epoch": 1.501800842823605, + "grad_norm": 5.78125, + "learning_rate": 1.5406705963315333e-06, + "loss": 0.80469446, + "memory(GiB)": 147.13, + "step": 64370, + "train_speed(iter/s)": 0.201165 + }, + { + "acc": 0.78121309, + "epoch": 1.5020341503958938, + "grad_norm": 5.125, + "learning_rate": 1.539306855477356e-06, + "loss": 0.7872479, + "memory(GiB)": 147.13, + "step": 64380, + "train_speed(iter/s)": 0.201181 + }, + { + "acc": 0.78482914, + "epoch": 1.5022674579681827, + "grad_norm": 4.875, + "learning_rate": 1.5379436086361187e-06, + "loss": 0.77998705, + "memory(GiB)": 147.13, + "step": 64390, + "train_speed(iter/s)": 0.201197 + }, + { + "acc": 0.76260514, + "epoch": 1.5025007655404716, + "grad_norm": 4.03125, + "learning_rate": 1.5365808560024264e-06, + "loss": 0.84974766, + "memory(GiB)": 147.13, + "step": 64400, + "train_speed(iter/s)": 0.201214 + }, + { + "acc": 0.79643979, + "epoch": 1.5027340731127605, + "grad_norm": 8.0625, + "learning_rate": 1.5352185977708112e-06, + "loss": 0.72695284, + "memory(GiB)": 147.13, + "step": 64410, + "train_speed(iter/s)": 0.201229 + }, + { + "acc": 0.80471954, + "epoch": 1.5029673806850492, + "grad_norm": 4.28125, + "learning_rate": 1.533856834135733e-06, + "loss": 0.71714687, + "memory(GiB)": 147.13, + "step": 64420, + "train_speed(iter/s)": 0.201244 + }, + { + "acc": 0.77157187, + "epoch": 1.5032006882573383, + "grad_norm": 6.53125, + "learning_rate": 1.532495565291587e-06, + "loss": 0.81608639, + "memory(GiB)": 147.13, + "step": 64430, + "train_speed(iter/s)": 0.20126 + }, + { + "acc": 0.79674616, + "epoch": 1.503433995829627, + "grad_norm": 6.09375, + "learning_rate": 1.5311347914326891e-06, + "loss": 0.72117443, + "memory(GiB)": 147.13, + "step": 64440, + "train_speed(iter/s)": 0.201276 + }, + { + "acc": 0.77385588, + "epoch": 1.5036673034019161, + "grad_norm": 3.9375, + "learning_rate": 1.5297745127532942e-06, + "loss": 0.83259687, + "memory(GiB)": 147.13, + "step": 64450, + "train_speed(iter/s)": 0.201292 + }, + { + "acc": 0.76449928, + "epoch": 1.5039006109742048, + "grad_norm": 4.71875, + "learning_rate": 1.5284147294475792e-06, + "loss": 0.84181862, + "memory(GiB)": 147.13, + "step": 64460, + "train_speed(iter/s)": 0.201307 + }, + { + "acc": 0.79227133, + "epoch": 1.504133918546494, + "grad_norm": 5.03125, + "learning_rate": 1.5270554417096533e-06, + "loss": 0.75533457, + "memory(GiB)": 147.13, + "step": 64470, + "train_speed(iter/s)": 0.201322 + }, + { + "acc": 0.77802868, + "epoch": 1.5043672261187826, + "grad_norm": 5.6875, + "learning_rate": 1.5256966497335541e-06, + "loss": 0.78939829, + "memory(GiB)": 147.13, + "step": 64480, + "train_speed(iter/s)": 0.201339 + }, + { + "acc": 0.77074051, + "epoch": 1.5046005336910717, + "grad_norm": 5.875, + "learning_rate": 1.5243383537132473e-06, + "loss": 0.83445225, + "memory(GiB)": 147.13, + "step": 64490, + "train_speed(iter/s)": 0.201355 + }, + { + "acc": 0.78632889, + "epoch": 1.5048338412633604, + "grad_norm": 4.84375, + "learning_rate": 1.5229805538426323e-06, + "loss": 0.76857004, + "memory(GiB)": 147.13, + "step": 64500, + "train_speed(iter/s)": 0.201371 + }, + { + "epoch": 1.5048338412633604, + "eval_acc": 0.7446565875089741, + "eval_loss": 0.8044771552085876, + "eval_runtime": 1269.4786, + "eval_samples_per_second": 28.351, + "eval_steps_per_second": 14.176, + "step": 64500 + }, + { + "acc": 0.79818277, + "epoch": 1.5050671488356495, + "grad_norm": 6.90625, + "learning_rate": 1.5216232503155314e-06, + "loss": 0.71413069, + "memory(GiB)": 147.13, + "step": 64510, + "train_speed(iter/s)": 0.200578 + }, + { + "acc": 0.79765759, + "epoch": 1.5053004564079382, + "grad_norm": 4.875, + "learning_rate": 1.520266443325703e-06, + "loss": 0.70702105, + "memory(GiB)": 147.13, + "step": 64520, + "train_speed(iter/s)": 0.200594 + }, + { + "acc": 0.77393379, + "epoch": 1.5055337639802273, + "grad_norm": 6.03125, + "learning_rate": 1.5189101330668288e-06, + "loss": 0.82264185, + "memory(GiB)": 147.13, + "step": 64530, + "train_speed(iter/s)": 0.200609 + }, + { + "acc": 0.78522334, + "epoch": 1.505767071552516, + "grad_norm": 5.59375, + "learning_rate": 1.5175543197325205e-06, + "loss": 0.76188078, + "memory(GiB)": 147.13, + "step": 64540, + "train_speed(iter/s)": 0.200625 + }, + { + "acc": 0.78678837, + "epoch": 1.506000379124805, + "grad_norm": 6.25, + "learning_rate": 1.5161990035163226e-06, + "loss": 0.77836008, + "memory(GiB)": 147.13, + "step": 64550, + "train_speed(iter/s)": 0.20064 + }, + { + "acc": 0.76740236, + "epoch": 1.5062336866970938, + "grad_norm": 5.71875, + "learning_rate": 1.5148441846117035e-06, + "loss": 0.8500349, + "memory(GiB)": 147.13, + "step": 64560, + "train_speed(iter/s)": 0.200657 + }, + { + "acc": 0.77085843, + "epoch": 1.506466994269383, + "grad_norm": 6.15625, + "learning_rate": 1.5134898632120659e-06, + "loss": 0.80457802, + "memory(GiB)": 147.13, + "step": 64570, + "train_speed(iter/s)": 0.200672 + }, + { + "acc": 0.78143206, + "epoch": 1.5067003018416716, + "grad_norm": 6.34375, + "learning_rate": 1.5121360395107366e-06, + "loss": 0.80713491, + "memory(GiB)": 147.13, + "step": 64580, + "train_speed(iter/s)": 0.200689 + }, + { + "acc": 0.7703723, + "epoch": 1.5069336094139607, + "grad_norm": 4.34375, + "learning_rate": 1.5107827137009772e-06, + "loss": 0.83621893, + "memory(GiB)": 147.13, + "step": 64590, + "train_speed(iter/s)": 0.200705 + }, + { + "acc": 0.78048077, + "epoch": 1.5071669169862494, + "grad_norm": 5.03125, + "learning_rate": 1.509429885975968e-06, + "loss": 0.77270036, + "memory(GiB)": 147.13, + "step": 64600, + "train_speed(iter/s)": 0.200722 + }, + { + "acc": 0.78544827, + "epoch": 1.5074002245585383, + "grad_norm": 5.78125, + "learning_rate": 1.5080775565288314e-06, + "loss": 0.76025243, + "memory(GiB)": 147.13, + "step": 64610, + "train_speed(iter/s)": 0.200738 + }, + { + "acc": 0.77681675, + "epoch": 1.5076335321308272, + "grad_norm": 5.21875, + "learning_rate": 1.5067257255526085e-06, + "loss": 0.79332089, + "memory(GiB)": 147.13, + "step": 64620, + "train_speed(iter/s)": 0.200753 + }, + { + "acc": 0.79631433, + "epoch": 1.507866839703116, + "grad_norm": 7.15625, + "learning_rate": 1.505374393240272e-06, + "loss": 0.71731257, + "memory(GiB)": 147.13, + "step": 64630, + "train_speed(iter/s)": 0.200769 + }, + { + "acc": 0.7795043, + "epoch": 1.508100147275405, + "grad_norm": 6.6875, + "learning_rate": 1.5040235597847268e-06, + "loss": 0.79805608, + "memory(GiB)": 147.13, + "step": 64640, + "train_speed(iter/s)": 0.200785 + }, + { + "acc": 0.77450428, + "epoch": 1.5083334548476939, + "grad_norm": 6.15625, + "learning_rate": 1.5026732253788018e-06, + "loss": 0.82841396, + "memory(GiB)": 147.13, + "step": 64650, + "train_speed(iter/s)": 0.2008 + }, + { + "acc": 0.78522072, + "epoch": 1.5085667624199828, + "grad_norm": 5.3125, + "learning_rate": 1.501323390215259e-06, + "loss": 0.77809391, + "memory(GiB)": 147.13, + "step": 64660, + "train_speed(iter/s)": 0.200816 + }, + { + "acc": 0.77315788, + "epoch": 1.5088000699922717, + "grad_norm": 6.84375, + "learning_rate": 1.4999740544867864e-06, + "loss": 0.82334805, + "memory(GiB)": 147.13, + "step": 64670, + "train_speed(iter/s)": 0.200833 + }, + { + "acc": 0.77878227, + "epoch": 1.5090333775645606, + "grad_norm": 5.03125, + "learning_rate": 1.498625218385999e-06, + "loss": 0.80296698, + "memory(GiB)": 147.13, + "step": 64680, + "train_speed(iter/s)": 0.200848 + }, + { + "acc": 0.76942329, + "epoch": 1.5092666851368495, + "grad_norm": 6.125, + "learning_rate": 1.4972768821054461e-06, + "loss": 0.82304773, + "memory(GiB)": 147.13, + "step": 64690, + "train_speed(iter/s)": 0.200864 + }, + { + "acc": 0.78032999, + "epoch": 1.5094999927091384, + "grad_norm": 4.96875, + "learning_rate": 1.4959290458376008e-06, + "loss": 0.78106308, + "memory(GiB)": 147.13, + "step": 64700, + "train_speed(iter/s)": 0.20088 + }, + { + "acc": 0.77347794, + "epoch": 1.5097333002814273, + "grad_norm": 6.5, + "learning_rate": 1.494581709774866e-06, + "loss": 0.82325249, + "memory(GiB)": 147.13, + "step": 64710, + "train_speed(iter/s)": 0.200895 + }, + { + "acc": 0.78451128, + "epoch": 1.5099666078537162, + "grad_norm": 5.65625, + "learning_rate": 1.4932348741095726e-06, + "loss": 0.76474838, + "memory(GiB)": 147.13, + "step": 64720, + "train_speed(iter/s)": 0.200911 + }, + { + "acc": 0.78300848, + "epoch": 1.510199915426005, + "grad_norm": 6.125, + "learning_rate": 1.4918885390339837e-06, + "loss": 0.77643466, + "memory(GiB)": 147.13, + "step": 64730, + "train_speed(iter/s)": 0.200926 + }, + { + "acc": 0.79654002, + "epoch": 1.510433222998294, + "grad_norm": 5.3125, + "learning_rate": 1.4905427047402848e-06, + "loss": 0.72235746, + "memory(GiB)": 147.13, + "step": 64740, + "train_speed(iter/s)": 0.200942 + }, + { + "acc": 0.77989998, + "epoch": 1.5106665305705829, + "grad_norm": 5.75, + "learning_rate": 1.4891973714205971e-06, + "loss": 0.78111591, + "memory(GiB)": 147.13, + "step": 64750, + "train_speed(iter/s)": 0.200959 + }, + { + "acc": 0.75031438, + "epoch": 1.5108998381428718, + "grad_norm": 6.78125, + "learning_rate": 1.4878525392669652e-06, + "loss": 0.91278858, + "memory(GiB)": 147.13, + "step": 64760, + "train_speed(iter/s)": 0.200976 + }, + { + "acc": 0.79479699, + "epoch": 1.5111331457151607, + "grad_norm": 4.46875, + "learning_rate": 1.4865082084713605e-06, + "loss": 0.73966794, + "memory(GiB)": 147.13, + "step": 64770, + "train_speed(iter/s)": 0.200993 + }, + { + "acc": 0.79615707, + "epoch": 1.5113664532874496, + "grad_norm": 5.34375, + "learning_rate": 1.485164379225691e-06, + "loss": 0.72713776, + "memory(GiB)": 147.13, + "step": 64780, + "train_speed(iter/s)": 0.201008 + }, + { + "acc": 0.79181237, + "epoch": 1.5115997608597382, + "grad_norm": 6.25, + "learning_rate": 1.4838210517217827e-06, + "loss": 0.75146961, + "memory(GiB)": 147.13, + "step": 64790, + "train_speed(iter/s)": 0.201024 + }, + { + "acc": 0.7745163, + "epoch": 1.5118330684320274, + "grad_norm": 5.5, + "learning_rate": 1.4824782261513997e-06, + "loss": 0.81358576, + "memory(GiB)": 147.13, + "step": 64800, + "train_speed(iter/s)": 0.201038 + }, + { + "acc": 0.77751856, + "epoch": 1.512066376004316, + "grad_norm": 4.71875, + "learning_rate": 1.4811359027062282e-06, + "loss": 0.79387336, + "memory(GiB)": 147.13, + "step": 64810, + "train_speed(iter/s)": 0.201054 + }, + { + "acc": 0.78064179, + "epoch": 1.5122996835766052, + "grad_norm": 7.125, + "learning_rate": 1.4797940815778849e-06, + "loss": 0.78270473, + "memory(GiB)": 147.13, + "step": 64820, + "train_speed(iter/s)": 0.201069 + }, + { + "acc": 0.77359581, + "epoch": 1.5125329911488938, + "grad_norm": 6.25, + "learning_rate": 1.4784527629579132e-06, + "loss": 0.8067338, + "memory(GiB)": 147.13, + "step": 64830, + "train_speed(iter/s)": 0.201085 + }, + { + "acc": 0.77926979, + "epoch": 1.512766298721183, + "grad_norm": 6.59375, + "learning_rate": 1.4771119470377853e-06, + "loss": 0.78685246, + "memory(GiB)": 147.13, + "step": 64840, + "train_speed(iter/s)": 0.201101 + }, + { + "acc": 0.77433491, + "epoch": 1.5129996062934716, + "grad_norm": 6.75, + "learning_rate": 1.4757716340089046e-06, + "loss": 0.81569548, + "memory(GiB)": 147.13, + "step": 64850, + "train_speed(iter/s)": 0.201116 + }, + { + "acc": 0.77438469, + "epoch": 1.5132329138657608, + "grad_norm": 7.1875, + "learning_rate": 1.4744318240625981e-06, + "loss": 0.79644213, + "memory(GiB)": 147.13, + "step": 64860, + "train_speed(iter/s)": 0.201133 + }, + { + "acc": 0.78981161, + "epoch": 1.5134662214380494, + "grad_norm": 4.375, + "learning_rate": 1.4730925173901262e-06, + "loss": 0.74776039, + "memory(GiB)": 147.13, + "step": 64870, + "train_speed(iter/s)": 0.20115 + }, + { + "acc": 0.76372499, + "epoch": 1.5136995290103386, + "grad_norm": 4.75, + "learning_rate": 1.4717537141826716e-06, + "loss": 0.84560843, + "memory(GiB)": 147.13, + "step": 64880, + "train_speed(iter/s)": 0.201164 + }, + { + "acc": 0.77633152, + "epoch": 1.5139328365826272, + "grad_norm": 4.125, + "learning_rate": 1.4704154146313503e-06, + "loss": 0.80207596, + "memory(GiB)": 147.13, + "step": 64890, + "train_speed(iter/s)": 0.20118 + }, + { + "acc": 0.78325543, + "epoch": 1.5141661441549163, + "grad_norm": 7.15625, + "learning_rate": 1.4690776189272033e-06, + "loss": 0.78658934, + "memory(GiB)": 147.13, + "step": 64900, + "train_speed(iter/s)": 0.201197 + }, + { + "acc": 0.78068142, + "epoch": 1.514399451727205, + "grad_norm": 5.0625, + "learning_rate": 1.467740327261199e-06, + "loss": 0.79103022, + "memory(GiB)": 147.13, + "step": 64910, + "train_speed(iter/s)": 0.201213 + }, + { + "acc": 0.79627781, + "epoch": 1.5146327592994941, + "grad_norm": 5.6875, + "learning_rate": 1.4664035398242387e-06, + "loss": 0.72616043, + "memory(GiB)": 147.13, + "step": 64920, + "train_speed(iter/s)": 0.201228 + }, + { + "acc": 0.77021713, + "epoch": 1.5148660668717828, + "grad_norm": 4.875, + "learning_rate": 1.4650672568071461e-06, + "loss": 0.81304054, + "memory(GiB)": 147.13, + "step": 64930, + "train_speed(iter/s)": 0.201244 + }, + { + "acc": 0.78109665, + "epoch": 1.515099374444072, + "grad_norm": 5.40625, + "learning_rate": 1.4637314784006761e-06, + "loss": 0.77189884, + "memory(GiB)": 147.13, + "step": 64940, + "train_speed(iter/s)": 0.201261 + }, + { + "acc": 0.76999512, + "epoch": 1.5153326820163606, + "grad_norm": 6.84375, + "learning_rate": 1.4623962047955087e-06, + "loss": 0.82561445, + "memory(GiB)": 147.13, + "step": 64950, + "train_speed(iter/s)": 0.201276 + }, + { + "acc": 0.77311716, + "epoch": 1.5155659895886497, + "grad_norm": 5.34375, + "learning_rate": 1.4610614361822567e-06, + "loss": 0.817449, + "memory(GiB)": 147.13, + "step": 64960, + "train_speed(iter/s)": 0.201293 + }, + { + "acc": 0.79791188, + "epoch": 1.5157992971609384, + "grad_norm": 5.0625, + "learning_rate": 1.4597271727514568e-06, + "loss": 0.7242517, + "memory(GiB)": 147.13, + "step": 64970, + "train_speed(iter/s)": 0.201307 + }, + { + "acc": 0.78501654, + "epoch": 1.5160326047332273, + "grad_norm": 5.53125, + "learning_rate": 1.4583934146935725e-06, + "loss": 0.75701132, + "memory(GiB)": 147.13, + "step": 64980, + "train_speed(iter/s)": 0.201323 + }, + { + "acc": 0.76452885, + "epoch": 1.5162659123055162, + "grad_norm": 5.84375, + "learning_rate": 1.4570601621990016e-06, + "loss": 0.86551952, + "memory(GiB)": 147.13, + "step": 64990, + "train_speed(iter/s)": 0.201339 + }, + { + "acc": 0.7722949, + "epoch": 1.5164992198778051, + "grad_norm": 5.625, + "learning_rate": 1.4557274154580614e-06, + "loss": 0.81062641, + "memory(GiB)": 147.13, + "step": 65000, + "train_speed(iter/s)": 0.201353 + }, + { + "epoch": 1.5164992198778051, + "eval_acc": 0.7446660360186799, + "eval_loss": 0.8044557571411133, + "eval_runtime": 1269.9569, + "eval_samples_per_second": 28.34, + "eval_steps_per_second": 14.171, + "step": 65000 + }, + { + "acc": 0.79000292, + "epoch": 1.516732527450094, + "grad_norm": 5.25, + "learning_rate": 1.4543951746610047e-06, + "loss": 0.73190441, + "memory(GiB)": 147.13, + "step": 65010, + "train_speed(iter/s)": 0.200566 + }, + { + "acc": 0.7913847, + "epoch": 1.516965835022383, + "grad_norm": 4.34375, + "learning_rate": 1.4530634399980049e-06, + "loss": 0.75735092, + "memory(GiB)": 147.13, + "step": 65020, + "train_speed(iter/s)": 0.200581 + }, + { + "acc": 0.78307304, + "epoch": 1.5171991425946718, + "grad_norm": 4.25, + "learning_rate": 1.4517322116591709e-06, + "loss": 0.77868781, + "memory(GiB)": 147.13, + "step": 65030, + "train_speed(iter/s)": 0.200596 + }, + { + "acc": 0.79364333, + "epoch": 1.5174324501669607, + "grad_norm": 6.65625, + "learning_rate": 1.4504014898345326e-06, + "loss": 0.73498554, + "memory(GiB)": 147.13, + "step": 65040, + "train_speed(iter/s)": 0.200611 + }, + { + "acc": 0.78174887, + "epoch": 1.5176657577392496, + "grad_norm": 6.53125, + "learning_rate": 1.44907127471405e-06, + "loss": 0.78112001, + "memory(GiB)": 147.13, + "step": 65050, + "train_speed(iter/s)": 0.200625 + }, + { + "acc": 0.79200649, + "epoch": 1.5178990653115385, + "grad_norm": 6.53125, + "learning_rate": 1.447741566487612e-06, + "loss": 0.73226194, + "memory(GiB)": 147.13, + "step": 65060, + "train_speed(iter/s)": 0.200642 + }, + { + "acc": 0.77733145, + "epoch": 1.5181323728838274, + "grad_norm": 5.5625, + "learning_rate": 1.4464123653450318e-06, + "loss": 0.7879797, + "memory(GiB)": 147.13, + "step": 65070, + "train_speed(iter/s)": 0.200658 + }, + { + "acc": 0.76886935, + "epoch": 1.5183656804561163, + "grad_norm": 5.0, + "learning_rate": 1.4450836714760553e-06, + "loss": 0.81756496, + "memory(GiB)": 147.13, + "step": 65080, + "train_speed(iter/s)": 0.200674 + }, + { + "acc": 0.79252605, + "epoch": 1.5185989880284052, + "grad_norm": 7.21875, + "learning_rate": 1.4437554850703506e-06, + "loss": 0.73690176, + "memory(GiB)": 147.13, + "step": 65090, + "train_speed(iter/s)": 0.20069 + }, + { + "acc": 0.79318552, + "epoch": 1.518832295600694, + "grad_norm": 4.625, + "learning_rate": 1.442427806317519e-06, + "loss": 0.73230796, + "memory(GiB)": 147.13, + "step": 65100, + "train_speed(iter/s)": 0.200707 + }, + { + "acc": 0.78320851, + "epoch": 1.519065603172983, + "grad_norm": 6.375, + "learning_rate": 1.4411006354070844e-06, + "loss": 0.76603732, + "memory(GiB)": 147.13, + "step": 65110, + "train_speed(iter/s)": 0.200722 + }, + { + "acc": 0.77989206, + "epoch": 1.519298910745272, + "grad_norm": 4.90625, + "learning_rate": 1.4397739725284988e-06, + "loss": 0.77163534, + "memory(GiB)": 147.13, + "step": 65120, + "train_speed(iter/s)": 0.200738 + }, + { + "acc": 0.77356272, + "epoch": 1.5195322183175608, + "grad_norm": 5.40625, + "learning_rate": 1.4384478178711458e-06, + "loss": 0.7995605, + "memory(GiB)": 147.13, + "step": 65130, + "train_speed(iter/s)": 0.200754 + }, + { + "acc": 0.79077578, + "epoch": 1.5197655258898497, + "grad_norm": 5.46875, + "learning_rate": 1.437122171624331e-06, + "loss": 0.73075666, + "memory(GiB)": 147.13, + "step": 65140, + "train_speed(iter/s)": 0.200771 + }, + { + "acc": 0.77648382, + "epoch": 1.5199988334621386, + "grad_norm": 4.9375, + "learning_rate": 1.4357970339772924e-06, + "loss": 0.79962687, + "memory(GiB)": 147.13, + "step": 65150, + "train_speed(iter/s)": 0.200787 + }, + { + "acc": 0.78098822, + "epoch": 1.5202321410344275, + "grad_norm": 6.6875, + "learning_rate": 1.4344724051191917e-06, + "loss": 0.80490036, + "memory(GiB)": 147.13, + "step": 65160, + "train_speed(iter/s)": 0.200802 + }, + { + "acc": 0.78283033, + "epoch": 1.5204654486067164, + "grad_norm": 6.1875, + "learning_rate": 1.43314828523912e-06, + "loss": 0.77967253, + "memory(GiB)": 147.13, + "step": 65170, + "train_speed(iter/s)": 0.200819 + }, + { + "acc": 0.78426676, + "epoch": 1.520698756179005, + "grad_norm": 10.1875, + "learning_rate": 1.431824674526092e-06, + "loss": 0.7804903, + "memory(GiB)": 147.13, + "step": 65180, + "train_speed(iter/s)": 0.200834 + }, + { + "acc": 0.77625027, + "epoch": 1.5209320637512942, + "grad_norm": 5.59375, + "learning_rate": 1.430501573169057e-06, + "loss": 0.79841747, + "memory(GiB)": 147.13, + "step": 65190, + "train_speed(iter/s)": 0.200848 + }, + { + "acc": 0.79016428, + "epoch": 1.5211653713235829, + "grad_norm": 4.75, + "learning_rate": 1.4291789813568858e-06, + "loss": 0.75621462, + "memory(GiB)": 147.13, + "step": 65200, + "train_speed(iter/s)": 0.200864 + }, + { + "acc": 0.78785486, + "epoch": 1.521398678895872, + "grad_norm": 6.53125, + "learning_rate": 1.4278568992783758e-06, + "loss": 0.76169577, + "memory(GiB)": 147.13, + "step": 65210, + "train_speed(iter/s)": 0.20088 + }, + { + "acc": 0.78738542, + "epoch": 1.5216319864681607, + "grad_norm": 4.625, + "learning_rate": 1.4265353271222577e-06, + "loss": 0.75058613, + "memory(GiB)": 147.13, + "step": 65220, + "train_speed(iter/s)": 0.200898 + }, + { + "acc": 0.78331351, + "epoch": 1.5218652940404498, + "grad_norm": 5.8125, + "learning_rate": 1.4252142650771811e-06, + "loss": 0.78213634, + "memory(GiB)": 147.13, + "step": 65230, + "train_speed(iter/s)": 0.200913 + }, + { + "acc": 0.768677, + "epoch": 1.5220986016127385, + "grad_norm": 4.375, + "learning_rate": 1.4238937133317322e-06, + "loss": 0.81841431, + "memory(GiB)": 147.13, + "step": 65240, + "train_speed(iter/s)": 0.20093 + }, + { + "acc": 0.78544569, + "epoch": 1.5223319091850276, + "grad_norm": 5.4375, + "learning_rate": 1.4225736720744164e-06, + "loss": 0.77090549, + "memory(GiB)": 147.13, + "step": 65250, + "train_speed(iter/s)": 0.200946 + }, + { + "acc": 0.77120905, + "epoch": 1.5225652167573163, + "grad_norm": 5.53125, + "learning_rate": 1.4212541414936682e-06, + "loss": 0.81031246, + "memory(GiB)": 147.13, + "step": 65260, + "train_speed(iter/s)": 0.200962 + }, + { + "acc": 0.77259464, + "epoch": 1.5227985243296054, + "grad_norm": 5.0625, + "learning_rate": 1.4199351217778545e-06, + "loss": 0.80552759, + "memory(GiB)": 147.13, + "step": 65270, + "train_speed(iter/s)": 0.200978 + }, + { + "acc": 0.78281403, + "epoch": 1.523031831901894, + "grad_norm": 6.375, + "learning_rate": 1.4186166131152595e-06, + "loss": 0.76773696, + "memory(GiB)": 147.13, + "step": 65280, + "train_speed(iter/s)": 0.200994 + }, + { + "acc": 0.75774269, + "epoch": 1.5232651394741832, + "grad_norm": 5.59375, + "learning_rate": 1.4172986156941038e-06, + "loss": 0.87673063, + "memory(GiB)": 147.13, + "step": 65290, + "train_speed(iter/s)": 0.20101 + }, + { + "acc": 0.76485977, + "epoch": 1.5234984470464719, + "grad_norm": 4.53125, + "learning_rate": 1.4159811297025284e-06, + "loss": 0.86806583, + "memory(GiB)": 147.13, + "step": 65300, + "train_speed(iter/s)": 0.201025 + }, + { + "acc": 0.80621672, + "epoch": 1.523731754618761, + "grad_norm": 6.125, + "learning_rate": 1.4146641553286071e-06, + "loss": 0.68341751, + "memory(GiB)": 147.13, + "step": 65310, + "train_speed(iter/s)": 0.201041 + }, + { + "acc": 0.78828001, + "epoch": 1.5239650621910497, + "grad_norm": 6.4375, + "learning_rate": 1.4133476927603362e-06, + "loss": 0.74977875, + "memory(GiB)": 147.13, + "step": 65320, + "train_speed(iter/s)": 0.201058 + }, + { + "acc": 0.76758041, + "epoch": 1.5241983697633388, + "grad_norm": 6.1875, + "learning_rate": 1.4120317421856389e-06, + "loss": 0.81543188, + "memory(GiB)": 147.13, + "step": 65330, + "train_speed(iter/s)": 0.201074 + }, + { + "acc": 0.76192322, + "epoch": 1.5244316773356275, + "grad_norm": 5.4375, + "learning_rate": 1.4107163037923693e-06, + "loss": 0.84419556, + "memory(GiB)": 147.13, + "step": 65340, + "train_speed(iter/s)": 0.201091 + }, + { + "acc": 0.77184172, + "epoch": 1.5246649849079166, + "grad_norm": 5.6875, + "learning_rate": 1.409401377768303e-06, + "loss": 0.82547474, + "memory(GiB)": 147.13, + "step": 65350, + "train_speed(iter/s)": 0.201106 + }, + { + "acc": 0.77231297, + "epoch": 1.5248982924802053, + "grad_norm": 4.5625, + "learning_rate": 1.408086964301149e-06, + "loss": 0.83197184, + "memory(GiB)": 147.13, + "step": 65360, + "train_speed(iter/s)": 0.201122 + }, + { + "acc": 0.78483472, + "epoch": 1.5251316000524942, + "grad_norm": 6.21875, + "learning_rate": 1.4067730635785354e-06, + "loss": 0.7758481, + "memory(GiB)": 147.13, + "step": 65370, + "train_speed(iter/s)": 0.201137 + }, + { + "acc": 0.77703638, + "epoch": 1.525364907624783, + "grad_norm": 9.125, + "learning_rate": 1.4054596757880262e-06, + "loss": 0.81458502, + "memory(GiB)": 147.13, + "step": 65380, + "train_speed(iter/s)": 0.201152 + }, + { + "acc": 0.77877493, + "epoch": 1.525598215197072, + "grad_norm": 6.21875, + "learning_rate": 1.4041468011171017e-06, + "loss": 0.81068916, + "memory(GiB)": 147.13, + "step": 65390, + "train_speed(iter/s)": 0.201168 + }, + { + "acc": 0.77528429, + "epoch": 1.5258315227693608, + "grad_norm": 4.8125, + "learning_rate": 1.4028344397531773e-06, + "loss": 0.80191021, + "memory(GiB)": 147.13, + "step": 65400, + "train_speed(iter/s)": 0.201184 + }, + { + "acc": 0.7893209, + "epoch": 1.5260648303416497, + "grad_norm": 9.5625, + "learning_rate": 1.4015225918835923e-06, + "loss": 0.7487329, + "memory(GiB)": 147.13, + "step": 65410, + "train_speed(iter/s)": 0.201199 + }, + { + "acc": 0.78093977, + "epoch": 1.5262981379139386, + "grad_norm": 5.96875, + "learning_rate": 1.4002112576956102e-06, + "loss": 0.78119683, + "memory(GiB)": 147.13, + "step": 65420, + "train_speed(iter/s)": 0.201215 + }, + { + "acc": 0.7891861, + "epoch": 1.5265314454862275, + "grad_norm": 8.75, + "learning_rate": 1.3989004373764264e-06, + "loss": 0.73629584, + "memory(GiB)": 147.13, + "step": 65430, + "train_speed(iter/s)": 0.20123 + }, + { + "acc": 0.76216254, + "epoch": 1.5267647530585164, + "grad_norm": 5.5625, + "learning_rate": 1.3975901311131584e-06, + "loss": 0.85819883, + "memory(GiB)": 147.13, + "step": 65440, + "train_speed(iter/s)": 0.201246 + }, + { + "acc": 0.77317934, + "epoch": 1.5269980606308053, + "grad_norm": 5.75, + "learning_rate": 1.3962803390928537e-06, + "loss": 0.80435085, + "memory(GiB)": 147.13, + "step": 65450, + "train_speed(iter/s)": 0.201262 + }, + { + "acc": 0.78658466, + "epoch": 1.5272313682030942, + "grad_norm": 6.8125, + "learning_rate": 1.3949710615024836e-06, + "loss": 0.76053772, + "memory(GiB)": 147.13, + "step": 65460, + "train_speed(iter/s)": 0.201277 + }, + { + "acc": 0.79117346, + "epoch": 1.5274646757753831, + "grad_norm": 5.125, + "learning_rate": 1.3936622985289462e-06, + "loss": 0.75075483, + "memory(GiB)": 147.13, + "step": 65470, + "train_speed(iter/s)": 0.201292 + }, + { + "acc": 0.77793059, + "epoch": 1.527697983347672, + "grad_norm": 5.34375, + "learning_rate": 1.3923540503590689e-06, + "loss": 0.79492455, + "memory(GiB)": 147.13, + "step": 65480, + "train_speed(iter/s)": 0.201309 + }, + { + "acc": 0.79856424, + "epoch": 1.527931290919961, + "grad_norm": 4.25, + "learning_rate": 1.3910463171796018e-06, + "loss": 0.71239781, + "memory(GiB)": 147.13, + "step": 65490, + "train_speed(iter/s)": 0.201325 + }, + { + "acc": 0.77887478, + "epoch": 1.5281645984922498, + "grad_norm": 8.3125, + "learning_rate": 1.3897390991772269e-06, + "loss": 0.78372169, + "memory(GiB)": 147.13, + "step": 65500, + "train_speed(iter/s)": 0.201341 + }, + { + "epoch": 1.5281645984922498, + "eval_acc": 0.7447174423172486, + "eval_loss": 0.8044983148574829, + "eval_runtime": 1270.2082, + "eval_samples_per_second": 28.335, + "eval_steps_per_second": 14.168, + "step": 65500 + }, + { + "acc": 0.77596331, + "epoch": 1.5283979060645387, + "grad_norm": 5.75, + "learning_rate": 1.3884323965385443e-06, + "loss": 0.80863295, + "memory(GiB)": 147.13, + "step": 65510, + "train_speed(iter/s)": 0.20056 + }, + { + "acc": 0.79642587, + "epoch": 1.5286312136368276, + "grad_norm": 4.53125, + "learning_rate": 1.3871262094500897e-06, + "loss": 0.72295847, + "memory(GiB)": 147.13, + "step": 65520, + "train_speed(iter/s)": 0.200576 + }, + { + "acc": 0.78992147, + "epoch": 1.5288645212091165, + "grad_norm": 4.25, + "learning_rate": 1.3858205380983175e-06, + "loss": 0.7313539, + "memory(GiB)": 147.13, + "step": 65530, + "train_speed(iter/s)": 0.200591 + }, + { + "acc": 0.78537593, + "epoch": 1.5290978287814054, + "grad_norm": 7.5, + "learning_rate": 1.3845153826696144e-06, + "loss": 0.76099243, + "memory(GiB)": 147.13, + "step": 65540, + "train_speed(iter/s)": 0.200607 + }, + { + "acc": 0.76554108, + "epoch": 1.5293311363536943, + "grad_norm": 5.6875, + "learning_rate": 1.3832107433502912e-06, + "loss": 0.85681934, + "memory(GiB)": 147.13, + "step": 65550, + "train_speed(iter/s)": 0.200623 + }, + { + "acc": 0.77308493, + "epoch": 1.5295644439259832, + "grad_norm": 14.5, + "learning_rate": 1.3819066203265813e-06, + "loss": 0.82602463, + "memory(GiB)": 147.13, + "step": 65560, + "train_speed(iter/s)": 0.200639 + }, + { + "acc": 0.76433573, + "epoch": 1.529797751498272, + "grad_norm": 4.375, + "learning_rate": 1.3806030137846521e-06, + "loss": 0.84086971, + "memory(GiB)": 147.13, + "step": 65570, + "train_speed(iter/s)": 0.200654 + }, + { + "acc": 0.79202843, + "epoch": 1.530031059070561, + "grad_norm": 5.09375, + "learning_rate": 1.3792999239105898e-06, + "loss": 0.74395723, + "memory(GiB)": 147.13, + "step": 65580, + "train_speed(iter/s)": 0.20067 + }, + { + "acc": 0.79133291, + "epoch": 1.5302643666428497, + "grad_norm": 6.25, + "learning_rate": 1.3779973508904132e-06, + "loss": 0.74699202, + "memory(GiB)": 147.13, + "step": 65590, + "train_speed(iter/s)": 0.200684 + }, + { + "acc": 0.77008934, + "epoch": 1.5304976742151388, + "grad_norm": 5.5, + "learning_rate": 1.3766952949100625e-06, + "loss": 0.83408842, + "memory(GiB)": 147.13, + "step": 65600, + "train_speed(iter/s)": 0.2007 + }, + { + "acc": 0.77300768, + "epoch": 1.5307309817874275, + "grad_norm": 6.65625, + "learning_rate": 1.3753937561554053e-06, + "loss": 0.81059189, + "memory(GiB)": 147.13, + "step": 65610, + "train_speed(iter/s)": 0.200715 + }, + { + "acc": 0.76012034, + "epoch": 1.5309642893597166, + "grad_norm": 6.8125, + "learning_rate": 1.3740927348122373e-06, + "loss": 0.87639198, + "memory(GiB)": 147.13, + "step": 65620, + "train_speed(iter/s)": 0.200731 + }, + { + "acc": 0.77505989, + "epoch": 1.5311975969320053, + "grad_norm": 6.65625, + "learning_rate": 1.3727922310662762e-06, + "loss": 0.79479666, + "memory(GiB)": 147.13, + "step": 65630, + "train_speed(iter/s)": 0.200746 + }, + { + "acc": 0.78588977, + "epoch": 1.5314309045042944, + "grad_norm": 4.75, + "learning_rate": 1.371492245103172e-06, + "loss": 0.79581938, + "memory(GiB)": 147.13, + "step": 65640, + "train_speed(iter/s)": 0.200761 + }, + { + "acc": 0.78697853, + "epoch": 1.531664212076583, + "grad_norm": 7.75, + "learning_rate": 1.3701927771084939e-06, + "loss": 0.77796259, + "memory(GiB)": 147.13, + "step": 65650, + "train_speed(iter/s)": 0.200777 + }, + { + "acc": 0.7804883, + "epoch": 1.5318975196488722, + "grad_norm": 4.9375, + "learning_rate": 1.3688938272677442e-06, + "loss": 0.78984337, + "memory(GiB)": 147.13, + "step": 65660, + "train_speed(iter/s)": 0.200793 + }, + { + "acc": 0.7731575, + "epoch": 1.532130827221161, + "grad_norm": 7.46875, + "learning_rate": 1.3675953957663441e-06, + "loss": 0.8263773, + "memory(GiB)": 147.13, + "step": 65670, + "train_speed(iter/s)": 0.200808 + }, + { + "acc": 0.81379738, + "epoch": 1.53236413479345, + "grad_norm": 5.09375, + "learning_rate": 1.3662974827896474e-06, + "loss": 0.65503149, + "memory(GiB)": 147.13, + "step": 65680, + "train_speed(iter/s)": 0.200825 + }, + { + "acc": 0.77931385, + "epoch": 1.5325974423657387, + "grad_norm": 5.125, + "learning_rate": 1.3650000885229297e-06, + "loss": 0.80769215, + "memory(GiB)": 147.13, + "step": 65690, + "train_speed(iter/s)": 0.200841 + }, + { + "acc": 0.77855949, + "epoch": 1.5328307499380278, + "grad_norm": 5.6875, + "learning_rate": 1.3637032131513922e-06, + "loss": 0.78796816, + "memory(GiB)": 147.13, + "step": 65700, + "train_speed(iter/s)": 0.200856 + }, + { + "acc": 0.77442112, + "epoch": 1.5330640575103165, + "grad_norm": 6.3125, + "learning_rate": 1.3624068568601655e-06, + "loss": 0.79497447, + "memory(GiB)": 147.13, + "step": 65710, + "train_speed(iter/s)": 0.200872 + }, + { + "acc": 0.77679834, + "epoch": 1.5332973650826056, + "grad_norm": 8.5625, + "learning_rate": 1.3611110198343025e-06, + "loss": 0.81760941, + "memory(GiB)": 147.13, + "step": 65720, + "train_speed(iter/s)": 0.200887 + }, + { + "acc": 0.78379855, + "epoch": 1.5335306726548943, + "grad_norm": 5.09375, + "learning_rate": 1.359815702258787e-06, + "loss": 0.77201395, + "memory(GiB)": 147.13, + "step": 65730, + "train_speed(iter/s)": 0.200902 + }, + { + "acc": 0.78287749, + "epoch": 1.5337639802271834, + "grad_norm": 6.8125, + "learning_rate": 1.3585209043185193e-06, + "loss": 0.76966333, + "memory(GiB)": 147.13, + "step": 65740, + "train_speed(iter/s)": 0.200917 + }, + { + "acc": 0.78400383, + "epoch": 1.533997287799472, + "grad_norm": 6.375, + "learning_rate": 1.3572266261983363e-06, + "loss": 0.77230244, + "memory(GiB)": 147.13, + "step": 65750, + "train_speed(iter/s)": 0.200934 + }, + { + "acc": 0.78760571, + "epoch": 1.534230595371761, + "grad_norm": 5.59375, + "learning_rate": 1.3559328680829942e-06, + "loss": 0.76585751, + "memory(GiB)": 147.13, + "step": 65760, + "train_speed(iter/s)": 0.200949 + }, + { + "acc": 0.79107695, + "epoch": 1.5344639029440499, + "grad_norm": 4.75, + "learning_rate": 1.3546396301571751e-06, + "loss": 0.74846153, + "memory(GiB)": 147.13, + "step": 65770, + "train_speed(iter/s)": 0.200965 + }, + { + "acc": 0.78852139, + "epoch": 1.5346972105163388, + "grad_norm": 5.53125, + "learning_rate": 1.353346912605491e-06, + "loss": 0.77424564, + "memory(GiB)": 147.13, + "step": 65780, + "train_speed(iter/s)": 0.200981 + }, + { + "acc": 0.79274092, + "epoch": 1.5349305180886277, + "grad_norm": 7.1875, + "learning_rate": 1.3520547156124748e-06, + "loss": 0.74635301, + "memory(GiB)": 147.13, + "step": 65790, + "train_speed(iter/s)": 0.200998 + }, + { + "acc": 0.77839308, + "epoch": 1.5351638256609166, + "grad_norm": 4.875, + "learning_rate": 1.3507630393625893e-06, + "loss": 0.80371504, + "memory(GiB)": 147.13, + "step": 65800, + "train_speed(iter/s)": 0.201015 + }, + { + "acc": 0.77611036, + "epoch": 1.5353971332332055, + "grad_norm": 5.25, + "learning_rate": 1.3494718840402188e-06, + "loss": 0.78693495, + "memory(GiB)": 147.13, + "step": 65810, + "train_speed(iter/s)": 0.201031 + }, + { + "acc": 0.77409744, + "epoch": 1.5356304408054944, + "grad_norm": 6.59375, + "learning_rate": 1.348181249829677e-06, + "loss": 0.82686939, + "memory(GiB)": 147.13, + "step": 65820, + "train_speed(iter/s)": 0.201047 + }, + { + "acc": 0.76908383, + "epoch": 1.5358637483777833, + "grad_norm": 4.625, + "learning_rate": 1.3468911369152015e-06, + "loss": 0.83825998, + "memory(GiB)": 147.13, + "step": 65830, + "train_speed(iter/s)": 0.201063 + }, + { + "acc": 0.77742887, + "epoch": 1.5360970559500722, + "grad_norm": 4.6875, + "learning_rate": 1.3456015454809551e-06, + "loss": 0.79348636, + "memory(GiB)": 147.13, + "step": 65840, + "train_speed(iter/s)": 0.20108 + }, + { + "acc": 0.77535996, + "epoch": 1.536330363522361, + "grad_norm": 7.0, + "learning_rate": 1.3443124757110266e-06, + "loss": 0.81135139, + "memory(GiB)": 147.13, + "step": 65850, + "train_speed(iter/s)": 0.201095 + }, + { + "acc": 0.77499065, + "epoch": 1.53656367109465, + "grad_norm": 6.96875, + "learning_rate": 1.3430239277894281e-06, + "loss": 0.82661295, + "memory(GiB)": 147.13, + "step": 65860, + "train_speed(iter/s)": 0.201109 + }, + { + "acc": 0.78962278, + "epoch": 1.5367969786669389, + "grad_norm": 5.09375, + "learning_rate": 1.3417359019001037e-06, + "loss": 0.74881072, + "memory(GiB)": 147.13, + "step": 65870, + "train_speed(iter/s)": 0.201125 + }, + { + "acc": 0.79519157, + "epoch": 1.5370302862392278, + "grad_norm": 9.3125, + "learning_rate": 1.3404483982269145e-06, + "loss": 0.74976549, + "memory(GiB)": 147.13, + "step": 65880, + "train_speed(iter/s)": 0.201141 + }, + { + "acc": 0.7780304, + "epoch": 1.5372635938115167, + "grad_norm": 6.15625, + "learning_rate": 1.339161416953655e-06, + "loss": 0.78960228, + "memory(GiB)": 147.13, + "step": 65890, + "train_speed(iter/s)": 0.201157 + }, + { + "acc": 0.74467463, + "epoch": 1.5374969013838056, + "grad_norm": 6.0, + "learning_rate": 1.3378749582640398e-06, + "loss": 0.95666943, + "memory(GiB)": 147.13, + "step": 65900, + "train_speed(iter/s)": 0.201174 + }, + { + "acc": 0.78630295, + "epoch": 1.5377302089560945, + "grad_norm": 4.125, + "learning_rate": 1.3365890223417088e-06, + "loss": 0.74238262, + "memory(GiB)": 147.13, + "step": 65910, + "train_speed(iter/s)": 0.20119 + }, + { + "acc": 0.7730278, + "epoch": 1.5379635165283834, + "grad_norm": 5.15625, + "learning_rate": 1.3353036093702326e-06, + "loss": 0.7966548, + "memory(GiB)": 147.13, + "step": 65920, + "train_speed(iter/s)": 0.201206 + }, + { + "acc": 0.78292246, + "epoch": 1.5381968241006723, + "grad_norm": 4.625, + "learning_rate": 1.3340187195331e-06, + "loss": 0.7709209, + "memory(GiB)": 147.13, + "step": 65930, + "train_speed(iter/s)": 0.201223 + }, + { + "acc": 0.78873224, + "epoch": 1.5384301316729612, + "grad_norm": 5.96875, + "learning_rate": 1.3327343530137316e-06, + "loss": 0.76129274, + "memory(GiB)": 147.13, + "step": 65940, + "train_speed(iter/s)": 0.201238 + }, + { + "acc": 0.77018852, + "epoch": 1.53866343924525, + "grad_norm": 3.5, + "learning_rate": 1.3314505099954683e-06, + "loss": 0.82236538, + "memory(GiB)": 147.13, + "step": 65950, + "train_speed(iter/s)": 0.201254 + }, + { + "acc": 0.77579556, + "epoch": 1.5388967468175387, + "grad_norm": 6.65625, + "learning_rate": 1.33016719066158e-06, + "loss": 0.7895669, + "memory(GiB)": 147.13, + "step": 65960, + "train_speed(iter/s)": 0.201269 + }, + { + "acc": 0.78516469, + "epoch": 1.5391300543898279, + "grad_norm": 4.78125, + "learning_rate": 1.328884395195257e-06, + "loss": 0.77448907, + "memory(GiB)": 147.13, + "step": 65970, + "train_speed(iter/s)": 0.201284 + }, + { + "acc": 0.77121625, + "epoch": 1.5393633619621165, + "grad_norm": 6.0, + "learning_rate": 1.3276021237796216e-06, + "loss": 0.83493958, + "memory(GiB)": 147.13, + "step": 65980, + "train_speed(iter/s)": 0.201298 + }, + { + "acc": 0.79430099, + "epoch": 1.5395966695344057, + "grad_norm": 4.28125, + "learning_rate": 1.3263203765977168e-06, + "loss": 0.73301487, + "memory(GiB)": 147.13, + "step": 65990, + "train_speed(iter/s)": 0.201314 + }, + { + "acc": 0.78175182, + "epoch": 1.5398299771066943, + "grad_norm": 5.03125, + "learning_rate": 1.3250391538325085e-06, + "loss": 0.75961099, + "memory(GiB)": 147.13, + "step": 66000, + "train_speed(iter/s)": 0.20133 + }, + { + "epoch": 1.5398299771066943, + "eval_acc": 0.7446849330380915, + "eval_loss": 0.8045194149017334, + "eval_runtime": 1270.9323, + "eval_samples_per_second": 28.319, + "eval_steps_per_second": 14.16, + "step": 66000 + }, + { + "acc": 0.79298277, + "epoch": 1.5400632846789835, + "grad_norm": 5.65625, + "learning_rate": 1.3237584556668958e-06, + "loss": 0.74688911, + "memory(GiB)": 147.13, + "step": 66010, + "train_speed(iter/s)": 0.200553 + }, + { + "acc": 0.78268442, + "epoch": 1.5402965922512721, + "grad_norm": 4.0, + "learning_rate": 1.322478282283694e-06, + "loss": 0.77756462, + "memory(GiB)": 147.13, + "step": 66020, + "train_speed(iter/s)": 0.200569 + }, + { + "acc": 0.77548428, + "epoch": 1.5405298998235613, + "grad_norm": 6.875, + "learning_rate": 1.3211986338656503e-06, + "loss": 0.80724964, + "memory(GiB)": 147.13, + "step": 66030, + "train_speed(iter/s)": 0.200586 + }, + { + "acc": 0.77102299, + "epoch": 1.54076320739585, + "grad_norm": 5.03125, + "learning_rate": 1.3199195105954331e-06, + "loss": 0.82279663, + "memory(GiB)": 147.13, + "step": 66040, + "train_speed(iter/s)": 0.200601 + }, + { + "acc": 0.77918787, + "epoch": 1.540996514968139, + "grad_norm": 5.46875, + "learning_rate": 1.318640912655635e-06, + "loss": 0.79191918, + "memory(GiB)": 147.13, + "step": 66050, + "train_speed(iter/s)": 0.200616 + }, + { + "acc": 0.80649509, + "epoch": 1.5412298225404277, + "grad_norm": 4.40625, + "learning_rate": 1.3173628402287785e-06, + "loss": 0.68745565, + "memory(GiB)": 147.13, + "step": 66060, + "train_speed(iter/s)": 0.200632 + }, + { + "acc": 0.77775507, + "epoch": 1.5414631301127169, + "grad_norm": 7.46875, + "learning_rate": 1.3160852934973073e-06, + "loss": 0.79977551, + "memory(GiB)": 147.13, + "step": 66070, + "train_speed(iter/s)": 0.200647 + }, + { + "acc": 0.76136436, + "epoch": 1.5416964376850055, + "grad_norm": 11.75, + "learning_rate": 1.31480827264359e-06, + "loss": 0.86434231, + "memory(GiB)": 147.13, + "step": 66080, + "train_speed(iter/s)": 0.200664 + }, + { + "acc": 0.77416496, + "epoch": 1.5419297452572946, + "grad_norm": 7.0, + "learning_rate": 1.3135317778499196e-06, + "loss": 0.78873339, + "memory(GiB)": 147.13, + "step": 66090, + "train_speed(iter/s)": 0.200681 + }, + { + "acc": 0.76674585, + "epoch": 1.5421630528295833, + "grad_norm": 5.8125, + "learning_rate": 1.3122558092985188e-06, + "loss": 0.83854027, + "memory(GiB)": 147.13, + "step": 66100, + "train_speed(iter/s)": 0.200697 + }, + { + "acc": 0.77215676, + "epoch": 1.5423963604018724, + "grad_norm": 4.375, + "learning_rate": 1.3109803671715283e-06, + "loss": 0.81484003, + "memory(GiB)": 147.13, + "step": 66110, + "train_speed(iter/s)": 0.200713 + }, + { + "acc": 0.79621944, + "epoch": 1.5426296679741611, + "grad_norm": 5.78125, + "learning_rate": 1.30970545165102e-06, + "loss": 0.72149839, + "memory(GiB)": 147.13, + "step": 66120, + "train_speed(iter/s)": 0.200729 + }, + { + "acc": 0.79580064, + "epoch": 1.5428629755464502, + "grad_norm": 12.9375, + "learning_rate": 1.3084310629189868e-06, + "loss": 0.73481159, + "memory(GiB)": 147.13, + "step": 66130, + "train_speed(iter/s)": 0.200743 + }, + { + "acc": 0.77218895, + "epoch": 1.543096283118739, + "grad_norm": 5.0625, + "learning_rate": 1.3071572011573453e-06, + "loss": 0.83505878, + "memory(GiB)": 147.13, + "step": 66140, + "train_speed(iter/s)": 0.200758 + }, + { + "acc": 0.78478584, + "epoch": 1.5433295906910278, + "grad_norm": 3.75, + "learning_rate": 1.3058838665479418e-06, + "loss": 0.79148293, + "memory(GiB)": 147.13, + "step": 66150, + "train_speed(iter/s)": 0.200774 + }, + { + "acc": 0.7856843, + "epoch": 1.5435628982633167, + "grad_norm": 4.21875, + "learning_rate": 1.304611059272542e-06, + "loss": 0.77415447, + "memory(GiB)": 147.13, + "step": 66160, + "train_speed(iter/s)": 0.200788 + }, + { + "acc": 0.77835255, + "epoch": 1.5437962058356056, + "grad_norm": 6.25, + "learning_rate": 1.3033387795128416e-06, + "loss": 0.79931955, + "memory(GiB)": 147.13, + "step": 66170, + "train_speed(iter/s)": 0.200804 + }, + { + "acc": 0.76517811, + "epoch": 1.5440295134078945, + "grad_norm": 5.3125, + "learning_rate": 1.302067027450456e-06, + "loss": 0.84638748, + "memory(GiB)": 147.13, + "step": 66180, + "train_speed(iter/s)": 0.200821 + }, + { + "acc": 0.78675337, + "epoch": 1.5442628209801834, + "grad_norm": 8.125, + "learning_rate": 1.3007958032669283e-06, + "loss": 0.76565952, + "memory(GiB)": 147.13, + "step": 66190, + "train_speed(iter/s)": 0.200836 + }, + { + "acc": 0.78021317, + "epoch": 1.5444961285524723, + "grad_norm": 5.0625, + "learning_rate": 1.2995251071437253e-06, + "loss": 0.78182669, + "memory(GiB)": 147.13, + "step": 66200, + "train_speed(iter/s)": 0.200851 + }, + { + "acc": 0.78736315, + "epoch": 1.5447294361247612, + "grad_norm": 5.75, + "learning_rate": 1.2982549392622362e-06, + "loss": 0.75720816, + "memory(GiB)": 147.13, + "step": 66210, + "train_speed(iter/s)": 0.200866 + }, + { + "acc": 0.76521492, + "epoch": 1.5449627436970501, + "grad_norm": 5.3125, + "learning_rate": 1.2969852998037813e-06, + "loss": 0.8522131, + "memory(GiB)": 147.13, + "step": 66220, + "train_speed(iter/s)": 0.200882 + }, + { + "acc": 0.78474207, + "epoch": 1.545196051269339, + "grad_norm": 4.21875, + "learning_rate": 1.2957161889495972e-06, + "loss": 0.77432103, + "memory(GiB)": 147.13, + "step": 66230, + "train_speed(iter/s)": 0.200897 + }, + { + "acc": 0.77806253, + "epoch": 1.545429358841628, + "grad_norm": 4.59375, + "learning_rate": 1.2944476068808526e-06, + "loss": 0.80501852, + "memory(GiB)": 147.13, + "step": 66240, + "train_speed(iter/s)": 0.200913 + }, + { + "acc": 0.7818716, + "epoch": 1.5456626664139168, + "grad_norm": 4.6875, + "learning_rate": 1.2931795537786357e-06, + "loss": 0.77636614, + "memory(GiB)": 147.13, + "step": 66250, + "train_speed(iter/s)": 0.200929 + }, + { + "acc": 0.77749758, + "epoch": 1.5458959739862057, + "grad_norm": 5.40625, + "learning_rate": 1.2919120298239591e-06, + "loss": 0.80647802, + "memory(GiB)": 147.13, + "step": 66260, + "train_speed(iter/s)": 0.200944 + }, + { + "acc": 0.77590075, + "epoch": 1.5461292815584946, + "grad_norm": 5.75, + "learning_rate": 1.2906450351977646e-06, + "loss": 0.81553898, + "memory(GiB)": 147.13, + "step": 66270, + "train_speed(iter/s)": 0.200959 + }, + { + "acc": 0.78080235, + "epoch": 1.5463625891307835, + "grad_norm": 6.0, + "learning_rate": 1.2893785700809118e-06, + "loss": 0.79015241, + "memory(GiB)": 147.13, + "step": 66280, + "train_speed(iter/s)": 0.200975 + }, + { + "acc": 0.78328552, + "epoch": 1.5465958967030724, + "grad_norm": 4.8125, + "learning_rate": 1.2881126346541922e-06, + "loss": 0.76925888, + "memory(GiB)": 147.13, + "step": 66290, + "train_speed(iter/s)": 0.20099 + }, + { + "acc": 0.77661171, + "epoch": 1.5468292042753613, + "grad_norm": 4.46875, + "learning_rate": 1.2868472290983158e-06, + "loss": 0.80561304, + "memory(GiB)": 147.13, + "step": 66300, + "train_speed(iter/s)": 0.201006 + }, + { + "acc": 0.79191103, + "epoch": 1.5470625118476502, + "grad_norm": 3.734375, + "learning_rate": 1.2855823535939188e-06, + "loss": 0.74329491, + "memory(GiB)": 147.13, + "step": 66310, + "train_speed(iter/s)": 0.20102 + }, + { + "acc": 0.77668848, + "epoch": 1.547295819419939, + "grad_norm": 4.9375, + "learning_rate": 1.28431800832156e-06, + "loss": 0.79781227, + "memory(GiB)": 147.13, + "step": 66320, + "train_speed(iter/s)": 0.201036 + }, + { + "acc": 0.77691069, + "epoch": 1.5475291269922278, + "grad_norm": 5.03125, + "learning_rate": 1.2830541934617274e-06, + "loss": 0.79807272, + "memory(GiB)": 147.13, + "step": 66330, + "train_speed(iter/s)": 0.201051 + }, + { + "acc": 0.78547478, + "epoch": 1.547762434564517, + "grad_norm": 5.1875, + "learning_rate": 1.2817909091948293e-06, + "loss": 0.76715317, + "memory(GiB)": 147.13, + "step": 66340, + "train_speed(iter/s)": 0.201066 + }, + { + "acc": 0.78639536, + "epoch": 1.5479957421368056, + "grad_norm": 5.0625, + "learning_rate": 1.2805281557011972e-06, + "loss": 0.75163832, + "memory(GiB)": 147.13, + "step": 66350, + "train_speed(iter/s)": 0.201082 + }, + { + "acc": 0.79573755, + "epoch": 1.5482290497090947, + "grad_norm": 7.09375, + "learning_rate": 1.2792659331610919e-06, + "loss": 0.72670212, + "memory(GiB)": 147.13, + "step": 66360, + "train_speed(iter/s)": 0.201097 + }, + { + "acc": 0.77826233, + "epoch": 1.5484623572813834, + "grad_norm": 6.21875, + "learning_rate": 1.2780042417546917e-06, + "loss": 0.80692101, + "memory(GiB)": 147.13, + "step": 66370, + "train_speed(iter/s)": 0.201111 + }, + { + "acc": 0.76512957, + "epoch": 1.5486956648536725, + "grad_norm": 5.71875, + "learning_rate": 1.276743081662107e-06, + "loss": 0.85662422, + "memory(GiB)": 147.13, + "step": 66380, + "train_speed(iter/s)": 0.201126 + }, + { + "acc": 0.78918648, + "epoch": 1.5489289724259612, + "grad_norm": 4.78125, + "learning_rate": 1.2754824530633654e-06, + "loss": 0.77457085, + "memory(GiB)": 147.13, + "step": 66390, + "train_speed(iter/s)": 0.201141 + }, + { + "acc": 0.76402283, + "epoch": 1.5491622799982503, + "grad_norm": 4.71875, + "learning_rate": 1.2742223561384204e-06, + "loss": 0.86086817, + "memory(GiB)": 147.13, + "step": 66400, + "train_speed(iter/s)": 0.201157 + }, + { + "acc": 0.77913628, + "epoch": 1.549395587570539, + "grad_norm": 4.9375, + "learning_rate": 1.2729627910671544e-06, + "loss": 0.79207058, + "memory(GiB)": 147.13, + "step": 66410, + "train_speed(iter/s)": 0.201172 + }, + { + "acc": 0.80253363, + "epoch": 1.549628895142828, + "grad_norm": 5.9375, + "learning_rate": 1.271703758029364e-06, + "loss": 0.71166973, + "memory(GiB)": 147.13, + "step": 66420, + "train_speed(iter/s)": 0.201188 + }, + { + "acc": 0.77526503, + "epoch": 1.5498622027151168, + "grad_norm": 5.53125, + "learning_rate": 1.270445257204781e-06, + "loss": 0.78889947, + "memory(GiB)": 147.13, + "step": 66430, + "train_speed(iter/s)": 0.201205 + }, + { + "acc": 0.78784337, + "epoch": 1.5500955102874059, + "grad_norm": 6.625, + "learning_rate": 1.2691872887730528e-06, + "loss": 0.74168453, + "memory(GiB)": 147.13, + "step": 66440, + "train_speed(iter/s)": 0.20122 + }, + { + "acc": 0.79704084, + "epoch": 1.5503288178596946, + "grad_norm": 7.65625, + "learning_rate": 1.2679298529137563e-06, + "loss": 0.75291963, + "memory(GiB)": 147.13, + "step": 66450, + "train_speed(iter/s)": 0.201236 + }, + { + "acc": 0.78185368, + "epoch": 1.5505621254319837, + "grad_norm": 4.8125, + "learning_rate": 1.266672949806388e-06, + "loss": 0.78402071, + "memory(GiB)": 147.13, + "step": 66460, + "train_speed(iter/s)": 0.201252 + }, + { + "acc": 0.78436918, + "epoch": 1.5507954330042724, + "grad_norm": 5.875, + "learning_rate": 1.265416579630373e-06, + "loss": 0.76729245, + "memory(GiB)": 147.13, + "step": 66470, + "train_speed(iter/s)": 0.201267 + }, + { + "acc": 0.77176619, + "epoch": 1.5510287405765615, + "grad_norm": 5.6875, + "learning_rate": 1.2641607425650565e-06, + "loss": 0.8262764, + "memory(GiB)": 147.13, + "step": 66480, + "train_speed(iter/s)": 0.201283 + }, + { + "acc": 0.79059591, + "epoch": 1.5512620481488502, + "grad_norm": 4.125, + "learning_rate": 1.2629054387897066e-06, + "loss": 0.75890532, + "memory(GiB)": 147.13, + "step": 66490, + "train_speed(iter/s)": 0.201299 + }, + { + "acc": 0.78669443, + "epoch": 1.5514953557211393, + "grad_norm": 6.75, + "learning_rate": 1.2616506684835217e-06, + "loss": 0.77961187, + "memory(GiB)": 147.13, + "step": 66500, + "train_speed(iter/s)": 0.201314 + }, + { + "epoch": 1.5514953557211393, + "eval_acc": 0.7446834917400007, + "eval_loss": 0.8045360445976257, + "eval_runtime": 1270.6085, + "eval_samples_per_second": 28.326, + "eval_steps_per_second": 14.163, + "step": 66500 + }, + { + "acc": 0.7779171, + "epoch": 1.551728663293428, + "grad_norm": 3.71875, + "learning_rate": 1.2603964318256167e-06, + "loss": 0.82731237, + "memory(GiB)": 147.13, + "step": 66510, + "train_speed(iter/s)": 0.200544 + }, + { + "acc": 0.76602592, + "epoch": 1.5519619708657169, + "grad_norm": 5.5625, + "learning_rate": 1.2591427289950358e-06, + "loss": 0.85228939, + "memory(GiB)": 147.13, + "step": 66520, + "train_speed(iter/s)": 0.20056 + }, + { + "acc": 0.79903426, + "epoch": 1.5521952784380058, + "grad_norm": 4.40625, + "learning_rate": 1.2578895601707435e-06, + "loss": 0.74350095, + "memory(GiB)": 147.13, + "step": 66530, + "train_speed(iter/s)": 0.200576 + }, + { + "acc": 0.77694921, + "epoch": 1.5524285860102947, + "grad_norm": 5.6875, + "learning_rate": 1.2566369255316296e-06, + "loss": 0.81319304, + "memory(GiB)": 147.13, + "step": 66540, + "train_speed(iter/s)": 0.200592 + }, + { + "acc": 0.77841153, + "epoch": 1.5526618935825836, + "grad_norm": 5.46875, + "learning_rate": 1.255384825256507e-06, + "loss": 0.79920754, + "memory(GiB)": 147.13, + "step": 66550, + "train_speed(iter/s)": 0.200607 + }, + { + "acc": 0.77843218, + "epoch": 1.5528952011548725, + "grad_norm": 4.9375, + "learning_rate": 1.2541332595241112e-06, + "loss": 0.8021946, + "memory(GiB)": 147.13, + "step": 66560, + "train_speed(iter/s)": 0.200622 + }, + { + "acc": 0.77249784, + "epoch": 1.5531285087271613, + "grad_norm": 7.15625, + "learning_rate": 1.2528822285131059e-06, + "loss": 0.8305788, + "memory(GiB)": 147.13, + "step": 66570, + "train_speed(iter/s)": 0.200636 + }, + { + "acc": 0.77560177, + "epoch": 1.5533618162994502, + "grad_norm": 5.5625, + "learning_rate": 1.2516317324020727e-06, + "loss": 0.79391565, + "memory(GiB)": 147.13, + "step": 66580, + "train_speed(iter/s)": 0.200652 + }, + { + "acc": 0.78499255, + "epoch": 1.5535951238717391, + "grad_norm": 4.21875, + "learning_rate": 1.2503817713695221e-06, + "loss": 0.76304283, + "memory(GiB)": 147.13, + "step": 66590, + "train_speed(iter/s)": 0.200668 + }, + { + "acc": 0.79315147, + "epoch": 1.553828431444028, + "grad_norm": 5.0625, + "learning_rate": 1.2491323455938831e-06, + "loss": 0.75981688, + "memory(GiB)": 147.13, + "step": 66600, + "train_speed(iter/s)": 0.200683 + }, + { + "acc": 0.79385643, + "epoch": 1.554061739016317, + "grad_norm": 3.984375, + "learning_rate": 1.2478834552535135e-06, + "loss": 0.72674108, + "memory(GiB)": 147.13, + "step": 66610, + "train_speed(iter/s)": 0.200699 + }, + { + "acc": 0.77791653, + "epoch": 1.5542950465886058, + "grad_norm": 5.71875, + "learning_rate": 1.2466351005266903e-06, + "loss": 0.78302855, + "memory(GiB)": 147.13, + "step": 66620, + "train_speed(iter/s)": 0.200715 + }, + { + "acc": 0.80931139, + "epoch": 1.5545283541608947, + "grad_norm": 3.828125, + "learning_rate": 1.2453872815916151e-06, + "loss": 0.67272167, + "memory(GiB)": 147.13, + "step": 66630, + "train_speed(iter/s)": 0.200732 + }, + { + "acc": 0.76477842, + "epoch": 1.5547616617331836, + "grad_norm": 5.25, + "learning_rate": 1.2441399986264174e-06, + "loss": 0.8524189, + "memory(GiB)": 147.13, + "step": 66640, + "train_speed(iter/s)": 0.200745 + }, + { + "acc": 0.78072309, + "epoch": 1.5549949693054725, + "grad_norm": 6.5, + "learning_rate": 1.2428932518091413e-06, + "loss": 0.79402523, + "memory(GiB)": 147.13, + "step": 66650, + "train_speed(iter/s)": 0.200761 + }, + { + "acc": 0.77371817, + "epoch": 1.5552282768777614, + "grad_norm": 30.25, + "learning_rate": 1.2416470413177633e-06, + "loss": 0.82794876, + "memory(GiB)": 147.13, + "step": 66660, + "train_speed(iter/s)": 0.200777 + }, + { + "acc": 0.7927494, + "epoch": 1.5554615844500503, + "grad_norm": 5.90625, + "learning_rate": 1.2404013673301768e-06, + "loss": 0.73954892, + "memory(GiB)": 147.13, + "step": 66670, + "train_speed(iter/s)": 0.200794 + }, + { + "acc": 0.78622789, + "epoch": 1.5556948920223392, + "grad_norm": 7.71875, + "learning_rate": 1.2391562300242044e-06, + "loss": 0.79023104, + "memory(GiB)": 147.13, + "step": 66680, + "train_speed(iter/s)": 0.200809 + }, + { + "acc": 0.77736959, + "epoch": 1.5559281995946281, + "grad_norm": 4.21875, + "learning_rate": 1.2379116295775877e-06, + "loss": 0.79935203, + "memory(GiB)": 147.13, + "step": 66690, + "train_speed(iter/s)": 0.200825 + }, + { + "acc": 0.77949438, + "epoch": 1.556161507166917, + "grad_norm": 6.78125, + "learning_rate": 1.236667566167991e-06, + "loss": 0.791997, + "memory(GiB)": 147.13, + "step": 66700, + "train_speed(iter/s)": 0.200841 + }, + { + "acc": 0.78041005, + "epoch": 1.556394814739206, + "grad_norm": 4.53125, + "learning_rate": 1.2354240399730083e-06, + "loss": 0.78324399, + "memory(GiB)": 147.13, + "step": 66710, + "train_speed(iter/s)": 0.200856 + }, + { + "acc": 0.77618508, + "epoch": 1.5566281223114946, + "grad_norm": 6.875, + "learning_rate": 1.2341810511701485e-06, + "loss": 0.80764313, + "memory(GiB)": 147.13, + "step": 66720, + "train_speed(iter/s)": 0.20087 + }, + { + "acc": 0.78065491, + "epoch": 1.5568614298837837, + "grad_norm": 5.125, + "learning_rate": 1.2329385999368509e-06, + "loss": 0.80852127, + "memory(GiB)": 147.13, + "step": 66730, + "train_speed(iter/s)": 0.200886 + }, + { + "acc": 0.78683767, + "epoch": 1.5570947374560724, + "grad_norm": 4.65625, + "learning_rate": 1.2316966864504725e-06, + "loss": 0.76271248, + "memory(GiB)": 147.13, + "step": 66740, + "train_speed(iter/s)": 0.200901 + }, + { + "acc": 0.76667438, + "epoch": 1.5573280450283615, + "grad_norm": 6.0625, + "learning_rate": 1.230455310888299e-06, + "loss": 0.85402393, + "memory(GiB)": 147.13, + "step": 66750, + "train_speed(iter/s)": 0.200918 + }, + { + "acc": 0.80291309, + "epoch": 1.5575613526006502, + "grad_norm": 5.21875, + "learning_rate": 1.2292144734275347e-06, + "loss": 0.70809097, + "memory(GiB)": 147.13, + "step": 66760, + "train_speed(iter/s)": 0.200934 + }, + { + "acc": 0.79057379, + "epoch": 1.5577946601729393, + "grad_norm": 4.46875, + "learning_rate": 1.2279741742453088e-06, + "loss": 0.75554724, + "memory(GiB)": 147.13, + "step": 66770, + "train_speed(iter/s)": 0.200949 + }, + { + "acc": 0.79762211, + "epoch": 1.558027967745228, + "grad_norm": 4.1875, + "learning_rate": 1.2267344135186743e-06, + "loss": 0.72075491, + "memory(GiB)": 147.13, + "step": 66780, + "train_speed(iter/s)": 0.200965 + }, + { + "acc": 0.78778682, + "epoch": 1.5582612753175171, + "grad_norm": 6.0, + "learning_rate": 1.2254951914246038e-06, + "loss": 0.75706754, + "memory(GiB)": 147.13, + "step": 66790, + "train_speed(iter/s)": 0.200981 + }, + { + "acc": 0.77799454, + "epoch": 1.5584945828898058, + "grad_norm": 4.5, + "learning_rate": 1.2242565081400004e-06, + "loss": 0.80059433, + "memory(GiB)": 147.13, + "step": 66800, + "train_speed(iter/s)": 0.200996 + }, + { + "acc": 0.78043733, + "epoch": 1.558727890462095, + "grad_norm": 7.78125, + "learning_rate": 1.223018363841682e-06, + "loss": 0.78616896, + "memory(GiB)": 147.13, + "step": 66810, + "train_speed(iter/s)": 0.201012 + }, + { + "acc": 0.78293238, + "epoch": 1.5589611980343836, + "grad_norm": 6.84375, + "learning_rate": 1.2217807587063962e-06, + "loss": 0.77822418, + "memory(GiB)": 147.13, + "step": 66820, + "train_speed(iter/s)": 0.201029 + }, + { + "acc": 0.75709181, + "epoch": 1.5591945056066727, + "grad_norm": 5.3125, + "learning_rate": 1.2205436929108093e-06, + "loss": 0.87782154, + "memory(GiB)": 147.13, + "step": 66830, + "train_speed(iter/s)": 0.201044 + }, + { + "acc": 0.78140383, + "epoch": 1.5594278131789614, + "grad_norm": 5.4375, + "learning_rate": 1.2193071666315114e-06, + "loss": 0.77247772, + "memory(GiB)": 147.13, + "step": 66840, + "train_speed(iter/s)": 0.201061 + }, + { + "acc": 0.77636423, + "epoch": 1.5596611207512505, + "grad_norm": 4.6875, + "learning_rate": 1.2180711800450184e-06, + "loss": 0.79241691, + "memory(GiB)": 147.13, + "step": 66850, + "train_speed(iter/s)": 0.201076 + }, + { + "acc": 0.76072507, + "epoch": 1.5598944283235392, + "grad_norm": 6.34375, + "learning_rate": 1.2168357333277641e-06, + "loss": 0.85695286, + "memory(GiB)": 147.13, + "step": 66860, + "train_speed(iter/s)": 0.201091 + }, + { + "acc": 0.80281935, + "epoch": 1.5601277358958283, + "grad_norm": 4.09375, + "learning_rate": 1.215600826656113e-06, + "loss": 0.72951822, + "memory(GiB)": 147.13, + "step": 66870, + "train_speed(iter/s)": 0.201107 + }, + { + "acc": 0.77838449, + "epoch": 1.560361043468117, + "grad_norm": 4.65625, + "learning_rate": 1.2143664602063415e-06, + "loss": 0.79508176, + "memory(GiB)": 147.13, + "step": 66880, + "train_speed(iter/s)": 0.201123 + }, + { + "acc": 0.78288064, + "epoch": 1.5605943510404061, + "grad_norm": 6.15625, + "learning_rate": 1.2131326341546596e-06, + "loss": 0.78689842, + "memory(GiB)": 147.13, + "step": 66890, + "train_speed(iter/s)": 0.201138 + }, + { + "acc": 0.7747613, + "epoch": 1.5608276586126948, + "grad_norm": 5.5, + "learning_rate": 1.2118993486771924e-06, + "loss": 0.80352983, + "memory(GiB)": 147.13, + "step": 66900, + "train_speed(iter/s)": 0.201153 + }, + { + "acc": 0.78817844, + "epoch": 1.5610609661849837, + "grad_norm": 6.28125, + "learning_rate": 1.2106666039499942e-06, + "loss": 0.77384362, + "memory(GiB)": 147.13, + "step": 66910, + "train_speed(iter/s)": 0.201168 + }, + { + "acc": 0.75901203, + "epoch": 1.5612942737572726, + "grad_norm": 5.40625, + "learning_rate": 1.2094344001490383e-06, + "loss": 0.88302488, + "memory(GiB)": 147.13, + "step": 66920, + "train_speed(iter/s)": 0.201185 + }, + { + "acc": 0.78414526, + "epoch": 1.5615275813295615, + "grad_norm": 6.0625, + "learning_rate": 1.2082027374502181e-06, + "loss": 0.79911737, + "memory(GiB)": 147.13, + "step": 66930, + "train_speed(iter/s)": 0.201201 + }, + { + "acc": 0.77624192, + "epoch": 1.5617608889018504, + "grad_norm": 5.9375, + "learning_rate": 1.2069716160293577e-06, + "loss": 0.80140715, + "memory(GiB)": 147.13, + "step": 66940, + "train_speed(iter/s)": 0.201216 + }, + { + "acc": 0.7758502, + "epoch": 1.5619941964741393, + "grad_norm": 5.0625, + "learning_rate": 1.2057410360621952e-06, + "loss": 0.80642414, + "memory(GiB)": 147.13, + "step": 66950, + "train_speed(iter/s)": 0.201233 + }, + { + "acc": 0.76256318, + "epoch": 1.5622275040464282, + "grad_norm": 10.5625, + "learning_rate": 1.2045109977243996e-06, + "loss": 0.84569578, + "memory(GiB)": 147.13, + "step": 66960, + "train_speed(iter/s)": 0.201249 + }, + { + "acc": 0.79698315, + "epoch": 1.562460811618717, + "grad_norm": 5.59375, + "learning_rate": 1.203281501191556e-06, + "loss": 0.72654285, + "memory(GiB)": 147.13, + "step": 66970, + "train_speed(iter/s)": 0.201265 + }, + { + "acc": 0.78373365, + "epoch": 1.562694119191006, + "grad_norm": 7.03125, + "learning_rate": 1.2020525466391758e-06, + "loss": 0.76392503, + "memory(GiB)": 147.13, + "step": 66980, + "train_speed(iter/s)": 0.20128 + }, + { + "acc": 0.79392319, + "epoch": 1.5629274267632949, + "grad_norm": 7.875, + "learning_rate": 1.2008241342426907e-06, + "loss": 0.71769762, + "memory(GiB)": 147.13, + "step": 66990, + "train_speed(iter/s)": 0.201296 + }, + { + "acc": 0.79626703, + "epoch": 1.5631607343355838, + "grad_norm": 6.46875, + "learning_rate": 1.1995962641774556e-06, + "loss": 0.71886544, + "memory(GiB)": 147.13, + "step": 67000, + "train_speed(iter/s)": 0.201311 + }, + { + "epoch": 1.5631607343355838, + "eval_acc": 0.7446565875089741, + "eval_loss": 0.80445796251297, + "eval_runtime": 1269.6632, + "eval_samples_per_second": 28.347, + "eval_steps_per_second": 14.174, + "step": 67000 + }, + { + "acc": 0.7860218, + "epoch": 1.5633940419078727, + "grad_norm": 5.0, + "learning_rate": 1.1983689366187512e-06, + "loss": 0.77764053, + "memory(GiB)": 147.13, + "step": 67010, + "train_speed(iter/s)": 0.200547 + }, + { + "acc": 0.77029128, + "epoch": 1.5636273494801616, + "grad_norm": 5.8125, + "learning_rate": 1.1971421517417748e-06, + "loss": 0.85604916, + "memory(GiB)": 147.13, + "step": 67020, + "train_speed(iter/s)": 0.200564 + }, + { + "acc": 0.78226252, + "epoch": 1.5638606570524505, + "grad_norm": 5.09375, + "learning_rate": 1.1959159097216533e-06, + "loss": 0.78034925, + "memory(GiB)": 147.13, + "step": 67030, + "train_speed(iter/s)": 0.200579 + }, + { + "acc": 0.78731737, + "epoch": 1.5640939646247394, + "grad_norm": 6.03125, + "learning_rate": 1.1946902107334308e-06, + "loss": 0.7751092, + "memory(GiB)": 147.13, + "step": 67040, + "train_speed(iter/s)": 0.200595 + }, + { + "acc": 0.77418489, + "epoch": 1.5643272721970283, + "grad_norm": 6.65625, + "learning_rate": 1.1934650549520737e-06, + "loss": 0.81517506, + "memory(GiB)": 147.13, + "step": 67050, + "train_speed(iter/s)": 0.200611 + }, + { + "acc": 0.7982223, + "epoch": 1.5645605797693172, + "grad_norm": 5.6875, + "learning_rate": 1.1922404425524753e-06, + "loss": 0.72336531, + "memory(GiB)": 147.13, + "step": 67060, + "train_speed(iter/s)": 0.200626 + }, + { + "acc": 0.78747759, + "epoch": 1.564793887341606, + "grad_norm": 5.03125, + "learning_rate": 1.1910163737094465e-06, + "loss": 0.75430288, + "memory(GiB)": 147.13, + "step": 67070, + "train_speed(iter/s)": 0.200641 + }, + { + "acc": 0.78929777, + "epoch": 1.565027194913895, + "grad_norm": 5.21875, + "learning_rate": 1.189792848597725e-06, + "loss": 0.77023878, + "memory(GiB)": 147.13, + "step": 67080, + "train_speed(iter/s)": 0.200657 + }, + { + "acc": 0.75839481, + "epoch": 1.5652605024861839, + "grad_norm": 6.5, + "learning_rate": 1.1885698673919666e-06, + "loss": 0.86226034, + "memory(GiB)": 147.13, + "step": 67090, + "train_speed(iter/s)": 0.200672 + }, + { + "acc": 0.77864447, + "epoch": 1.5654938100584728, + "grad_norm": 5.59375, + "learning_rate": 1.1873474302667548e-06, + "loss": 0.81831455, + "memory(GiB)": 147.13, + "step": 67100, + "train_speed(iter/s)": 0.200688 + }, + { + "acc": 0.77476091, + "epoch": 1.5657271176307614, + "grad_norm": 5.625, + "learning_rate": 1.186125537396587e-06, + "loss": 0.80706577, + "memory(GiB)": 147.13, + "step": 67110, + "train_speed(iter/s)": 0.200704 + }, + { + "acc": 0.76268578, + "epoch": 1.5659604252030506, + "grad_norm": 4.84375, + "learning_rate": 1.1849041889558922e-06, + "loss": 0.85323725, + "memory(GiB)": 147.13, + "step": 67120, + "train_speed(iter/s)": 0.200718 + }, + { + "acc": 0.78476624, + "epoch": 1.5661937327753392, + "grad_norm": 4.375, + "learning_rate": 1.1836833851190161e-06, + "loss": 0.78540063, + "memory(GiB)": 147.13, + "step": 67130, + "train_speed(iter/s)": 0.200733 + }, + { + "acc": 0.79616303, + "epoch": 1.5664270403476284, + "grad_norm": 6.25, + "learning_rate": 1.1824631260602266e-06, + "loss": 0.71438951, + "memory(GiB)": 147.13, + "step": 67140, + "train_speed(iter/s)": 0.200749 + }, + { + "acc": 0.79416599, + "epoch": 1.566660347919917, + "grad_norm": 4.59375, + "learning_rate": 1.1812434119537187e-06, + "loss": 0.74181514, + "memory(GiB)": 147.13, + "step": 67150, + "train_speed(iter/s)": 0.200765 + }, + { + "acc": 0.78638306, + "epoch": 1.5668936554922062, + "grad_norm": 8.25, + "learning_rate": 1.1800242429736025e-06, + "loss": 0.75746689, + "memory(GiB)": 147.13, + "step": 67160, + "train_speed(iter/s)": 0.200781 + }, + { + "acc": 0.76691604, + "epoch": 1.5671269630644948, + "grad_norm": 4.84375, + "learning_rate": 1.1788056192939173e-06, + "loss": 0.85675087, + "memory(GiB)": 147.13, + "step": 67170, + "train_speed(iter/s)": 0.200796 + }, + { + "acc": 0.78023224, + "epoch": 1.567360270636784, + "grad_norm": 5.75, + "learning_rate": 1.1775875410886206e-06, + "loss": 0.80645647, + "memory(GiB)": 147.13, + "step": 67180, + "train_speed(iter/s)": 0.200811 + }, + { + "acc": 0.80352058, + "epoch": 1.5675935782090726, + "grad_norm": 3.640625, + "learning_rate": 1.17637000853159e-06, + "loss": 0.68619394, + "memory(GiB)": 147.13, + "step": 67190, + "train_speed(iter/s)": 0.200826 + }, + { + "acc": 0.77900271, + "epoch": 1.5678268857813618, + "grad_norm": 4.90625, + "learning_rate": 1.1751530217966312e-06, + "loss": 0.79527159, + "memory(GiB)": 147.13, + "step": 67200, + "train_speed(iter/s)": 0.200842 + }, + { + "acc": 0.79193454, + "epoch": 1.5680601933536504, + "grad_norm": 9.1875, + "learning_rate": 1.1739365810574677e-06, + "loss": 0.74616985, + "memory(GiB)": 147.13, + "step": 67210, + "train_speed(iter/s)": 0.200857 + }, + { + "acc": 0.79721117, + "epoch": 1.5682935009259396, + "grad_norm": 9.0, + "learning_rate": 1.1727206864877456e-06, + "loss": 0.72338243, + "memory(GiB)": 147.13, + "step": 67220, + "train_speed(iter/s)": 0.200872 + }, + { + "acc": 0.77702656, + "epoch": 1.5685268084982282, + "grad_norm": 4.8125, + "learning_rate": 1.1715053382610325e-06, + "loss": 0.79841452, + "memory(GiB)": 147.13, + "step": 67230, + "train_speed(iter/s)": 0.200888 + }, + { + "acc": 0.74713612, + "epoch": 1.5687601160705174, + "grad_norm": 4.5625, + "learning_rate": 1.170290536550821e-06, + "loss": 0.89974098, + "memory(GiB)": 147.13, + "step": 67240, + "train_speed(iter/s)": 0.200905 + }, + { + "acc": 0.77886939, + "epoch": 1.568993423642806, + "grad_norm": 3.78125, + "learning_rate": 1.1690762815305224e-06, + "loss": 0.80204468, + "memory(GiB)": 147.13, + "step": 67250, + "train_speed(iter/s)": 0.20092 + }, + { + "acc": 0.77801399, + "epoch": 1.5692267312150951, + "grad_norm": 5.78125, + "learning_rate": 1.1678625733734722e-06, + "loss": 0.79122267, + "memory(GiB)": 147.13, + "step": 67260, + "train_speed(iter/s)": 0.200936 + }, + { + "acc": 0.77463903, + "epoch": 1.5694600387873838, + "grad_norm": 5.875, + "learning_rate": 1.166649412252927e-06, + "loss": 0.80886192, + "memory(GiB)": 147.13, + "step": 67270, + "train_speed(iter/s)": 0.200952 + }, + { + "acc": 0.81241455, + "epoch": 1.569693346359673, + "grad_norm": 4.8125, + "learning_rate": 1.1654367983420628e-06, + "loss": 0.65604534, + "memory(GiB)": 147.13, + "step": 67280, + "train_speed(iter/s)": 0.200968 + }, + { + "acc": 0.78449936, + "epoch": 1.5699266539319616, + "grad_norm": 5.0, + "learning_rate": 1.1642247318139837e-06, + "loss": 0.77122612, + "memory(GiB)": 147.13, + "step": 67290, + "train_speed(iter/s)": 0.200984 + }, + { + "acc": 0.77493362, + "epoch": 1.5701599615042505, + "grad_norm": 5.46875, + "learning_rate": 1.1630132128417083e-06, + "loss": 0.80600281, + "memory(GiB)": 147.13, + "step": 67300, + "train_speed(iter/s)": 0.200999 + }, + { + "acc": 0.7923779, + "epoch": 1.5703932690765394, + "grad_norm": 6.875, + "learning_rate": 1.1618022415981827e-06, + "loss": 0.7523242, + "memory(GiB)": 147.13, + "step": 67310, + "train_speed(iter/s)": 0.201015 + }, + { + "acc": 0.7808217, + "epoch": 1.5706265766488283, + "grad_norm": 5.9375, + "learning_rate": 1.1605918182562731e-06, + "loss": 0.784408, + "memory(GiB)": 147.13, + "step": 67320, + "train_speed(iter/s)": 0.20103 + }, + { + "acc": 0.77961884, + "epoch": 1.5708598842211172, + "grad_norm": 5.96875, + "learning_rate": 1.1593819429887655e-06, + "loss": 0.78548775, + "memory(GiB)": 147.13, + "step": 67330, + "train_speed(iter/s)": 0.201046 + }, + { + "acc": 0.79387894, + "epoch": 1.5710931917934061, + "grad_norm": 5.25, + "learning_rate": 1.1581726159683698e-06, + "loss": 0.73550348, + "memory(GiB)": 147.13, + "step": 67340, + "train_speed(iter/s)": 0.201061 + }, + { + "acc": 0.77411294, + "epoch": 1.571326499365695, + "grad_norm": 17.125, + "learning_rate": 1.1569638373677162e-06, + "loss": 0.80191069, + "memory(GiB)": 147.13, + "step": 67350, + "train_speed(iter/s)": 0.201076 + }, + { + "acc": 0.77730083, + "epoch": 1.571559806937984, + "grad_norm": 5.03125, + "learning_rate": 1.1557556073593595e-06, + "loss": 0.79704728, + "memory(GiB)": 147.13, + "step": 67360, + "train_speed(iter/s)": 0.201092 + }, + { + "acc": 0.77600565, + "epoch": 1.5717931145102728, + "grad_norm": 5.875, + "learning_rate": 1.1545479261157715e-06, + "loss": 0.80014782, + "memory(GiB)": 147.13, + "step": 67370, + "train_speed(iter/s)": 0.201107 + }, + { + "acc": 0.76335773, + "epoch": 1.5720264220825617, + "grad_norm": 5.9375, + "learning_rate": 1.1533407938093515e-06, + "loss": 0.8461832, + "memory(GiB)": 147.13, + "step": 67380, + "train_speed(iter/s)": 0.201123 + }, + { + "acc": 0.7748148, + "epoch": 1.5722597296548506, + "grad_norm": 5.34375, + "learning_rate": 1.1521342106124145e-06, + "loss": 0.80657759, + "memory(GiB)": 147.13, + "step": 67390, + "train_speed(iter/s)": 0.201138 + }, + { + "acc": 0.80296421, + "epoch": 1.5724930372271395, + "grad_norm": 4.21875, + "learning_rate": 1.1509281766972026e-06, + "loss": 0.6991787, + "memory(GiB)": 147.13, + "step": 67400, + "train_speed(iter/s)": 0.201153 + }, + { + "acc": 0.78165808, + "epoch": 1.5727263447994284, + "grad_norm": 6.0, + "learning_rate": 1.149722692235875e-06, + "loss": 0.79229851, + "memory(GiB)": 147.13, + "step": 67410, + "train_speed(iter/s)": 0.201168 + }, + { + "acc": 0.77584095, + "epoch": 1.5729596523717173, + "grad_norm": 5.46875, + "learning_rate": 1.1485177574005134e-06, + "loss": 0.79919806, + "memory(GiB)": 147.13, + "step": 67420, + "train_speed(iter/s)": 0.201184 + }, + { + "acc": 0.77663813, + "epoch": 1.5731929599440062, + "grad_norm": 6.40625, + "learning_rate": 1.1473133723631241e-06, + "loss": 0.83092327, + "memory(GiB)": 147.13, + "step": 67430, + "train_speed(iter/s)": 0.2012 + }, + { + "acc": 0.761483, + "epoch": 1.573426267516295, + "grad_norm": 4.53125, + "learning_rate": 1.1461095372956322e-06, + "loss": 0.86643829, + "memory(GiB)": 147.13, + "step": 67440, + "train_speed(iter/s)": 0.201216 + }, + { + "acc": 0.76875048, + "epoch": 1.573659575088584, + "grad_norm": 3.765625, + "learning_rate": 1.1449062523698839e-06, + "loss": 0.82976036, + "memory(GiB)": 147.13, + "step": 67450, + "train_speed(iter/s)": 0.201231 + }, + { + "acc": 0.77960482, + "epoch": 1.573892882660873, + "grad_norm": 5.375, + "learning_rate": 1.1437035177576467e-06, + "loss": 0.79715147, + "memory(GiB)": 147.13, + "step": 67460, + "train_speed(iter/s)": 0.201245 + }, + { + "acc": 0.77590551, + "epoch": 1.5741261902331618, + "grad_norm": 4.9375, + "learning_rate": 1.142501333630614e-06, + "loss": 0.79287214, + "memory(GiB)": 147.13, + "step": 67470, + "train_speed(iter/s)": 0.201262 + }, + { + "acc": 0.7959466, + "epoch": 1.5743594978054507, + "grad_norm": 3.984375, + "learning_rate": 1.1412997001603947e-06, + "loss": 0.74863319, + "memory(GiB)": 147.13, + "step": 67480, + "train_speed(iter/s)": 0.201277 + }, + { + "acc": 0.80535631, + "epoch": 1.5745928053777396, + "grad_norm": 35.5, + "learning_rate": 1.1400986175185214e-06, + "loss": 0.70440435, + "memory(GiB)": 147.13, + "step": 67490, + "train_speed(iter/s)": 0.201291 + }, + { + "acc": 0.78224182, + "epoch": 1.5748261129500283, + "grad_norm": 5.3125, + "learning_rate": 1.1388980858764504e-06, + "loss": 0.76127481, + "memory(GiB)": 147.13, + "step": 67500, + "train_speed(iter/s)": 0.201306 + }, + { + "epoch": 1.5748261129500283, + "eval_acc": 0.74470014674016, + "eval_loss": 0.804486095905304, + "eval_runtime": 1270.1666, + "eval_samples_per_second": 28.336, + "eval_steps_per_second": 14.168, + "step": 67500 + }, + { + "acc": 0.78972077, + "epoch": 1.5750594205223174, + "grad_norm": 5.1875, + "learning_rate": 1.1376981054055542e-06, + "loss": 0.75714273, + "memory(GiB)": 147.13, + "step": 67510, + "train_speed(iter/s)": 0.200548 + }, + { + "acc": 0.75635891, + "epoch": 1.575292728094606, + "grad_norm": 5.65625, + "learning_rate": 1.136498676277133e-06, + "loss": 0.8773056, + "memory(GiB)": 147.13, + "step": 67520, + "train_speed(iter/s)": 0.200564 + }, + { + "acc": 0.78398147, + "epoch": 1.5755260356668952, + "grad_norm": 4.59375, + "learning_rate": 1.135299798662402e-06, + "loss": 0.76907773, + "memory(GiB)": 147.13, + "step": 67530, + "train_speed(iter/s)": 0.200579 + }, + { + "acc": 0.77541685, + "epoch": 1.5757593432391839, + "grad_norm": 5.28125, + "learning_rate": 1.1341014727325038e-06, + "loss": 0.83951645, + "memory(GiB)": 147.13, + "step": 67540, + "train_speed(iter/s)": 0.200594 + }, + { + "acc": 0.77365732, + "epoch": 1.575992650811473, + "grad_norm": 4.9375, + "learning_rate": 1.1329036986584968e-06, + "loss": 0.80199337, + "memory(GiB)": 147.13, + "step": 67550, + "train_speed(iter/s)": 0.20061 + }, + { + "acc": 0.77182055, + "epoch": 1.5762259583837617, + "grad_norm": 10.0, + "learning_rate": 1.131706476611364e-06, + "loss": 0.84103651, + "memory(GiB)": 147.13, + "step": 67560, + "train_speed(iter/s)": 0.200624 + }, + { + "acc": 0.78937941, + "epoch": 1.5764592659560508, + "grad_norm": 5.375, + "learning_rate": 1.1305098067620074e-06, + "loss": 0.74375906, + "memory(GiB)": 147.13, + "step": 67570, + "train_speed(iter/s)": 0.200638 + }, + { + "acc": 0.78117437, + "epoch": 1.5766925735283395, + "grad_norm": 5.3125, + "learning_rate": 1.1293136892812507e-06, + "loss": 0.79897671, + "memory(GiB)": 147.13, + "step": 67580, + "train_speed(iter/s)": 0.200653 + }, + { + "acc": 0.78500509, + "epoch": 1.5769258811006286, + "grad_norm": 4.625, + "learning_rate": 1.1281181243398414e-06, + "loss": 0.76446457, + "memory(GiB)": 147.13, + "step": 67590, + "train_speed(iter/s)": 0.200668 + }, + { + "acc": 0.76707001, + "epoch": 1.5771591886729173, + "grad_norm": 4.9375, + "learning_rate": 1.1269231121084439e-06, + "loss": 0.83135767, + "memory(GiB)": 147.13, + "step": 67600, + "train_speed(iter/s)": 0.200683 + }, + { + "acc": 0.78357925, + "epoch": 1.5773924962452064, + "grad_norm": 5.6875, + "learning_rate": 1.1257286527576488e-06, + "loss": 0.77646589, + "memory(GiB)": 147.13, + "step": 67610, + "train_speed(iter/s)": 0.200698 + }, + { + "acc": 0.76686993, + "epoch": 1.577625803817495, + "grad_norm": 6.46875, + "learning_rate": 1.1245347464579626e-06, + "loss": 0.83330984, + "memory(GiB)": 147.13, + "step": 67620, + "train_speed(iter/s)": 0.200714 + }, + { + "acc": 0.77705078, + "epoch": 1.5778591113897842, + "grad_norm": 5.1875, + "learning_rate": 1.1233413933798143e-06, + "loss": 0.80012417, + "memory(GiB)": 147.13, + "step": 67630, + "train_speed(iter/s)": 0.200729 + }, + { + "acc": 0.78694525, + "epoch": 1.5780924189620729, + "grad_norm": 6.53125, + "learning_rate": 1.1221485936935571e-06, + "loss": 0.75556831, + "memory(GiB)": 147.13, + "step": 67640, + "train_speed(iter/s)": 0.200742 + }, + { + "acc": 0.7862113, + "epoch": 1.578325726534362, + "grad_norm": 4.59375, + "learning_rate": 1.1209563475694608e-06, + "loss": 0.77506971, + "memory(GiB)": 147.13, + "step": 67650, + "train_speed(iter/s)": 0.200755 + }, + { + "acc": 0.76903219, + "epoch": 1.5785590341066507, + "grad_norm": 6.15625, + "learning_rate": 1.1197646551777196e-06, + "loss": 0.83937769, + "memory(GiB)": 147.13, + "step": 67660, + "train_speed(iter/s)": 0.20077 + }, + { + "acc": 0.78912687, + "epoch": 1.5787923416789398, + "grad_norm": 4.53125, + "learning_rate": 1.118573516688447e-06, + "loss": 0.75033379, + "memory(GiB)": 147.13, + "step": 67670, + "train_speed(iter/s)": 0.200786 + }, + { + "acc": 0.7752059, + "epoch": 1.5790256492512285, + "grad_norm": 4.625, + "learning_rate": 1.1173829322716774e-06, + "loss": 0.80030231, + "memory(GiB)": 147.13, + "step": 67680, + "train_speed(iter/s)": 0.200801 + }, + { + "acc": 0.77555189, + "epoch": 1.5792589568235174, + "grad_norm": 4.8125, + "learning_rate": 1.116192902097365e-06, + "loss": 0.79320483, + "memory(GiB)": 147.13, + "step": 67690, + "train_speed(iter/s)": 0.200816 + }, + { + "acc": 0.77827835, + "epoch": 1.5794922643958063, + "grad_norm": 8.25, + "learning_rate": 1.1150034263353887e-06, + "loss": 0.78702092, + "memory(GiB)": 147.13, + "step": 67700, + "train_speed(iter/s)": 0.200832 + }, + { + "acc": 0.78876238, + "epoch": 1.5797255719680952, + "grad_norm": 4.40625, + "learning_rate": 1.113814505155545e-06, + "loss": 0.75948009, + "memory(GiB)": 147.13, + "step": 67710, + "train_speed(iter/s)": 0.200848 + }, + { + "acc": 0.78561726, + "epoch": 1.579958879540384, + "grad_norm": 4.3125, + "learning_rate": 1.11262613872755e-06, + "loss": 0.78061628, + "memory(GiB)": 147.13, + "step": 67720, + "train_speed(iter/s)": 0.200863 + }, + { + "acc": 0.76869831, + "epoch": 1.580192187112673, + "grad_norm": 4.6875, + "learning_rate": 1.111438327221046e-06, + "loss": 0.84357386, + "memory(GiB)": 147.13, + "step": 67730, + "train_speed(iter/s)": 0.200878 + }, + { + "acc": 0.79987097, + "epoch": 1.5804254946849619, + "grad_norm": 4.78125, + "learning_rate": 1.1102510708055897e-06, + "loss": 0.71598945, + "memory(GiB)": 147.13, + "step": 67740, + "train_speed(iter/s)": 0.200893 + }, + { + "acc": 0.76315088, + "epoch": 1.5806588022572508, + "grad_norm": 6.78125, + "learning_rate": 1.1090643696506648e-06, + "loss": 0.87976942, + "memory(GiB)": 147.13, + "step": 67750, + "train_speed(iter/s)": 0.200909 + }, + { + "acc": 0.79254284, + "epoch": 1.5808921098295396, + "grad_norm": 4.34375, + "learning_rate": 1.1078782239256707e-06, + "loss": 0.71899223, + "memory(GiB)": 147.13, + "step": 67760, + "train_speed(iter/s)": 0.200924 + }, + { + "acc": 0.78601794, + "epoch": 1.5811254174018285, + "grad_norm": 4.875, + "learning_rate": 1.106692633799928e-06, + "loss": 0.75329552, + "memory(GiB)": 147.13, + "step": 67770, + "train_speed(iter/s)": 0.20094 + }, + { + "acc": 0.77427597, + "epoch": 1.5813587249741174, + "grad_norm": 6.0625, + "learning_rate": 1.1055075994426833e-06, + "loss": 0.818151, + "memory(GiB)": 147.13, + "step": 67780, + "train_speed(iter/s)": 0.200955 + }, + { + "acc": 0.79199877, + "epoch": 1.5815920325464063, + "grad_norm": 6.875, + "learning_rate": 1.1043231210230949e-06, + "loss": 0.73157892, + "memory(GiB)": 147.13, + "step": 67790, + "train_speed(iter/s)": 0.20097 + }, + { + "acc": 0.77825437, + "epoch": 1.5818253401186952, + "grad_norm": 6.09375, + "learning_rate": 1.1031391987102502e-06, + "loss": 0.78899622, + "memory(GiB)": 147.13, + "step": 67800, + "train_speed(iter/s)": 0.200985 + }, + { + "acc": 0.7796669, + "epoch": 1.5820586476909841, + "grad_norm": 7.0625, + "learning_rate": 1.1019558326731522e-06, + "loss": 0.78667364, + "memory(GiB)": 147.13, + "step": 67810, + "train_speed(iter/s)": 0.201 + }, + { + "acc": 0.76001234, + "epoch": 1.582291955263273, + "grad_norm": 6.125, + "learning_rate": 1.100773023080728e-06, + "loss": 0.8563343, + "memory(GiB)": 147.13, + "step": 67820, + "train_speed(iter/s)": 0.201016 + }, + { + "acc": 0.76715779, + "epoch": 1.582525262835562, + "grad_norm": 6.3125, + "learning_rate": 1.09959077010182e-06, + "loss": 0.83880901, + "memory(GiB)": 147.13, + "step": 67830, + "train_speed(iter/s)": 0.201032 + }, + { + "acc": 0.77683773, + "epoch": 1.5827585704078508, + "grad_norm": 4.09375, + "learning_rate": 1.0984090739051984e-06, + "loss": 0.79647036, + "memory(GiB)": 147.13, + "step": 67840, + "train_speed(iter/s)": 0.201048 + }, + { + "acc": 0.77974472, + "epoch": 1.5829918779801397, + "grad_norm": 4.6875, + "learning_rate": 1.0972279346595477e-06, + "loss": 0.78166647, + "memory(GiB)": 147.13, + "step": 67850, + "train_speed(iter/s)": 0.201063 + }, + { + "acc": 0.78405037, + "epoch": 1.5832251855524286, + "grad_norm": 5.03125, + "learning_rate": 1.0960473525334747e-06, + "loss": 0.76982818, + "memory(GiB)": 147.13, + "step": 67860, + "train_speed(iter/s)": 0.201077 + }, + { + "acc": 0.76590328, + "epoch": 1.5834584931247173, + "grad_norm": 4.3125, + "learning_rate": 1.094867327695509e-06, + "loss": 0.85223007, + "memory(GiB)": 147.13, + "step": 67870, + "train_speed(iter/s)": 0.201092 + }, + { + "acc": 0.79253726, + "epoch": 1.5836918006970064, + "grad_norm": 4.71875, + "learning_rate": 1.0936878603140966e-06, + "loss": 0.72532487, + "memory(GiB)": 147.13, + "step": 67880, + "train_speed(iter/s)": 0.201108 + }, + { + "acc": 0.79152641, + "epoch": 1.5839251082692951, + "grad_norm": 5.0625, + "learning_rate": 1.0925089505576085e-06, + "loss": 0.75220585, + "memory(GiB)": 147.13, + "step": 67890, + "train_speed(iter/s)": 0.201123 + }, + { + "acc": 0.77057228, + "epoch": 1.5841584158415842, + "grad_norm": 5.9375, + "learning_rate": 1.0913305985943328e-06, + "loss": 0.83439999, + "memory(GiB)": 147.13, + "step": 67900, + "train_speed(iter/s)": 0.201138 + }, + { + "acc": 0.78468051, + "epoch": 1.584391723413873, + "grad_norm": 7.0, + "learning_rate": 1.0901528045924786e-06, + "loss": 0.75254793, + "memory(GiB)": 147.13, + "step": 67910, + "train_speed(iter/s)": 0.201152 + }, + { + "acc": 0.77283545, + "epoch": 1.584625030986162, + "grad_norm": 4.5, + "learning_rate": 1.0889755687201758e-06, + "loss": 0.82278538, + "memory(GiB)": 147.13, + "step": 67920, + "train_speed(iter/s)": 0.201168 + }, + { + "acc": 0.78898163, + "epoch": 1.5848583385584507, + "grad_norm": 5.46875, + "learning_rate": 1.087798891145473e-06, + "loss": 0.75745392, + "memory(GiB)": 147.13, + "step": 67930, + "train_speed(iter/s)": 0.201184 + }, + { + "acc": 0.75619049, + "epoch": 1.5850916461307398, + "grad_norm": 6.5, + "learning_rate": 1.0866227720363431e-06, + "loss": 0.86945744, + "memory(GiB)": 147.13, + "step": 67940, + "train_speed(iter/s)": 0.2012 + }, + { + "acc": 0.78170376, + "epoch": 1.5853249537030285, + "grad_norm": 5.8125, + "learning_rate": 1.0854472115606745e-06, + "loss": 0.79862347, + "memory(GiB)": 147.13, + "step": 67950, + "train_speed(iter/s)": 0.201216 + }, + { + "acc": 0.80022554, + "epoch": 1.5855582612753176, + "grad_norm": 5.4375, + "learning_rate": 1.0842722098862813e-06, + "loss": 0.70132508, + "memory(GiB)": 147.13, + "step": 67960, + "train_speed(iter/s)": 0.201231 + }, + { + "acc": 0.79483395, + "epoch": 1.5857915688476063, + "grad_norm": 4.8125, + "learning_rate": 1.0830977671808918e-06, + "loss": 0.74464898, + "memory(GiB)": 147.13, + "step": 67970, + "train_speed(iter/s)": 0.201246 + }, + { + "acc": 0.7618938, + "epoch": 1.5860248764198954, + "grad_norm": 5.9375, + "learning_rate": 1.081923883612157e-06, + "loss": 0.85374565, + "memory(GiB)": 147.13, + "step": 67980, + "train_speed(iter/s)": 0.201261 + }, + { + "acc": 0.77546587, + "epoch": 1.586258183992184, + "grad_norm": 5.1875, + "learning_rate": 1.080750559347651e-06, + "loss": 0.79738269, + "memory(GiB)": 147.13, + "step": 67990, + "train_speed(iter/s)": 0.201277 + }, + { + "acc": 0.78763037, + "epoch": 1.5864914915644732, + "grad_norm": 4.5625, + "learning_rate": 1.0795777945548624e-06, + "loss": 0.78351851, + "memory(GiB)": 147.13, + "step": 68000, + "train_speed(iter/s)": 0.201292 + }, + { + "epoch": 1.5864914915644732, + "eval_acc": 0.7446918192400804, + "eval_loss": 0.8044730424880981, + "eval_runtime": 1269.9877, + "eval_samples_per_second": 28.34, + "eval_steps_per_second": 14.17, + "step": 68000 + }, + { + "acc": 0.78458452, + "epoch": 1.586724799136762, + "grad_norm": 5.96875, + "learning_rate": 1.078405589401208e-06, + "loss": 0.77951698, + "memory(GiB)": 147.13, + "step": 68010, + "train_speed(iter/s)": 0.200538 + }, + { + "acc": 0.77816787, + "epoch": 1.586958106709051, + "grad_norm": 7.375, + "learning_rate": 1.0772339440540135e-06, + "loss": 0.77889547, + "memory(GiB)": 147.13, + "step": 68020, + "train_speed(iter/s)": 0.200553 + }, + { + "acc": 0.77970071, + "epoch": 1.5871914142813397, + "grad_norm": 5.6875, + "learning_rate": 1.076062858680535e-06, + "loss": 0.78750424, + "memory(GiB)": 147.13, + "step": 68030, + "train_speed(iter/s)": 0.200567 + }, + { + "acc": 0.79411736, + "epoch": 1.5874247218536288, + "grad_norm": 3.90625, + "learning_rate": 1.0748923334479427e-06, + "loss": 0.73421063, + "memory(GiB)": 147.13, + "step": 68040, + "train_speed(iter/s)": 0.200582 + }, + { + "acc": 0.78528967, + "epoch": 1.5876580294259175, + "grad_norm": 5.4375, + "learning_rate": 1.0737223685233306e-06, + "loss": 0.76053562, + "memory(GiB)": 147.13, + "step": 68050, + "train_speed(iter/s)": 0.200597 + }, + { + "acc": 0.77646742, + "epoch": 1.5878913369982064, + "grad_norm": 4.75, + "learning_rate": 1.0725529640737098e-06, + "loss": 0.80411501, + "memory(GiB)": 147.13, + "step": 68060, + "train_speed(iter/s)": 0.200612 + }, + { + "acc": 0.76218376, + "epoch": 1.5881246445704953, + "grad_norm": 5.15625, + "learning_rate": 1.0713841202660114e-06, + "loss": 0.86731377, + "memory(GiB)": 147.13, + "step": 68070, + "train_speed(iter/s)": 0.200625 + }, + { + "acc": 0.77366896, + "epoch": 1.5883579521427842, + "grad_norm": 5.0, + "learning_rate": 1.0702158372670895e-06, + "loss": 0.80369129, + "memory(GiB)": 147.13, + "step": 68080, + "train_speed(iter/s)": 0.200641 + }, + { + "acc": 0.77165804, + "epoch": 1.588591259715073, + "grad_norm": 5.0, + "learning_rate": 1.0690481152437138e-06, + "loss": 0.80997429, + "memory(GiB)": 147.13, + "step": 68090, + "train_speed(iter/s)": 0.200655 + }, + { + "acc": 0.7878973, + "epoch": 1.588824567287362, + "grad_norm": 6.15625, + "learning_rate": 1.0678809543625796e-06, + "loss": 0.76152925, + "memory(GiB)": 147.13, + "step": 68100, + "train_speed(iter/s)": 0.200671 + }, + { + "acc": 0.77761049, + "epoch": 1.5890578748596509, + "grad_norm": 6.40625, + "learning_rate": 1.0667143547902964e-06, + "loss": 0.80845604, + "memory(GiB)": 147.13, + "step": 68110, + "train_speed(iter/s)": 0.200686 + }, + { + "acc": 0.78582354, + "epoch": 1.5892911824319398, + "grad_norm": 4.0, + "learning_rate": 1.065548316693395e-06, + "loss": 0.75975571, + "memory(GiB)": 147.13, + "step": 68120, + "train_speed(iter/s)": 0.200702 + }, + { + "acc": 0.78570542, + "epoch": 1.5895244900042287, + "grad_norm": 4.6875, + "learning_rate": 1.0643828402383317e-06, + "loss": 0.7738452, + "memory(GiB)": 147.13, + "step": 68130, + "train_speed(iter/s)": 0.200716 + }, + { + "acc": 0.78605022, + "epoch": 1.5897577975765176, + "grad_norm": 5.3125, + "learning_rate": 1.0632179255914716e-06, + "loss": 0.76639547, + "memory(GiB)": 147.13, + "step": 68140, + "train_speed(iter/s)": 0.200731 + }, + { + "acc": 0.77231016, + "epoch": 1.5899911051488065, + "grad_norm": 5.5625, + "learning_rate": 1.06205357291911e-06, + "loss": 0.79767914, + "memory(GiB)": 147.13, + "step": 68150, + "train_speed(iter/s)": 0.200746 + }, + { + "acc": 0.78238726, + "epoch": 1.5902244127210954, + "grad_norm": 8.875, + "learning_rate": 1.0608897823874565e-06, + "loss": 0.76666489, + "memory(GiB)": 147.13, + "step": 68160, + "train_speed(iter/s)": 0.200759 + }, + { + "acc": 0.78485689, + "epoch": 1.5904577202933843, + "grad_norm": 5.875, + "learning_rate": 1.0597265541626428e-06, + "loss": 0.77945495, + "memory(GiB)": 147.13, + "step": 68170, + "train_speed(iter/s)": 0.200774 + }, + { + "acc": 0.789116, + "epoch": 1.5906910278656732, + "grad_norm": 4.625, + "learning_rate": 1.0585638884107174e-06, + "loss": 0.76332469, + "memory(GiB)": 147.13, + "step": 68180, + "train_speed(iter/s)": 0.200789 + }, + { + "acc": 0.77295685, + "epoch": 1.590924335437962, + "grad_norm": 5.96875, + "learning_rate": 1.0574017852976538e-06, + "loss": 0.80912657, + "memory(GiB)": 147.13, + "step": 68190, + "train_speed(iter/s)": 0.200803 + }, + { + "acc": 0.78879251, + "epoch": 1.591157643010251, + "grad_norm": 6.28125, + "learning_rate": 1.0562402449893394e-06, + "loss": 0.75802283, + "memory(GiB)": 147.13, + "step": 68200, + "train_speed(iter/s)": 0.20082 + }, + { + "acc": 0.77317877, + "epoch": 1.5913909505825399, + "grad_norm": 6.1875, + "learning_rate": 1.0550792676515836e-06, + "loss": 0.81081858, + "memory(GiB)": 147.13, + "step": 68210, + "train_speed(iter/s)": 0.200836 + }, + { + "acc": 0.79010024, + "epoch": 1.5916242581548288, + "grad_norm": 7.875, + "learning_rate": 1.0539188534501176e-06, + "loss": 0.75586977, + "memory(GiB)": 147.13, + "step": 68220, + "train_speed(iter/s)": 0.200851 + }, + { + "acc": 0.80472164, + "epoch": 1.5918575657271177, + "grad_norm": 7.15625, + "learning_rate": 1.0527590025505873e-06, + "loss": 0.71593409, + "memory(GiB)": 147.13, + "step": 68230, + "train_speed(iter/s)": 0.200866 + }, + { + "acc": 0.78382373, + "epoch": 1.5920908732994066, + "grad_norm": 5.28125, + "learning_rate": 1.051599715118566e-06, + "loss": 0.78045077, + "memory(GiB)": 147.13, + "step": 68240, + "train_speed(iter/s)": 0.200881 + }, + { + "acc": 0.79803543, + "epoch": 1.5923241808716955, + "grad_norm": 6.09375, + "learning_rate": 1.0504409913195346e-06, + "loss": 0.71405649, + "memory(GiB)": 147.13, + "step": 68250, + "train_speed(iter/s)": 0.200896 + }, + { + "acc": 0.79116182, + "epoch": 1.5925574884439841, + "grad_norm": 5.09375, + "learning_rate": 1.0492828313189064e-06, + "loss": 0.73879242, + "memory(GiB)": 147.13, + "step": 68260, + "train_speed(iter/s)": 0.200911 + }, + { + "acc": 0.76173649, + "epoch": 1.5927907960162733, + "grad_norm": 7.75, + "learning_rate": 1.0481252352820064e-06, + "loss": 0.85937166, + "memory(GiB)": 147.13, + "step": 68270, + "train_speed(iter/s)": 0.200926 + }, + { + "acc": 0.78311167, + "epoch": 1.593024103588562, + "grad_norm": 4.4375, + "learning_rate": 1.046968203374079e-06, + "loss": 0.78482008, + "memory(GiB)": 147.13, + "step": 68280, + "train_speed(iter/s)": 0.20094 + }, + { + "acc": 0.75329967, + "epoch": 1.593257411160851, + "grad_norm": 7.40625, + "learning_rate": 1.0458117357602944e-06, + "loss": 0.89326878, + "memory(GiB)": 147.13, + "step": 68290, + "train_speed(iter/s)": 0.200956 + }, + { + "acc": 0.77799082, + "epoch": 1.5934907187331397, + "grad_norm": 4.40625, + "learning_rate": 1.0446558326057342e-06, + "loss": 0.79334536, + "memory(GiB)": 147.13, + "step": 68300, + "train_speed(iter/s)": 0.200972 + }, + { + "acc": 0.75523701, + "epoch": 1.5937240263054289, + "grad_norm": 6.28125, + "learning_rate": 1.0435004940754062e-06, + "loss": 0.90748978, + "memory(GiB)": 147.13, + "step": 68310, + "train_speed(iter/s)": 0.200987 + }, + { + "acc": 0.80369549, + "epoch": 1.5939573338777175, + "grad_norm": 6.0, + "learning_rate": 1.0423457203342318e-06, + "loss": 0.68501949, + "memory(GiB)": 147.13, + "step": 68320, + "train_speed(iter/s)": 0.201001 + }, + { + "acc": 0.78252659, + "epoch": 1.5941906414500067, + "grad_norm": 5.8125, + "learning_rate": 1.0411915115470578e-06, + "loss": 0.772118, + "memory(GiB)": 147.13, + "step": 68330, + "train_speed(iter/s)": 0.201017 + }, + { + "acc": 0.78786469, + "epoch": 1.5944239490222953, + "grad_norm": 6.15625, + "learning_rate": 1.0400378678786449e-06, + "loss": 0.75693922, + "memory(GiB)": 147.13, + "step": 68340, + "train_speed(iter/s)": 0.201033 + }, + { + "acc": 0.77552619, + "epoch": 1.5946572565945845, + "grad_norm": 4.3125, + "learning_rate": 1.0388847894936765e-06, + "loss": 0.80412102, + "memory(GiB)": 147.13, + "step": 68350, + "train_speed(iter/s)": 0.201048 + }, + { + "acc": 0.7692029, + "epoch": 1.5948905641668731, + "grad_norm": 7.34375, + "learning_rate": 1.0377322765567533e-06, + "loss": 0.85158634, + "memory(GiB)": 147.13, + "step": 68360, + "train_speed(iter/s)": 0.201064 + }, + { + "acc": 0.79861155, + "epoch": 1.5951238717391623, + "grad_norm": 5.21875, + "learning_rate": 1.0365803292323956e-06, + "loss": 0.68969359, + "memory(GiB)": 147.13, + "step": 68370, + "train_speed(iter/s)": 0.201079 + }, + { + "acc": 0.80761919, + "epoch": 1.595357179311451, + "grad_norm": 4.5, + "learning_rate": 1.0354289476850459e-06, + "loss": 0.67954183, + "memory(GiB)": 147.13, + "step": 68380, + "train_speed(iter/s)": 0.201094 + }, + { + "acc": 0.79081321, + "epoch": 1.59559048688374, + "grad_norm": 9.625, + "learning_rate": 1.0342781320790606e-06, + "loss": 0.74034171, + "memory(GiB)": 147.13, + "step": 68390, + "train_speed(iter/s)": 0.201108 + }, + { + "acc": 0.79297962, + "epoch": 1.5958237944560287, + "grad_norm": 4.03125, + "learning_rate": 1.0331278825787211e-06, + "loss": 0.75901136, + "memory(GiB)": 147.13, + "step": 68400, + "train_speed(iter/s)": 0.201124 + }, + { + "acc": 0.78131299, + "epoch": 1.5960571020283179, + "grad_norm": 6.625, + "learning_rate": 1.0319781993482242e-06, + "loss": 0.77496767, + "memory(GiB)": 147.13, + "step": 68410, + "train_speed(iter/s)": 0.20114 + }, + { + "acc": 0.77413187, + "epoch": 1.5962904096006065, + "grad_norm": 7.40625, + "learning_rate": 1.0308290825516852e-06, + "loss": 0.80257893, + "memory(GiB)": 147.13, + "step": 68420, + "train_speed(iter/s)": 0.201154 + }, + { + "acc": 0.78656511, + "epoch": 1.5965237171728957, + "grad_norm": 5.96875, + "learning_rate": 1.0296805323531435e-06, + "loss": 0.79238644, + "memory(GiB)": 147.13, + "step": 68430, + "train_speed(iter/s)": 0.20117 + }, + { + "acc": 0.7843667, + "epoch": 1.5967570247451843, + "grad_norm": 5.9375, + "learning_rate": 1.0285325489165503e-06, + "loss": 0.78407121, + "memory(GiB)": 147.13, + "step": 68440, + "train_speed(iter/s)": 0.201187 + }, + { + "acc": 0.81157207, + "epoch": 1.5969903323174732, + "grad_norm": 5.90625, + "learning_rate": 1.0273851324057838e-06, + "loss": 0.67407074, + "memory(GiB)": 147.13, + "step": 68450, + "train_speed(iter/s)": 0.201202 + }, + { + "acc": 0.79256735, + "epoch": 1.5972236398897621, + "grad_norm": 6.03125, + "learning_rate": 1.026238282984634e-06, + "loss": 0.74679794, + "memory(GiB)": 147.13, + "step": 68460, + "train_speed(iter/s)": 0.201217 + }, + { + "acc": 0.79176388, + "epoch": 1.597456947462051, + "grad_norm": 9.5625, + "learning_rate": 1.025092000816818e-06, + "loss": 0.74075713, + "memory(GiB)": 147.13, + "step": 68470, + "train_speed(iter/s)": 0.201231 + }, + { + "acc": 0.78975224, + "epoch": 1.59769025503434, + "grad_norm": 5.5625, + "learning_rate": 1.023946286065961e-06, + "loss": 0.74442635, + "memory(GiB)": 147.13, + "step": 68480, + "train_speed(iter/s)": 0.201247 + }, + { + "acc": 0.77964411, + "epoch": 1.5979235626066288, + "grad_norm": 4.78125, + "learning_rate": 1.0228011388956182e-06, + "loss": 0.7978488, + "memory(GiB)": 147.13, + "step": 68490, + "train_speed(iter/s)": 0.201262 + }, + { + "acc": 0.78166089, + "epoch": 1.5981568701789177, + "grad_norm": 5.09375, + "learning_rate": 1.0216565594692573e-06, + "loss": 0.78926048, + "memory(GiB)": 147.13, + "step": 68500, + "train_speed(iter/s)": 0.201277 + }, + { + "epoch": 1.5981568701789177, + "eval_acc": 0.7447232075096114, + "eval_loss": 0.8043951988220215, + "eval_runtime": 1270.2348, + "eval_samples_per_second": 28.334, + "eval_steps_per_second": 14.167, + "step": 68500 + }, + { + "acc": 0.79227681, + "epoch": 1.5983901777512066, + "grad_norm": 6.90625, + "learning_rate": 1.0205125479502658e-06, + "loss": 0.74688244, + "memory(GiB)": 147.13, + "step": 68510, + "train_speed(iter/s)": 0.200531 + }, + { + "acc": 0.78717947, + "epoch": 1.5986234853234955, + "grad_norm": 5.5625, + "learning_rate": 1.0193691045019533e-06, + "loss": 0.7674963, + "memory(GiB)": 147.13, + "step": 68520, + "train_speed(iter/s)": 0.200547 + }, + { + "acc": 0.77596292, + "epoch": 1.5988567928957844, + "grad_norm": 4.5, + "learning_rate": 1.0182262292875427e-06, + "loss": 0.80983753, + "memory(GiB)": 147.13, + "step": 68530, + "train_speed(iter/s)": 0.200563 + }, + { + "acc": 0.77494078, + "epoch": 1.5990901004680733, + "grad_norm": 5.6875, + "learning_rate": 1.0170839224701834e-06, + "loss": 0.81508121, + "memory(GiB)": 147.13, + "step": 68540, + "train_speed(iter/s)": 0.200578 + }, + { + "acc": 0.78155966, + "epoch": 1.5993234080403622, + "grad_norm": 7.4375, + "learning_rate": 1.015942184212937e-06, + "loss": 0.81253033, + "memory(GiB)": 147.13, + "step": 68550, + "train_speed(iter/s)": 0.200593 + }, + { + "acc": 0.79896936, + "epoch": 1.5995567156126511, + "grad_norm": 7.65625, + "learning_rate": 1.0148010146787845e-06, + "loss": 0.70341702, + "memory(GiB)": 148.85, + "step": 68560, + "train_speed(iter/s)": 0.200606 + }, + { + "acc": 0.77949715, + "epoch": 1.59979002318494, + "grad_norm": 3.375, + "learning_rate": 1.0136604140306312e-06, + "loss": 0.7729764, + "memory(GiB)": 138.1, + "step": 68570, + "train_speed(iter/s)": 0.200621 + }, + { + "acc": 0.7776124, + "epoch": 1.600023330757229, + "grad_norm": 5.875, + "learning_rate": 1.0125203824312957e-06, + "loss": 0.78736258, + "memory(GiB)": 138.1, + "step": 68580, + "train_speed(iter/s)": 0.200636 + }, + { + "acc": 0.77759681, + "epoch": 1.6002566383295178, + "grad_norm": 5.875, + "learning_rate": 1.0113809200435176e-06, + "loss": 0.81777782, + "memory(GiB)": 138.1, + "step": 68590, + "train_speed(iter/s)": 0.200651 + }, + { + "acc": 0.78296614, + "epoch": 1.6004899459018067, + "grad_norm": 5.5, + "learning_rate": 1.010242027029953e-06, + "loss": 0.77656908, + "memory(GiB)": 138.1, + "step": 68600, + "train_speed(iter/s)": 0.200667 + }, + { + "acc": 0.77765617, + "epoch": 1.6007232534740956, + "grad_norm": 4.5, + "learning_rate": 1.009103703553181e-06, + "loss": 0.81307631, + "memory(GiB)": 138.1, + "step": 68610, + "train_speed(iter/s)": 0.200682 + }, + { + "acc": 0.78029776, + "epoch": 1.6009565610463845, + "grad_norm": 5.15625, + "learning_rate": 1.0079659497756943e-06, + "loss": 0.77876968, + "memory(GiB)": 138.1, + "step": 68620, + "train_speed(iter/s)": 0.200697 + }, + { + "acc": 0.77350893, + "epoch": 1.6011898686186734, + "grad_norm": 4.28125, + "learning_rate": 1.0068287658599107e-06, + "loss": 0.81584587, + "memory(GiB)": 138.1, + "step": 68630, + "train_speed(iter/s)": 0.200711 + }, + { + "acc": 0.77707477, + "epoch": 1.6014231761909623, + "grad_norm": 7.65625, + "learning_rate": 1.0056921519681605e-06, + "loss": 0.79458685, + "memory(GiB)": 138.1, + "step": 68640, + "train_speed(iter/s)": 0.200726 + }, + { + "acc": 0.80541954, + "epoch": 1.601656483763251, + "grad_norm": 6.09375, + "learning_rate": 1.0045561082626936e-06, + "loss": 0.68734684, + "memory(GiB)": 138.1, + "step": 68650, + "train_speed(iter/s)": 0.200741 + }, + { + "acc": 0.7977459, + "epoch": 1.60188979133554, + "grad_norm": 4.1875, + "learning_rate": 1.0034206349056829e-06, + "loss": 0.723417, + "memory(GiB)": 138.1, + "step": 68660, + "train_speed(iter/s)": 0.200757 + }, + { + "acc": 0.78180084, + "epoch": 1.6021230989078288, + "grad_norm": 6.5, + "learning_rate": 1.002285732059215e-06, + "loss": 0.79331684, + "memory(GiB)": 138.1, + "step": 68670, + "train_speed(iter/s)": 0.200771 + }, + { + "acc": 0.79122467, + "epoch": 1.602356406480118, + "grad_norm": 5.84375, + "learning_rate": 1.001151399885298e-06, + "loss": 0.74690819, + "memory(GiB)": 138.1, + "step": 68680, + "train_speed(iter/s)": 0.200785 + }, + { + "acc": 0.78143425, + "epoch": 1.6025897140524066, + "grad_norm": 6.125, + "learning_rate": 1.000017638545857e-06, + "loss": 0.77264366, + "memory(GiB)": 138.1, + "step": 68690, + "train_speed(iter/s)": 0.200801 + }, + { + "acc": 0.79719687, + "epoch": 1.6028230216246957, + "grad_norm": 5.90625, + "learning_rate": 9.988844482027365e-07, + "loss": 0.71532197, + "memory(GiB)": 138.1, + "step": 68700, + "train_speed(iter/s)": 0.200816 + }, + { + "acc": 0.77920446, + "epoch": 1.6030563291969844, + "grad_norm": 5.6875, + "learning_rate": 9.97751829017699e-07, + "loss": 0.77345262, + "memory(GiB)": 138.1, + "step": 68710, + "train_speed(iter/s)": 0.200832 + }, + { + "acc": 0.76342015, + "epoch": 1.6032896367692735, + "grad_norm": 6.75, + "learning_rate": 9.966197811524231e-07, + "loss": 0.85860195, + "memory(GiB)": 138.1, + "step": 68720, + "train_speed(iter/s)": 0.200846 + }, + { + "acc": 0.77934628, + "epoch": 1.6035229443415622, + "grad_norm": 6.21875, + "learning_rate": 9.954883047685121e-07, + "loss": 0.79521642, + "memory(GiB)": 138.1, + "step": 68730, + "train_speed(iter/s)": 0.200862 + }, + { + "acc": 0.781073, + "epoch": 1.6037562519138513, + "grad_norm": 5.1875, + "learning_rate": 9.943574000274814e-07, + "loss": 0.78763037, + "memory(GiB)": 138.1, + "step": 68740, + "train_speed(iter/s)": 0.200878 + }, + { + "acc": 0.77881775, + "epoch": 1.60398955948614, + "grad_norm": 5.59375, + "learning_rate": 9.93227067090769e-07, + "loss": 0.81242599, + "memory(GiB)": 138.1, + "step": 68750, + "train_speed(iter/s)": 0.200894 + }, + { + "acc": 0.77240114, + "epoch": 1.604222867058429, + "grad_norm": 5.75, + "learning_rate": 9.920973061197291e-07, + "loss": 0.80986681, + "memory(GiB)": 138.1, + "step": 68760, + "train_speed(iter/s)": 0.200908 + }, + { + "acc": 0.77040758, + "epoch": 1.6044561746307178, + "grad_norm": 5.53125, + "learning_rate": 9.90968117275633e-07, + "loss": 0.82993565, + "memory(GiB)": 138.1, + "step": 68770, + "train_speed(iter/s)": 0.200923 + }, + { + "acc": 0.76394424, + "epoch": 1.604689482203007, + "grad_norm": 5.625, + "learning_rate": 9.898395007196747e-07, + "loss": 0.84758568, + "memory(GiB)": 138.1, + "step": 68780, + "train_speed(iter/s)": 0.200937 + }, + { + "acc": 0.78739591, + "epoch": 1.6049227897752956, + "grad_norm": 6.0, + "learning_rate": 9.887114566129613e-07, + "loss": 0.75285759, + "memory(GiB)": 138.1, + "step": 68790, + "train_speed(iter/s)": 0.200951 + }, + { + "acc": 0.7844842, + "epoch": 1.6051560973475847, + "grad_norm": 8.5625, + "learning_rate": 9.875839851165237e-07, + "loss": 0.77806282, + "memory(GiB)": 138.1, + "step": 68800, + "train_speed(iter/s)": 0.200966 + }, + { + "acc": 0.77970557, + "epoch": 1.6053894049198734, + "grad_norm": 7.4375, + "learning_rate": 9.86457086391307e-07, + "loss": 0.8352437, + "memory(GiB)": 138.1, + "step": 68810, + "train_speed(iter/s)": 0.20098 + }, + { + "acc": 0.77098522, + "epoch": 1.6056227124921625, + "grad_norm": 7.4375, + "learning_rate": 9.85330760598175e-07, + "loss": 0.82055664, + "memory(GiB)": 138.1, + "step": 68820, + "train_speed(iter/s)": 0.200995 + }, + { + "acc": 0.77886381, + "epoch": 1.6058560200644512, + "grad_norm": 6.5, + "learning_rate": 9.842050078979088e-07, + "loss": 0.78484612, + "memory(GiB)": 138.1, + "step": 68830, + "train_speed(iter/s)": 0.20101 + }, + { + "acc": 0.79053154, + "epoch": 1.60608932763674, + "grad_norm": 3.765625, + "learning_rate": 9.830798284512132e-07, + "loss": 0.74266052, + "memory(GiB)": 138.1, + "step": 68840, + "train_speed(iter/s)": 0.201025 + }, + { + "acc": 0.79033446, + "epoch": 1.606322635209029, + "grad_norm": 5.15625, + "learning_rate": 9.819552224187046e-07, + "loss": 0.7502409, + "memory(GiB)": 138.1, + "step": 68850, + "train_speed(iter/s)": 0.20104 + }, + { + "acc": 0.78153863, + "epoch": 1.6065559427813179, + "grad_norm": 5.4375, + "learning_rate": 9.808311899609197e-07, + "loss": 0.76911235, + "memory(GiB)": 138.1, + "step": 68860, + "train_speed(iter/s)": 0.201055 + }, + { + "acc": 0.79210978, + "epoch": 1.6067892503536068, + "grad_norm": 5.03125, + "learning_rate": 9.797077312383162e-07, + "loss": 0.74023113, + "memory(GiB)": 138.1, + "step": 68870, + "train_speed(iter/s)": 0.20107 + }, + { + "acc": 0.79266319, + "epoch": 1.6070225579258957, + "grad_norm": 6.46875, + "learning_rate": 9.785848464112647e-07, + "loss": 0.76112881, + "memory(GiB)": 138.1, + "step": 68880, + "train_speed(iter/s)": 0.201084 + }, + { + "acc": 0.79358058, + "epoch": 1.6072558654981846, + "grad_norm": 3.65625, + "learning_rate": 9.774625356400597e-07, + "loss": 0.7407238, + "memory(GiB)": 138.1, + "step": 68890, + "train_speed(iter/s)": 0.2011 + }, + { + "acc": 0.78830051, + "epoch": 1.6074891730704735, + "grad_norm": 4.5625, + "learning_rate": 9.763407990849089e-07, + "loss": 0.75743866, + "memory(GiB)": 138.1, + "step": 68900, + "train_speed(iter/s)": 0.201115 + }, + { + "acc": 0.76708155, + "epoch": 1.6077224806427624, + "grad_norm": 5.34375, + "learning_rate": 9.75219636905939e-07, + "loss": 0.81055851, + "memory(GiB)": 138.1, + "step": 68910, + "train_speed(iter/s)": 0.20113 + }, + { + "acc": 0.78732371, + "epoch": 1.6079557882150513, + "grad_norm": 7.53125, + "learning_rate": 9.74099049263198e-07, + "loss": 0.76581669, + "memory(GiB)": 138.1, + "step": 68920, + "train_speed(iter/s)": 0.201144 + }, + { + "acc": 0.77004409, + "epoch": 1.6081890957873402, + "grad_norm": 4.6875, + "learning_rate": 9.729790363166487e-07, + "loss": 0.82656031, + "memory(GiB)": 138.1, + "step": 68930, + "train_speed(iter/s)": 0.201159 + }, + { + "acc": 0.79495058, + "epoch": 1.608422403359629, + "grad_norm": 4.1875, + "learning_rate": 9.718595982261713e-07, + "loss": 0.74895706, + "memory(GiB)": 138.1, + "step": 68940, + "train_speed(iter/s)": 0.201172 + }, + { + "acc": 0.78100176, + "epoch": 1.608655710931918, + "grad_norm": 6.09375, + "learning_rate": 9.707407351515653e-07, + "loss": 0.78046789, + "memory(GiB)": 138.1, + "step": 68950, + "train_speed(iter/s)": 0.201188 + }, + { + "acc": 0.78246336, + "epoch": 1.6088890185042068, + "grad_norm": 5.28125, + "learning_rate": 9.696224472525494e-07, + "loss": 0.78533907, + "memory(GiB)": 138.1, + "step": 68960, + "train_speed(iter/s)": 0.201204 + }, + { + "acc": 0.78976364, + "epoch": 1.6091223260764957, + "grad_norm": 7.0625, + "learning_rate": 9.685047346887578e-07, + "loss": 0.75304751, + "memory(GiB)": 138.1, + "step": 68970, + "train_speed(iter/s)": 0.201219 + }, + { + "acc": 0.78010139, + "epoch": 1.6093556336487846, + "grad_norm": 5.3125, + "learning_rate": 9.673875976197455e-07, + "loss": 0.79901323, + "memory(GiB)": 138.1, + "step": 68980, + "train_speed(iter/s)": 0.201234 + }, + { + "acc": 0.7747858, + "epoch": 1.6095889412210735, + "grad_norm": 5.03125, + "learning_rate": 9.662710362049815e-07, + "loss": 0.82582674, + "memory(GiB)": 138.1, + "step": 68990, + "train_speed(iter/s)": 0.201248 + }, + { + "acc": 0.78229184, + "epoch": 1.6098222487933624, + "grad_norm": 6.09375, + "learning_rate": 9.651550506038543e-07, + "loss": 0.78168783, + "memory(GiB)": 138.1, + "step": 69000, + "train_speed(iter/s)": 0.201264 + }, + { + "epoch": 1.6098222487933624, + "eval_acc": 0.7446647548648214, + "eval_loss": 0.8044345378875732, + "eval_runtime": 1269.3193, + "eval_samples_per_second": 28.355, + "eval_steps_per_second": 14.178, + "step": 69000 + }, + { + "acc": 0.77631769, + "epoch": 1.6100555563656513, + "grad_norm": 4.40625, + "learning_rate": 9.640396409756731e-07, + "loss": 0.83473434, + "memory(GiB)": 138.1, + "step": 69010, + "train_speed(iter/s)": 0.200524 + }, + { + "acc": 0.79392805, + "epoch": 1.6102888639379402, + "grad_norm": 5.75, + "learning_rate": 9.629248074796593e-07, + "loss": 0.7308218, + "memory(GiB)": 138.1, + "step": 69020, + "train_speed(iter/s)": 0.20054 + }, + { + "acc": 0.78255825, + "epoch": 1.6105221715102291, + "grad_norm": 7.625, + "learning_rate": 9.618105502749575e-07, + "loss": 0.78719893, + "memory(GiB)": 138.1, + "step": 69030, + "train_speed(iter/s)": 0.200555 + }, + { + "acc": 0.80137396, + "epoch": 1.6107554790825178, + "grad_norm": 10.5625, + "learning_rate": 9.606968695206264e-07, + "loss": 0.69361744, + "memory(GiB)": 138.1, + "step": 69040, + "train_speed(iter/s)": 0.200571 + }, + { + "acc": 0.7979476, + "epoch": 1.610988786654807, + "grad_norm": 6.5625, + "learning_rate": 9.59583765375644e-07, + "loss": 0.72113686, + "memory(GiB)": 138.1, + "step": 69050, + "train_speed(iter/s)": 0.200587 + }, + { + "acc": 0.7878149, + "epoch": 1.6112220942270956, + "grad_norm": 8.5625, + "learning_rate": 9.58471237998906e-07, + "loss": 0.77528825, + "memory(GiB)": 138.1, + "step": 69060, + "train_speed(iter/s)": 0.200602 + }, + { + "acc": 0.77063704, + "epoch": 1.6114554017993847, + "grad_norm": 5.71875, + "learning_rate": 9.57359287549222e-07, + "loss": 0.84115477, + "memory(GiB)": 138.1, + "step": 69070, + "train_speed(iter/s)": 0.200617 + }, + { + "acc": 0.77148275, + "epoch": 1.6116887093716734, + "grad_norm": 5.3125, + "learning_rate": 9.562479141853276e-07, + "loss": 0.80791435, + "memory(GiB)": 138.1, + "step": 69080, + "train_speed(iter/s)": 0.200631 + }, + { + "acc": 0.77265015, + "epoch": 1.6119220169439625, + "grad_norm": 5.3125, + "learning_rate": 9.551371180658675e-07, + "loss": 0.79873414, + "memory(GiB)": 138.1, + "step": 69090, + "train_speed(iter/s)": 0.200646 + }, + { + "acc": 0.79981709, + "epoch": 1.6121553245162512, + "grad_norm": 4.8125, + "learning_rate": 9.540268993494095e-07, + "loss": 0.70313296, + "memory(GiB)": 138.1, + "step": 69100, + "train_speed(iter/s)": 0.200661 + }, + { + "acc": 0.77894669, + "epoch": 1.6123886320885403, + "grad_norm": 6.6875, + "learning_rate": 9.529172581944352e-07, + "loss": 0.81366968, + "memory(GiB)": 138.1, + "step": 69110, + "train_speed(iter/s)": 0.200676 + }, + { + "acc": 0.77654457, + "epoch": 1.612621939660829, + "grad_norm": 7.15625, + "learning_rate": 9.518081947593477e-07, + "loss": 0.81639051, + "memory(GiB)": 138.1, + "step": 69120, + "train_speed(iter/s)": 0.200692 + }, + { + "acc": 0.76891427, + "epoch": 1.6128552472331181, + "grad_norm": 4.8125, + "learning_rate": 9.50699709202465e-07, + "loss": 0.85439548, + "memory(GiB)": 138.1, + "step": 69130, + "train_speed(iter/s)": 0.200706 + }, + { + "acc": 0.78814936, + "epoch": 1.6130885548054068, + "grad_norm": 5.71875, + "learning_rate": 9.495918016820204e-07, + "loss": 0.7638484, + "memory(GiB)": 138.1, + "step": 69140, + "train_speed(iter/s)": 0.200722 + }, + { + "acc": 0.79379234, + "epoch": 1.613321862377696, + "grad_norm": 5.1875, + "learning_rate": 9.484844723561726e-07, + "loss": 0.7099072, + "memory(GiB)": 138.1, + "step": 69150, + "train_speed(iter/s)": 0.200737 + }, + { + "acc": 0.77234783, + "epoch": 1.6135551699499846, + "grad_norm": 6.90625, + "learning_rate": 9.473777213829866e-07, + "loss": 0.81089096, + "memory(GiB)": 138.1, + "step": 69160, + "train_speed(iter/s)": 0.200752 + }, + { + "acc": 0.77259874, + "epoch": 1.6137884775222737, + "grad_norm": 4.90625, + "learning_rate": 9.462715489204549e-07, + "loss": 0.81169949, + "memory(GiB)": 138.1, + "step": 69170, + "train_speed(iter/s)": 0.200768 + }, + { + "acc": 0.78828964, + "epoch": 1.6140217850945624, + "grad_norm": 7.78125, + "learning_rate": 9.451659551264808e-07, + "loss": 0.75277605, + "memory(GiB)": 138.1, + "step": 69180, + "train_speed(iter/s)": 0.200783 + }, + { + "acc": 0.79725246, + "epoch": 1.6142550926668515, + "grad_norm": 6.59375, + "learning_rate": 9.440609401588901e-07, + "loss": 0.72743855, + "memory(GiB)": 138.1, + "step": 69190, + "train_speed(iter/s)": 0.200799 + }, + { + "acc": 0.79967861, + "epoch": 1.6144884002391402, + "grad_norm": 4.21875, + "learning_rate": 9.429565041754218e-07, + "loss": 0.70557318, + "memory(GiB)": 138.1, + "step": 69200, + "train_speed(iter/s)": 0.200813 + }, + { + "acc": 0.77489271, + "epoch": 1.6147217078114293, + "grad_norm": 3.90625, + "learning_rate": 9.418526473337325e-07, + "loss": 0.81767092, + "memory(GiB)": 138.1, + "step": 69210, + "train_speed(iter/s)": 0.200828 + }, + { + "acc": 0.77260404, + "epoch": 1.614955015383718, + "grad_norm": 4.9375, + "learning_rate": 9.407493697913999e-07, + "loss": 0.82012691, + "memory(GiB)": 138.1, + "step": 69220, + "train_speed(iter/s)": 0.200844 + }, + { + "acc": 0.79442325, + "epoch": 1.615188322956007, + "grad_norm": 8.5, + "learning_rate": 9.396466717059149e-07, + "loss": 0.72898545, + "memory(GiB)": 138.1, + "step": 69230, + "train_speed(iter/s)": 0.200859 + }, + { + "acc": 0.78504844, + "epoch": 1.6154216305282958, + "grad_norm": 5.4375, + "learning_rate": 9.385445532346887e-07, + "loss": 0.76235442, + "memory(GiB)": 138.1, + "step": 69240, + "train_speed(iter/s)": 0.200874 + }, + { + "acc": 0.79455013, + "epoch": 1.6156549381005847, + "grad_norm": 4.40625, + "learning_rate": 9.374430145350466e-07, + "loss": 0.74329395, + "memory(GiB)": 138.1, + "step": 69250, + "train_speed(iter/s)": 0.200888 + }, + { + "acc": 0.7682219, + "epoch": 1.6158882456728736, + "grad_norm": 5.46875, + "learning_rate": 9.363420557642355e-07, + "loss": 0.84918575, + "memory(GiB)": 138.1, + "step": 69260, + "train_speed(iter/s)": 0.200903 + }, + { + "acc": 0.78665934, + "epoch": 1.6161215532451625, + "grad_norm": 5.25, + "learning_rate": 9.352416770794154e-07, + "loss": 0.76795254, + "memory(GiB)": 138.1, + "step": 69270, + "train_speed(iter/s)": 0.200917 + }, + { + "acc": 0.7946744, + "epoch": 1.6163548608174514, + "grad_norm": 4.71875, + "learning_rate": 9.341418786376649e-07, + "loss": 0.71400213, + "memory(GiB)": 138.1, + "step": 69280, + "train_speed(iter/s)": 0.200932 + }, + { + "acc": 0.77530904, + "epoch": 1.6165881683897403, + "grad_norm": 4.625, + "learning_rate": 9.330426605959803e-07, + "loss": 0.82718925, + "memory(GiB)": 138.1, + "step": 69290, + "train_speed(iter/s)": 0.200947 + }, + { + "acc": 0.7725615, + "epoch": 1.6168214759620292, + "grad_norm": 5.78125, + "learning_rate": 9.319440231112725e-07, + "loss": 0.81718159, + "memory(GiB)": 138.1, + "step": 69300, + "train_speed(iter/s)": 0.200963 + }, + { + "acc": 0.79436731, + "epoch": 1.617054783534318, + "grad_norm": 5.15625, + "learning_rate": 9.308459663403757e-07, + "loss": 0.75845556, + "memory(GiB)": 138.1, + "step": 69310, + "train_speed(iter/s)": 0.200977 + }, + { + "acc": 0.78687463, + "epoch": 1.617288091106607, + "grad_norm": 4.71875, + "learning_rate": 9.297484904400333e-07, + "loss": 0.7621346, + "memory(GiB)": 138.1, + "step": 69320, + "train_speed(iter/s)": 0.200992 + }, + { + "acc": 0.78693705, + "epoch": 1.6175213986788959, + "grad_norm": 4.90625, + "learning_rate": 9.286515955669134e-07, + "loss": 0.74903417, + "memory(GiB)": 138.1, + "step": 69330, + "train_speed(iter/s)": 0.201007 + }, + { + "acc": 0.78143864, + "epoch": 1.6177547062511848, + "grad_norm": 5.53125, + "learning_rate": 9.275552818775945e-07, + "loss": 0.79808311, + "memory(GiB)": 138.1, + "step": 69340, + "train_speed(iter/s)": 0.201023 + }, + { + "acc": 0.7710865, + "epoch": 1.6179880138234737, + "grad_norm": 6.0625, + "learning_rate": 9.264595495285755e-07, + "loss": 0.83482437, + "memory(GiB)": 138.1, + "step": 69350, + "train_speed(iter/s)": 0.201037 + }, + { + "acc": 0.76026926, + "epoch": 1.6182213213957626, + "grad_norm": 8.1875, + "learning_rate": 9.25364398676274e-07, + "loss": 0.85732269, + "memory(GiB)": 138.1, + "step": 69360, + "train_speed(iter/s)": 0.201052 + }, + { + "acc": 0.78260975, + "epoch": 1.6184546289680515, + "grad_norm": 7.125, + "learning_rate": 9.242698294770191e-07, + "loss": 0.77640896, + "memory(GiB)": 138.1, + "step": 69370, + "train_speed(iter/s)": 0.201067 + }, + { + "acc": 0.79006805, + "epoch": 1.6186879365403404, + "grad_norm": 3.921875, + "learning_rate": 9.231758420870645e-07, + "loss": 0.75500803, + "memory(GiB)": 138.1, + "step": 69380, + "train_speed(iter/s)": 0.201082 + }, + { + "acc": 0.7738595, + "epoch": 1.6189212441126293, + "grad_norm": 4.4375, + "learning_rate": 9.220824366625719e-07, + "loss": 0.8050436, + "memory(GiB)": 138.1, + "step": 69390, + "train_speed(iter/s)": 0.201097 + }, + { + "acc": 0.77326818, + "epoch": 1.6191545516849182, + "grad_norm": 4.84375, + "learning_rate": 9.20989613359628e-07, + "loss": 0.83086958, + "memory(GiB)": 138.1, + "step": 69400, + "train_speed(iter/s)": 0.201111 + }, + { + "acc": 0.77323809, + "epoch": 1.6193878592572069, + "grad_norm": 5.90625, + "learning_rate": 9.198973723342303e-07, + "loss": 0.81775436, + "memory(GiB)": 138.1, + "step": 69410, + "train_speed(iter/s)": 0.201125 + }, + { + "acc": 0.76896219, + "epoch": 1.619621166829496, + "grad_norm": 4.84375, + "learning_rate": 9.18805713742299e-07, + "loss": 0.82727232, + "memory(GiB)": 138.1, + "step": 69420, + "train_speed(iter/s)": 0.20114 + }, + { + "acc": 0.77858291, + "epoch": 1.6198544744017846, + "grad_norm": 5.5, + "learning_rate": 9.177146377396662e-07, + "loss": 0.80608435, + "memory(GiB)": 138.1, + "step": 69430, + "train_speed(iter/s)": 0.201156 + }, + { + "acc": 0.78011804, + "epoch": 1.6200877819740738, + "grad_norm": 6.125, + "learning_rate": 9.166241444820817e-07, + "loss": 0.76758566, + "memory(GiB)": 138.1, + "step": 69440, + "train_speed(iter/s)": 0.201171 + }, + { + "acc": 0.78035154, + "epoch": 1.6203210895463624, + "grad_norm": 10.625, + "learning_rate": 9.15534234125216e-07, + "loss": 0.78598452, + "memory(GiB)": 138.1, + "step": 69450, + "train_speed(iter/s)": 0.201186 + }, + { + "acc": 0.80147018, + "epoch": 1.6205543971186516, + "grad_norm": 5.25, + "learning_rate": 9.144449068246502e-07, + "loss": 0.70722756, + "memory(GiB)": 138.1, + "step": 69460, + "train_speed(iter/s)": 0.201202 + }, + { + "acc": 0.75699992, + "epoch": 1.6207877046909402, + "grad_norm": 4.59375, + "learning_rate": 9.133561627358884e-07, + "loss": 0.86237135, + "memory(GiB)": 138.1, + "step": 69470, + "train_speed(iter/s)": 0.201218 + }, + { + "acc": 0.79911742, + "epoch": 1.6210210122632294, + "grad_norm": 8.875, + "learning_rate": 9.122680020143476e-07, + "loss": 0.70699673, + "memory(GiB)": 138.1, + "step": 69480, + "train_speed(iter/s)": 0.201231 + }, + { + "acc": 0.80146561, + "epoch": 1.621254319835518, + "grad_norm": 4.90625, + "learning_rate": 9.111804248153605e-07, + "loss": 0.70860691, + "memory(GiB)": 138.1, + "step": 69490, + "train_speed(iter/s)": 0.201246 + }, + { + "acc": 0.78607807, + "epoch": 1.6214876274078072, + "grad_norm": 5.40625, + "learning_rate": 9.100934312941822e-07, + "loss": 0.76731787, + "memory(GiB)": 138.1, + "step": 69500, + "train_speed(iter/s)": 0.20126 + }, + { + "epoch": 1.6214876274078072, + "eval_acc": 0.744685733759253, + "eval_loss": 0.8044191002845764, + "eval_runtime": 1269.1743, + "eval_samples_per_second": 28.358, + "eval_steps_per_second": 14.179, + "step": 69500 + }, + { + "acc": 0.77421312, + "epoch": 1.6217209349800958, + "grad_norm": 6.46875, + "learning_rate": 9.09007021605976e-07, + "loss": 0.79150124, + "memory(GiB)": 138.1, + "step": 69510, + "train_speed(iter/s)": 0.200524 + }, + { + "acc": 0.76822271, + "epoch": 1.621954242552385, + "grad_norm": 10.875, + "learning_rate": 9.079211959058304e-07, + "loss": 0.8320261, + "memory(GiB)": 138.1, + "step": 69520, + "train_speed(iter/s)": 0.200539 + }, + { + "acc": 0.79691601, + "epoch": 1.6221875501246736, + "grad_norm": 8.0, + "learning_rate": 9.068359543487442e-07, + "loss": 0.73356771, + "memory(GiB)": 138.1, + "step": 69530, + "train_speed(iter/s)": 0.200553 + }, + { + "acc": 0.76938868, + "epoch": 1.6224208576969628, + "grad_norm": 4.40625, + "learning_rate": 9.057512970896376e-07, + "loss": 0.81400595, + "memory(GiB)": 138.1, + "step": 69540, + "train_speed(iter/s)": 0.200568 + }, + { + "acc": 0.77159843, + "epoch": 1.6226541652692514, + "grad_norm": 6.1875, + "learning_rate": 9.046672242833427e-07, + "loss": 0.82162743, + "memory(GiB)": 138.1, + "step": 69550, + "train_speed(iter/s)": 0.200583 + }, + { + "acc": 0.79073334, + "epoch": 1.6228874728415406, + "grad_norm": 5.71875, + "learning_rate": 9.035837360846134e-07, + "loss": 0.75625534, + "memory(GiB)": 138.1, + "step": 69560, + "train_speed(iter/s)": 0.200599 + }, + { + "acc": 0.77172422, + "epoch": 1.6231207804138292, + "grad_norm": 6.71875, + "learning_rate": 9.02500832648115e-07, + "loss": 0.82089863, + "memory(GiB)": 138.1, + "step": 69570, + "train_speed(iter/s)": 0.200614 + }, + { + "acc": 0.75787716, + "epoch": 1.6233540879861184, + "grad_norm": 5.8125, + "learning_rate": 9.014185141284315e-07, + "loss": 0.87999249, + "memory(GiB)": 138.1, + "step": 69580, + "train_speed(iter/s)": 0.200629 + }, + { + "acc": 0.78990684, + "epoch": 1.623587395558407, + "grad_norm": 3.5, + "learning_rate": 9.003367806800661e-07, + "loss": 0.76870031, + "memory(GiB)": 138.1, + "step": 69590, + "train_speed(iter/s)": 0.200644 + }, + { + "acc": 0.77759619, + "epoch": 1.623820703130696, + "grad_norm": 5.0, + "learning_rate": 8.992556324574325e-07, + "loss": 0.77327042, + "memory(GiB)": 138.1, + "step": 69600, + "train_speed(iter/s)": 0.200658 + }, + { + "acc": 0.77967892, + "epoch": 1.6240540107029848, + "grad_norm": 5.28125, + "learning_rate": 8.981750696148689e-07, + "loss": 0.78824787, + "memory(GiB)": 138.1, + "step": 69610, + "train_speed(iter/s)": 0.200674 + }, + { + "acc": 0.77583332, + "epoch": 1.6242873182752737, + "grad_norm": 5.21875, + "learning_rate": 8.970950923066201e-07, + "loss": 0.81284409, + "memory(GiB)": 138.1, + "step": 69620, + "train_speed(iter/s)": 0.200689 + }, + { + "acc": 0.79084225, + "epoch": 1.6245206258475626, + "grad_norm": 6.9375, + "learning_rate": 8.960157006868564e-07, + "loss": 0.76385317, + "memory(GiB)": 138.1, + "step": 69630, + "train_speed(iter/s)": 0.200703 + }, + { + "acc": 0.77975607, + "epoch": 1.6247539334198515, + "grad_norm": 5.0, + "learning_rate": 8.949368949096588e-07, + "loss": 0.79590893, + "memory(GiB)": 138.1, + "step": 69640, + "train_speed(iter/s)": 0.200719 + }, + { + "acc": 0.76001158, + "epoch": 1.6249872409921404, + "grad_norm": 6.125, + "learning_rate": 8.938586751290257e-07, + "loss": 0.85949221, + "memory(GiB)": 138.1, + "step": 69650, + "train_speed(iter/s)": 0.200734 + }, + { + "acc": 0.78397055, + "epoch": 1.6252205485644293, + "grad_norm": 8.75, + "learning_rate": 8.927810414988752e-07, + "loss": 0.78344545, + "memory(GiB)": 138.1, + "step": 69660, + "train_speed(iter/s)": 0.200749 + }, + { + "acc": 0.7823205, + "epoch": 1.6254538561367182, + "grad_norm": 5.8125, + "learning_rate": 8.917039941730365e-07, + "loss": 0.77132821, + "memory(GiB)": 138.1, + "step": 69670, + "train_speed(iter/s)": 0.200763 + }, + { + "acc": 0.77989521, + "epoch": 1.6256871637090071, + "grad_norm": 5.21875, + "learning_rate": 8.906275333052605e-07, + "loss": 0.76878123, + "memory(GiB)": 138.1, + "step": 69680, + "train_speed(iter/s)": 0.200778 + }, + { + "acc": 0.78908534, + "epoch": 1.625920471281296, + "grad_norm": 4.59375, + "learning_rate": 8.895516590492104e-07, + "loss": 0.74415193, + "memory(GiB)": 138.1, + "step": 69690, + "train_speed(iter/s)": 0.200793 + }, + { + "acc": 0.80243616, + "epoch": 1.626153778853585, + "grad_norm": 4.84375, + "learning_rate": 8.88476371558466e-07, + "loss": 0.70059543, + "memory(GiB)": 138.1, + "step": 69700, + "train_speed(iter/s)": 0.200808 + }, + { + "acc": 0.76366444, + "epoch": 1.6263870864258738, + "grad_norm": 4.34375, + "learning_rate": 8.874016709865257e-07, + "loss": 0.83497429, + "memory(GiB)": 138.1, + "step": 69710, + "train_speed(iter/s)": 0.200822 + }, + { + "acc": 0.79546161, + "epoch": 1.6266203939981627, + "grad_norm": 5.59375, + "learning_rate": 8.863275574868014e-07, + "loss": 0.73149471, + "memory(GiB)": 138.1, + "step": 69720, + "train_speed(iter/s)": 0.200836 + }, + { + "acc": 0.77597876, + "epoch": 1.6268537015704516, + "grad_norm": 3.890625, + "learning_rate": 8.852540312126256e-07, + "loss": 0.79382687, + "memory(GiB)": 138.1, + "step": 69730, + "train_speed(iter/s)": 0.200852 + }, + { + "acc": 0.77241516, + "epoch": 1.6270870091427405, + "grad_norm": 6.09375, + "learning_rate": 8.841810923172389e-07, + "loss": 0.82993584, + "memory(GiB)": 138.1, + "step": 69740, + "train_speed(iter/s)": 0.200867 + }, + { + "acc": 0.77992134, + "epoch": 1.6273203167150294, + "grad_norm": 5.125, + "learning_rate": 8.83108740953807e-07, + "loss": 0.77837553, + "memory(GiB)": 138.1, + "step": 69750, + "train_speed(iter/s)": 0.200882 + }, + { + "acc": 0.79285231, + "epoch": 1.6275536242873183, + "grad_norm": 4.8125, + "learning_rate": 8.82036977275405e-07, + "loss": 0.74223328, + "memory(GiB)": 138.1, + "step": 69760, + "train_speed(iter/s)": 0.200897 + }, + { + "acc": 0.76947985, + "epoch": 1.6277869318596072, + "grad_norm": 4.75, + "learning_rate": 8.809658014350297e-07, + "loss": 0.82958355, + "memory(GiB)": 138.1, + "step": 69770, + "train_speed(iter/s)": 0.200913 + }, + { + "acc": 0.78920531, + "epoch": 1.628020239431896, + "grad_norm": 4.15625, + "learning_rate": 8.798952135855893e-07, + "loss": 0.76306615, + "memory(GiB)": 138.1, + "step": 69780, + "train_speed(iter/s)": 0.200928 + }, + { + "acc": 0.77562838, + "epoch": 1.628253547004185, + "grad_norm": 8.25, + "learning_rate": 8.788252138799092e-07, + "loss": 0.80673237, + "memory(GiB)": 138.1, + "step": 69790, + "train_speed(iter/s)": 0.200942 + }, + { + "acc": 0.77968626, + "epoch": 1.6284868545764737, + "grad_norm": 5.1875, + "learning_rate": 8.777558024707339e-07, + "loss": 0.78458438, + "memory(GiB)": 138.1, + "step": 69800, + "train_speed(iter/s)": 0.200957 + }, + { + "acc": 0.78913994, + "epoch": 1.6287201621487628, + "grad_norm": 4.375, + "learning_rate": 8.766869795107191e-07, + "loss": 0.75235834, + "memory(GiB)": 138.1, + "step": 69810, + "train_speed(iter/s)": 0.200971 + }, + { + "acc": 0.78043246, + "epoch": 1.6289534697210515, + "grad_norm": 6.1875, + "learning_rate": 8.756187451524412e-07, + "loss": 0.77849183, + "memory(GiB)": 138.1, + "step": 69820, + "train_speed(iter/s)": 0.200986 + }, + { + "acc": 0.77848892, + "epoch": 1.6291867772933406, + "grad_norm": 4.34375, + "learning_rate": 8.745510995483892e-07, + "loss": 0.80239801, + "memory(GiB)": 138.1, + "step": 69830, + "train_speed(iter/s)": 0.201001 + }, + { + "acc": 0.79218383, + "epoch": 1.6294200848656293, + "grad_norm": 5.28125, + "learning_rate": 8.734840428509694e-07, + "loss": 0.74597349, + "memory(GiB)": 138.1, + "step": 69840, + "train_speed(iter/s)": 0.201017 + }, + { + "acc": 0.77905416, + "epoch": 1.6296533924379184, + "grad_norm": 5.5, + "learning_rate": 8.724175752125042e-07, + "loss": 0.78995914, + "memory(GiB)": 138.1, + "step": 69850, + "train_speed(iter/s)": 0.201032 + }, + { + "acc": 0.78956504, + "epoch": 1.629886700010207, + "grad_norm": 5.84375, + "learning_rate": 8.713516967852292e-07, + "loss": 0.76276588, + "memory(GiB)": 138.1, + "step": 69860, + "train_speed(iter/s)": 0.201048 + }, + { + "acc": 0.77725039, + "epoch": 1.6301200075824962, + "grad_norm": 5.53125, + "learning_rate": 8.702864077213014e-07, + "loss": 0.80270042, + "memory(GiB)": 138.1, + "step": 69870, + "train_speed(iter/s)": 0.201063 + }, + { + "acc": 0.77587008, + "epoch": 1.6303533151547849, + "grad_norm": 4.65625, + "learning_rate": 8.692217081727883e-07, + "loss": 0.81218023, + "memory(GiB)": 138.1, + "step": 69880, + "train_speed(iter/s)": 0.201077 + }, + { + "acc": 0.76779122, + "epoch": 1.630586622727074, + "grad_norm": 5.625, + "learning_rate": 8.681575982916773e-07, + "loss": 0.85469055, + "memory(GiB)": 138.1, + "step": 69890, + "train_speed(iter/s)": 0.201091 + }, + { + "acc": 0.78068151, + "epoch": 1.6308199302993627, + "grad_norm": 5.65625, + "learning_rate": 8.670940782298675e-07, + "loss": 0.79066305, + "memory(GiB)": 138.1, + "step": 69900, + "train_speed(iter/s)": 0.201106 + }, + { + "acc": 0.76384864, + "epoch": 1.6310532378716518, + "grad_norm": 4.25, + "learning_rate": 8.660311481391792e-07, + "loss": 0.85888195, + "memory(GiB)": 138.1, + "step": 69910, + "train_speed(iter/s)": 0.20112 + }, + { + "acc": 0.79069233, + "epoch": 1.6312865454439405, + "grad_norm": 5.34375, + "learning_rate": 8.649688081713431e-07, + "loss": 0.74954491, + "memory(GiB)": 138.1, + "step": 69920, + "train_speed(iter/s)": 0.201135 + }, + { + "acc": 0.78507986, + "epoch": 1.6315198530162296, + "grad_norm": 5.0625, + "learning_rate": 8.639070584780074e-07, + "loss": 0.79639606, + "memory(GiB)": 138.1, + "step": 69930, + "train_speed(iter/s)": 0.20115 + }, + { + "acc": 0.79327669, + "epoch": 1.6317531605885183, + "grad_norm": 5.53125, + "learning_rate": 8.628458992107386e-07, + "loss": 0.74162989, + "memory(GiB)": 138.1, + "step": 69940, + "train_speed(iter/s)": 0.201165 + }, + { + "acc": 0.7880043, + "epoch": 1.6319864681608074, + "grad_norm": 8.3125, + "learning_rate": 8.617853305210161e-07, + "loss": 0.75817766, + "memory(GiB)": 138.1, + "step": 69950, + "train_speed(iter/s)": 0.20118 + }, + { + "acc": 0.77378907, + "epoch": 1.632219775733096, + "grad_norm": 7.6875, + "learning_rate": 8.607253525602355e-07, + "loss": 0.80225611, + "memory(GiB)": 138.1, + "step": 69960, + "train_speed(iter/s)": 0.201195 + }, + { + "acc": 0.77905831, + "epoch": 1.6324530833053852, + "grad_norm": 5.1875, + "learning_rate": 8.596659654797068e-07, + "loss": 0.79952292, + "memory(GiB)": 138.1, + "step": 69970, + "train_speed(iter/s)": 0.201209 + }, + { + "acc": 0.77380133, + "epoch": 1.6326863908776739, + "grad_norm": 4.75, + "learning_rate": 8.586071694306602e-07, + "loss": 0.80798283, + "memory(GiB)": 138.1, + "step": 69980, + "train_speed(iter/s)": 0.201225 + }, + { + "acc": 0.79780202, + "epoch": 1.6329196984499628, + "grad_norm": 4.625, + "learning_rate": 8.575489645642371e-07, + "loss": 0.71656194, + "memory(GiB)": 138.1, + "step": 69990, + "train_speed(iter/s)": 0.201239 + }, + { + "acc": 0.77291632, + "epoch": 1.6331530060222517, + "grad_norm": 11.8125, + "learning_rate": 8.564913510314943e-07, + "loss": 0.80805435, + "memory(GiB)": 138.1, + "step": 70000, + "train_speed(iter/s)": 0.201254 + }, + { + "epoch": 1.6331530060222517, + "eval_acc": 0.744727211115419, + "eval_loss": 0.8044537901878357, + "eval_runtime": 1270.0228, + "eval_samples_per_second": 28.339, + "eval_steps_per_second": 14.17, + "step": 70000 + }, + { + "acc": 0.78201637, + "epoch": 1.6333863135945406, + "grad_norm": 5.75, + "learning_rate": 8.554343289834094e-07, + "loss": 0.80393801, + "memory(GiB)": 138.1, + "step": 70010, + "train_speed(iter/s)": 0.200523 + }, + { + "acc": 0.79293175, + "epoch": 1.6336196211668295, + "grad_norm": 5.0625, + "learning_rate": 8.543778985708683e-07, + "loss": 0.74324322, + "memory(GiB)": 138.1, + "step": 70020, + "train_speed(iter/s)": 0.200537 + }, + { + "acc": 0.78512459, + "epoch": 1.6338529287391184, + "grad_norm": 5.65625, + "learning_rate": 8.533220599446789e-07, + "loss": 0.78935528, + "memory(GiB)": 138.1, + "step": 70030, + "train_speed(iter/s)": 0.200552 + }, + { + "acc": 0.77615108, + "epoch": 1.6340862363114073, + "grad_norm": 6.84375, + "learning_rate": 8.522668132555601e-07, + "loss": 0.82629166, + "memory(GiB)": 138.1, + "step": 70040, + "train_speed(iter/s)": 0.200566 + }, + { + "acc": 0.77868662, + "epoch": 1.6343195438836962, + "grad_norm": 8.9375, + "learning_rate": 8.512121586541499e-07, + "loss": 0.79097457, + "memory(GiB)": 138.1, + "step": 70050, + "train_speed(iter/s)": 0.20058 + }, + { + "acc": 0.79038181, + "epoch": 1.634552851455985, + "grad_norm": 5.4375, + "learning_rate": 8.501580962909989e-07, + "loss": 0.73661346, + "memory(GiB)": 138.1, + "step": 70060, + "train_speed(iter/s)": 0.200594 + }, + { + "acc": 0.7649703, + "epoch": 1.634786159028274, + "grad_norm": 6.75, + "learning_rate": 8.491046263165737e-07, + "loss": 0.85525341, + "memory(GiB)": 138.1, + "step": 70070, + "train_speed(iter/s)": 0.20061 + }, + { + "acc": 0.7755065, + "epoch": 1.6350194666005629, + "grad_norm": 4.71875, + "learning_rate": 8.480517488812578e-07, + "loss": 0.79242172, + "memory(GiB)": 138.1, + "step": 70080, + "train_speed(iter/s)": 0.200625 + }, + { + "acc": 0.77949743, + "epoch": 1.6352527741728518, + "grad_norm": 5.40625, + "learning_rate": 8.469994641353468e-07, + "loss": 0.77577677, + "memory(GiB)": 138.1, + "step": 70090, + "train_speed(iter/s)": 0.200639 + }, + { + "acc": 0.77214479, + "epoch": 1.6354860817451407, + "grad_norm": 4.96875, + "learning_rate": 8.459477722290577e-07, + "loss": 0.82063198, + "memory(GiB)": 138.1, + "step": 70100, + "train_speed(iter/s)": 0.200653 + }, + { + "acc": 0.77816544, + "epoch": 1.6357193893174296, + "grad_norm": 14.4375, + "learning_rate": 8.448966733125152e-07, + "loss": 0.81221743, + "memory(GiB)": 138.1, + "step": 70110, + "train_speed(iter/s)": 0.200668 + }, + { + "acc": 0.7740953, + "epoch": 1.6359526968897184, + "grad_norm": 7.09375, + "learning_rate": 8.438461675357679e-07, + "loss": 0.81897221, + "memory(GiB)": 138.1, + "step": 70120, + "train_speed(iter/s)": 0.200681 + }, + { + "acc": 0.78639469, + "epoch": 1.6361860044620073, + "grad_norm": 5.15625, + "learning_rate": 8.427962550487717e-07, + "loss": 0.7703733, + "memory(GiB)": 138.1, + "step": 70130, + "train_speed(iter/s)": 0.200696 + }, + { + "acc": 0.77399602, + "epoch": 1.6364193120342962, + "grad_norm": 6.71875, + "learning_rate": 8.417469360014019e-07, + "loss": 0.81631184, + "memory(GiB)": 138.1, + "step": 70140, + "train_speed(iter/s)": 0.20071 + }, + { + "acc": 0.78081784, + "epoch": 1.6366526196065851, + "grad_norm": 5.8125, + "learning_rate": 8.406982105434502e-07, + "loss": 0.79938803, + "memory(GiB)": 138.1, + "step": 70150, + "train_speed(iter/s)": 0.200725 + }, + { + "acc": 0.78800888, + "epoch": 1.636885927178874, + "grad_norm": 5.5, + "learning_rate": 8.396500788246192e-07, + "loss": 0.75535965, + "memory(GiB)": 138.1, + "step": 70160, + "train_speed(iter/s)": 0.200739 + }, + { + "acc": 0.80056391, + "epoch": 1.637119234751163, + "grad_norm": 6.46875, + "learning_rate": 8.38602540994532e-07, + "loss": 0.69680567, + "memory(GiB)": 138.1, + "step": 70170, + "train_speed(iter/s)": 0.200755 + }, + { + "acc": 0.7816555, + "epoch": 1.6373525423234518, + "grad_norm": 4.625, + "learning_rate": 8.375555972027233e-07, + "loss": 0.76161251, + "memory(GiB)": 138.1, + "step": 70180, + "train_speed(iter/s)": 0.20077 + }, + { + "acc": 0.78333101, + "epoch": 1.6375858498957405, + "grad_norm": 4.40625, + "learning_rate": 8.365092475986442e-07, + "loss": 0.77230682, + "memory(GiB)": 138.1, + "step": 70190, + "train_speed(iter/s)": 0.200786 + }, + { + "acc": 0.75599384, + "epoch": 1.6378191574680296, + "grad_norm": 5.15625, + "learning_rate": 8.35463492331659e-07, + "loss": 0.91006165, + "memory(GiB)": 138.1, + "step": 70200, + "train_speed(iter/s)": 0.200801 + }, + { + "acc": 0.7760139, + "epoch": 1.6380524650403183, + "grad_norm": 5.5, + "learning_rate": 8.344183315510518e-07, + "loss": 0.807833, + "memory(GiB)": 138.1, + "step": 70210, + "train_speed(iter/s)": 0.200816 + }, + { + "acc": 0.77669868, + "epoch": 1.6382857726126074, + "grad_norm": 4.75, + "learning_rate": 8.333737654060176e-07, + "loss": 0.78656607, + "memory(GiB)": 138.1, + "step": 70220, + "train_speed(iter/s)": 0.200831 + }, + { + "acc": 0.77367878, + "epoch": 1.6385190801848961, + "grad_norm": 5.03125, + "learning_rate": 8.323297940456665e-07, + "loss": 0.82135525, + "memory(GiB)": 138.1, + "step": 70230, + "train_speed(iter/s)": 0.200846 + }, + { + "acc": 0.77797899, + "epoch": 1.6387523877571852, + "grad_norm": 4.78125, + "learning_rate": 8.312864176190282e-07, + "loss": 0.80228634, + "memory(GiB)": 138.1, + "step": 70240, + "train_speed(iter/s)": 0.200861 + }, + { + "acc": 0.7765986, + "epoch": 1.638985695329474, + "grad_norm": 6.8125, + "learning_rate": 8.302436362750416e-07, + "loss": 0.80377598, + "memory(GiB)": 138.1, + "step": 70250, + "train_speed(iter/s)": 0.200877 + }, + { + "acc": 0.79614129, + "epoch": 1.639219002901763, + "grad_norm": 4.375, + "learning_rate": 8.292014501625656e-07, + "loss": 0.70680809, + "memory(GiB)": 138.1, + "step": 70260, + "train_speed(iter/s)": 0.200891 + }, + { + "acc": 0.79076648, + "epoch": 1.6394523104740517, + "grad_norm": 4.375, + "learning_rate": 8.281598594303708e-07, + "loss": 0.74855566, + "memory(GiB)": 138.1, + "step": 70270, + "train_speed(iter/s)": 0.200905 + }, + { + "acc": 0.7758749, + "epoch": 1.6396856180463408, + "grad_norm": 3.984375, + "learning_rate": 8.271188642271432e-07, + "loss": 0.81647902, + "memory(GiB)": 138.1, + "step": 70280, + "train_speed(iter/s)": 0.200919 + }, + { + "acc": 0.79907188, + "epoch": 1.6399189256186295, + "grad_norm": 5.03125, + "learning_rate": 8.260784647014864e-07, + "loss": 0.7401226, + "memory(GiB)": 138.1, + "step": 70290, + "train_speed(iter/s)": 0.200934 + }, + { + "acc": 0.77903986, + "epoch": 1.6401522331909186, + "grad_norm": 5.40625, + "learning_rate": 8.250386610019167e-07, + "loss": 0.80523014, + "memory(GiB)": 138.1, + "step": 70300, + "train_speed(iter/s)": 0.200949 + }, + { + "acc": 0.78396397, + "epoch": 1.6403855407632073, + "grad_norm": 5.46875, + "learning_rate": 8.239994532768647e-07, + "loss": 0.78131781, + "memory(GiB)": 138.1, + "step": 70310, + "train_speed(iter/s)": 0.200964 + }, + { + "acc": 0.78332367, + "epoch": 1.6406188483354964, + "grad_norm": 5.0, + "learning_rate": 8.22960841674677e-07, + "loss": 0.77818413, + "memory(GiB)": 138.1, + "step": 70320, + "train_speed(iter/s)": 0.200979 + }, + { + "acc": 0.77349386, + "epoch": 1.640852155907785, + "grad_norm": 4.96875, + "learning_rate": 8.219228263436168e-07, + "loss": 0.82582359, + "memory(GiB)": 138.1, + "step": 70330, + "train_speed(iter/s)": 0.200993 + }, + { + "acc": 0.78453584, + "epoch": 1.6410854634800742, + "grad_norm": 4.875, + "learning_rate": 8.208854074318579e-07, + "loss": 0.76499305, + "memory(GiB)": 138.1, + "step": 70340, + "train_speed(iter/s)": 0.201008 + }, + { + "acc": 0.78075652, + "epoch": 1.641318771052363, + "grad_norm": 9.5, + "learning_rate": 8.198485850874943e-07, + "loss": 0.78641806, + "memory(GiB)": 138.1, + "step": 70350, + "train_speed(iter/s)": 0.201022 + }, + { + "acc": 0.77025485, + "epoch": 1.641552078624652, + "grad_norm": 5.6875, + "learning_rate": 8.188123594585312e-07, + "loss": 0.81662521, + "memory(GiB)": 138.1, + "step": 70360, + "train_speed(iter/s)": 0.201037 + }, + { + "acc": 0.78271184, + "epoch": 1.6417853861969407, + "grad_norm": 4.0, + "learning_rate": 8.177767306928875e-07, + "loss": 0.76894827, + "memory(GiB)": 138.1, + "step": 70370, + "train_speed(iter/s)": 0.201051 + }, + { + "acc": 0.77495737, + "epoch": 1.6420186937692296, + "grad_norm": 6.25, + "learning_rate": 8.16741698938402e-07, + "loss": 0.80135536, + "memory(GiB)": 138.1, + "step": 70380, + "train_speed(iter/s)": 0.201067 + }, + { + "acc": 0.77556715, + "epoch": 1.6422520013415185, + "grad_norm": 3.484375, + "learning_rate": 8.157072643428227e-07, + "loss": 0.80013609, + "memory(GiB)": 138.1, + "step": 70390, + "train_speed(iter/s)": 0.201081 + }, + { + "acc": 0.78865757, + "epoch": 1.6424853089138074, + "grad_norm": 6.46875, + "learning_rate": 8.14673427053817e-07, + "loss": 0.7638164, + "memory(GiB)": 138.1, + "step": 70400, + "train_speed(iter/s)": 0.201096 + }, + { + "acc": 0.78071489, + "epoch": 1.6427186164860963, + "grad_norm": 4.875, + "learning_rate": 8.136401872189631e-07, + "loss": 0.7829968, + "memory(GiB)": 138.1, + "step": 70410, + "train_speed(iter/s)": 0.201111 + }, + { + "acc": 0.79857535, + "epoch": 1.6429519240583852, + "grad_norm": 5.96875, + "learning_rate": 8.126075449857574e-07, + "loss": 0.71098719, + "memory(GiB)": 138.1, + "step": 70420, + "train_speed(iter/s)": 0.201126 + }, + { + "acc": 0.76003633, + "epoch": 1.643185231630674, + "grad_norm": 5.09375, + "learning_rate": 8.115755005016074e-07, + "loss": 0.87234459, + "memory(GiB)": 138.1, + "step": 70430, + "train_speed(iter/s)": 0.201141 + }, + { + "acc": 0.76972141, + "epoch": 1.643418539202963, + "grad_norm": 10.25, + "learning_rate": 8.105440539138371e-07, + "loss": 0.82762814, + "memory(GiB)": 138.1, + "step": 70440, + "train_speed(iter/s)": 0.201155 + }, + { + "acc": 0.78295097, + "epoch": 1.643651846775252, + "grad_norm": 4.65625, + "learning_rate": 8.095132053696869e-07, + "loss": 0.79033747, + "memory(GiB)": 138.1, + "step": 70450, + "train_speed(iter/s)": 0.201169 + }, + { + "acc": 0.77744741, + "epoch": 1.6438851543475408, + "grad_norm": 4.90625, + "learning_rate": 8.084829550163087e-07, + "loss": 0.8104229, + "memory(GiB)": 138.1, + "step": 70460, + "train_speed(iter/s)": 0.201184 + }, + { + "acc": 0.79442644, + "epoch": 1.6441184619198297, + "grad_norm": 5.09375, + "learning_rate": 8.074533030007714e-07, + "loss": 0.7583622, + "memory(GiB)": 138.1, + "step": 70470, + "train_speed(iter/s)": 0.2012 + }, + { + "acc": 0.77418184, + "epoch": 1.6443517694921186, + "grad_norm": 4.25, + "learning_rate": 8.064242494700581e-07, + "loss": 0.79210043, + "memory(GiB)": 138.1, + "step": 70480, + "train_speed(iter/s)": 0.201215 + }, + { + "acc": 0.7854991, + "epoch": 1.6445850770644075, + "grad_norm": 9.3125, + "learning_rate": 8.053957945710633e-07, + "loss": 0.7795774, + "memory(GiB)": 138.1, + "step": 70490, + "train_speed(iter/s)": 0.201229 + }, + { + "acc": 0.76263819, + "epoch": 1.6448183846366964, + "grad_norm": 5.75, + "learning_rate": 8.043679384506014e-07, + "loss": 0.83604841, + "memory(GiB)": 138.1, + "step": 70500, + "train_speed(iter/s)": 0.201243 + }, + { + "epoch": 1.6448183846366964, + "eval_acc": 0.7446782069803348, + "eval_loss": 0.8044106364250183, + "eval_runtime": 1269.9954, + "eval_samples_per_second": 28.339, + "eval_steps_per_second": 14.17, + "step": 70500 + }, + { + "acc": 0.77483778, + "epoch": 1.6450516922089853, + "grad_norm": 5.84375, + "learning_rate": 8.033406812553962e-07, + "loss": 0.81247177, + "memory(GiB)": 138.1, + "step": 70510, + "train_speed(iter/s)": 0.200516 + }, + { + "acc": 0.78622189, + "epoch": 1.6452849997812742, + "grad_norm": 7.625, + "learning_rate": 8.023140231320919e-07, + "loss": 0.77647934, + "memory(GiB)": 138.1, + "step": 70520, + "train_speed(iter/s)": 0.200532 + }, + { + "acc": 0.76669769, + "epoch": 1.645518307353563, + "grad_norm": 4.9375, + "learning_rate": 8.012879642272392e-07, + "loss": 0.84020367, + "memory(GiB)": 138.1, + "step": 70530, + "train_speed(iter/s)": 0.200548 + }, + { + "acc": 0.77495613, + "epoch": 1.645751614925852, + "grad_norm": 4.15625, + "learning_rate": 8.002625046873114e-07, + "loss": 0.81265011, + "memory(GiB)": 138.1, + "step": 70540, + "train_speed(iter/s)": 0.200562 + }, + { + "acc": 0.77314444, + "epoch": 1.6459849224981409, + "grad_norm": 4.90625, + "learning_rate": 7.992376446586891e-07, + "loss": 0.81458588, + "memory(GiB)": 138.1, + "step": 70550, + "train_speed(iter/s)": 0.200577 + }, + { + "acc": 0.78395252, + "epoch": 1.6462182300704298, + "grad_norm": 6.5625, + "learning_rate": 7.982133842876744e-07, + "loss": 0.7673286, + "memory(GiB)": 138.1, + "step": 70560, + "train_speed(iter/s)": 0.200592 + }, + { + "acc": 0.79952912, + "epoch": 1.6464515376427187, + "grad_norm": 6.5625, + "learning_rate": 7.971897237204785e-07, + "loss": 0.71654468, + "memory(GiB)": 138.1, + "step": 70570, + "train_speed(iter/s)": 0.200608 + }, + { + "acc": 0.77154684, + "epoch": 1.6466848452150074, + "grad_norm": 5.4375, + "learning_rate": 7.961666631032273e-07, + "loss": 0.8394556, + "memory(GiB)": 138.1, + "step": 70580, + "train_speed(iter/s)": 0.200622 + }, + { + "acc": 0.78006334, + "epoch": 1.6469181527872965, + "grad_norm": 5.8125, + "learning_rate": 7.951442025819651e-07, + "loss": 0.79687848, + "memory(GiB)": 138.1, + "step": 70590, + "train_speed(iter/s)": 0.200636 + }, + { + "acc": 0.77312818, + "epoch": 1.6471514603595852, + "grad_norm": 12.5625, + "learning_rate": 7.941223423026445e-07, + "loss": 0.82311382, + "memory(GiB)": 138.1, + "step": 70600, + "train_speed(iter/s)": 0.20065 + }, + { + "acc": 0.78867445, + "epoch": 1.6473847679318743, + "grad_norm": 5.8125, + "learning_rate": 7.931010824111396e-07, + "loss": 0.76376724, + "memory(GiB)": 138.1, + "step": 70610, + "train_speed(iter/s)": 0.200664 + }, + { + "acc": 0.78184714, + "epoch": 1.647618075504163, + "grad_norm": 5.84375, + "learning_rate": 7.920804230532331e-07, + "loss": 0.77635255, + "memory(GiB)": 138.1, + "step": 70620, + "train_speed(iter/s)": 0.200679 + }, + { + "acc": 0.79582644, + "epoch": 1.647851383076452, + "grad_norm": 6.21875, + "learning_rate": 7.910603643746223e-07, + "loss": 0.72300158, + "memory(GiB)": 138.1, + "step": 70630, + "train_speed(iter/s)": 0.200693 + }, + { + "acc": 0.76628838, + "epoch": 1.6480846906487407, + "grad_norm": 6.71875, + "learning_rate": 7.90040906520923e-07, + "loss": 0.85222702, + "memory(GiB)": 138.1, + "step": 70640, + "train_speed(iter/s)": 0.200709 + }, + { + "acc": 0.7875648, + "epoch": 1.6483179982210299, + "grad_norm": 6.40625, + "learning_rate": 7.890220496376616e-07, + "loss": 0.76165543, + "memory(GiB)": 138.1, + "step": 70650, + "train_speed(iter/s)": 0.200724 + }, + { + "acc": 0.78504138, + "epoch": 1.6485513057933185, + "grad_norm": 4.65625, + "learning_rate": 7.880037938702789e-07, + "loss": 0.74155846, + "memory(GiB)": 138.1, + "step": 70660, + "train_speed(iter/s)": 0.200738 + }, + { + "acc": 0.78913331, + "epoch": 1.6487846133656077, + "grad_norm": 5.4375, + "learning_rate": 7.869861393641304e-07, + "loss": 0.75573359, + "memory(GiB)": 138.1, + "step": 70670, + "train_speed(iter/s)": 0.200752 + }, + { + "acc": 0.77992077, + "epoch": 1.6490179209378963, + "grad_norm": 6.5, + "learning_rate": 7.859690862644876e-07, + "loss": 0.78720312, + "memory(GiB)": 138.1, + "step": 70680, + "train_speed(iter/s)": 0.200767 + }, + { + "acc": 0.77604442, + "epoch": 1.6492512285101855, + "grad_norm": 6.6875, + "learning_rate": 7.849526347165321e-07, + "loss": 0.83598385, + "memory(GiB)": 138.1, + "step": 70690, + "train_speed(iter/s)": 0.200782 + }, + { + "acc": 0.78691902, + "epoch": 1.6494845360824741, + "grad_norm": 8.875, + "learning_rate": 7.83936784865365e-07, + "loss": 0.77053566, + "memory(GiB)": 138.1, + "step": 70700, + "train_speed(iter/s)": 0.200796 + }, + { + "acc": 0.79002995, + "epoch": 1.6497178436547633, + "grad_norm": 3.6875, + "learning_rate": 7.829215368559967e-07, + "loss": 0.74581184, + "memory(GiB)": 138.1, + "step": 70710, + "train_speed(iter/s)": 0.200811 + }, + { + "acc": 0.78356628, + "epoch": 1.649951151227052, + "grad_norm": 5.28125, + "learning_rate": 7.819068908333532e-07, + "loss": 0.76630058, + "memory(GiB)": 138.1, + "step": 70720, + "train_speed(iter/s)": 0.200827 + }, + { + "acc": 0.79238472, + "epoch": 1.650184458799341, + "grad_norm": 5.21875, + "learning_rate": 7.808928469422766e-07, + "loss": 0.7440876, + "memory(GiB)": 138.1, + "step": 70730, + "train_speed(iter/s)": 0.200841 + }, + { + "acc": 0.7771996, + "epoch": 1.6504177663716297, + "grad_norm": 5.875, + "learning_rate": 7.798794053275193e-07, + "loss": 0.80373068, + "memory(GiB)": 138.1, + "step": 70740, + "train_speed(iter/s)": 0.200856 + }, + { + "acc": 0.75577822, + "epoch": 1.6506510739439189, + "grad_norm": 5.5625, + "learning_rate": 7.78866566133753e-07, + "loss": 0.90629873, + "memory(GiB)": 138.1, + "step": 70750, + "train_speed(iter/s)": 0.200869 + }, + { + "acc": 0.79528246, + "epoch": 1.6508843815162075, + "grad_norm": 5.84375, + "learning_rate": 7.778543295055563e-07, + "loss": 0.73617687, + "memory(GiB)": 138.1, + "step": 70760, + "train_speed(iter/s)": 0.200884 + }, + { + "acc": 0.77932301, + "epoch": 1.6511176890884964, + "grad_norm": 5.53125, + "learning_rate": 7.768426955874287e-07, + "loss": 0.78966246, + "memory(GiB)": 138.1, + "step": 70770, + "train_speed(iter/s)": 0.2009 + }, + { + "acc": 0.78333249, + "epoch": 1.6513509966607853, + "grad_norm": 5.46875, + "learning_rate": 7.758316645237791e-07, + "loss": 0.76522617, + "memory(GiB)": 138.1, + "step": 70780, + "train_speed(iter/s)": 0.200915 + }, + { + "acc": 0.78264174, + "epoch": 1.6515843042330742, + "grad_norm": 4.59375, + "learning_rate": 7.748212364589314e-07, + "loss": 0.77815781, + "memory(GiB)": 138.1, + "step": 70790, + "train_speed(iter/s)": 0.20093 + }, + { + "acc": 0.7721468, + "epoch": 1.6518176118053631, + "grad_norm": 6.09375, + "learning_rate": 7.738114115371254e-07, + "loss": 0.82284946, + "memory(GiB)": 138.1, + "step": 70800, + "train_speed(iter/s)": 0.200945 + }, + { + "acc": 0.77733574, + "epoch": 1.652050919377652, + "grad_norm": 5.71875, + "learning_rate": 7.728021899025124e-07, + "loss": 0.80925665, + "memory(GiB)": 138.1, + "step": 70810, + "train_speed(iter/s)": 0.20096 + }, + { + "acc": 0.79648423, + "epoch": 1.652284226949941, + "grad_norm": 4.53125, + "learning_rate": 7.717935716991592e-07, + "loss": 0.72788992, + "memory(GiB)": 138.1, + "step": 70820, + "train_speed(iter/s)": 0.200975 + }, + { + "acc": 0.7860816, + "epoch": 1.6525175345222298, + "grad_norm": 7.125, + "learning_rate": 7.707855570710443e-07, + "loss": 0.75475664, + "memory(GiB)": 138.1, + "step": 70830, + "train_speed(iter/s)": 0.20099 + }, + { + "acc": 0.77979727, + "epoch": 1.6527508420945187, + "grad_norm": 5.03125, + "learning_rate": 7.69778146162064e-07, + "loss": 0.80329094, + "memory(GiB)": 138.1, + "step": 70840, + "train_speed(iter/s)": 0.201005 + }, + { + "acc": 0.77364244, + "epoch": 1.6529841496668076, + "grad_norm": 5.875, + "learning_rate": 7.68771339116024e-07, + "loss": 0.82023067, + "memory(GiB)": 138.1, + "step": 70850, + "train_speed(iter/s)": 0.201019 + }, + { + "acc": 0.77010069, + "epoch": 1.6532174572390965, + "grad_norm": 5.65625, + "learning_rate": 7.677651360766453e-07, + "loss": 0.85668049, + "memory(GiB)": 138.1, + "step": 70860, + "train_speed(iter/s)": 0.201034 + }, + { + "acc": 0.7844202, + "epoch": 1.6534507648113854, + "grad_norm": 7.375, + "learning_rate": 7.667595371875663e-07, + "loss": 0.7779212, + "memory(GiB)": 138.1, + "step": 70870, + "train_speed(iter/s)": 0.201048 + }, + { + "acc": 0.77224879, + "epoch": 1.6536840723836743, + "grad_norm": 5.03125, + "learning_rate": 7.657545425923313e-07, + "loss": 0.82133636, + "memory(GiB)": 138.1, + "step": 70880, + "train_speed(iter/s)": 0.201062 + }, + { + "acc": 0.76610975, + "epoch": 1.6539173799559632, + "grad_norm": 5.46875, + "learning_rate": 7.647501524344064e-07, + "loss": 0.8340517, + "memory(GiB)": 138.1, + "step": 70890, + "train_speed(iter/s)": 0.201077 + }, + { + "acc": 0.78516321, + "epoch": 1.6541506875282521, + "grad_norm": 4.75, + "learning_rate": 7.637463668571659e-07, + "loss": 0.76711388, + "memory(GiB)": 138.1, + "step": 70900, + "train_speed(iter/s)": 0.201091 + }, + { + "acc": 0.77286749, + "epoch": 1.654383995100541, + "grad_norm": 5.375, + "learning_rate": 7.627431860039019e-07, + "loss": 0.81947851, + "memory(GiB)": 138.1, + "step": 70910, + "train_speed(iter/s)": 0.201105 + }, + { + "acc": 0.76813011, + "epoch": 1.65461730267283, + "grad_norm": 5.71875, + "learning_rate": 7.617406100178171e-07, + "loss": 0.83200006, + "memory(GiB)": 138.1, + "step": 70920, + "train_speed(iter/s)": 0.201119 + }, + { + "acc": 0.78832273, + "epoch": 1.6548506102451188, + "grad_norm": 5.03125, + "learning_rate": 7.607386390420279e-07, + "loss": 0.74536948, + "memory(GiB)": 138.1, + "step": 70930, + "train_speed(iter/s)": 0.201134 + }, + { + "acc": 0.7811326, + "epoch": 1.6550839178174077, + "grad_norm": 5.4375, + "learning_rate": 7.597372732195674e-07, + "loss": 0.78975849, + "memory(GiB)": 138.1, + "step": 70940, + "train_speed(iter/s)": 0.201149 + }, + { + "acc": 0.77318063, + "epoch": 1.6553172253896964, + "grad_norm": 6.75, + "learning_rate": 7.587365126933782e-07, + "loss": 0.8096487, + "memory(GiB)": 138.1, + "step": 70950, + "train_speed(iter/s)": 0.201163 + }, + { + "acc": 0.78359327, + "epoch": 1.6555505329619855, + "grad_norm": 5.28125, + "learning_rate": 7.577363576063212e-07, + "loss": 0.77436295, + "memory(GiB)": 138.1, + "step": 70960, + "train_speed(iter/s)": 0.201178 + }, + { + "acc": 0.78843203, + "epoch": 1.6557838405342742, + "grad_norm": 5.46875, + "learning_rate": 7.567368081011656e-07, + "loss": 0.74306149, + "memory(GiB)": 138.1, + "step": 70970, + "train_speed(iter/s)": 0.201191 + }, + { + "acc": 0.78639054, + "epoch": 1.6560171481065633, + "grad_norm": 7.375, + "learning_rate": 7.557378643206003e-07, + "loss": 0.78001995, + "memory(GiB)": 138.1, + "step": 70980, + "train_speed(iter/s)": 0.201206 + }, + { + "acc": 0.79448624, + "epoch": 1.656250455678852, + "grad_norm": 7.6875, + "learning_rate": 7.547395264072193e-07, + "loss": 0.74755363, + "memory(GiB)": 138.1, + "step": 70990, + "train_speed(iter/s)": 0.20122 + }, + { + "acc": 0.78536434, + "epoch": 1.656483763251141, + "grad_norm": 6.78125, + "learning_rate": 7.537417945035391e-07, + "loss": 0.76806111, + "memory(GiB)": 138.1, + "step": 71000, + "train_speed(iter/s)": 0.201235 + }, + { + "epoch": 1.656483763251141, + "eval_acc": 0.7447166415960871, + "eval_loss": 0.8044018745422363, + "eval_runtime": 1270.2511, + "eval_samples_per_second": 28.334, + "eval_steps_per_second": 14.167, + "step": 71000 + }, + { + "acc": 0.78267994, + "epoch": 1.6567170708234298, + "grad_norm": 5.5625, + "learning_rate": 7.52744668751984e-07, + "loss": 0.78940582, + "memory(GiB)": 138.1, + "step": 71010, + "train_speed(iter/s)": 0.200514 + }, + { + "acc": 0.78548698, + "epoch": 1.656950378395719, + "grad_norm": 9.375, + "learning_rate": 7.517481492948925e-07, + "loss": 0.77665896, + "memory(GiB)": 138.1, + "step": 71020, + "train_speed(iter/s)": 0.200527 + }, + { + "acc": 0.76159725, + "epoch": 1.6571836859680076, + "grad_norm": 5.84375, + "learning_rate": 7.507522362745195e-07, + "loss": 0.8417695, + "memory(GiB)": 138.1, + "step": 71030, + "train_speed(iter/s)": 0.200541 + }, + { + "acc": 0.78062215, + "epoch": 1.6574169935402967, + "grad_norm": 5.59375, + "learning_rate": 7.497569298330293e-07, + "loss": 0.7981185, + "memory(GiB)": 138.1, + "step": 71040, + "train_speed(iter/s)": 0.200557 + }, + { + "acc": 0.76416349, + "epoch": 1.6576503011125854, + "grad_norm": 6.3125, + "learning_rate": 7.487622301125041e-07, + "loss": 0.87657032, + "memory(GiB)": 138.1, + "step": 71050, + "train_speed(iter/s)": 0.200572 + }, + { + "acc": 0.78121519, + "epoch": 1.6578836086848745, + "grad_norm": 4.65625, + "learning_rate": 7.477681372549355e-07, + "loss": 0.77680473, + "memory(GiB)": 138.1, + "step": 71060, + "train_speed(iter/s)": 0.200585 + }, + { + "acc": 0.77829385, + "epoch": 1.6581169162571632, + "grad_norm": 5.75, + "learning_rate": 7.467746514022284e-07, + "loss": 0.80236874, + "memory(GiB)": 138.1, + "step": 71070, + "train_speed(iter/s)": 0.200599 + }, + { + "acc": 0.76091347, + "epoch": 1.6583502238294523, + "grad_norm": 5.5625, + "learning_rate": 7.457817726962058e-07, + "loss": 0.86437483, + "memory(GiB)": 138.1, + "step": 71080, + "train_speed(iter/s)": 0.200613 + }, + { + "acc": 0.79476452, + "epoch": 1.658583531401741, + "grad_norm": 5.78125, + "learning_rate": 7.447895012785983e-07, + "loss": 0.73813944, + "memory(GiB)": 138.1, + "step": 71090, + "train_speed(iter/s)": 0.200627 + }, + { + "acc": 0.77920585, + "epoch": 1.65881683897403, + "grad_norm": 5.09375, + "learning_rate": 7.437978372910554e-07, + "loss": 0.80791998, + "memory(GiB)": 138.1, + "step": 71100, + "train_speed(iter/s)": 0.200641 + }, + { + "acc": 0.7747447, + "epoch": 1.6590501465463188, + "grad_norm": 4.9375, + "learning_rate": 7.428067808751327e-07, + "loss": 0.80314159, + "memory(GiB)": 138.1, + "step": 71110, + "train_speed(iter/s)": 0.200656 + }, + { + "acc": 0.78580241, + "epoch": 1.659283454118608, + "grad_norm": 5.71875, + "learning_rate": 7.41816332172306e-07, + "loss": 0.77066259, + "memory(GiB)": 138.1, + "step": 71120, + "train_speed(iter/s)": 0.200671 + }, + { + "acc": 0.78698373, + "epoch": 1.6595167616908966, + "grad_norm": 6.28125, + "learning_rate": 7.408264913239598e-07, + "loss": 0.74832349, + "memory(GiB)": 138.1, + "step": 71130, + "train_speed(iter/s)": 0.200685 + }, + { + "acc": 0.77769785, + "epoch": 1.6597500692631855, + "grad_norm": 5.96875, + "learning_rate": 7.398372584713964e-07, + "loss": 0.78736668, + "memory(GiB)": 138.1, + "step": 71140, + "train_speed(iter/s)": 0.2007 + }, + { + "acc": 0.78976698, + "epoch": 1.6599833768354744, + "grad_norm": 6.0625, + "learning_rate": 7.388486337558265e-07, + "loss": 0.75616255, + "memory(GiB)": 138.1, + "step": 71150, + "train_speed(iter/s)": 0.200714 + }, + { + "acc": 0.80499172, + "epoch": 1.6602166844077633, + "grad_norm": 5.5, + "learning_rate": 7.378606173183749e-07, + "loss": 0.70488954, + "memory(GiB)": 138.1, + "step": 71160, + "train_speed(iter/s)": 0.200729 + }, + { + "acc": 0.79924831, + "epoch": 1.6604499919800522, + "grad_norm": 5.5625, + "learning_rate": 7.36873209300083e-07, + "loss": 0.71615515, + "memory(GiB)": 138.1, + "step": 71170, + "train_speed(iter/s)": 0.200743 + }, + { + "acc": 0.77024212, + "epoch": 1.660683299552341, + "grad_norm": 5.1875, + "learning_rate": 7.35886409841901e-07, + "loss": 0.82428303, + "memory(GiB)": 138.1, + "step": 71180, + "train_speed(iter/s)": 0.200758 + }, + { + "acc": 0.78113737, + "epoch": 1.66091660712463, + "grad_norm": 4.3125, + "learning_rate": 7.349002190846965e-07, + "loss": 0.77647839, + "memory(GiB)": 138.1, + "step": 71190, + "train_speed(iter/s)": 0.200773 + }, + { + "acc": 0.78592319, + "epoch": 1.6611499146969189, + "grad_norm": 4.5625, + "learning_rate": 7.339146371692468e-07, + "loss": 0.77881441, + "memory(GiB)": 138.1, + "step": 71200, + "train_speed(iter/s)": 0.200787 + }, + { + "acc": 0.78388247, + "epoch": 1.6613832222692078, + "grad_norm": 7.34375, + "learning_rate": 7.329296642362438e-07, + "loss": 0.77189293, + "memory(GiB)": 138.1, + "step": 71210, + "train_speed(iter/s)": 0.200803 + }, + { + "acc": 0.77709417, + "epoch": 1.6616165298414967, + "grad_norm": 12.875, + "learning_rate": 7.319453004262911e-07, + "loss": 0.80911427, + "memory(GiB)": 138.1, + "step": 71220, + "train_speed(iter/s)": 0.200818 + }, + { + "acc": 0.78052092, + "epoch": 1.6618498374137856, + "grad_norm": 6.15625, + "learning_rate": 7.309615458799058e-07, + "loss": 0.80394659, + "memory(GiB)": 138.1, + "step": 71230, + "train_speed(iter/s)": 0.200832 + }, + { + "acc": 0.78070478, + "epoch": 1.6620831449860745, + "grad_norm": 4.84375, + "learning_rate": 7.299784007375205e-07, + "loss": 0.78910527, + "memory(GiB)": 138.1, + "step": 71240, + "train_speed(iter/s)": 0.200846 + }, + { + "acc": 0.79349833, + "epoch": 1.6623164525583634, + "grad_norm": 5.375, + "learning_rate": 7.289958651394774e-07, + "loss": 0.7258852, + "memory(GiB)": 138.1, + "step": 71250, + "train_speed(iter/s)": 0.20086 + }, + { + "acc": 0.76768055, + "epoch": 1.6625497601306523, + "grad_norm": 5.28125, + "learning_rate": 7.280139392260344e-07, + "loss": 0.83310871, + "memory(GiB)": 138.1, + "step": 71260, + "train_speed(iter/s)": 0.200874 + }, + { + "acc": 0.76861515, + "epoch": 1.6627830677029412, + "grad_norm": 7.6875, + "learning_rate": 7.270326231373598e-07, + "loss": 0.81961651, + "memory(GiB)": 138.1, + "step": 71270, + "train_speed(iter/s)": 0.20089 + }, + { + "acc": 0.81069078, + "epoch": 1.66301637527523, + "grad_norm": 6.3125, + "learning_rate": 7.260519170135383e-07, + "loss": 0.6584691, + "memory(GiB)": 138.1, + "step": 71280, + "train_speed(iter/s)": 0.200904 + }, + { + "acc": 0.7901927, + "epoch": 1.663249682847519, + "grad_norm": 6.90625, + "learning_rate": 7.25071820994564e-07, + "loss": 0.76735644, + "memory(GiB)": 138.1, + "step": 71290, + "train_speed(iter/s)": 0.200919 + }, + { + "acc": 0.77350497, + "epoch": 1.6634829904198078, + "grad_norm": 6.15625, + "learning_rate": 7.240923352203438e-07, + "loss": 0.81890402, + "memory(GiB)": 138.1, + "step": 71300, + "train_speed(iter/s)": 0.200934 + }, + { + "acc": 0.77600427, + "epoch": 1.6637162979920967, + "grad_norm": 4.46875, + "learning_rate": 7.231134598307022e-07, + "loss": 0.80230637, + "memory(GiB)": 138.1, + "step": 71310, + "train_speed(iter/s)": 0.200949 + }, + { + "acc": 0.78995695, + "epoch": 1.6639496055643856, + "grad_norm": 4.40625, + "learning_rate": 7.221351949653715e-07, + "loss": 0.73928757, + "memory(GiB)": 138.1, + "step": 71320, + "train_speed(iter/s)": 0.200965 + }, + { + "acc": 0.78438892, + "epoch": 1.6641829131366745, + "grad_norm": 4.8125, + "learning_rate": 7.211575407639987e-07, + "loss": 0.77428908, + "memory(GiB)": 138.1, + "step": 71330, + "train_speed(iter/s)": 0.200979 + }, + { + "acc": 0.77166224, + "epoch": 1.6644162207089632, + "grad_norm": 7.84375, + "learning_rate": 7.20180497366143e-07, + "loss": 0.83759232, + "memory(GiB)": 138.1, + "step": 71340, + "train_speed(iter/s)": 0.200994 + }, + { + "acc": 0.79361496, + "epoch": 1.6646495282812523, + "grad_norm": 4.28125, + "learning_rate": 7.192040649112797e-07, + "loss": 0.74620438, + "memory(GiB)": 138.1, + "step": 71350, + "train_speed(iter/s)": 0.201009 + }, + { + "acc": 0.77841673, + "epoch": 1.664882835853541, + "grad_norm": 5.0625, + "learning_rate": 7.182282435387922e-07, + "loss": 0.77743587, + "memory(GiB)": 138.1, + "step": 71360, + "train_speed(iter/s)": 0.201024 + }, + { + "acc": 0.77280235, + "epoch": 1.6651161434258301, + "grad_norm": 5.4375, + "learning_rate": 7.172530333879774e-07, + "loss": 0.83453283, + "memory(GiB)": 138.1, + "step": 71370, + "train_speed(iter/s)": 0.201039 + }, + { + "acc": 0.78664598, + "epoch": 1.6653494509981188, + "grad_norm": 6.0, + "learning_rate": 7.1627843459805e-07, + "loss": 0.78319445, + "memory(GiB)": 138.1, + "step": 71380, + "train_speed(iter/s)": 0.201053 + }, + { + "acc": 0.77737265, + "epoch": 1.665582758570408, + "grad_norm": 5.65625, + "learning_rate": 7.153044473081299e-07, + "loss": 0.79047518, + "memory(GiB)": 138.1, + "step": 71390, + "train_speed(iter/s)": 0.201068 + }, + { + "acc": 0.7710959, + "epoch": 1.6658160661426966, + "grad_norm": 6.0625, + "learning_rate": 7.143310716572565e-07, + "loss": 0.83085079, + "memory(GiB)": 138.1, + "step": 71400, + "train_speed(iter/s)": 0.201083 + }, + { + "acc": 0.797966, + "epoch": 1.6660493737149857, + "grad_norm": 5.75, + "learning_rate": 7.133583077843776e-07, + "loss": 0.70618973, + "memory(GiB)": 138.1, + "step": 71410, + "train_speed(iter/s)": 0.201099 + }, + { + "acc": 0.77575045, + "epoch": 1.6662826812872744, + "grad_norm": 5.34375, + "learning_rate": 7.12386155828354e-07, + "loss": 0.82906694, + "memory(GiB)": 138.1, + "step": 71420, + "train_speed(iter/s)": 0.201114 + }, + { + "acc": 0.75881324, + "epoch": 1.6665159888595635, + "grad_norm": 5.65625, + "learning_rate": 7.114146159279622e-07, + "loss": 0.8684248, + "memory(GiB)": 138.1, + "step": 71430, + "train_speed(iter/s)": 0.201129 + }, + { + "acc": 0.77309985, + "epoch": 1.6667492964318522, + "grad_norm": 6.28125, + "learning_rate": 7.104436882218879e-07, + "loss": 0.81412992, + "memory(GiB)": 138.1, + "step": 71440, + "train_speed(iter/s)": 0.201143 + }, + { + "acc": 0.78402615, + "epoch": 1.6669826040041413, + "grad_norm": 3.8125, + "learning_rate": 7.094733728487313e-07, + "loss": 0.78737326, + "memory(GiB)": 138.1, + "step": 71450, + "train_speed(iter/s)": 0.201157 + }, + { + "acc": 0.79735641, + "epoch": 1.66721591157643, + "grad_norm": 6.6875, + "learning_rate": 7.085036699470027e-07, + "loss": 0.71820598, + "memory(GiB)": 138.1, + "step": 71460, + "train_speed(iter/s)": 0.201172 + }, + { + "acc": 0.77270122, + "epoch": 1.6674492191487191, + "grad_norm": 5.03125, + "learning_rate": 7.075345796551303e-07, + "loss": 0.82472687, + "memory(GiB)": 138.1, + "step": 71470, + "train_speed(iter/s)": 0.201185 + }, + { + "acc": 0.79649363, + "epoch": 1.6676825267210078, + "grad_norm": 6.3125, + "learning_rate": 7.065661021114478e-07, + "loss": 0.72360072, + "memory(GiB)": 138.1, + "step": 71480, + "train_speed(iter/s)": 0.201199 + }, + { + "acc": 0.78411646, + "epoch": 1.667915834293297, + "grad_norm": 6.09375, + "learning_rate": 7.055982374542086e-07, + "loss": 0.75441995, + "memory(GiB)": 138.1, + "step": 71490, + "train_speed(iter/s)": 0.201214 + }, + { + "acc": 0.76906228, + "epoch": 1.6681491418655856, + "grad_norm": 5.0, + "learning_rate": 7.046309858215733e-07, + "loss": 0.84269314, + "memory(GiB)": 138.1, + "step": 71500, + "train_speed(iter/s)": 0.201228 + }, + { + "epoch": 1.6681491418655856, + "eval_acc": 0.7446756446726179, + "eval_loss": 0.8043976426124573, + "eval_runtime": 1270.6288, + "eval_samples_per_second": 28.325, + "eval_steps_per_second": 14.163, + "step": 71500 + }, + { + "acc": 0.77692218, + "epoch": 1.6683824494378747, + "grad_norm": 5.90625, + "learning_rate": 7.036643473516164e-07, + "loss": 0.81663551, + "memory(GiB)": 138.1, + "step": 71510, + "train_speed(iter/s)": 0.200512 + }, + { + "acc": 0.77694969, + "epoch": 1.6686157570101634, + "grad_norm": 5.625, + "learning_rate": 7.026983221823264e-07, + "loss": 0.81028423, + "memory(GiB)": 138.1, + "step": 71520, + "train_speed(iter/s)": 0.200526 + }, + { + "acc": 0.77378316, + "epoch": 1.6688490645824523, + "grad_norm": 6.46875, + "learning_rate": 7.017329104516013e-07, + "loss": 0.7947732, + "memory(GiB)": 138.1, + "step": 71530, + "train_speed(iter/s)": 0.200541 + }, + { + "acc": 0.77548847, + "epoch": 1.6690823721547412, + "grad_norm": 4.65625, + "learning_rate": 7.007681122972559e-07, + "loss": 0.80890713, + "memory(GiB)": 138.1, + "step": 71540, + "train_speed(iter/s)": 0.200555 + }, + { + "acc": 0.78633842, + "epoch": 1.66931567972703, + "grad_norm": 5.4375, + "learning_rate": 6.998039278570134e-07, + "loss": 0.74881501, + "memory(GiB)": 138.1, + "step": 71550, + "train_speed(iter/s)": 0.20057 + }, + { + "acc": 0.76336951, + "epoch": 1.669548987299319, + "grad_norm": 5.78125, + "learning_rate": 6.988403572685115e-07, + "loss": 0.82800121, + "memory(GiB)": 138.1, + "step": 71560, + "train_speed(iter/s)": 0.200585 + }, + { + "acc": 0.78569932, + "epoch": 1.669782294871608, + "grad_norm": 4.25, + "learning_rate": 6.978774006692984e-07, + "loss": 0.76846313, + "memory(GiB)": 138.1, + "step": 71570, + "train_speed(iter/s)": 0.200599 + }, + { + "acc": 0.78505049, + "epoch": 1.6700156024438968, + "grad_norm": 4.0625, + "learning_rate": 6.969150581968359e-07, + "loss": 0.76446638, + "memory(GiB)": 138.1, + "step": 71580, + "train_speed(iter/s)": 0.200613 + }, + { + "acc": 0.79139848, + "epoch": 1.6702489100161857, + "grad_norm": 3.65625, + "learning_rate": 6.959533299885001e-07, + "loss": 0.75788527, + "memory(GiB)": 138.1, + "step": 71590, + "train_speed(iter/s)": 0.200628 + }, + { + "acc": 0.78600264, + "epoch": 1.6704822175884746, + "grad_norm": 5.28125, + "learning_rate": 6.949922161815748e-07, + "loss": 0.76887283, + "memory(GiB)": 138.1, + "step": 71600, + "train_speed(iter/s)": 0.200643 + }, + { + "acc": 0.77672091, + "epoch": 1.6707155251607635, + "grad_norm": 5.59375, + "learning_rate": 6.94031716913261e-07, + "loss": 0.80708447, + "memory(GiB)": 138.1, + "step": 71610, + "train_speed(iter/s)": 0.200657 + }, + { + "acc": 0.78550982, + "epoch": 1.6709488327330524, + "grad_norm": 6.84375, + "learning_rate": 6.930718323206676e-07, + "loss": 0.78040328, + "memory(GiB)": 138.1, + "step": 71620, + "train_speed(iter/s)": 0.200672 + }, + { + "acc": 0.78563089, + "epoch": 1.6711821403053413, + "grad_norm": 7.84375, + "learning_rate": 6.921125625408198e-07, + "loss": 0.75789003, + "memory(GiB)": 138.1, + "step": 71630, + "train_speed(iter/s)": 0.200686 + }, + { + "acc": 0.77634792, + "epoch": 1.6714154478776302, + "grad_norm": 7.25, + "learning_rate": 6.911539077106527e-07, + "loss": 0.81327028, + "memory(GiB)": 138.1, + "step": 71640, + "train_speed(iter/s)": 0.200701 + }, + { + "acc": 0.75646667, + "epoch": 1.671648755449919, + "grad_norm": 5.09375, + "learning_rate": 6.901958679670123e-07, + "loss": 0.90101776, + "memory(GiB)": 138.1, + "step": 71650, + "train_speed(iter/s)": 0.200715 + }, + { + "acc": 0.80443058, + "epoch": 1.671882063022208, + "grad_norm": 5.625, + "learning_rate": 6.892384434466609e-07, + "loss": 0.70652561, + "memory(GiB)": 138.1, + "step": 71660, + "train_speed(iter/s)": 0.200729 + }, + { + "acc": 0.77762895, + "epoch": 1.6721153705944969, + "grad_norm": 6.875, + "learning_rate": 6.882816342862692e-07, + "loss": 0.80890656, + "memory(GiB)": 138.1, + "step": 71670, + "train_speed(iter/s)": 0.200743 + }, + { + "acc": 0.78075476, + "epoch": 1.6723486781667858, + "grad_norm": 4.5, + "learning_rate": 6.873254406224223e-07, + "loss": 0.79064302, + "memory(GiB)": 138.1, + "step": 71680, + "train_speed(iter/s)": 0.200758 + }, + { + "acc": 0.78527708, + "epoch": 1.6725819857390747, + "grad_norm": 6.09375, + "learning_rate": 6.863698625916137e-07, + "loss": 0.74829164, + "memory(GiB)": 138.1, + "step": 71690, + "train_speed(iter/s)": 0.200773 + }, + { + "acc": 0.77983356, + "epoch": 1.6728152933113636, + "grad_norm": 4.25, + "learning_rate": 6.854149003302562e-07, + "loss": 0.77540479, + "memory(GiB)": 138.1, + "step": 71700, + "train_speed(iter/s)": 0.200787 + }, + { + "acc": 0.79154415, + "epoch": 1.6730486008836525, + "grad_norm": 4.90625, + "learning_rate": 6.844605539746679e-07, + "loss": 0.7527297, + "memory(GiB)": 138.1, + "step": 71710, + "train_speed(iter/s)": 0.200801 + }, + { + "acc": 0.81196423, + "epoch": 1.6732819084559414, + "grad_norm": 3.984375, + "learning_rate": 6.835068236610809e-07, + "loss": 0.66903067, + "memory(GiB)": 138.1, + "step": 71720, + "train_speed(iter/s)": 0.200815 + }, + { + "acc": 0.80410776, + "epoch": 1.67351521602823, + "grad_norm": 5.28125, + "learning_rate": 6.825537095256418e-07, + "loss": 0.68753047, + "memory(GiB)": 138.1, + "step": 71730, + "train_speed(iter/s)": 0.200829 + }, + { + "acc": 0.76747966, + "epoch": 1.6737485236005192, + "grad_norm": 4.625, + "learning_rate": 6.816012117044052e-07, + "loss": 0.83944092, + "memory(GiB)": 138.1, + "step": 71740, + "train_speed(iter/s)": 0.200843 + }, + { + "acc": 0.7719739, + "epoch": 1.6739818311728079, + "grad_norm": 4.8125, + "learning_rate": 6.806493303333422e-07, + "loss": 0.82619953, + "memory(GiB)": 138.1, + "step": 71750, + "train_speed(iter/s)": 0.200859 + }, + { + "acc": 0.77261276, + "epoch": 1.674215138745097, + "grad_norm": 8.25, + "learning_rate": 6.796980655483315e-07, + "loss": 0.80542164, + "memory(GiB)": 138.1, + "step": 71760, + "train_speed(iter/s)": 0.200874 + }, + { + "acc": 0.77031894, + "epoch": 1.6744484463173857, + "grad_norm": 4.46875, + "learning_rate": 6.787474174851683e-07, + "loss": 0.84091234, + "memory(GiB)": 138.1, + "step": 71770, + "train_speed(iter/s)": 0.200889 + }, + { + "acc": 0.78931608, + "epoch": 1.6746817538896748, + "grad_norm": 4.71875, + "learning_rate": 6.777973862795556e-07, + "loss": 0.76971698, + "memory(GiB)": 138.1, + "step": 71780, + "train_speed(iter/s)": 0.200904 + }, + { + "acc": 0.80558147, + "epoch": 1.6749150614619635, + "grad_norm": 5.0, + "learning_rate": 6.768479720671106e-07, + "loss": 0.69032288, + "memory(GiB)": 138.1, + "step": 71790, + "train_speed(iter/s)": 0.200918 + }, + { + "acc": 0.79586306, + "epoch": 1.6751483690342526, + "grad_norm": 3.1875, + "learning_rate": 6.758991749833616e-07, + "loss": 0.72956157, + "memory(GiB)": 138.1, + "step": 71800, + "train_speed(iter/s)": 0.200932 + }, + { + "acc": 0.77934403, + "epoch": 1.6753816766065412, + "grad_norm": 4.65625, + "learning_rate": 6.749509951637484e-07, + "loss": 0.78227091, + "memory(GiB)": 138.1, + "step": 71810, + "train_speed(iter/s)": 0.200947 + }, + { + "acc": 0.7744832, + "epoch": 1.6756149841788304, + "grad_norm": 7.0625, + "learning_rate": 6.740034327436251e-07, + "loss": 0.81382933, + "memory(GiB)": 138.1, + "step": 71820, + "train_speed(iter/s)": 0.200961 + }, + { + "acc": 0.77508135, + "epoch": 1.675848291751119, + "grad_norm": 5.84375, + "learning_rate": 6.730564878582535e-07, + "loss": 0.81892805, + "memory(GiB)": 138.1, + "step": 71830, + "train_speed(iter/s)": 0.200975 + }, + { + "acc": 0.80630512, + "epoch": 1.6760815993234082, + "grad_norm": 6.15625, + "learning_rate": 6.721101606428132e-07, + "loss": 0.70892029, + "memory(GiB)": 138.1, + "step": 71840, + "train_speed(iter/s)": 0.200989 + }, + { + "acc": 0.7737956, + "epoch": 1.6763149068956968, + "grad_norm": 5.8125, + "learning_rate": 6.711644512323895e-07, + "loss": 0.83846264, + "memory(GiB)": 138.1, + "step": 71850, + "train_speed(iter/s)": 0.201004 + }, + { + "acc": 0.76679764, + "epoch": 1.676548214467986, + "grad_norm": 4.3125, + "learning_rate": 6.702193597619821e-07, + "loss": 0.84609222, + "memory(GiB)": 138.1, + "step": 71860, + "train_speed(iter/s)": 0.201018 + }, + { + "acc": 0.75910959, + "epoch": 1.6767815220402746, + "grad_norm": 5.40625, + "learning_rate": 6.692748863665044e-07, + "loss": 0.85825319, + "memory(GiB)": 138.1, + "step": 71870, + "train_speed(iter/s)": 0.201033 + }, + { + "acc": 0.78103266, + "epoch": 1.6770148296125638, + "grad_norm": 5.34375, + "learning_rate": 6.683310311807772e-07, + "loss": 0.78897181, + "memory(GiB)": 138.1, + "step": 71880, + "train_speed(iter/s)": 0.201048 + }, + { + "acc": 0.78252711, + "epoch": 1.6772481371848524, + "grad_norm": 5.71875, + "learning_rate": 6.673877943395385e-07, + "loss": 0.7663425, + "memory(GiB)": 138.1, + "step": 71890, + "train_speed(iter/s)": 0.201063 + }, + { + "acc": 0.79238405, + "epoch": 1.6774814447571416, + "grad_norm": 6.40625, + "learning_rate": 6.664451759774332e-07, + "loss": 0.7801724, + "memory(GiB)": 138.1, + "step": 71900, + "train_speed(iter/s)": 0.201077 + }, + { + "acc": 0.78016853, + "epoch": 1.6777147523294302, + "grad_norm": 5.625, + "learning_rate": 6.655031762290203e-07, + "loss": 0.77946939, + "memory(GiB)": 138.1, + "step": 71910, + "train_speed(iter/s)": 0.201091 + }, + { + "acc": 0.75844526, + "epoch": 1.6779480599017191, + "grad_norm": 4.71875, + "learning_rate": 6.645617952287686e-07, + "loss": 0.85980473, + "memory(GiB)": 138.1, + "step": 71920, + "train_speed(iter/s)": 0.201105 + }, + { + "acc": 0.77880216, + "epoch": 1.678181367474008, + "grad_norm": 6.15625, + "learning_rate": 6.636210331110621e-07, + "loss": 0.78121481, + "memory(GiB)": 138.1, + "step": 71930, + "train_speed(iter/s)": 0.201119 + }, + { + "acc": 0.76943226, + "epoch": 1.678414675046297, + "grad_norm": 4.6875, + "learning_rate": 6.626808900101939e-07, + "loss": 0.82415466, + "memory(GiB)": 138.1, + "step": 71940, + "train_speed(iter/s)": 0.201134 + }, + { + "acc": 0.76650505, + "epoch": 1.6786479826185858, + "grad_norm": 5.59375, + "learning_rate": 6.617413660603672e-07, + "loss": 0.86484928, + "memory(GiB)": 138.1, + "step": 71950, + "train_speed(iter/s)": 0.201149 + }, + { + "acc": 0.78949447, + "epoch": 1.6788812901908747, + "grad_norm": 5.0, + "learning_rate": 6.608024613957015e-07, + "loss": 0.77047791, + "memory(GiB)": 138.1, + "step": 71960, + "train_speed(iter/s)": 0.201164 + }, + { + "acc": 0.77228599, + "epoch": 1.6791145977631636, + "grad_norm": 6.4375, + "learning_rate": 6.598641761502222e-07, + "loss": 0.82509651, + "memory(GiB)": 138.1, + "step": 71970, + "train_speed(iter/s)": 0.201178 + }, + { + "acc": 0.75860147, + "epoch": 1.6793479053354525, + "grad_norm": 5.4375, + "learning_rate": 6.58926510457873e-07, + "loss": 0.88990307, + "memory(GiB)": 138.1, + "step": 71980, + "train_speed(iter/s)": 0.201192 + }, + { + "acc": 0.77799234, + "epoch": 1.6795812129077414, + "grad_norm": 4.5625, + "learning_rate": 6.579894644525026e-07, + "loss": 0.83457985, + "memory(GiB)": 138.1, + "step": 71990, + "train_speed(iter/s)": 0.201207 + }, + { + "acc": 0.80525131, + "epoch": 1.6798145204800303, + "grad_norm": 3.59375, + "learning_rate": 6.570530382678741e-07, + "loss": 0.69075212, + "memory(GiB)": 138.1, + "step": 72000, + "train_speed(iter/s)": 0.201221 + }, + { + "epoch": 1.6798145204800303, + "eval_acc": 0.7446879757785052, + "eval_loss": 0.8043897747993469, + "eval_runtime": 1271.86, + "eval_samples_per_second": 28.298, + "eval_steps_per_second": 14.149, + "step": 72000 + }, + { + "acc": 0.77684412, + "epoch": 1.6800478280523192, + "grad_norm": 7.34375, + "learning_rate": 6.561172320376647e-07, + "loss": 0.79601202, + "memory(GiB)": 138.1, + "step": 72010, + "train_speed(iter/s)": 0.200509 + }, + { + "acc": 0.76988397, + "epoch": 1.6802811356246081, + "grad_norm": 5.65625, + "learning_rate": 6.551820458954561e-07, + "loss": 0.80573511, + "memory(GiB)": 138.1, + "step": 72020, + "train_speed(iter/s)": 0.200524 + }, + { + "acc": 0.77471781, + "epoch": 1.680514443196897, + "grad_norm": 4.53125, + "learning_rate": 6.5424747997475e-07, + "loss": 0.79388332, + "memory(GiB)": 138.1, + "step": 72030, + "train_speed(iter/s)": 0.200539 + }, + { + "acc": 0.76678681, + "epoch": 1.680747750769186, + "grad_norm": 4.5, + "learning_rate": 6.53313534408952e-07, + "loss": 0.85442133, + "memory(GiB)": 138.1, + "step": 72040, + "train_speed(iter/s)": 0.200554 + }, + { + "acc": 0.78160801, + "epoch": 1.6809810583414748, + "grad_norm": 7.375, + "learning_rate": 6.523802093313857e-07, + "loss": 0.75970039, + "memory(GiB)": 138.1, + "step": 72050, + "train_speed(iter/s)": 0.20057 + }, + { + "acc": 0.75993977, + "epoch": 1.6812143659137637, + "grad_norm": 6.5625, + "learning_rate": 6.514475048752805e-07, + "loss": 0.88980999, + "memory(GiB)": 138.1, + "step": 72060, + "train_speed(iter/s)": 0.200585 + }, + { + "acc": 0.78098273, + "epoch": 1.6814476734860526, + "grad_norm": 4.4375, + "learning_rate": 6.505154211737813e-07, + "loss": 0.7791935, + "memory(GiB)": 138.1, + "step": 72070, + "train_speed(iter/s)": 0.200599 + }, + { + "acc": 0.78494759, + "epoch": 1.6816809810583415, + "grad_norm": 6.375, + "learning_rate": 6.495839583599428e-07, + "loss": 0.765938, + "memory(GiB)": 138.1, + "step": 72080, + "train_speed(iter/s)": 0.200614 + }, + { + "acc": 0.78756342, + "epoch": 1.6819142886306304, + "grad_norm": 4.90625, + "learning_rate": 6.486531165667292e-07, + "loss": 0.74883633, + "memory(GiB)": 138.1, + "step": 72090, + "train_speed(iter/s)": 0.200629 + }, + { + "acc": 0.78392973, + "epoch": 1.6821475962029193, + "grad_norm": 5.03125, + "learning_rate": 6.477228959270199e-07, + "loss": 0.78099174, + "memory(GiB)": 138.1, + "step": 72100, + "train_speed(iter/s)": 0.200643 + }, + { + "acc": 0.78292713, + "epoch": 1.6823809037752082, + "grad_norm": 5.46875, + "learning_rate": 6.467932965736024e-07, + "loss": 0.75630102, + "memory(GiB)": 138.1, + "step": 72110, + "train_speed(iter/s)": 0.200657 + }, + { + "acc": 0.78047161, + "epoch": 1.682614211347497, + "grad_norm": 5.125, + "learning_rate": 6.458643186391789e-07, + "loss": 0.76697688, + "memory(GiB)": 138.1, + "step": 72120, + "train_speed(iter/s)": 0.200672 + }, + { + "acc": 0.78350782, + "epoch": 1.682847518919786, + "grad_norm": 7.0, + "learning_rate": 6.449359622563567e-07, + "loss": 0.76841931, + "memory(GiB)": 138.1, + "step": 72130, + "train_speed(iter/s)": 0.200686 + }, + { + "acc": 0.79610729, + "epoch": 1.6830808264920747, + "grad_norm": 4.375, + "learning_rate": 6.44008227557662e-07, + "loss": 0.74552526, + "memory(GiB)": 138.1, + "step": 72140, + "train_speed(iter/s)": 0.200701 + }, + { + "acc": 0.75990648, + "epoch": 1.6833141340643638, + "grad_norm": 5.34375, + "learning_rate": 6.430811146755272e-07, + "loss": 0.85680218, + "memory(GiB)": 138.1, + "step": 72150, + "train_speed(iter/s)": 0.200715 + }, + { + "acc": 0.78126965, + "epoch": 1.6835474416366525, + "grad_norm": 6.0625, + "learning_rate": 6.421546237422971e-07, + "loss": 0.78917265, + "memory(GiB)": 138.1, + "step": 72160, + "train_speed(iter/s)": 0.200729 + }, + { + "acc": 0.77440681, + "epoch": 1.6837807492089416, + "grad_norm": 4.53125, + "learning_rate": 6.412287548902291e-07, + "loss": 0.82259197, + "memory(GiB)": 138.1, + "step": 72170, + "train_speed(iter/s)": 0.200744 + }, + { + "acc": 0.79978375, + "epoch": 1.6840140567812303, + "grad_norm": 9.0625, + "learning_rate": 6.403035082514891e-07, + "loss": 0.71059504, + "memory(GiB)": 138.1, + "step": 72180, + "train_speed(iter/s)": 0.200758 + }, + { + "acc": 0.78088136, + "epoch": 1.6842473643535194, + "grad_norm": 6.4375, + "learning_rate": 6.393788839581578e-07, + "loss": 0.78694282, + "memory(GiB)": 138.1, + "step": 72190, + "train_speed(iter/s)": 0.200771 + }, + { + "acc": 0.76632071, + "epoch": 1.684480671925808, + "grad_norm": 7.75, + "learning_rate": 6.384548821422243e-07, + "loss": 0.8362587, + "memory(GiB)": 138.1, + "step": 72200, + "train_speed(iter/s)": 0.200785 + }, + { + "acc": 0.75542583, + "epoch": 1.6847139794980972, + "grad_norm": 5.25, + "learning_rate": 6.375315029355883e-07, + "loss": 0.88243284, + "memory(GiB)": 138.1, + "step": 72210, + "train_speed(iter/s)": 0.2008 + }, + { + "acc": 0.79456859, + "epoch": 1.6849472870703859, + "grad_norm": 6.1875, + "learning_rate": 6.366087464700637e-07, + "loss": 0.7337564, + "memory(GiB)": 138.1, + "step": 72220, + "train_speed(iter/s)": 0.200814 + }, + { + "acc": 0.79592638, + "epoch": 1.685180594642675, + "grad_norm": 5.0, + "learning_rate": 6.35686612877372e-07, + "loss": 0.7540689, + "memory(GiB)": 138.1, + "step": 72230, + "train_speed(iter/s)": 0.200828 + }, + { + "acc": 0.76712937, + "epoch": 1.6854139022149637, + "grad_norm": 4.84375, + "learning_rate": 6.34765102289151e-07, + "loss": 0.85317764, + "memory(GiB)": 138.1, + "step": 72240, + "train_speed(iter/s)": 0.200843 + }, + { + "acc": 0.77703066, + "epoch": 1.6856472097872528, + "grad_norm": 4.875, + "learning_rate": 6.338442148369406e-07, + "loss": 0.77372303, + "memory(GiB)": 138.1, + "step": 72250, + "train_speed(iter/s)": 0.200857 + }, + { + "acc": 0.78281322, + "epoch": 1.6858805173595415, + "grad_norm": 4.5, + "learning_rate": 6.329239506522017e-07, + "loss": 0.77638769, + "memory(GiB)": 138.1, + "step": 72260, + "train_speed(iter/s)": 0.200872 + }, + { + "acc": 0.78430648, + "epoch": 1.6861138249318306, + "grad_norm": 7.125, + "learning_rate": 6.320043098662992e-07, + "loss": 0.7674891, + "memory(GiB)": 138.1, + "step": 72270, + "train_speed(iter/s)": 0.200887 + }, + { + "acc": 0.77010136, + "epoch": 1.6863471325041193, + "grad_norm": 4.6875, + "learning_rate": 6.310852926105138e-07, + "loss": 0.82719336, + "memory(GiB)": 138.1, + "step": 72280, + "train_speed(iter/s)": 0.2009 + }, + { + "acc": 0.78210573, + "epoch": 1.6865804400764084, + "grad_norm": 5.4375, + "learning_rate": 6.301668990160331e-07, + "loss": 0.79944468, + "memory(GiB)": 138.1, + "step": 72290, + "train_speed(iter/s)": 0.200914 + }, + { + "acc": 0.80032415, + "epoch": 1.686813747648697, + "grad_norm": 4.9375, + "learning_rate": 6.292491292139574e-07, + "loss": 0.72096748, + "memory(GiB)": 138.1, + "step": 72300, + "train_speed(iter/s)": 0.200929 + }, + { + "acc": 0.7697011, + "epoch": 1.687047055220986, + "grad_norm": 7.03125, + "learning_rate": 6.283319833353002e-07, + "loss": 0.82724371, + "memory(GiB)": 138.1, + "step": 72310, + "train_speed(iter/s)": 0.200943 + }, + { + "acc": 0.79757509, + "epoch": 1.6872803627932749, + "grad_norm": 6.6875, + "learning_rate": 6.274154615109812e-07, + "loss": 0.70658522, + "memory(GiB)": 138.1, + "step": 72320, + "train_speed(iter/s)": 0.200957 + }, + { + "acc": 0.79727383, + "epoch": 1.6875136703655638, + "grad_norm": 6.875, + "learning_rate": 6.264995638718352e-07, + "loss": 0.7160409, + "memory(GiB)": 138.1, + "step": 72330, + "train_speed(iter/s)": 0.200971 + }, + { + "acc": 0.76207228, + "epoch": 1.6877469779378527, + "grad_norm": 9.125, + "learning_rate": 6.255842905486065e-07, + "loss": 0.85418339, + "memory(GiB)": 138.1, + "step": 72340, + "train_speed(iter/s)": 0.200986 + }, + { + "acc": 0.77744265, + "epoch": 1.6879802855101416, + "grad_norm": 5.125, + "learning_rate": 6.246696416719495e-07, + "loss": 0.80833817, + "memory(GiB)": 138.1, + "step": 72350, + "train_speed(iter/s)": 0.201 + }, + { + "acc": 0.80837536, + "epoch": 1.6882135930824305, + "grad_norm": 5.4375, + "learning_rate": 6.237556173724291e-07, + "loss": 0.70104604, + "memory(GiB)": 138.1, + "step": 72360, + "train_speed(iter/s)": 0.201015 + }, + { + "acc": 0.79856534, + "epoch": 1.6884469006547194, + "grad_norm": 4.8125, + "learning_rate": 6.228422177805244e-07, + "loss": 0.70726795, + "memory(GiB)": 138.1, + "step": 72370, + "train_speed(iter/s)": 0.20103 + }, + { + "acc": 0.76452069, + "epoch": 1.6886802082270083, + "grad_norm": 8.4375, + "learning_rate": 6.21929443026621e-07, + "loss": 0.84759407, + "memory(GiB)": 138.1, + "step": 72380, + "train_speed(iter/s)": 0.201045 + }, + { + "acc": 0.78385258, + "epoch": 1.6889135157992972, + "grad_norm": 4.90625, + "learning_rate": 6.210172932410169e-07, + "loss": 0.76335421, + "memory(GiB)": 138.1, + "step": 72390, + "train_speed(iter/s)": 0.201059 + }, + { + "acc": 0.77586222, + "epoch": 1.689146823371586, + "grad_norm": 5.9375, + "learning_rate": 6.20105768553923e-07, + "loss": 0.79805136, + "memory(GiB)": 138.1, + "step": 72400, + "train_speed(iter/s)": 0.201075 + }, + { + "acc": 0.78417902, + "epoch": 1.689380130943875, + "grad_norm": 5.375, + "learning_rate": 6.191948690954575e-07, + "loss": 0.75983038, + "memory(GiB)": 138.1, + "step": 72410, + "train_speed(iter/s)": 0.201089 + }, + { + "acc": 0.78113298, + "epoch": 1.6896134385161639, + "grad_norm": 4.6875, + "learning_rate": 6.182845949956523e-07, + "loss": 0.80795841, + "memory(GiB)": 138.1, + "step": 72420, + "train_speed(iter/s)": 0.201104 + }, + { + "acc": 0.78343716, + "epoch": 1.6898467460884528, + "grad_norm": 5.1875, + "learning_rate": 6.173749463844486e-07, + "loss": 0.76608415, + "memory(GiB)": 138.1, + "step": 72430, + "train_speed(iter/s)": 0.201118 + }, + { + "acc": 0.79046288, + "epoch": 1.6900800536607417, + "grad_norm": 5.1875, + "learning_rate": 6.164659233916976e-07, + "loss": 0.75967202, + "memory(GiB)": 138.1, + "step": 72440, + "train_speed(iter/s)": 0.201131 + }, + { + "acc": 0.76052351, + "epoch": 1.6903133612330306, + "grad_norm": 6.0625, + "learning_rate": 6.15557526147163e-07, + "loss": 0.86396046, + "memory(GiB)": 138.1, + "step": 72450, + "train_speed(iter/s)": 0.201145 + }, + { + "acc": 0.7680316, + "epoch": 1.6905466688053195, + "grad_norm": 4.875, + "learning_rate": 6.146497547805169e-07, + "loss": 0.83324471, + "memory(GiB)": 138.1, + "step": 72460, + "train_speed(iter/s)": 0.201159 + }, + { + "acc": 0.78809872, + "epoch": 1.6907799763776084, + "grad_norm": 3.703125, + "learning_rate": 6.137426094213466e-07, + "loss": 0.74023752, + "memory(GiB)": 138.1, + "step": 72470, + "train_speed(iter/s)": 0.201174 + }, + { + "acc": 0.77969913, + "epoch": 1.6910132839498973, + "grad_norm": 4.71875, + "learning_rate": 6.128360901991426e-07, + "loss": 0.79022303, + "memory(GiB)": 138.1, + "step": 72480, + "train_speed(iter/s)": 0.201189 + }, + { + "acc": 0.80677795, + "epoch": 1.691246591522186, + "grad_norm": 10.1875, + "learning_rate": 6.119301972433128e-07, + "loss": 0.69705276, + "memory(GiB)": 138.1, + "step": 72490, + "train_speed(iter/s)": 0.201203 + }, + { + "acc": 0.76214151, + "epoch": 1.691479899094475, + "grad_norm": 7.3125, + "learning_rate": 6.110249306831733e-07, + "loss": 0.86750393, + "memory(GiB)": 138.1, + "step": 72500, + "train_speed(iter/s)": 0.201218 + }, + { + "epoch": 1.691479899094475, + "eval_acc": 0.7446806091438193, + "eval_loss": 0.8044036626815796, + "eval_runtime": 1270.6076, + "eval_samples_per_second": 28.326, + "eval_steps_per_second": 14.163, + "step": 72500 + }, + { + "acc": 0.80648994, + "epoch": 1.6917132066667637, + "grad_norm": 6.1875, + "learning_rate": 6.101202906479487e-07, + "loss": 0.67632608, + "memory(GiB)": 138.1, + "step": 72510, + "train_speed(iter/s)": 0.200512 + }, + { + "acc": 0.77256756, + "epoch": 1.6919465142390528, + "grad_norm": 3.859375, + "learning_rate": 6.092162772667781e-07, + "loss": 0.80494852, + "memory(GiB)": 138.1, + "step": 72520, + "train_speed(iter/s)": 0.200527 + }, + { + "acc": 0.77806387, + "epoch": 1.6921798218113415, + "grad_norm": 6.625, + "learning_rate": 6.083128906687074e-07, + "loss": 0.78679771, + "memory(GiB)": 138.1, + "step": 72530, + "train_speed(iter/s)": 0.200542 + }, + { + "acc": 0.77233944, + "epoch": 1.6924131293836306, + "grad_norm": 5.09375, + "learning_rate": 6.074101309826968e-07, + "loss": 0.80660133, + "memory(GiB)": 138.1, + "step": 72540, + "train_speed(iter/s)": 0.200557 + }, + { + "acc": 0.79014969, + "epoch": 1.6926464369559193, + "grad_norm": 6.875, + "learning_rate": 6.065079983376132e-07, + "loss": 0.76289606, + "memory(GiB)": 138.1, + "step": 72550, + "train_speed(iter/s)": 0.200571 + }, + { + "acc": 0.78846631, + "epoch": 1.6928797445282084, + "grad_norm": 5.75, + "learning_rate": 6.056064928622374e-07, + "loss": 0.77690454, + "memory(GiB)": 138.1, + "step": 72560, + "train_speed(iter/s)": 0.200586 + }, + { + "acc": 0.79145346, + "epoch": 1.6931130521004971, + "grad_norm": 6.15625, + "learning_rate": 6.047056146852575e-07, + "loss": 0.74644814, + "memory(GiB)": 138.1, + "step": 72570, + "train_speed(iter/s)": 0.2006 + }, + { + "acc": 0.78505545, + "epoch": 1.6933463596727862, + "grad_norm": 4.125, + "learning_rate": 6.038053639352754e-07, + "loss": 0.76110735, + "memory(GiB)": 138.1, + "step": 72580, + "train_speed(iter/s)": 0.200613 + }, + { + "acc": 0.78503914, + "epoch": 1.693579667245075, + "grad_norm": 4.21875, + "learning_rate": 6.029057407407995e-07, + "loss": 0.7781271, + "memory(GiB)": 138.1, + "step": 72590, + "train_speed(iter/s)": 0.200628 + }, + { + "acc": 0.82557945, + "epoch": 1.693812974817364, + "grad_norm": 4.21875, + "learning_rate": 6.020067452302514e-07, + "loss": 0.6177803, + "memory(GiB)": 138.1, + "step": 72600, + "train_speed(iter/s)": 0.200642 + }, + { + "acc": 0.79121065, + "epoch": 1.6940462823896527, + "grad_norm": 5.78125, + "learning_rate": 6.011083775319637e-07, + "loss": 0.73865213, + "memory(GiB)": 138.1, + "step": 72610, + "train_speed(iter/s)": 0.200657 + }, + { + "acc": 0.77857733, + "epoch": 1.6942795899619418, + "grad_norm": 4.15625, + "learning_rate": 6.002106377741762e-07, + "loss": 0.78328385, + "memory(GiB)": 138.1, + "step": 72620, + "train_speed(iter/s)": 0.200671 + }, + { + "acc": 0.7810358, + "epoch": 1.6945128975342305, + "grad_norm": 6.53125, + "learning_rate": 5.99313526085043e-07, + "loss": 0.7782196, + "memory(GiB)": 138.1, + "step": 72630, + "train_speed(iter/s)": 0.200686 + }, + { + "acc": 0.79591274, + "epoch": 1.6947462051065196, + "grad_norm": 16.125, + "learning_rate": 5.984170425926256e-07, + "loss": 0.72331409, + "memory(GiB)": 138.1, + "step": 72640, + "train_speed(iter/s)": 0.2007 + }, + { + "acc": 0.77169046, + "epoch": 1.6949795126788083, + "grad_norm": 5.09375, + "learning_rate": 5.975211874248954e-07, + "loss": 0.83417196, + "memory(GiB)": 138.1, + "step": 72650, + "train_speed(iter/s)": 0.200715 + }, + { + "acc": 0.80202818, + "epoch": 1.6952128202510974, + "grad_norm": 6.21875, + "learning_rate": 5.96625960709738e-07, + "loss": 0.72186661, + "memory(GiB)": 138.1, + "step": 72660, + "train_speed(iter/s)": 0.200729 + }, + { + "acc": 0.78016157, + "epoch": 1.695446127823386, + "grad_norm": 5.53125, + "learning_rate": 5.957313625749445e-07, + "loss": 0.80601139, + "memory(GiB)": 138.1, + "step": 72670, + "train_speed(iter/s)": 0.200743 + }, + { + "acc": 0.78957376, + "epoch": 1.695679435395675, + "grad_norm": 4.75, + "learning_rate": 5.948373931482204e-07, + "loss": 0.75643826, + "memory(GiB)": 138.1, + "step": 72680, + "train_speed(iter/s)": 0.200758 + }, + { + "acc": 0.75675497, + "epoch": 1.695912742967964, + "grad_norm": 6.875, + "learning_rate": 5.939440525571788e-07, + "loss": 0.88091316, + "memory(GiB)": 138.1, + "step": 72690, + "train_speed(iter/s)": 0.200771 + }, + { + "acc": 0.77509961, + "epoch": 1.6961460505402528, + "grad_norm": 6.21875, + "learning_rate": 5.930513409293437e-07, + "loss": 0.79928541, + "memory(GiB)": 138.1, + "step": 72700, + "train_speed(iter/s)": 0.200786 + }, + { + "acc": 0.77624722, + "epoch": 1.6963793581125417, + "grad_norm": 6.1875, + "learning_rate": 5.921592583921488e-07, + "loss": 0.7726511, + "memory(GiB)": 138.1, + "step": 72710, + "train_speed(iter/s)": 0.2008 + }, + { + "acc": 0.77374887, + "epoch": 1.6966126656848306, + "grad_norm": 5.9375, + "learning_rate": 5.912678050729398e-07, + "loss": 0.7831955, + "memory(GiB)": 138.1, + "step": 72720, + "train_speed(iter/s)": 0.200814 + }, + { + "acc": 0.7693531, + "epoch": 1.6968459732571195, + "grad_norm": 4.34375, + "learning_rate": 5.903769810989713e-07, + "loss": 0.82154694, + "memory(GiB)": 138.1, + "step": 72730, + "train_speed(iter/s)": 0.200828 + }, + { + "acc": 0.77684445, + "epoch": 1.6970792808294084, + "grad_norm": 4.4375, + "learning_rate": 5.894867865974064e-07, + "loss": 0.80565586, + "memory(GiB)": 138.1, + "step": 72740, + "train_speed(iter/s)": 0.200843 + }, + { + "acc": 0.78173313, + "epoch": 1.6973125884016973, + "grad_norm": 4.25, + "learning_rate": 5.885972216953223e-07, + "loss": 0.78659649, + "memory(GiB)": 138.1, + "step": 72750, + "train_speed(iter/s)": 0.200857 + }, + { + "acc": 0.78354235, + "epoch": 1.6975458959739862, + "grad_norm": 5.09375, + "learning_rate": 5.877082865197026e-07, + "loss": 0.80655441, + "memory(GiB)": 138.1, + "step": 72760, + "train_speed(iter/s)": 0.200872 + }, + { + "acc": 0.79456587, + "epoch": 1.697779203546275, + "grad_norm": 6.65625, + "learning_rate": 5.86819981197444e-07, + "loss": 0.73504858, + "memory(GiB)": 138.1, + "step": 72770, + "train_speed(iter/s)": 0.200886 + }, + { + "acc": 0.79108696, + "epoch": 1.698012511118564, + "grad_norm": 8.5, + "learning_rate": 5.859323058553512e-07, + "loss": 0.75745306, + "memory(GiB)": 138.1, + "step": 72780, + "train_speed(iter/s)": 0.200901 + }, + { + "acc": 0.7710947, + "epoch": 1.698245818690853, + "grad_norm": 4.25, + "learning_rate": 5.850452606201384e-07, + "loss": 0.81637497, + "memory(GiB)": 138.1, + "step": 72790, + "train_speed(iter/s)": 0.200915 + }, + { + "acc": 0.8023735, + "epoch": 1.6984791262631418, + "grad_norm": 5.125, + "learning_rate": 5.841588456184333e-07, + "loss": 0.69645176, + "memory(GiB)": 138.1, + "step": 72800, + "train_speed(iter/s)": 0.20093 + }, + { + "acc": 0.78629084, + "epoch": 1.6987124338354307, + "grad_norm": 7.1875, + "learning_rate": 5.8327306097677e-07, + "loss": 0.75006866, + "memory(GiB)": 138.1, + "step": 72810, + "train_speed(iter/s)": 0.200943 + }, + { + "acc": 0.77999191, + "epoch": 1.6989457414077196, + "grad_norm": 10.5625, + "learning_rate": 5.823879068215943e-07, + "loss": 0.80349598, + "memory(GiB)": 138.1, + "step": 72820, + "train_speed(iter/s)": 0.200958 + }, + { + "acc": 0.77250009, + "epoch": 1.6991790489800085, + "grad_norm": 6.09375, + "learning_rate": 5.815033832792605e-07, + "loss": 0.79956465, + "memory(GiB)": 138.1, + "step": 72830, + "train_speed(iter/s)": 0.200972 + }, + { + "acc": 0.78692751, + "epoch": 1.6994123565522974, + "grad_norm": 4.90625, + "learning_rate": 5.806194904760365e-07, + "loss": 0.75956717, + "memory(GiB)": 138.1, + "step": 72840, + "train_speed(iter/s)": 0.200985 + }, + { + "acc": 0.7845644, + "epoch": 1.6996456641245863, + "grad_norm": 5.75, + "learning_rate": 5.797362285380948e-07, + "loss": 0.77232275, + "memory(GiB)": 138.1, + "step": 72850, + "train_speed(iter/s)": 0.200999 + }, + { + "acc": 0.78012753, + "epoch": 1.6998789716968752, + "grad_norm": 6.8125, + "learning_rate": 5.788535975915239e-07, + "loss": 0.77585268, + "memory(GiB)": 138.1, + "step": 72860, + "train_speed(iter/s)": 0.201013 + }, + { + "acc": 0.77515068, + "epoch": 1.700112279269164, + "grad_norm": 5.34375, + "learning_rate": 5.779715977623168e-07, + "loss": 0.81616535, + "memory(GiB)": 138.1, + "step": 72870, + "train_speed(iter/s)": 0.201026 + }, + { + "acc": 0.79815941, + "epoch": 1.7003455868414528, + "grad_norm": 5.625, + "learning_rate": 5.770902291763791e-07, + "loss": 0.71203232, + "memory(GiB)": 138.1, + "step": 72880, + "train_speed(iter/s)": 0.20104 + }, + { + "acc": 0.80398407, + "epoch": 1.7005788944137419, + "grad_norm": 5.8125, + "learning_rate": 5.762094919595274e-07, + "loss": 0.71322527, + "memory(GiB)": 138.1, + "step": 72890, + "train_speed(iter/s)": 0.201052 + }, + { + "acc": 0.79169493, + "epoch": 1.7008122019860306, + "grad_norm": 5.125, + "learning_rate": 5.753293862374842e-07, + "loss": 0.73782015, + "memory(GiB)": 138.1, + "step": 72900, + "train_speed(iter/s)": 0.201067 + }, + { + "acc": 0.78701067, + "epoch": 1.7010455095583197, + "grad_norm": 7.125, + "learning_rate": 5.744499121358871e-07, + "loss": 0.77875328, + "memory(GiB)": 138.1, + "step": 72910, + "train_speed(iter/s)": 0.201081 + }, + { + "acc": 0.79096022, + "epoch": 1.7012788171306084, + "grad_norm": 4.4375, + "learning_rate": 5.735710697802793e-07, + "loss": 0.73712015, + "memory(GiB)": 138.1, + "step": 72920, + "train_speed(iter/s)": 0.201095 + }, + { + "acc": 0.77540636, + "epoch": 1.7015121247028975, + "grad_norm": 4.84375, + "learning_rate": 5.726928592961156e-07, + "loss": 0.80175991, + "memory(GiB)": 138.1, + "step": 72930, + "train_speed(iter/s)": 0.20111 + }, + { + "acc": 0.77421494, + "epoch": 1.7017454322751862, + "grad_norm": 7.53125, + "learning_rate": 5.718152808087601e-07, + "loss": 0.81656933, + "memory(GiB)": 138.1, + "step": 72940, + "train_speed(iter/s)": 0.201125 + }, + { + "acc": 0.76136036, + "epoch": 1.7019787398474753, + "grad_norm": 4.5, + "learning_rate": 5.709383344434854e-07, + "loss": 0.8624939, + "memory(GiB)": 138.1, + "step": 72950, + "train_speed(iter/s)": 0.201139 + }, + { + "acc": 0.790942, + "epoch": 1.702212047419764, + "grad_norm": 7.21875, + "learning_rate": 5.700620203254781e-07, + "loss": 0.74097085, + "memory(GiB)": 138.1, + "step": 72960, + "train_speed(iter/s)": 0.201154 + }, + { + "acc": 0.77590914, + "epoch": 1.702445354992053, + "grad_norm": 5.875, + "learning_rate": 5.691863385798296e-07, + "loss": 0.81017885, + "memory(GiB)": 138.1, + "step": 72970, + "train_speed(iter/s)": 0.201168 + }, + { + "acc": 0.7668539, + "epoch": 1.7026786625643417, + "grad_norm": 5.21875, + "learning_rate": 5.683112893315451e-07, + "loss": 0.86975269, + "memory(GiB)": 138.1, + "step": 72980, + "train_speed(iter/s)": 0.201183 + }, + { + "acc": 0.78860574, + "epoch": 1.7029119701366309, + "grad_norm": 5.53125, + "learning_rate": 5.674368727055351e-07, + "loss": 0.74698706, + "memory(GiB)": 138.1, + "step": 72990, + "train_speed(iter/s)": 0.201197 + }, + { + "acc": 0.77102966, + "epoch": 1.7031452777089195, + "grad_norm": 5.3125, + "learning_rate": 5.665630888266254e-07, + "loss": 0.82879314, + "memory(GiB)": 138.1, + "step": 73000, + "train_speed(iter/s)": 0.201211 + }, + { + "epoch": 1.7031452777089195, + "eval_acc": 0.7447366596251248, + "eval_loss": 0.8043876886367798, + "eval_runtime": 1271.875, + "eval_samples_per_second": 28.298, + "eval_steps_per_second": 14.149, + "step": 73000 + }, + { + "acc": 0.78933325, + "epoch": 1.7033785852812087, + "grad_norm": 5.0625, + "learning_rate": 5.656899378195468e-07, + "loss": 0.75085564, + "memory(GiB)": 138.1, + "step": 73010, + "train_speed(iter/s)": 0.200509 + }, + { + "acc": 0.79252825, + "epoch": 1.7036118928534973, + "grad_norm": 4.34375, + "learning_rate": 5.648174198089407e-07, + "loss": 0.74576683, + "memory(GiB)": 138.1, + "step": 73020, + "train_speed(iter/s)": 0.200523 + }, + { + "acc": 0.79472671, + "epoch": 1.7038452004257865, + "grad_norm": 5.8125, + "learning_rate": 5.639455349193602e-07, + "loss": 0.7537653, + "memory(GiB)": 138.1, + "step": 73030, + "train_speed(iter/s)": 0.200537 + }, + { + "acc": 0.78937383, + "epoch": 1.7040785079980751, + "grad_norm": 5.28125, + "learning_rate": 5.630742832752655e-07, + "loss": 0.74292388, + "memory(GiB)": 138.1, + "step": 73040, + "train_speed(iter/s)": 0.200551 + }, + { + "acc": 0.78601589, + "epoch": 1.7043118155703643, + "grad_norm": 3.828125, + "learning_rate": 5.622036650010281e-07, + "loss": 0.77539072, + "memory(GiB)": 138.1, + "step": 73050, + "train_speed(iter/s)": 0.200565 + }, + { + "acc": 0.80177393, + "epoch": 1.704545123142653, + "grad_norm": 4.65625, + "learning_rate": 5.613336802209274e-07, + "loss": 0.7048872, + "memory(GiB)": 138.1, + "step": 73060, + "train_speed(iter/s)": 0.200579 + }, + { + "acc": 0.76926432, + "epoch": 1.7047784307149418, + "grad_norm": 5.78125, + "learning_rate": 5.604643290591555e-07, + "loss": 0.82517433, + "memory(GiB)": 138.1, + "step": 73070, + "train_speed(iter/s)": 0.200594 + }, + { + "acc": 0.78017607, + "epoch": 1.7050117382872307, + "grad_norm": 5.5625, + "learning_rate": 5.595956116398111e-07, + "loss": 0.75298491, + "memory(GiB)": 138.1, + "step": 73080, + "train_speed(iter/s)": 0.200608 + }, + { + "acc": 0.77815399, + "epoch": 1.7052450458595196, + "grad_norm": 5.1875, + "learning_rate": 5.58727528086902e-07, + "loss": 0.78736992, + "memory(GiB)": 138.1, + "step": 73090, + "train_speed(iter/s)": 0.200623 + }, + { + "acc": 0.77575245, + "epoch": 1.7054783534318085, + "grad_norm": 5.375, + "learning_rate": 5.578600785243493e-07, + "loss": 0.81773357, + "memory(GiB)": 138.1, + "step": 73100, + "train_speed(iter/s)": 0.200638 + }, + { + "acc": 0.79463716, + "epoch": 1.7057116610040974, + "grad_norm": 4.3125, + "learning_rate": 5.569932630759789e-07, + "loss": 0.74758506, + "memory(GiB)": 138.1, + "step": 73110, + "train_speed(iter/s)": 0.200652 + }, + { + "acc": 0.78639936, + "epoch": 1.7059449685763863, + "grad_norm": 4.90625, + "learning_rate": 5.561270818655301e-07, + "loss": 0.76089525, + "memory(GiB)": 138.1, + "step": 73120, + "train_speed(iter/s)": 0.200666 + }, + { + "acc": 0.7882925, + "epoch": 1.7061782761486752, + "grad_norm": 10.375, + "learning_rate": 5.552615350166496e-07, + "loss": 0.76413431, + "memory(GiB)": 138.1, + "step": 73130, + "train_speed(iter/s)": 0.200681 + }, + { + "acc": 0.78352389, + "epoch": 1.7064115837209641, + "grad_norm": 5.375, + "learning_rate": 5.54396622652893e-07, + "loss": 0.77671185, + "memory(GiB)": 138.1, + "step": 73140, + "train_speed(iter/s)": 0.200694 + }, + { + "acc": 0.78264346, + "epoch": 1.706644891293253, + "grad_norm": 8.0625, + "learning_rate": 5.535323448977275e-07, + "loss": 0.76960349, + "memory(GiB)": 138.1, + "step": 73150, + "train_speed(iter/s)": 0.200709 + }, + { + "acc": 0.8065589, + "epoch": 1.706878198865542, + "grad_norm": 6.84375, + "learning_rate": 5.526687018745286e-07, + "loss": 0.69353619, + "memory(GiB)": 138.1, + "step": 73160, + "train_speed(iter/s)": 0.200723 + }, + { + "acc": 0.78679342, + "epoch": 1.7071115064378308, + "grad_norm": 5.0625, + "learning_rate": 5.518056937065802e-07, + "loss": 0.77192287, + "memory(GiB)": 138.1, + "step": 73170, + "train_speed(iter/s)": 0.200737 + }, + { + "acc": 0.7717082, + "epoch": 1.7073448140101197, + "grad_norm": 4.15625, + "learning_rate": 5.509433205170761e-07, + "loss": 0.82198477, + "memory(GiB)": 138.1, + "step": 73180, + "train_speed(iter/s)": 0.200751 + }, + { + "acc": 0.78816519, + "epoch": 1.7075781215824086, + "grad_norm": 6.03125, + "learning_rate": 5.500815824291216e-07, + "loss": 0.78270626, + "memory(GiB)": 138.1, + "step": 73190, + "train_speed(iter/s)": 0.200764 + }, + { + "acc": 0.76701832, + "epoch": 1.7078114291546975, + "grad_norm": 6.4375, + "learning_rate": 5.492204795657274e-07, + "loss": 0.82847338, + "memory(GiB)": 138.1, + "step": 73200, + "train_speed(iter/s)": 0.200779 + }, + { + "acc": 0.7891695, + "epoch": 1.7080447367269864, + "grad_norm": 4.59375, + "learning_rate": 5.483600120498178e-07, + "loss": 0.74464188, + "memory(GiB)": 138.1, + "step": 73210, + "train_speed(iter/s)": 0.200792 + }, + { + "acc": 0.76037254, + "epoch": 1.7082780442992753, + "grad_norm": 5.0, + "learning_rate": 5.475001800042228e-07, + "loss": 0.87401991, + "memory(GiB)": 138.1, + "step": 73220, + "train_speed(iter/s)": 0.200806 + }, + { + "acc": 0.78426857, + "epoch": 1.7085113518715642, + "grad_norm": 5.15625, + "learning_rate": 5.466409835516834e-07, + "loss": 0.7634037, + "memory(GiB)": 138.1, + "step": 73230, + "train_speed(iter/s)": 0.20082 + }, + { + "acc": 0.80567493, + "epoch": 1.7087446594438531, + "grad_norm": 4.78125, + "learning_rate": 5.457824228148506e-07, + "loss": 0.68272343, + "memory(GiB)": 138.1, + "step": 73240, + "train_speed(iter/s)": 0.200834 + }, + { + "acc": 0.77797771, + "epoch": 1.708977967016142, + "grad_norm": 4.78125, + "learning_rate": 5.449244979162816e-07, + "loss": 0.80399952, + "memory(GiB)": 138.1, + "step": 73250, + "train_speed(iter/s)": 0.200848 + }, + { + "acc": 0.78294101, + "epoch": 1.709211274588431, + "grad_norm": 6.6875, + "learning_rate": 5.440672089784476e-07, + "loss": 0.78328276, + "memory(GiB)": 138.1, + "step": 73260, + "train_speed(iter/s)": 0.20086 + }, + { + "acc": 0.79575491, + "epoch": 1.7094445821607196, + "grad_norm": 5.5625, + "learning_rate": 5.43210556123725e-07, + "loss": 0.73883858, + "memory(GiB)": 138.1, + "step": 73270, + "train_speed(iter/s)": 0.200876 + }, + { + "acc": 0.78026762, + "epoch": 1.7096778897330087, + "grad_norm": 5.625, + "learning_rate": 5.423545394744012e-07, + "loss": 0.79718757, + "memory(GiB)": 138.1, + "step": 73280, + "train_speed(iter/s)": 0.20089 + }, + { + "acc": 0.79239721, + "epoch": 1.7099111973052974, + "grad_norm": 5.5, + "learning_rate": 5.414991591526714e-07, + "loss": 0.74911137, + "memory(GiB)": 138.1, + "step": 73290, + "train_speed(iter/s)": 0.200904 + }, + { + "acc": 0.78526192, + "epoch": 1.7101445048775865, + "grad_norm": 9.625, + "learning_rate": 5.406444152806406e-07, + "loss": 0.75527325, + "memory(GiB)": 138.1, + "step": 73300, + "train_speed(iter/s)": 0.200919 + }, + { + "acc": 0.77781644, + "epoch": 1.7103778124498752, + "grad_norm": 5.28125, + "learning_rate": 5.397903079803251e-07, + "loss": 0.81301641, + "memory(GiB)": 138.1, + "step": 73310, + "train_speed(iter/s)": 0.200933 + }, + { + "acc": 0.78796282, + "epoch": 1.7106111200221643, + "grad_norm": 6.09375, + "learning_rate": 5.389368373736464e-07, + "loss": 0.75768142, + "memory(GiB)": 138.1, + "step": 73320, + "train_speed(iter/s)": 0.200947 + }, + { + "acc": 0.80338306, + "epoch": 1.710844427594453, + "grad_norm": 7.4375, + "learning_rate": 5.380840035824397e-07, + "loss": 0.6969276, + "memory(GiB)": 138.1, + "step": 73330, + "train_speed(iter/s)": 0.200961 + }, + { + "acc": 0.78724689, + "epoch": 1.711077735166742, + "grad_norm": 5.84375, + "learning_rate": 5.372318067284438e-07, + "loss": 0.76626501, + "memory(GiB)": 138.1, + "step": 73340, + "train_speed(iter/s)": 0.200976 + }, + { + "acc": 0.78899498, + "epoch": 1.7113110427390308, + "grad_norm": 5.71875, + "learning_rate": 5.363802469333118e-07, + "loss": 0.75074024, + "memory(GiB)": 138.1, + "step": 73350, + "train_speed(iter/s)": 0.200989 + }, + { + "acc": 0.77315474, + "epoch": 1.71154435031132, + "grad_norm": 6.8125, + "learning_rate": 5.355293243186033e-07, + "loss": 0.82559099, + "memory(GiB)": 138.1, + "step": 73360, + "train_speed(iter/s)": 0.201005 + }, + { + "acc": 0.77842388, + "epoch": 1.7117776578836086, + "grad_norm": 6.96875, + "learning_rate": 5.34679039005786e-07, + "loss": 0.8188942, + "memory(GiB)": 138.1, + "step": 73370, + "train_speed(iter/s)": 0.201019 + }, + { + "acc": 0.76217799, + "epoch": 1.7120109654558977, + "grad_norm": 5.8125, + "learning_rate": 5.338293911162401e-07, + "loss": 0.85048714, + "memory(GiB)": 138.1, + "step": 73380, + "train_speed(iter/s)": 0.201033 + }, + { + "acc": 0.76450157, + "epoch": 1.7122442730281864, + "grad_norm": 7.03125, + "learning_rate": 5.329803807712497e-07, + "loss": 0.85439548, + "memory(GiB)": 138.1, + "step": 73390, + "train_speed(iter/s)": 0.201048 + }, + { + "acc": 0.78162451, + "epoch": 1.7124775806004755, + "grad_norm": 7.53125, + "learning_rate": 5.321320080920128e-07, + "loss": 0.78223362, + "memory(GiB)": 138.1, + "step": 73400, + "train_speed(iter/s)": 0.201062 + }, + { + "acc": 0.77304645, + "epoch": 1.7127108881727642, + "grad_norm": 5.15625, + "learning_rate": 5.312842731996332e-07, + "loss": 0.82387562, + "memory(GiB)": 138.1, + "step": 73410, + "train_speed(iter/s)": 0.201076 + }, + { + "acc": 0.77667975, + "epoch": 1.7129441957450533, + "grad_norm": 4.03125, + "learning_rate": 5.304371762151261e-07, + "loss": 0.80046349, + "memory(GiB)": 138.1, + "step": 73420, + "train_speed(iter/s)": 0.20109 + }, + { + "acc": 0.79003797, + "epoch": 1.713177503317342, + "grad_norm": 10.625, + "learning_rate": 5.295907172594139e-07, + "loss": 0.75672364, + "memory(GiB)": 138.1, + "step": 73430, + "train_speed(iter/s)": 0.201105 + }, + { + "acc": 0.79200077, + "epoch": 1.713410810889631, + "grad_norm": 5.71875, + "learning_rate": 5.287448964533276e-07, + "loss": 0.7386096, + "memory(GiB)": 138.1, + "step": 73440, + "train_speed(iter/s)": 0.201119 + }, + { + "acc": 0.77990046, + "epoch": 1.7136441184619198, + "grad_norm": 6.75, + "learning_rate": 5.278997139176084e-07, + "loss": 0.79957833, + "memory(GiB)": 138.1, + "step": 73450, + "train_speed(iter/s)": 0.201132 + }, + { + "acc": 0.74390507, + "epoch": 1.7138774260342087, + "grad_norm": 4.1875, + "learning_rate": 5.270551697729059e-07, + "loss": 0.94712648, + "memory(GiB)": 138.1, + "step": 73460, + "train_speed(iter/s)": 0.201147 + }, + { + "acc": 0.77517686, + "epoch": 1.7141107336064976, + "grad_norm": 4.875, + "learning_rate": 5.262112641397788e-07, + "loss": 0.80620651, + "memory(GiB)": 138.1, + "step": 73470, + "train_speed(iter/s)": 0.201161 + }, + { + "acc": 0.76355195, + "epoch": 1.7143440411787865, + "grad_norm": 7.25, + "learning_rate": 5.253679971386932e-07, + "loss": 0.84303713, + "memory(GiB)": 138.1, + "step": 73480, + "train_speed(iter/s)": 0.201173 + }, + { + "acc": 0.78939095, + "epoch": 1.7145773487510754, + "grad_norm": 5.09375, + "learning_rate": 5.245253688900287e-07, + "loss": 0.75973749, + "memory(GiB)": 138.1, + "step": 73490, + "train_speed(iter/s)": 0.201187 + }, + { + "acc": 0.78993163, + "epoch": 1.7148106563233643, + "grad_norm": 5.375, + "learning_rate": 5.236833795140651e-07, + "loss": 0.73632402, + "memory(GiB)": 138.1, + "step": 73500, + "train_speed(iter/s)": 0.201202 + }, + { + "epoch": 1.7148106563233643, + "eval_acc": 0.7447105561152596, + "eval_loss": 0.8043909072875977, + "eval_runtime": 1271.1227, + "eval_samples_per_second": 28.314, + "eval_steps_per_second": 14.158, + "step": 73500 + }, + { + "acc": 0.79015398, + "epoch": 1.7150439638956532, + "grad_norm": 5.1875, + "learning_rate": 5.228420291309999e-07, + "loss": 0.74385405, + "memory(GiB)": 138.1, + "step": 73510, + "train_speed(iter/s)": 0.200504 + }, + { + "acc": 0.7631135, + "epoch": 1.715277271467942, + "grad_norm": 5.5625, + "learning_rate": 5.22001317860934e-07, + "loss": 0.84219952, + "memory(GiB)": 138.1, + "step": 73520, + "train_speed(iter/s)": 0.200519 + }, + { + "acc": 0.76704111, + "epoch": 1.715510579040231, + "grad_norm": 3.984375, + "learning_rate": 5.211612458238785e-07, + "loss": 0.87439299, + "memory(GiB)": 138.1, + "step": 73530, + "train_speed(iter/s)": 0.200533 + }, + { + "acc": 0.79748411, + "epoch": 1.7157438866125199, + "grad_norm": 6.0625, + "learning_rate": 5.203218131397553e-07, + "loss": 0.73548927, + "memory(GiB)": 138.1, + "step": 73540, + "train_speed(iter/s)": 0.200547 + }, + { + "acc": 0.77375269, + "epoch": 1.7159771941848088, + "grad_norm": 4.875, + "learning_rate": 5.194830199283907e-07, + "loss": 0.80748701, + "memory(GiB)": 138.1, + "step": 73550, + "train_speed(iter/s)": 0.200561 + }, + { + "acc": 0.77218609, + "epoch": 1.7162105017570977, + "grad_norm": 6.5, + "learning_rate": 5.18644866309524e-07, + "loss": 0.81236382, + "memory(GiB)": 138.1, + "step": 73560, + "train_speed(iter/s)": 0.200575 + }, + { + "acc": 0.79794626, + "epoch": 1.7164438093293866, + "grad_norm": 3.90625, + "learning_rate": 5.178073524028016e-07, + "loss": 0.71432037, + "memory(GiB)": 138.1, + "step": 73570, + "train_speed(iter/s)": 0.200589 + }, + { + "acc": 0.75880527, + "epoch": 1.7166771169016755, + "grad_norm": 6.28125, + "learning_rate": 5.169704783277756e-07, + "loss": 0.84431038, + "memory(GiB)": 138.1, + "step": 73580, + "train_speed(iter/s)": 0.200603 + }, + { + "acc": 0.78814268, + "epoch": 1.7169104244739644, + "grad_norm": 5.375, + "learning_rate": 5.16134244203913e-07, + "loss": 0.75521536, + "memory(GiB)": 138.1, + "step": 73590, + "train_speed(iter/s)": 0.200618 + }, + { + "acc": 0.78718247, + "epoch": 1.7171437320462533, + "grad_norm": 6.625, + "learning_rate": 5.152986501505835e-07, + "loss": 0.76596746, + "memory(GiB)": 138.1, + "step": 73600, + "train_speed(iter/s)": 0.200633 + }, + { + "acc": 0.77716827, + "epoch": 1.7173770396185422, + "grad_norm": 3.65625, + "learning_rate": 5.144636962870708e-07, + "loss": 0.80403767, + "memory(GiB)": 138.1, + "step": 73610, + "train_speed(iter/s)": 0.200646 + }, + { + "acc": 0.75140824, + "epoch": 1.717610347190831, + "grad_norm": 5.5625, + "learning_rate": 5.136293827325606e-07, + "loss": 0.89990616, + "memory(GiB)": 138.1, + "step": 73620, + "train_speed(iter/s)": 0.20066 + }, + { + "acc": 0.77322063, + "epoch": 1.71784365476312, + "grad_norm": 6.125, + "learning_rate": 5.127957096061537e-07, + "loss": 0.83046427, + "memory(GiB)": 138.1, + "step": 73630, + "train_speed(iter/s)": 0.200674 + }, + { + "acc": 0.76129708, + "epoch": 1.7180769623354089, + "grad_norm": 5.90625, + "learning_rate": 5.119626770268543e-07, + "loss": 0.86177006, + "memory(GiB)": 138.1, + "step": 73640, + "train_speed(iter/s)": 0.200688 + }, + { + "acc": 0.78243065, + "epoch": 1.7183102699076978, + "grad_norm": 3.5625, + "learning_rate": 5.111302851135802e-07, + "loss": 0.79324946, + "memory(GiB)": 138.1, + "step": 73650, + "train_speed(iter/s)": 0.200702 + }, + { + "acc": 0.79360733, + "epoch": 1.7185435774799864, + "grad_norm": 4.84375, + "learning_rate": 5.10298533985154e-07, + "loss": 0.74446688, + "memory(GiB)": 138.1, + "step": 73660, + "train_speed(iter/s)": 0.200716 + }, + { + "acc": 0.78374872, + "epoch": 1.7187768850522755, + "grad_norm": 5.25, + "learning_rate": 5.09467423760307e-07, + "loss": 0.74643164, + "memory(GiB)": 138.1, + "step": 73670, + "train_speed(iter/s)": 0.200731 + }, + { + "acc": 0.78896942, + "epoch": 1.7190101926245642, + "grad_norm": 4.84375, + "learning_rate": 5.086369545576814e-07, + "loss": 0.75824614, + "memory(GiB)": 138.1, + "step": 73680, + "train_speed(iter/s)": 0.200746 + }, + { + "acc": 0.78774047, + "epoch": 1.7192435001968533, + "grad_norm": 5.375, + "learning_rate": 5.078071264958245e-07, + "loss": 0.74300013, + "memory(GiB)": 138.1, + "step": 73690, + "train_speed(iter/s)": 0.20076 + }, + { + "acc": 0.79266844, + "epoch": 1.719476807769142, + "grad_norm": 4.125, + "learning_rate": 5.069779396931967e-07, + "loss": 0.74105115, + "memory(GiB)": 138.1, + "step": 73700, + "train_speed(iter/s)": 0.200774 + }, + { + "acc": 0.76223621, + "epoch": 1.7197101153414311, + "grad_norm": 13.625, + "learning_rate": 5.061493942681628e-07, + "loss": 0.88398075, + "memory(GiB)": 138.1, + "step": 73710, + "train_speed(iter/s)": 0.200788 + }, + { + "acc": 0.77189965, + "epoch": 1.7199434229137198, + "grad_norm": 5.71875, + "learning_rate": 5.053214903389975e-07, + "loss": 0.81558266, + "memory(GiB)": 138.1, + "step": 73720, + "train_speed(iter/s)": 0.200802 + }, + { + "acc": 0.78657942, + "epoch": 1.720176730486009, + "grad_norm": 5.71875, + "learning_rate": 5.044942280238835e-07, + "loss": 0.7773448, + "memory(GiB)": 138.1, + "step": 73730, + "train_speed(iter/s)": 0.200815 + }, + { + "acc": 0.76862726, + "epoch": 1.7204100380582976, + "grad_norm": 4.40625, + "learning_rate": 5.036676074409114e-07, + "loss": 0.85371304, + "memory(GiB)": 138.1, + "step": 73740, + "train_speed(iter/s)": 0.200829 + }, + { + "acc": 0.77896681, + "epoch": 1.7206433456305867, + "grad_norm": 7.53125, + "learning_rate": 5.028416287080834e-07, + "loss": 0.7777874, + "memory(GiB)": 138.1, + "step": 73750, + "train_speed(iter/s)": 0.200844 + }, + { + "acc": 0.77873478, + "epoch": 1.7208766532028754, + "grad_norm": 5.28125, + "learning_rate": 5.020162919433053e-07, + "loss": 0.79268093, + "memory(GiB)": 138.1, + "step": 73760, + "train_speed(iter/s)": 0.200858 + }, + { + "acc": 0.78746619, + "epoch": 1.7211099607751645, + "grad_norm": 7.21875, + "learning_rate": 5.01191597264396e-07, + "loss": 0.75564404, + "memory(GiB)": 138.1, + "step": 73770, + "train_speed(iter/s)": 0.200871 + }, + { + "acc": 0.78620825, + "epoch": 1.7213432683474532, + "grad_norm": 6.28125, + "learning_rate": 5.003675447890782e-07, + "loss": 0.75644531, + "memory(GiB)": 138.1, + "step": 73780, + "train_speed(iter/s)": 0.200885 + }, + { + "acc": 0.76999698, + "epoch": 1.7215765759197423, + "grad_norm": 4.8125, + "learning_rate": 4.995441346349872e-07, + "loss": 0.83167076, + "memory(GiB)": 138.1, + "step": 73790, + "train_speed(iter/s)": 0.200899 + }, + { + "acc": 0.80347652, + "epoch": 1.721809883492031, + "grad_norm": 4.28125, + "learning_rate": 4.987213669196639e-07, + "loss": 0.68127127, + "memory(GiB)": 138.1, + "step": 73800, + "train_speed(iter/s)": 0.200913 + }, + { + "acc": 0.79354105, + "epoch": 1.7220431910643201, + "grad_norm": 4.5625, + "learning_rate": 4.978992417605566e-07, + "loss": 0.73909149, + "memory(GiB)": 138.1, + "step": 73810, + "train_speed(iter/s)": 0.200927 + }, + { + "acc": 0.8010478, + "epoch": 1.7222764986366088, + "grad_norm": 5.46875, + "learning_rate": 4.970777592750253e-07, + "loss": 0.69659333, + "memory(GiB)": 138.1, + "step": 73820, + "train_speed(iter/s)": 0.20094 + }, + { + "acc": 0.76971216, + "epoch": 1.722509806208898, + "grad_norm": 5.09375, + "learning_rate": 4.962569195803352e-07, + "loss": 0.8232933, + "memory(GiB)": 138.1, + "step": 73830, + "train_speed(iter/s)": 0.200953 + }, + { + "acc": 0.77865553, + "epoch": 1.7227431137811866, + "grad_norm": 4.15625, + "learning_rate": 4.954367227936635e-07, + "loss": 0.7898417, + "memory(GiB)": 138.1, + "step": 73840, + "train_speed(iter/s)": 0.200967 + }, + { + "acc": 0.77204628, + "epoch": 1.7229764213534755, + "grad_norm": 6.71875, + "learning_rate": 4.946171690320889e-07, + "loss": 0.82885933, + "memory(GiB)": 138.1, + "step": 73850, + "train_speed(iter/s)": 0.200981 + }, + { + "acc": 0.79157839, + "epoch": 1.7232097289257644, + "grad_norm": 5.5, + "learning_rate": 4.937982584126055e-07, + "loss": 0.73304939, + "memory(GiB)": 138.1, + "step": 73860, + "train_speed(iter/s)": 0.200995 + }, + { + "acc": 0.7956171, + "epoch": 1.7234430364980533, + "grad_norm": 5.5625, + "learning_rate": 4.929799910521116e-07, + "loss": 0.73145752, + "memory(GiB)": 138.1, + "step": 73870, + "train_speed(iter/s)": 0.201009 + }, + { + "acc": 0.76525025, + "epoch": 1.7236763440703422, + "grad_norm": 4.96875, + "learning_rate": 4.921623670674142e-07, + "loss": 0.84171267, + "memory(GiB)": 138.1, + "step": 73880, + "train_speed(iter/s)": 0.201024 + }, + { + "acc": 0.80335445, + "epoch": 1.723909651642631, + "grad_norm": 4.9375, + "learning_rate": 4.913453865752299e-07, + "loss": 0.69952669, + "memory(GiB)": 138.1, + "step": 73890, + "train_speed(iter/s)": 0.201038 + }, + { + "acc": 0.76818728, + "epoch": 1.72414295921492, + "grad_norm": 11.875, + "learning_rate": 4.905290496921811e-07, + "loss": 0.84597569, + "memory(GiB)": 138.1, + "step": 73900, + "train_speed(iter/s)": 0.201053 + }, + { + "acc": 0.79364624, + "epoch": 1.724376266787209, + "grad_norm": 5.875, + "learning_rate": 4.897133565348012e-07, + "loss": 0.73016977, + "memory(GiB)": 138.1, + "step": 73910, + "train_speed(iter/s)": 0.201067 + }, + { + "acc": 0.79641161, + "epoch": 1.7246095743594978, + "grad_norm": 3.640625, + "learning_rate": 4.888983072195296e-07, + "loss": 0.72878189, + "memory(GiB)": 138.1, + "step": 73920, + "train_speed(iter/s)": 0.201082 + }, + { + "acc": 0.7892786, + "epoch": 1.7248428819317867, + "grad_norm": 5.34375, + "learning_rate": 4.880839018627132e-07, + "loss": 0.74224253, + "memory(GiB)": 138.1, + "step": 73930, + "train_speed(iter/s)": 0.201096 + }, + { + "acc": 0.78093824, + "epoch": 1.7250761895040756, + "grad_norm": 5.65625, + "learning_rate": 4.872701405806101e-07, + "loss": 0.77963114, + "memory(GiB)": 138.1, + "step": 73940, + "train_speed(iter/s)": 0.20111 + }, + { + "acc": 0.77672095, + "epoch": 1.7253094970763645, + "grad_norm": 7.0625, + "learning_rate": 4.864570234893834e-07, + "loss": 0.78376875, + "memory(GiB)": 138.1, + "step": 73950, + "train_speed(iter/s)": 0.201123 + }, + { + "acc": 0.78407612, + "epoch": 1.7255428046486534, + "grad_norm": 6.125, + "learning_rate": 4.856445507051049e-07, + "loss": 0.77355375, + "memory(GiB)": 138.1, + "step": 73960, + "train_speed(iter/s)": 0.201137 + }, + { + "acc": 0.7757288, + "epoch": 1.7257761122209423, + "grad_norm": 4.5625, + "learning_rate": 4.848327223437554e-07, + "loss": 0.80582609, + "memory(GiB)": 138.1, + "step": 73970, + "train_speed(iter/s)": 0.201151 + }, + { + "acc": 0.76491833, + "epoch": 1.7260094197932312, + "grad_norm": 6.34375, + "learning_rate": 4.840215385212232e-07, + "loss": 0.86212664, + "memory(GiB)": 138.1, + "step": 73980, + "train_speed(iter/s)": 0.201165 + }, + { + "acc": 0.77588701, + "epoch": 1.72624272736552, + "grad_norm": 7.25, + "learning_rate": 4.832109993533041e-07, + "loss": 0.80320539, + "memory(GiB)": 138.1, + "step": 73990, + "train_speed(iter/s)": 0.201178 + }, + { + "acc": 0.7706954, + "epoch": 1.726476034937809, + "grad_norm": 5.3125, + "learning_rate": 4.824011049557037e-07, + "loss": 0.83672714, + "memory(GiB)": 138.1, + "step": 74000, + "train_speed(iter/s)": 0.201192 + }, + { + "epoch": 1.726476034937809, + "eval_acc": 0.7447166415960871, + "eval_loss": 0.8044154047966003, + "eval_runtime": 1272.3059, + "eval_samples_per_second": 28.288, + "eval_steps_per_second": 14.144, + "step": 74000 + }, + { + "acc": 0.77697601, + "epoch": 1.7267093425100979, + "grad_norm": 7.1875, + "learning_rate": 4.815918554440324e-07, + "loss": 0.802526, + "memory(GiB)": 138.1, + "step": 74010, + "train_speed(iter/s)": 0.200499 + }, + { + "acc": 0.78431935, + "epoch": 1.7269426500823868, + "grad_norm": 4.71875, + "learning_rate": 4.807832509338112e-07, + "loss": 0.771733, + "memory(GiB)": 138.1, + "step": 74020, + "train_speed(iter/s)": 0.200512 + }, + { + "acc": 0.78355474, + "epoch": 1.7271759576546755, + "grad_norm": 4.40625, + "learning_rate": 4.799752915404682e-07, + "loss": 0.77948675, + "memory(GiB)": 138.1, + "step": 74030, + "train_speed(iter/s)": 0.200525 + }, + { + "acc": 0.77410603, + "epoch": 1.7274092652269646, + "grad_norm": 4.9375, + "learning_rate": 4.791679773793389e-07, + "loss": 0.82328606, + "memory(GiB)": 138.1, + "step": 74040, + "train_speed(iter/s)": 0.200539 + }, + { + "acc": 0.77805958, + "epoch": 1.7276425727992533, + "grad_norm": 5.96875, + "learning_rate": 4.78361308565668e-07, + "loss": 0.80983696, + "memory(GiB)": 138.1, + "step": 74050, + "train_speed(iter/s)": 0.200553 + }, + { + "acc": 0.76219211, + "epoch": 1.7278758803715424, + "grad_norm": 8.875, + "learning_rate": 4.77555285214607e-07, + "loss": 0.85069485, + "memory(GiB)": 138.1, + "step": 74060, + "train_speed(iter/s)": 0.200567 + }, + { + "acc": 0.77103195, + "epoch": 1.728109187943831, + "grad_norm": 3.90625, + "learning_rate": 4.767499074412152e-07, + "loss": 0.82191906, + "memory(GiB)": 138.1, + "step": 74070, + "train_speed(iter/s)": 0.200579 + }, + { + "acc": 0.80303154, + "epoch": 1.7283424955161202, + "grad_norm": 7.125, + "learning_rate": 4.7594517536045936e-07, + "loss": 0.72213902, + "memory(GiB)": 138.1, + "step": 74080, + "train_speed(iter/s)": 0.200593 + }, + { + "acc": 0.76572065, + "epoch": 1.7285758030884089, + "grad_norm": 6.1875, + "learning_rate": 4.751410890872166e-07, + "loss": 0.85232534, + "memory(GiB)": 138.1, + "step": 74090, + "train_speed(iter/s)": 0.200607 + }, + { + "acc": 0.77745919, + "epoch": 1.728809110660698, + "grad_norm": 5.53125, + "learning_rate": 4.743376487362683e-07, + "loss": 0.81853027, + "memory(GiB)": 138.1, + "step": 74100, + "train_speed(iter/s)": 0.200621 + }, + { + "acc": 0.79782553, + "epoch": 1.7290424182329867, + "grad_norm": 5.03125, + "learning_rate": 4.7353485442230507e-07, + "loss": 0.70489302, + "memory(GiB)": 138.1, + "step": 74110, + "train_speed(iter/s)": 0.200635 + }, + { + "acc": 0.77856512, + "epoch": 1.7292757258052758, + "grad_norm": 11.5, + "learning_rate": 4.7273270625992794e-07, + "loss": 0.80788822, + "memory(GiB)": 138.1, + "step": 74120, + "train_speed(iter/s)": 0.200649 + }, + { + "acc": 0.76892738, + "epoch": 1.7295090333775645, + "grad_norm": 6.59375, + "learning_rate": 4.719312043636404e-07, + "loss": 0.83906326, + "memory(GiB)": 138.1, + "step": 74130, + "train_speed(iter/s)": 0.200663 + }, + { + "acc": 0.77776084, + "epoch": 1.7297423409498536, + "grad_norm": 6.0, + "learning_rate": 4.711303488478591e-07, + "loss": 0.78263907, + "memory(GiB)": 138.1, + "step": 74140, + "train_speed(iter/s)": 0.200676 + }, + { + "acc": 0.7878387, + "epoch": 1.7299756485221423, + "grad_norm": 5.28125, + "learning_rate": 4.7033013982690446e-07, + "loss": 0.76044273, + "memory(GiB)": 138.1, + "step": 74150, + "train_speed(iter/s)": 0.20069 + }, + { + "acc": 0.77629614, + "epoch": 1.7302089560944314, + "grad_norm": 7.53125, + "learning_rate": 4.695305774150061e-07, + "loss": 0.79856586, + "memory(GiB)": 138.1, + "step": 74160, + "train_speed(iter/s)": 0.200705 + }, + { + "acc": 0.7986949, + "epoch": 1.73044226366672, + "grad_norm": 6.40625, + "learning_rate": 4.687316617263027e-07, + "loss": 0.73556232, + "memory(GiB)": 138.1, + "step": 74170, + "train_speed(iter/s)": 0.20072 + }, + { + "acc": 0.79174337, + "epoch": 1.7306755712390092, + "grad_norm": 6.375, + "learning_rate": 4.679333928748375e-07, + "loss": 0.73341341, + "memory(GiB)": 138.1, + "step": 74180, + "train_speed(iter/s)": 0.200733 + }, + { + "acc": 0.77004156, + "epoch": 1.7309088788112978, + "grad_norm": 7.09375, + "learning_rate": 4.671357709745644e-07, + "loss": 0.83492775, + "memory(GiB)": 138.1, + "step": 74190, + "train_speed(iter/s)": 0.200748 + }, + { + "acc": 0.79907241, + "epoch": 1.731142186383587, + "grad_norm": 5.375, + "learning_rate": 4.6633879613934227e-07, + "loss": 0.6985404, + "memory(GiB)": 138.1, + "step": 74200, + "train_speed(iter/s)": 0.200762 + }, + { + "acc": 0.75945568, + "epoch": 1.7313754939558756, + "grad_norm": 4.78125, + "learning_rate": 4.6554246848294127e-07, + "loss": 0.86438541, + "memory(GiB)": 138.1, + "step": 74210, + "train_speed(iter/s)": 0.200777 + }, + { + "acc": 0.78431191, + "epoch": 1.7316088015281648, + "grad_norm": 5.1875, + "learning_rate": 4.6474678811903483e-07, + "loss": 0.77434368, + "memory(GiB)": 138.1, + "step": 74220, + "train_speed(iter/s)": 0.200791 + }, + { + "acc": 0.7858922, + "epoch": 1.7318421091004534, + "grad_norm": 6.09375, + "learning_rate": 4.639517551612066e-07, + "loss": 0.74613075, + "memory(GiB)": 138.1, + "step": 74230, + "train_speed(iter/s)": 0.200803 + }, + { + "acc": 0.78975163, + "epoch": 1.7320754166727423, + "grad_norm": 5.65625, + "learning_rate": 4.63157369722948e-07, + "loss": 0.74640374, + "memory(GiB)": 138.1, + "step": 74240, + "train_speed(iter/s)": 0.200817 + }, + { + "acc": 0.78697009, + "epoch": 1.7323087242450312, + "grad_norm": 4.96875, + "learning_rate": 4.6236363191765666e-07, + "loss": 0.76076813, + "memory(GiB)": 138.1, + "step": 74250, + "train_speed(iter/s)": 0.200831 + }, + { + "acc": 0.774368, + "epoch": 1.7325420318173201, + "grad_norm": 6.96875, + "learning_rate": 4.615705418586391e-07, + "loss": 0.84603786, + "memory(GiB)": 138.1, + "step": 74260, + "train_speed(iter/s)": 0.200845 + }, + { + "acc": 0.78506193, + "epoch": 1.732775339389609, + "grad_norm": 6.28125, + "learning_rate": 4.6077809965910716e-07, + "loss": 0.76752243, + "memory(GiB)": 138.1, + "step": 74270, + "train_speed(iter/s)": 0.200859 + }, + { + "acc": 0.77089567, + "epoch": 1.733008646961898, + "grad_norm": 4.875, + "learning_rate": 4.59986305432184e-07, + "loss": 0.81474075, + "memory(GiB)": 138.1, + "step": 74280, + "train_speed(iter/s)": 0.200873 + }, + { + "acc": 0.7874424, + "epoch": 1.7332419545341868, + "grad_norm": 6.3125, + "learning_rate": 4.591951592908972e-07, + "loss": 0.76511889, + "memory(GiB)": 138.1, + "step": 74290, + "train_speed(iter/s)": 0.200887 + }, + { + "acc": 0.77571363, + "epoch": 1.7334752621064757, + "grad_norm": 4.1875, + "learning_rate": 4.5840466134818184e-07, + "loss": 0.8134244, + "memory(GiB)": 138.1, + "step": 74300, + "train_speed(iter/s)": 0.200901 + }, + { + "acc": 0.77631946, + "epoch": 1.7337085696787646, + "grad_norm": 5.90625, + "learning_rate": 4.576148117168816e-07, + "loss": 0.78171606, + "memory(GiB)": 138.1, + "step": 74310, + "train_speed(iter/s)": 0.200915 + }, + { + "acc": 0.79241109, + "epoch": 1.7339418772510535, + "grad_norm": 8.0, + "learning_rate": 4.568256105097468e-07, + "loss": 0.72888813, + "memory(GiB)": 138.1, + "step": 74320, + "train_speed(iter/s)": 0.200929 + }, + { + "acc": 0.78169894, + "epoch": 1.7341751848233424, + "grad_norm": 4.9375, + "learning_rate": 4.560370578394374e-07, + "loss": 0.76154728, + "memory(GiB)": 138.1, + "step": 74330, + "train_speed(iter/s)": 0.200943 + }, + { + "acc": 0.79366865, + "epoch": 1.7344084923956313, + "grad_norm": 5.875, + "learning_rate": 4.5524915381851663e-07, + "loss": 0.74610434, + "memory(GiB)": 138.1, + "step": 74340, + "train_speed(iter/s)": 0.200956 + }, + { + "acc": 0.78711934, + "epoch": 1.7346417999679202, + "grad_norm": 7.71875, + "learning_rate": 4.544618985594601e-07, + "loss": 0.75808601, + "memory(GiB)": 138.1, + "step": 74350, + "train_speed(iter/s)": 0.200971 + }, + { + "acc": 0.7786828, + "epoch": 1.7348751075402091, + "grad_norm": 5.40625, + "learning_rate": 4.5367529217464733e-07, + "loss": 0.78535357, + "memory(GiB)": 138.1, + "step": 74360, + "train_speed(iter/s)": 0.200986 + }, + { + "acc": 0.78281074, + "epoch": 1.735108415112498, + "grad_norm": 5.78125, + "learning_rate": 4.5288933477636466e-07, + "loss": 0.77032428, + "memory(GiB)": 138.1, + "step": 74370, + "train_speed(iter/s)": 0.200999 + }, + { + "acc": 0.77319117, + "epoch": 1.735341722684787, + "grad_norm": 5.96875, + "learning_rate": 4.5210402647680895e-07, + "loss": 0.81330729, + "memory(GiB)": 138.1, + "step": 74380, + "train_speed(iter/s)": 0.201013 + }, + { + "acc": 0.78810053, + "epoch": 1.7355750302570758, + "grad_norm": 4.8125, + "learning_rate": 4.5131936738808225e-07, + "loss": 0.76743989, + "memory(GiB)": 138.1, + "step": 74390, + "train_speed(iter/s)": 0.201028 + }, + { + "acc": 0.79243145, + "epoch": 1.7358083378293647, + "grad_norm": 6.375, + "learning_rate": 4.5053535762219494e-07, + "loss": 0.74746494, + "memory(GiB)": 138.1, + "step": 74400, + "train_speed(iter/s)": 0.201042 + }, + { + "acc": 0.7838623, + "epoch": 1.7360416454016536, + "grad_norm": 5.4375, + "learning_rate": 4.4975199729106355e-07, + "loss": 0.78794641, + "memory(GiB)": 138.1, + "step": 74410, + "train_speed(iter/s)": 0.201056 + }, + { + "acc": 0.8007185, + "epoch": 1.7362749529739423, + "grad_norm": 6.4375, + "learning_rate": 4.489692865065126e-07, + "loss": 0.72447138, + "memory(GiB)": 138.1, + "step": 74420, + "train_speed(iter/s)": 0.201071 + }, + { + "acc": 0.78932304, + "epoch": 1.7365082605462314, + "grad_norm": 4.125, + "learning_rate": 4.4818722538027326e-07, + "loss": 0.75672827, + "memory(GiB)": 138.1, + "step": 74430, + "train_speed(iter/s)": 0.201086 + }, + { + "acc": 0.78215332, + "epoch": 1.73674156811852, + "grad_norm": 4.75, + "learning_rate": 4.474058140239862e-07, + "loss": 0.77884374, + "memory(GiB)": 138.1, + "step": 74440, + "train_speed(iter/s)": 0.2011 + }, + { + "acc": 0.78150392, + "epoch": 1.7369748756908092, + "grad_norm": 6.09375, + "learning_rate": 4.4662505254919665e-07, + "loss": 0.75879669, + "memory(GiB)": 138.1, + "step": 74450, + "train_speed(iter/s)": 0.201114 + }, + { + "acc": 0.79613762, + "epoch": 1.737208183263098, + "grad_norm": 5.8125, + "learning_rate": 4.4584494106735707e-07, + "loss": 0.73280921, + "memory(GiB)": 138.1, + "step": 74460, + "train_speed(iter/s)": 0.201128 + }, + { + "acc": 0.78308897, + "epoch": 1.737441490835387, + "grad_norm": 5.84375, + "learning_rate": 4.4506547968983016e-07, + "loss": 0.78369331, + "memory(GiB)": 138.1, + "step": 74470, + "train_speed(iter/s)": 0.201143 + }, + { + "acc": 0.77581263, + "epoch": 1.7376747984076757, + "grad_norm": 8.125, + "learning_rate": 4.4428666852788126e-07, + "loss": 0.80782318, + "memory(GiB)": 138.1, + "step": 74480, + "train_speed(iter/s)": 0.201157 + }, + { + "acc": 0.76641207, + "epoch": 1.7379081059799648, + "grad_norm": 5.375, + "learning_rate": 4.4350850769268874e-07, + "loss": 0.84937601, + "memory(GiB)": 138.1, + "step": 74490, + "train_speed(iter/s)": 0.20117 + }, + { + "acc": 0.78670392, + "epoch": 1.7381414135522535, + "grad_norm": 4.65625, + "learning_rate": 4.4273099729533255e-07, + "loss": 0.76552391, + "memory(GiB)": 138.1, + "step": 74500, + "train_speed(iter/s)": 0.201183 + }, + { + "epoch": 1.7381414135522535, + "eval_acc": 0.7446762852495471, + "eval_loss": 0.804404079914093, + "eval_runtime": 1271.2914, + "eval_samples_per_second": 28.311, + "eval_steps_per_second": 14.156, + "step": 74500 + }, + { + "acc": 0.7867198, + "epoch": 1.7383747211245426, + "grad_norm": 4.5, + "learning_rate": 4.419541374468023e-07, + "loss": 0.76824493, + "memory(GiB)": 138.1, + "step": 74510, + "train_speed(iter/s)": 0.200496 + }, + { + "acc": 0.77137146, + "epoch": 1.7386080286968313, + "grad_norm": 5.75, + "learning_rate": 4.411779282579959e-07, + "loss": 0.82607307, + "memory(GiB)": 138.1, + "step": 74520, + "train_speed(iter/s)": 0.200509 + }, + { + "acc": 0.78419704, + "epoch": 1.7388413362691204, + "grad_norm": 4.25, + "learning_rate": 4.4040236983971476e-07, + "loss": 0.76362429, + "memory(GiB)": 138.1, + "step": 74530, + "train_speed(iter/s)": 0.200523 + }, + { + "acc": 0.78158565, + "epoch": 1.739074643841409, + "grad_norm": 4.8125, + "learning_rate": 4.3962746230267084e-07, + "loss": 0.77455659, + "memory(GiB)": 138.1, + "step": 74540, + "train_speed(iter/s)": 0.200537 + }, + { + "acc": 0.7929049, + "epoch": 1.7393079514136982, + "grad_norm": 6.9375, + "learning_rate": 4.388532057574818e-07, + "loss": 0.73814707, + "memory(GiB)": 138.1, + "step": 74550, + "train_speed(iter/s)": 0.200551 + }, + { + "acc": 0.79585075, + "epoch": 1.7395412589859869, + "grad_norm": 4.21875, + "learning_rate": 4.380796003146731e-07, + "loss": 0.71769018, + "memory(GiB)": 138.1, + "step": 74560, + "train_speed(iter/s)": 0.200565 + }, + { + "acc": 0.76547136, + "epoch": 1.739774566558276, + "grad_norm": 4.71875, + "learning_rate": 4.3730664608467534e-07, + "loss": 0.81900177, + "memory(GiB)": 138.1, + "step": 74570, + "train_speed(iter/s)": 0.200579 + }, + { + "acc": 0.77250733, + "epoch": 1.7400078741305647, + "grad_norm": 10.375, + "learning_rate": 4.3653434317782905e-07, + "loss": 0.79657941, + "memory(GiB)": 138.1, + "step": 74580, + "train_speed(iter/s)": 0.200593 + }, + { + "acc": 0.77615132, + "epoch": 1.7402411817028538, + "grad_norm": 5.3125, + "learning_rate": 4.3576269170438e-07, + "loss": 0.81013956, + "memory(GiB)": 138.1, + "step": 74590, + "train_speed(iter/s)": 0.200607 + }, + { + "acc": 0.80237694, + "epoch": 1.7404744892751425, + "grad_norm": 5.53125, + "learning_rate": 4.349916917744801e-07, + "loss": 0.69212132, + "memory(GiB)": 138.1, + "step": 74600, + "train_speed(iter/s)": 0.200621 + }, + { + "acc": 0.77684078, + "epoch": 1.7407077968474314, + "grad_norm": 5.1875, + "learning_rate": 4.3422134349819014e-07, + "loss": 0.82745504, + "memory(GiB)": 138.1, + "step": 74610, + "train_speed(iter/s)": 0.200635 + }, + { + "acc": 0.7814332, + "epoch": 1.7409411044197203, + "grad_norm": 6.59375, + "learning_rate": 4.334516469854766e-07, + "loss": 0.75724936, + "memory(GiB)": 138.1, + "step": 74620, + "train_speed(iter/s)": 0.200649 + }, + { + "acc": 0.76991339, + "epoch": 1.7411744119920092, + "grad_norm": 7.21875, + "learning_rate": 4.3268260234621497e-07, + "loss": 0.82924404, + "memory(GiB)": 138.1, + "step": 74630, + "train_speed(iter/s)": 0.200663 + }, + { + "acc": 0.78513436, + "epoch": 1.741407719564298, + "grad_norm": 6.34375, + "learning_rate": 4.319142096901846e-07, + "loss": 0.76789331, + "memory(GiB)": 138.1, + "step": 74640, + "train_speed(iter/s)": 0.200677 + }, + { + "acc": 0.77668796, + "epoch": 1.741641027136587, + "grad_norm": 5.15625, + "learning_rate": 4.3114646912707394e-07, + "loss": 0.8108284, + "memory(GiB)": 138.1, + "step": 74650, + "train_speed(iter/s)": 0.200691 + }, + { + "acc": 0.7892621, + "epoch": 1.7418743347088759, + "grad_norm": 5.0625, + "learning_rate": 4.30379380766478e-07, + "loss": 0.76283331, + "memory(GiB)": 138.1, + "step": 74660, + "train_speed(iter/s)": 0.200705 + }, + { + "acc": 0.80962782, + "epoch": 1.7421076422811648, + "grad_norm": 4.96875, + "learning_rate": 4.296129447178965e-07, + "loss": 0.68484173, + "memory(GiB)": 138.1, + "step": 74670, + "train_speed(iter/s)": 0.200718 + }, + { + "acc": 0.78058996, + "epoch": 1.7423409498534537, + "grad_norm": 5.53125, + "learning_rate": 4.288471610907402e-07, + "loss": 0.80650549, + "memory(GiB)": 138.1, + "step": 74680, + "train_speed(iter/s)": 0.200732 + }, + { + "acc": 0.77258387, + "epoch": 1.7425742574257426, + "grad_norm": 7.625, + "learning_rate": 4.2808202999432335e-07, + "loss": 0.80087252, + "memory(GiB)": 138.1, + "step": 74690, + "train_speed(iter/s)": 0.200746 + }, + { + "acc": 0.77971539, + "epoch": 1.7428075649980315, + "grad_norm": 4.90625, + "learning_rate": 4.2731755153786915e-07, + "loss": 0.79446883, + "memory(GiB)": 138.1, + "step": 74700, + "train_speed(iter/s)": 0.200761 + }, + { + "acc": 0.78136635, + "epoch": 1.7430408725703204, + "grad_norm": 4.4375, + "learning_rate": 4.2655372583050525e-07, + "loss": 0.78863945, + "memory(GiB)": 138.1, + "step": 74710, + "train_speed(iter/s)": 0.200775 + }, + { + "acc": 0.78437352, + "epoch": 1.7432741801426093, + "grad_norm": 4.15625, + "learning_rate": 4.2579055298126945e-07, + "loss": 0.7832253, + "memory(GiB)": 138.1, + "step": 74720, + "train_speed(iter/s)": 0.20079 + }, + { + "acc": 0.77852464, + "epoch": 1.7435074877148982, + "grad_norm": 5.1875, + "learning_rate": 4.2502803309910354e-07, + "loss": 0.79541359, + "memory(GiB)": 138.1, + "step": 74730, + "train_speed(iter/s)": 0.200804 + }, + { + "acc": 0.77891903, + "epoch": 1.743740795287187, + "grad_norm": 5.28125, + "learning_rate": 4.2426616629285544e-07, + "loss": 0.79765043, + "memory(GiB)": 138.1, + "step": 74740, + "train_speed(iter/s)": 0.200818 + }, + { + "acc": 0.76886244, + "epoch": 1.743974102859476, + "grad_norm": 29.25, + "learning_rate": 4.235049526712848e-07, + "loss": 0.8290019, + "memory(GiB)": 138.1, + "step": 74750, + "train_speed(iter/s)": 0.200833 + }, + { + "acc": 0.78655448, + "epoch": 1.7442074104317649, + "grad_norm": 14.1875, + "learning_rate": 4.227443923430513e-07, + "loss": 0.75835438, + "memory(GiB)": 138.1, + "step": 74760, + "train_speed(iter/s)": 0.200847 + }, + { + "acc": 0.78014526, + "epoch": 1.7444407180040538, + "grad_norm": 6.5, + "learning_rate": 4.2198448541672654e-07, + "loss": 0.7994524, + "memory(GiB)": 138.1, + "step": 74770, + "train_speed(iter/s)": 0.200861 + }, + { + "acc": 0.77601213, + "epoch": 1.7446740255763427, + "grad_norm": 5.1875, + "learning_rate": 4.212252320007859e-07, + "loss": 0.83280792, + "memory(GiB)": 138.1, + "step": 74780, + "train_speed(iter/s)": 0.200875 + }, + { + "acc": 0.79946394, + "epoch": 1.7449073331486316, + "grad_norm": 4.96875, + "learning_rate": 4.204666322036138e-07, + "loss": 0.70037394, + "memory(GiB)": 138.1, + "step": 74790, + "train_speed(iter/s)": 0.200889 + }, + { + "acc": 0.78642559, + "epoch": 1.7451406407209205, + "grad_norm": 6.40625, + "learning_rate": 4.197086861334998e-07, + "loss": 0.78597708, + "memory(GiB)": 138.1, + "step": 74800, + "train_speed(iter/s)": 0.200903 + }, + { + "acc": 0.76866412, + "epoch": 1.7453739482932091, + "grad_norm": 5.5625, + "learning_rate": 4.189513938986395e-07, + "loss": 0.82813854, + "memory(GiB)": 138.1, + "step": 74810, + "train_speed(iter/s)": 0.200917 + }, + { + "acc": 0.77864561, + "epoch": 1.7456072558654983, + "grad_norm": 7.96875, + "learning_rate": 4.181947556071381e-07, + "loss": 0.79182148, + "memory(GiB)": 138.1, + "step": 74820, + "train_speed(iter/s)": 0.200931 + }, + { + "acc": 0.77730217, + "epoch": 1.745840563437787, + "grad_norm": 6.6875, + "learning_rate": 4.1743877136700307e-07, + "loss": 0.80669651, + "memory(GiB)": 138.1, + "step": 74830, + "train_speed(iter/s)": 0.200944 + }, + { + "acc": 0.78707137, + "epoch": 1.746073871010076, + "grad_norm": 5.8125, + "learning_rate": 4.1668344128615254e-07, + "loss": 0.77385092, + "memory(GiB)": 138.1, + "step": 74840, + "train_speed(iter/s)": 0.200957 + }, + { + "acc": 0.79038467, + "epoch": 1.7463071785823647, + "grad_norm": 6.09375, + "learning_rate": 4.1592876547241035e-07, + "loss": 0.74476175, + "memory(GiB)": 138.1, + "step": 74850, + "train_speed(iter/s)": 0.200971 + }, + { + "acc": 0.78900266, + "epoch": 1.7465404861546538, + "grad_norm": 5.34375, + "learning_rate": 4.151747440335047e-07, + "loss": 0.74805627, + "memory(GiB)": 138.1, + "step": 74860, + "train_speed(iter/s)": 0.200984 + }, + { + "acc": 0.77410288, + "epoch": 1.7467737937269425, + "grad_norm": 4.40625, + "learning_rate": 4.144213770770711e-07, + "loss": 0.82338734, + "memory(GiB)": 138.1, + "step": 74870, + "train_speed(iter/s)": 0.200996 + }, + { + "acc": 0.7824645, + "epoch": 1.7470071012992316, + "grad_norm": 4.5, + "learning_rate": 4.1366866471065524e-07, + "loss": 0.77526579, + "memory(GiB)": 138.1, + "step": 74880, + "train_speed(iter/s)": 0.20101 + }, + { + "acc": 0.76652699, + "epoch": 1.7472404088715203, + "grad_norm": 4.90625, + "learning_rate": 4.129166070417051e-07, + "loss": 0.85960999, + "memory(GiB)": 138.1, + "step": 74890, + "train_speed(iter/s)": 0.201024 + }, + { + "acc": 0.78324442, + "epoch": 1.7474737164438094, + "grad_norm": 6.5625, + "learning_rate": 4.121652041775759e-07, + "loss": 0.75736356, + "memory(GiB)": 138.1, + "step": 74900, + "train_speed(iter/s)": 0.201038 + }, + { + "acc": 0.78140106, + "epoch": 1.7477070240160981, + "grad_norm": 5.78125, + "learning_rate": 4.1141445622553175e-07, + "loss": 0.80148916, + "memory(GiB)": 138.1, + "step": 74910, + "train_speed(iter/s)": 0.201053 + }, + { + "acc": 0.79169626, + "epoch": 1.7479403315883872, + "grad_norm": 5.59375, + "learning_rate": 4.106643632927404e-07, + "loss": 0.7412683, + "memory(GiB)": 138.1, + "step": 74920, + "train_speed(iter/s)": 0.201066 + }, + { + "acc": 0.77393618, + "epoch": 1.748173639160676, + "grad_norm": 6.71875, + "learning_rate": 4.099149254862783e-07, + "loss": 0.78559313, + "memory(GiB)": 138.1, + "step": 74930, + "train_speed(iter/s)": 0.20108 + }, + { + "acc": 0.78671503, + "epoch": 1.748406946732965, + "grad_norm": 7.40625, + "learning_rate": 4.091661429131277e-07, + "loss": 0.77732067, + "memory(GiB)": 138.1, + "step": 74940, + "train_speed(iter/s)": 0.201095 + }, + { + "acc": 0.76118011, + "epoch": 1.7486402543052537, + "grad_norm": 5.65625, + "learning_rate": 4.0841801568017534e-07, + "loss": 0.86246977, + "memory(GiB)": 138.1, + "step": 74950, + "train_speed(iter/s)": 0.201108 + }, + { + "acc": 0.76429033, + "epoch": 1.7488735618775428, + "grad_norm": 5.375, + "learning_rate": 4.0767054389421857e-07, + "loss": 0.85855999, + "memory(GiB)": 138.1, + "step": 74960, + "train_speed(iter/s)": 0.201121 + }, + { + "acc": 0.78033648, + "epoch": 1.7491068694498315, + "grad_norm": 4.15625, + "learning_rate": 4.069237276619564e-07, + "loss": 0.76560135, + "memory(GiB)": 138.1, + "step": 74970, + "train_speed(iter/s)": 0.201134 + }, + { + "acc": 0.76945028, + "epoch": 1.7493401770221206, + "grad_norm": 4.71875, + "learning_rate": 4.0617756708999976e-07, + "loss": 0.82791023, + "memory(GiB)": 138.1, + "step": 74980, + "train_speed(iter/s)": 0.201148 + }, + { + "acc": 0.76738582, + "epoch": 1.7495734845944093, + "grad_norm": 5.46875, + "learning_rate": 4.05432062284859e-07, + "loss": 0.834793, + "memory(GiB)": 138.1, + "step": 74990, + "train_speed(iter/s)": 0.201161 + }, + { + "acc": 0.78293915, + "epoch": 1.7498067921666982, + "grad_norm": 6.5, + "learning_rate": 4.0468721335295726e-07, + "loss": 0.78276381, + "memory(GiB)": 138.1, + "step": 75000, + "train_speed(iter/s)": 0.201175 + }, + { + "epoch": 1.7498067921666982, + "eval_acc": 0.7446785272687994, + "eval_loss": 0.8044347763061523, + "eval_runtime": 1272.0014, + "eval_samples_per_second": 28.295, + "eval_steps_per_second": 14.148, + "step": 75000 + }, + { + "acc": 0.77675529, + "epoch": 1.750040099738987, + "grad_norm": 5.9375, + "learning_rate": 4.0394302040062117e-07, + "loss": 0.78634806, + "memory(GiB)": 138.1, + "step": 75010, + "train_speed(iter/s)": 0.200492 + }, + { + "acc": 0.79533587, + "epoch": 1.750273407311276, + "grad_norm": 4.46875, + "learning_rate": 4.03199483534083e-07, + "loss": 0.73570852, + "memory(GiB)": 138.1, + "step": 75020, + "train_speed(iter/s)": 0.200505 + }, + { + "acc": 0.78597369, + "epoch": 1.750506714883565, + "grad_norm": 6.53125, + "learning_rate": 4.0245660285948394e-07, + "loss": 0.75488353, + "memory(GiB)": 138.1, + "step": 75030, + "train_speed(iter/s)": 0.200519 + }, + { + "acc": 0.8074131, + "epoch": 1.7507400224558538, + "grad_norm": 11.6875, + "learning_rate": 4.0171437848286867e-07, + "loss": 0.698316, + "memory(GiB)": 138.1, + "step": 75040, + "train_speed(iter/s)": 0.200532 + }, + { + "acc": 0.77187395, + "epoch": 1.7509733300281427, + "grad_norm": 6.53125, + "learning_rate": 4.009728105101901e-07, + "loss": 0.81652546, + "memory(GiB)": 138.1, + "step": 75050, + "train_speed(iter/s)": 0.200547 + }, + { + "acc": 0.77719297, + "epoch": 1.7512066376004316, + "grad_norm": 6.0, + "learning_rate": 4.00231899047307e-07, + "loss": 0.8079525, + "memory(GiB)": 138.1, + "step": 75060, + "train_speed(iter/s)": 0.20056 + }, + { + "acc": 0.77472291, + "epoch": 1.7514399451727205, + "grad_norm": 6.15625, + "learning_rate": 3.994916441999841e-07, + "loss": 0.80752106, + "memory(GiB)": 138.1, + "step": 75070, + "train_speed(iter/s)": 0.200573 + }, + { + "acc": 0.79288235, + "epoch": 1.7516732527450094, + "grad_norm": 6.0625, + "learning_rate": 3.9875204607389304e-07, + "loss": 0.76388402, + "memory(GiB)": 138.1, + "step": 75080, + "train_speed(iter/s)": 0.200587 + }, + { + "acc": 0.77766056, + "epoch": 1.7519065603172983, + "grad_norm": 4.25, + "learning_rate": 3.980131047746105e-07, + "loss": 0.80273018, + "memory(GiB)": 138.1, + "step": 75090, + "train_speed(iter/s)": 0.200601 + }, + { + "acc": 0.78243771, + "epoch": 1.7521398678895872, + "grad_norm": 4.875, + "learning_rate": 3.9727482040762044e-07, + "loss": 0.77541404, + "memory(GiB)": 138.1, + "step": 75100, + "train_speed(iter/s)": 0.200615 + }, + { + "acc": 0.78391609, + "epoch": 1.752373175461876, + "grad_norm": 5.59375, + "learning_rate": 3.965371930783113e-07, + "loss": 0.78062081, + "memory(GiB)": 138.1, + "step": 75110, + "train_speed(iter/s)": 0.200628 + }, + { + "acc": 0.79337425, + "epoch": 1.752606483034165, + "grad_norm": 5.0, + "learning_rate": 3.958002228919822e-07, + "loss": 0.73935184, + "memory(GiB)": 138.1, + "step": 75120, + "train_speed(iter/s)": 0.200642 + }, + { + "acc": 0.80228825, + "epoch": 1.752839790606454, + "grad_norm": 3.96875, + "learning_rate": 3.9506390995383225e-07, + "loss": 0.70532112, + "memory(GiB)": 138.1, + "step": 75130, + "train_speed(iter/s)": 0.200656 + }, + { + "acc": 0.79691057, + "epoch": 1.7530730981787428, + "grad_norm": 5.5, + "learning_rate": 3.943282543689725e-07, + "loss": 0.73715506, + "memory(GiB)": 138.1, + "step": 75140, + "train_speed(iter/s)": 0.20067 + }, + { + "acc": 0.78184276, + "epoch": 1.7533064057510317, + "grad_norm": 5.0625, + "learning_rate": 3.935932562424166e-07, + "loss": 0.78103657, + "memory(GiB)": 138.1, + "step": 75150, + "train_speed(iter/s)": 0.200681 + }, + { + "acc": 0.78166604, + "epoch": 1.7535397133233206, + "grad_norm": 4.6875, + "learning_rate": 3.9285891567908465e-07, + "loss": 0.78287544, + "memory(GiB)": 138.1, + "step": 75160, + "train_speed(iter/s)": 0.200695 + }, + { + "acc": 0.78169422, + "epoch": 1.7537730208956095, + "grad_norm": 6.28125, + "learning_rate": 3.9212523278380434e-07, + "loss": 0.75878248, + "memory(GiB)": 138.1, + "step": 75170, + "train_speed(iter/s)": 0.200707 + }, + { + "acc": 0.78662972, + "epoch": 1.7540063284678984, + "grad_norm": 7.4375, + "learning_rate": 3.9139220766130803e-07, + "loss": 0.75259981, + "memory(GiB)": 138.1, + "step": 75180, + "train_speed(iter/s)": 0.20072 + }, + { + "acc": 0.77813616, + "epoch": 1.7542396360401873, + "grad_norm": 9.8125, + "learning_rate": 3.9065984041623594e-07, + "loss": 0.7942369, + "memory(GiB)": 138.1, + "step": 75190, + "train_speed(iter/s)": 0.200733 + }, + { + "acc": 0.77014294, + "epoch": 1.754472943612476, + "grad_norm": 4.09375, + "learning_rate": 3.8992813115313164e-07, + "loss": 0.83160582, + "memory(GiB)": 138.1, + "step": 75200, + "train_speed(iter/s)": 0.200747 + }, + { + "acc": 0.77763653, + "epoch": 1.754706251184765, + "grad_norm": 5.28125, + "learning_rate": 3.8919707997644884e-07, + "loss": 0.8163887, + "memory(GiB)": 138.1, + "step": 75210, + "train_speed(iter/s)": 0.200761 + }, + { + "acc": 0.77771187, + "epoch": 1.7549395587570538, + "grad_norm": 4.6875, + "learning_rate": 3.8846668699054233e-07, + "loss": 0.80659122, + "memory(GiB)": 138.1, + "step": 75220, + "train_speed(iter/s)": 0.200774 + }, + { + "acc": 0.78318548, + "epoch": 1.7551728663293429, + "grad_norm": 5.90625, + "learning_rate": 3.877369522996771e-07, + "loss": 0.80368767, + "memory(GiB)": 138.1, + "step": 75230, + "train_speed(iter/s)": 0.200787 + }, + { + "acc": 0.7785296, + "epoch": 1.7554061739016316, + "grad_norm": 4.625, + "learning_rate": 3.8700787600802203e-07, + "loss": 0.78689766, + "memory(GiB)": 138.1, + "step": 75240, + "train_speed(iter/s)": 0.200801 + }, + { + "acc": 0.77669411, + "epoch": 1.7556394814739207, + "grad_norm": 6.90625, + "learning_rate": 3.862794582196522e-07, + "loss": 0.8024128, + "memory(GiB)": 138.1, + "step": 75250, + "train_speed(iter/s)": 0.200815 + }, + { + "acc": 0.77892799, + "epoch": 1.7558727890462094, + "grad_norm": 5.96875, + "learning_rate": 3.8555169903854993e-07, + "loss": 0.7945653, + "memory(GiB)": 138.1, + "step": 75260, + "train_speed(iter/s)": 0.200829 + }, + { + "acc": 0.79608464, + "epoch": 1.7561060966184985, + "grad_norm": 6.03125, + "learning_rate": 3.848245985686011e-07, + "loss": 0.72459307, + "memory(GiB)": 138.1, + "step": 75270, + "train_speed(iter/s)": 0.200843 + }, + { + "acc": 0.77168674, + "epoch": 1.7563394041907872, + "grad_norm": 5.96875, + "learning_rate": 3.84098156913601e-07, + "loss": 0.82376051, + "memory(GiB)": 138.1, + "step": 75280, + "train_speed(iter/s)": 0.200857 + }, + { + "acc": 0.79666181, + "epoch": 1.7565727117630763, + "grad_norm": 6.6875, + "learning_rate": 3.8337237417724827e-07, + "loss": 0.72730808, + "memory(GiB)": 138.1, + "step": 75290, + "train_speed(iter/s)": 0.200871 + }, + { + "acc": 0.8078371, + "epoch": 1.756806019335365, + "grad_norm": 5.8125, + "learning_rate": 3.826472504631473e-07, + "loss": 0.68620291, + "memory(GiB)": 138.1, + "step": 75300, + "train_speed(iter/s)": 0.200884 + }, + { + "acc": 0.78211536, + "epoch": 1.757039326907654, + "grad_norm": 6.28125, + "learning_rate": 3.8192278587481035e-07, + "loss": 0.77496066, + "memory(GiB)": 138.1, + "step": 75310, + "train_speed(iter/s)": 0.200898 + }, + { + "acc": 0.77927465, + "epoch": 1.7572726344799428, + "grad_norm": 5.21875, + "learning_rate": 3.811989805156546e-07, + "loss": 0.80956287, + "memory(GiB)": 138.1, + "step": 75320, + "train_speed(iter/s)": 0.200912 + }, + { + "acc": 0.79260778, + "epoch": 1.7575059420522319, + "grad_norm": 7.46875, + "learning_rate": 3.804758344890025e-07, + "loss": 0.74879017, + "memory(GiB)": 138.1, + "step": 75330, + "train_speed(iter/s)": 0.200926 + }, + { + "acc": 0.78099527, + "epoch": 1.7577392496245205, + "grad_norm": 5.75, + "learning_rate": 3.7975334789808194e-07, + "loss": 0.78341789, + "memory(GiB)": 138.1, + "step": 75340, + "train_speed(iter/s)": 0.20094 + }, + { + "acc": 0.77331486, + "epoch": 1.7579725571968097, + "grad_norm": 4.84375, + "learning_rate": 3.790315208460299e-07, + "loss": 0.82067747, + "memory(GiB)": 138.1, + "step": 75350, + "train_speed(iter/s)": 0.200953 + }, + { + "acc": 0.77612276, + "epoch": 1.7582058647690983, + "grad_norm": 5.25, + "learning_rate": 3.783103534358845e-07, + "loss": 0.79735045, + "memory(GiB)": 138.1, + "step": 75360, + "train_speed(iter/s)": 0.200967 + }, + { + "acc": 0.77384906, + "epoch": 1.7584391723413875, + "grad_norm": 6.6875, + "learning_rate": 3.775898457705951e-07, + "loss": 0.81172314, + "memory(GiB)": 138.1, + "step": 75370, + "train_speed(iter/s)": 0.200981 + }, + { + "acc": 0.77728834, + "epoch": 1.7586724799136761, + "grad_norm": 5.8125, + "learning_rate": 3.768699979530122e-07, + "loss": 0.77499533, + "memory(GiB)": 138.1, + "step": 75380, + "train_speed(iter/s)": 0.200994 + }, + { + "acc": 0.78484507, + "epoch": 1.758905787485965, + "grad_norm": 4.65625, + "learning_rate": 3.761508100858929e-07, + "loss": 0.7881773, + "memory(GiB)": 138.1, + "step": 75390, + "train_speed(iter/s)": 0.201007 + }, + { + "acc": 0.76613727, + "epoch": 1.759139095058254, + "grad_norm": 7.375, + "learning_rate": 3.7543228227190307e-07, + "loss": 0.84683084, + "memory(GiB)": 138.1, + "step": 75400, + "train_speed(iter/s)": 0.201021 + }, + { + "acc": 0.80847015, + "epoch": 1.7593724026305428, + "grad_norm": 4.125, + "learning_rate": 3.747144146136111e-07, + "loss": 0.67406015, + "memory(GiB)": 138.1, + "step": 75410, + "train_speed(iter/s)": 0.201035 + }, + { + "acc": 0.79223614, + "epoch": 1.7596057102028317, + "grad_norm": 3.734375, + "learning_rate": 3.739972072134934e-07, + "loss": 0.75303974, + "memory(GiB)": 138.1, + "step": 75420, + "train_speed(iter/s)": 0.201048 + }, + { + "acc": 0.7955615, + "epoch": 1.7598390177751206, + "grad_norm": 5.40625, + "learning_rate": 3.7328066017393025e-07, + "loss": 0.72555103, + "memory(GiB)": 138.1, + "step": 75430, + "train_speed(iter/s)": 0.201061 + }, + { + "acc": 0.7834197, + "epoch": 1.7600723253474095, + "grad_norm": 6.5625, + "learning_rate": 3.725647735972093e-07, + "loss": 0.77190681, + "memory(GiB)": 138.1, + "step": 75440, + "train_speed(iter/s)": 0.201075 + }, + { + "acc": 0.78522491, + "epoch": 1.7603056329196984, + "grad_norm": 3.6875, + "learning_rate": 3.7184954758552215e-07, + "loss": 0.75268602, + "memory(GiB)": 138.1, + "step": 75450, + "train_speed(iter/s)": 0.201089 + }, + { + "acc": 0.77963495, + "epoch": 1.7605389404919873, + "grad_norm": 5.65625, + "learning_rate": 3.711349822409671e-07, + "loss": 0.78821554, + "memory(GiB)": 138.1, + "step": 75460, + "train_speed(iter/s)": 0.201103 + }, + { + "acc": 0.77956514, + "epoch": 1.7607722480642762, + "grad_norm": 5.875, + "learning_rate": 3.7042107766554925e-07, + "loss": 0.78396311, + "memory(GiB)": 138.1, + "step": 75470, + "train_speed(iter/s)": 0.201115 + }, + { + "acc": 0.79036522, + "epoch": 1.7610055556365651, + "grad_norm": 5.40625, + "learning_rate": 3.6970783396117706e-07, + "loss": 0.77081327, + "memory(GiB)": 138.1, + "step": 75480, + "train_speed(iter/s)": 0.201128 + }, + { + "acc": 0.77387805, + "epoch": 1.761238863208854, + "grad_norm": 6.625, + "learning_rate": 3.689952512296674e-07, + "loss": 0.83468933, + "memory(GiB)": 138.1, + "step": 75490, + "train_speed(iter/s)": 0.201142 + }, + { + "acc": 0.78259726, + "epoch": 1.761472170781143, + "grad_norm": 5.53125, + "learning_rate": 3.682833295727389e-07, + "loss": 0.76884212, + "memory(GiB)": 138.1, + "step": 75500, + "train_speed(iter/s)": 0.201156 + }, + { + "epoch": 1.761472170781143, + "eval_acc": 0.7446898975092928, + "eval_loss": 0.8043956756591797, + "eval_runtime": 1270.8807, + "eval_samples_per_second": 28.32, + "eval_steps_per_second": 14.16, + "step": 75500 + }, + { + "acc": 0.775494, + "epoch": 1.7617054783534318, + "grad_norm": 3.890625, + "learning_rate": 3.675720690920209e-07, + "loss": 0.7966898, + "memory(GiB)": 138.1, + "step": 75510, + "train_speed(iter/s)": 0.200477 + }, + { + "acc": 0.77051344, + "epoch": 1.7619387859257207, + "grad_norm": 4.96875, + "learning_rate": 3.668614698890444e-07, + "loss": 0.8196578, + "memory(GiB)": 138.1, + "step": 75520, + "train_speed(iter/s)": 0.200491 + }, + { + "acc": 0.77491951, + "epoch": 1.7621720934980096, + "grad_norm": 4.5, + "learning_rate": 3.661515320652459e-07, + "loss": 0.82127848, + "memory(GiB)": 138.1, + "step": 75530, + "train_speed(iter/s)": 0.200505 + }, + { + "acc": 0.80416489, + "epoch": 1.7624054010702985, + "grad_norm": 5.28125, + "learning_rate": 3.654422557219711e-07, + "loss": 0.6906394, + "memory(GiB)": 138.1, + "step": 75540, + "train_speed(iter/s)": 0.200519 + }, + { + "acc": 0.76411433, + "epoch": 1.7626387086425874, + "grad_norm": 5.46875, + "learning_rate": 3.6473364096046795e-07, + "loss": 0.85491076, + "memory(GiB)": 138.1, + "step": 75550, + "train_speed(iter/s)": 0.200534 + }, + { + "acc": 0.7577879, + "epoch": 1.7628720162148763, + "grad_norm": 5.0625, + "learning_rate": 3.64025687881891e-07, + "loss": 0.87597389, + "memory(GiB)": 138.1, + "step": 75560, + "train_speed(iter/s)": 0.200548 + }, + { + "acc": 0.78338909, + "epoch": 1.763105323787165, + "grad_norm": 4.5, + "learning_rate": 3.633183965872994e-07, + "loss": 0.78178053, + "memory(GiB)": 138.1, + "step": 75570, + "train_speed(iter/s)": 0.200561 + }, + { + "acc": 0.80338173, + "epoch": 1.7633386313594541, + "grad_norm": 5.21875, + "learning_rate": 3.6261176717766076e-07, + "loss": 0.71166687, + "memory(GiB)": 138.1, + "step": 75580, + "train_speed(iter/s)": 0.200575 + }, + { + "acc": 0.77883034, + "epoch": 1.7635719389317428, + "grad_norm": 6.46875, + "learning_rate": 3.6190579975384497e-07, + "loss": 0.81209888, + "memory(GiB)": 138.1, + "step": 75590, + "train_speed(iter/s)": 0.200588 + }, + { + "acc": 0.78879728, + "epoch": 1.763805246504032, + "grad_norm": 4.4375, + "learning_rate": 3.6120049441662805e-07, + "loss": 0.76921787, + "memory(GiB)": 138.1, + "step": 75600, + "train_speed(iter/s)": 0.200602 + }, + { + "acc": 0.78142576, + "epoch": 1.7640385540763206, + "grad_norm": 5.46875, + "learning_rate": 3.6049585126669395e-07, + "loss": 0.79211183, + "memory(GiB)": 138.1, + "step": 75610, + "train_speed(iter/s)": 0.200615 + }, + { + "acc": 0.78412828, + "epoch": 1.7642718616486097, + "grad_norm": 6.3125, + "learning_rate": 3.5979187040462883e-07, + "loss": 0.77902346, + "memory(GiB)": 138.1, + "step": 75620, + "train_speed(iter/s)": 0.200629 + }, + { + "acc": 0.75922632, + "epoch": 1.7645051692208984, + "grad_norm": 6.3125, + "learning_rate": 3.5908855193092617e-07, + "loss": 0.87735043, + "memory(GiB)": 138.1, + "step": 75630, + "train_speed(iter/s)": 0.200644 + }, + { + "acc": 0.79961157, + "epoch": 1.7647384767931875, + "grad_norm": 5.46875, + "learning_rate": 3.5838589594598514e-07, + "loss": 0.7121191, + "memory(GiB)": 138.1, + "step": 75640, + "train_speed(iter/s)": 0.200658 + }, + { + "acc": 0.7941186, + "epoch": 1.7649717843654762, + "grad_norm": 5.125, + "learning_rate": 3.5768390255010833e-07, + "loss": 0.71949234, + "memory(GiB)": 138.1, + "step": 75650, + "train_speed(iter/s)": 0.200672 + }, + { + "acc": 0.77251234, + "epoch": 1.7652050919377653, + "grad_norm": 5.5625, + "learning_rate": 3.569825718435066e-07, + "loss": 0.82141552, + "memory(GiB)": 138.1, + "step": 75660, + "train_speed(iter/s)": 0.200686 + }, + { + "acc": 0.7619668, + "epoch": 1.765438399510054, + "grad_norm": 4.65625, + "learning_rate": 3.562819039262938e-07, + "loss": 0.87379723, + "memory(GiB)": 138.1, + "step": 75670, + "train_speed(iter/s)": 0.200701 + }, + { + "acc": 0.80607891, + "epoch": 1.7656717070823431, + "grad_norm": 7.5, + "learning_rate": 3.5558189889848995e-07, + "loss": 0.67555485, + "memory(GiB)": 138.1, + "step": 75680, + "train_speed(iter/s)": 0.200715 + }, + { + "acc": 0.78483949, + "epoch": 1.7659050146546318, + "grad_norm": 4.0625, + "learning_rate": 3.5488255686002005e-07, + "loss": 0.80067978, + "memory(GiB)": 138.1, + "step": 75690, + "train_speed(iter/s)": 0.200728 + }, + { + "acc": 0.77885909, + "epoch": 1.766138322226921, + "grad_norm": 5.28125, + "learning_rate": 3.5418387791071706e-07, + "loss": 0.81857595, + "memory(GiB)": 138.1, + "step": 75700, + "train_speed(iter/s)": 0.200743 + }, + { + "acc": 0.77617831, + "epoch": 1.7663716297992096, + "grad_norm": 5.125, + "learning_rate": 3.5348586215031457e-07, + "loss": 0.79372025, + "memory(GiB)": 138.1, + "step": 75710, + "train_speed(iter/s)": 0.200756 + }, + { + "acc": 0.78535709, + "epoch": 1.7666049373714987, + "grad_norm": 6.03125, + "learning_rate": 3.527885096784567e-07, + "loss": 0.76806793, + "memory(GiB)": 138.1, + "step": 75720, + "train_speed(iter/s)": 0.200769 + }, + { + "acc": 0.76860857, + "epoch": 1.7668382449437874, + "grad_norm": 5.59375, + "learning_rate": 3.520918205946883e-07, + "loss": 0.81936932, + "memory(GiB)": 138.1, + "step": 75730, + "train_speed(iter/s)": 0.200784 + }, + { + "acc": 0.77166557, + "epoch": 1.7670715525160765, + "grad_norm": 5.375, + "learning_rate": 3.51395794998462e-07, + "loss": 0.82154722, + "memory(GiB)": 138.1, + "step": 75740, + "train_speed(iter/s)": 0.200798 + }, + { + "acc": 0.78490963, + "epoch": 1.7673048600883652, + "grad_norm": 6.53125, + "learning_rate": 3.507004329891367e-07, + "loss": 0.76154852, + "memory(GiB)": 138.1, + "step": 75750, + "train_speed(iter/s)": 0.200812 + }, + { + "acc": 0.78819742, + "epoch": 1.7675381676606543, + "grad_norm": 5.96875, + "learning_rate": 3.5000573466597243e-07, + "loss": 0.74255095, + "memory(GiB)": 138.1, + "step": 75760, + "train_speed(iter/s)": 0.200825 + }, + { + "acc": 0.79479284, + "epoch": 1.767771475232943, + "grad_norm": 6.65625, + "learning_rate": 3.4931170012813985e-07, + "loss": 0.73127079, + "memory(GiB)": 138.1, + "step": 75770, + "train_speed(iter/s)": 0.200838 + }, + { + "acc": 0.79238229, + "epoch": 1.7680047828052319, + "grad_norm": 6.1875, + "learning_rate": 3.4861832947471086e-07, + "loss": 0.73390646, + "memory(GiB)": 138.1, + "step": 75780, + "train_speed(iter/s)": 0.200852 + }, + { + "acc": 0.78550787, + "epoch": 1.7682380903775208, + "grad_norm": 5.0625, + "learning_rate": 3.479256228046646e-07, + "loss": 0.75722055, + "memory(GiB)": 138.1, + "step": 75790, + "train_speed(iter/s)": 0.200866 + }, + { + "acc": 0.777526, + "epoch": 1.7684713979498097, + "grad_norm": 9.1875, + "learning_rate": 3.4723358021688303e-07, + "loss": 0.78414106, + "memory(GiB)": 138.1, + "step": 75800, + "train_speed(iter/s)": 0.20088 + }, + { + "acc": 0.77671871, + "epoch": 1.7687047055220986, + "grad_norm": 4.90625, + "learning_rate": 3.465422018101572e-07, + "loss": 0.80290337, + "memory(GiB)": 138.1, + "step": 75810, + "train_speed(iter/s)": 0.200894 + }, + { + "acc": 0.79054794, + "epoch": 1.7689380130943875, + "grad_norm": 5.28125, + "learning_rate": 3.4585148768317975e-07, + "loss": 0.74591765, + "memory(GiB)": 138.1, + "step": 75820, + "train_speed(iter/s)": 0.200909 + }, + { + "acc": 0.77464447, + "epoch": 1.7691713206666764, + "grad_norm": 6.28125, + "learning_rate": 3.4516143793455027e-07, + "loss": 0.83038616, + "memory(GiB)": 138.1, + "step": 75830, + "train_speed(iter/s)": 0.200923 + }, + { + "acc": 0.78378277, + "epoch": 1.7694046282389653, + "grad_norm": 4.34375, + "learning_rate": 3.4447205266277373e-07, + "loss": 0.75231061, + "memory(GiB)": 138.1, + "step": 75840, + "train_speed(iter/s)": 0.200936 + }, + { + "acc": 0.78303156, + "epoch": 1.7696379358112542, + "grad_norm": 4.53125, + "learning_rate": 3.437833319662587e-07, + "loss": 0.78000083, + "memory(GiB)": 138.1, + "step": 75850, + "train_speed(iter/s)": 0.20095 + }, + { + "acc": 0.79331565, + "epoch": 1.769871243383543, + "grad_norm": 5.65625, + "learning_rate": 3.430952759433209e-07, + "loss": 0.74238071, + "memory(GiB)": 138.1, + "step": 75860, + "train_speed(iter/s)": 0.200964 + }, + { + "acc": 0.78225794, + "epoch": 1.770104550955832, + "grad_norm": 5.5625, + "learning_rate": 3.4240788469217966e-07, + "loss": 0.78348684, + "memory(GiB)": 138.1, + "step": 75870, + "train_speed(iter/s)": 0.200977 + }, + { + "acc": 0.79212933, + "epoch": 1.7703378585281209, + "grad_norm": 6.96875, + "learning_rate": 3.417211583109592e-07, + "loss": 0.73944921, + "memory(GiB)": 138.1, + "step": 75880, + "train_speed(iter/s)": 0.200991 + }, + { + "acc": 0.77557068, + "epoch": 1.7705711661004098, + "grad_norm": 5.03125, + "learning_rate": 3.4103509689769165e-07, + "loss": 0.78513107, + "memory(GiB)": 138.1, + "step": 75890, + "train_speed(iter/s)": 0.201004 + }, + { + "acc": 0.77322273, + "epoch": 1.7708044736726987, + "grad_norm": 6.78125, + "learning_rate": 3.4034970055030923e-07, + "loss": 0.81071148, + "memory(GiB)": 138.1, + "step": 75900, + "train_speed(iter/s)": 0.201018 + }, + { + "acc": 0.79248972, + "epoch": 1.7710377812449876, + "grad_norm": 4.28125, + "learning_rate": 3.396649693666537e-07, + "loss": 0.74013586, + "memory(GiB)": 138.1, + "step": 75910, + "train_speed(iter/s)": 0.201031 + }, + { + "acc": 0.79005871, + "epoch": 1.7712710888172765, + "grad_norm": 6.03125, + "learning_rate": 3.3898090344446966e-07, + "loss": 0.73053083, + "memory(GiB)": 138.1, + "step": 75920, + "train_speed(iter/s)": 0.201044 + }, + { + "acc": 0.76683903, + "epoch": 1.7715043963895654, + "grad_norm": 7.09375, + "learning_rate": 3.382975028814078e-07, + "loss": 0.8392231, + "memory(GiB)": 138.1, + "step": 75930, + "train_speed(iter/s)": 0.201059 + }, + { + "acc": 0.78439379, + "epoch": 1.7717377039618543, + "grad_norm": 6.59375, + "learning_rate": 3.3761476777502355e-07, + "loss": 0.77608676, + "memory(GiB)": 138.1, + "step": 75940, + "train_speed(iter/s)": 0.201072 + }, + { + "acc": 0.78523664, + "epoch": 1.7719710115341432, + "grad_norm": 4.9375, + "learning_rate": 3.369326982227761e-07, + "loss": 0.76700554, + "memory(GiB)": 138.1, + "step": 75950, + "train_speed(iter/s)": 0.201086 + }, + { + "acc": 0.78907003, + "epoch": 1.7722043191064318, + "grad_norm": 4.4375, + "learning_rate": 3.3625129432203197e-07, + "loss": 0.75762873, + "memory(GiB)": 138.1, + "step": 75960, + "train_speed(iter/s)": 0.201099 + }, + { + "acc": 0.77812548, + "epoch": 1.772437626678721, + "grad_norm": 4.28125, + "learning_rate": 3.3557055617006006e-07, + "loss": 0.76968746, + "memory(GiB)": 138.1, + "step": 75970, + "train_speed(iter/s)": 0.201113 + }, + { + "acc": 0.77837658, + "epoch": 1.7726709342510096, + "grad_norm": 4.34375, + "learning_rate": 3.348904838640371e-07, + "loss": 0.77840204, + "memory(GiB)": 138.1, + "step": 75980, + "train_speed(iter/s)": 0.201127 + }, + { + "acc": 0.77046366, + "epoch": 1.7729042418232988, + "grad_norm": 5.40625, + "learning_rate": 3.3421107750104155e-07, + "loss": 0.83179445, + "memory(GiB)": 138.1, + "step": 75990, + "train_speed(iter/s)": 0.201141 + }, + { + "acc": 0.77653723, + "epoch": 1.7731375493955874, + "grad_norm": 5.96875, + "learning_rate": 3.3353233717805967e-07, + "loss": 0.80116844, + "memory(GiB)": 138.1, + "step": 76000, + "train_speed(iter/s)": 0.201154 + }, + { + "epoch": 1.7731375493955874, + "eval_acc": 0.744712317701815, + "eval_loss": 0.804376482963562, + "eval_runtime": 1270.1032, + "eval_samples_per_second": 28.337, + "eval_steps_per_second": 14.169, + "step": 76000 + }, + { + "acc": 0.77820454, + "epoch": 1.7733708569678766, + "grad_norm": 4.59375, + "learning_rate": 3.3285426299198175e-07, + "loss": 0.81404266, + "memory(GiB)": 138.1, + "step": 76010, + "train_speed(iter/s)": 0.200481 + }, + { + "acc": 0.77452345, + "epoch": 1.7736041645401652, + "grad_norm": 4.4375, + "learning_rate": 3.321768550396015e-07, + "loss": 0.81841717, + "memory(GiB)": 138.1, + "step": 76020, + "train_speed(iter/s)": 0.200495 + }, + { + "acc": 0.78945813, + "epoch": 1.7738374721124543, + "grad_norm": 4.96875, + "learning_rate": 3.3150011341761933e-07, + "loss": 0.75137343, + "memory(GiB)": 138.1, + "step": 76030, + "train_speed(iter/s)": 0.200509 + }, + { + "acc": 0.7791081, + "epoch": 1.774070779684743, + "grad_norm": 5.375, + "learning_rate": 3.308240382226385e-07, + "loss": 0.7906178, + "memory(GiB)": 138.1, + "step": 76040, + "train_speed(iter/s)": 0.200523 + }, + { + "acc": 0.78823118, + "epoch": 1.7743040872570321, + "grad_norm": 4.59375, + "learning_rate": 3.301486295511713e-07, + "loss": 0.77563086, + "memory(GiB)": 138.1, + "step": 76050, + "train_speed(iter/s)": 0.200537 + }, + { + "acc": 0.78934536, + "epoch": 1.7745373948293208, + "grad_norm": 7.5, + "learning_rate": 3.294738874996295e-07, + "loss": 0.7632966, + "memory(GiB)": 138.1, + "step": 76060, + "train_speed(iter/s)": 0.200552 + }, + { + "acc": 0.77486506, + "epoch": 1.77477070240161, + "grad_norm": 7.0, + "learning_rate": 3.2879981216433433e-07, + "loss": 0.85285072, + "memory(GiB)": 138.1, + "step": 76070, + "train_speed(iter/s)": 0.200566 + }, + { + "acc": 0.7874649, + "epoch": 1.7750040099738986, + "grad_norm": 4.25, + "learning_rate": 3.281264036415088e-07, + "loss": 0.75169353, + "memory(GiB)": 138.1, + "step": 76080, + "train_speed(iter/s)": 0.200579 + }, + { + "acc": 0.77508116, + "epoch": 1.7752373175461877, + "grad_norm": 6.125, + "learning_rate": 3.274536620272811e-07, + "loss": 0.80775452, + "memory(GiB)": 138.1, + "step": 76090, + "train_speed(iter/s)": 0.200593 + }, + { + "acc": 0.77886529, + "epoch": 1.7754706251184764, + "grad_norm": 4.8125, + "learning_rate": 3.267815874176866e-07, + "loss": 0.80793839, + "memory(GiB)": 138.1, + "step": 76100, + "train_speed(iter/s)": 0.200607 + }, + { + "acc": 0.77209039, + "epoch": 1.7757039326907655, + "grad_norm": 7.125, + "learning_rate": 3.2611017990866244e-07, + "loss": 0.82538052, + "memory(GiB)": 138.1, + "step": 76110, + "train_speed(iter/s)": 0.200621 + }, + { + "acc": 0.77398095, + "epoch": 1.7759372402630542, + "grad_norm": 4.96875, + "learning_rate": 3.254394395960536e-07, + "loss": 0.81047382, + "memory(GiB)": 138.1, + "step": 76120, + "train_speed(iter/s)": 0.200635 + }, + { + "acc": 0.76875162, + "epoch": 1.7761705478353433, + "grad_norm": 5.0, + "learning_rate": 3.247693665756052e-07, + "loss": 0.84657841, + "memory(GiB)": 138.1, + "step": 76130, + "train_speed(iter/s)": 0.200648 + }, + { + "acc": 0.7959794, + "epoch": 1.776403855407632, + "grad_norm": 4.8125, + "learning_rate": 3.2409996094297294e-07, + "loss": 0.74632068, + "memory(GiB)": 138.1, + "step": 76140, + "train_speed(iter/s)": 0.200661 + }, + { + "acc": 0.77729673, + "epoch": 1.776637162979921, + "grad_norm": 6.25, + "learning_rate": 3.234312227937114e-07, + "loss": 0.82577114, + "memory(GiB)": 138.1, + "step": 76150, + "train_speed(iter/s)": 0.200674 + }, + { + "acc": 0.80631809, + "epoch": 1.7768704705522098, + "grad_norm": 6.125, + "learning_rate": 3.2276315222328547e-07, + "loss": 0.71360807, + "memory(GiB)": 138.1, + "step": 76160, + "train_speed(iter/s)": 0.200689 + }, + { + "acc": 0.80361118, + "epoch": 1.7771037781244987, + "grad_norm": 4.1875, + "learning_rate": 3.2209574932706043e-07, + "loss": 0.69395614, + "memory(GiB)": 138.1, + "step": 76170, + "train_speed(iter/s)": 0.200703 + }, + { + "acc": 0.77744942, + "epoch": 1.7773370856967876, + "grad_norm": 5.0625, + "learning_rate": 3.214290142003079e-07, + "loss": 0.80556965, + "memory(GiB)": 138.1, + "step": 76180, + "train_speed(iter/s)": 0.200716 + }, + { + "acc": 0.78394184, + "epoch": 1.7775703932690765, + "grad_norm": 5.875, + "learning_rate": 3.207629469382051e-07, + "loss": 0.78173552, + "memory(GiB)": 138.1, + "step": 76190, + "train_speed(iter/s)": 0.20073 + }, + { + "acc": 0.7806911, + "epoch": 1.7778037008413654, + "grad_norm": 4.875, + "learning_rate": 3.2009754763583143e-07, + "loss": 0.79640622, + "memory(GiB)": 138.1, + "step": 76200, + "train_speed(iter/s)": 0.200743 + }, + { + "acc": 0.78159871, + "epoch": 1.7780370084136543, + "grad_norm": 5.34375, + "learning_rate": 3.194328163881738e-07, + "loss": 0.77030783, + "memory(GiB)": 138.1, + "step": 76210, + "train_speed(iter/s)": 0.200756 + }, + { + "acc": 0.78937597, + "epoch": 1.7782703159859432, + "grad_norm": 4.5, + "learning_rate": 3.1876875329012235e-07, + "loss": 0.74239864, + "memory(GiB)": 138.1, + "step": 76220, + "train_speed(iter/s)": 0.200768 + }, + { + "acc": 0.76783009, + "epoch": 1.778503623558232, + "grad_norm": 5.03125, + "learning_rate": 3.181053584364707e-07, + "loss": 0.84220142, + "memory(GiB)": 138.1, + "step": 76230, + "train_speed(iter/s)": 0.200783 + }, + { + "acc": 0.77021403, + "epoch": 1.778736931130521, + "grad_norm": 5.5, + "learning_rate": 3.174426319219204e-07, + "loss": 0.83477764, + "memory(GiB)": 138.1, + "step": 76240, + "train_speed(iter/s)": 0.200798 + }, + { + "acc": 0.78976417, + "epoch": 1.77897023870281, + "grad_norm": 5.03125, + "learning_rate": 3.167805738410723e-07, + "loss": 0.75705347, + "memory(GiB)": 138.1, + "step": 76250, + "train_speed(iter/s)": 0.200812 + }, + { + "acc": 0.78457289, + "epoch": 1.7792035462750988, + "grad_norm": 6.6875, + "learning_rate": 3.161191842884381e-07, + "loss": 0.76162119, + "memory(GiB)": 138.1, + "step": 76260, + "train_speed(iter/s)": 0.200827 + }, + { + "acc": 0.78860617, + "epoch": 1.7794368538473877, + "grad_norm": 4.5625, + "learning_rate": 3.1545846335842843e-07, + "loss": 0.76644807, + "memory(GiB)": 138.1, + "step": 76270, + "train_speed(iter/s)": 0.20084 + }, + { + "acc": 0.78241735, + "epoch": 1.7796701614196766, + "grad_norm": 5.59375, + "learning_rate": 3.1479841114536334e-07, + "loss": 0.7895534, + "memory(GiB)": 138.1, + "step": 76280, + "train_speed(iter/s)": 0.200854 + }, + { + "acc": 0.76458092, + "epoch": 1.7799034689919655, + "grad_norm": 5.59375, + "learning_rate": 3.1413902774346305e-07, + "loss": 0.85796795, + "memory(GiB)": 138.1, + "step": 76290, + "train_speed(iter/s)": 0.200868 + }, + { + "acc": 0.7956975, + "epoch": 1.7801367765642544, + "grad_norm": 6.59375, + "learning_rate": 3.134803132468561e-07, + "loss": 0.73320594, + "memory(GiB)": 138.1, + "step": 76300, + "train_speed(iter/s)": 0.200882 + }, + { + "acc": 0.80051413, + "epoch": 1.7803700841365433, + "grad_norm": 5.4375, + "learning_rate": 3.128222677495729e-07, + "loss": 0.71121464, + "memory(GiB)": 138.1, + "step": 76310, + "train_speed(iter/s)": 0.200896 + }, + { + "acc": 0.78982344, + "epoch": 1.7806033917088322, + "grad_norm": 5.9375, + "learning_rate": 3.1216489134554886e-07, + "loss": 0.76041822, + "memory(GiB)": 138.1, + "step": 76320, + "train_speed(iter/s)": 0.200909 + }, + { + "acc": 0.78644514, + "epoch": 1.780836699281121, + "grad_norm": 5.5625, + "learning_rate": 3.115081841286255e-07, + "loss": 0.76248388, + "memory(GiB)": 138.1, + "step": 76330, + "train_speed(iter/s)": 0.200923 + }, + { + "acc": 0.78216338, + "epoch": 1.78107000685341, + "grad_norm": 5.21875, + "learning_rate": 3.108521461925457e-07, + "loss": 0.80187321, + "memory(GiB)": 138.1, + "step": 76340, + "train_speed(iter/s)": 0.200937 + }, + { + "acc": 0.75998535, + "epoch": 1.7813033144256987, + "grad_norm": 5.125, + "learning_rate": 3.101967776309617e-07, + "loss": 0.8771349, + "memory(GiB)": 138.1, + "step": 76350, + "train_speed(iter/s)": 0.200951 + }, + { + "acc": 0.80899925, + "epoch": 1.7815366219979878, + "grad_norm": 5.0625, + "learning_rate": 3.095420785374237e-07, + "loss": 0.66126566, + "memory(GiB)": 138.1, + "step": 76360, + "train_speed(iter/s)": 0.200965 + }, + { + "acc": 0.76196551, + "epoch": 1.7817699295702765, + "grad_norm": 6.78125, + "learning_rate": 3.088880490053925e-07, + "loss": 0.87400131, + "memory(GiB)": 138.1, + "step": 76370, + "train_speed(iter/s)": 0.200978 + }, + { + "acc": 0.76612358, + "epoch": 1.7820032371425656, + "grad_norm": 6.21875, + "learning_rate": 3.0823468912822895e-07, + "loss": 0.84581919, + "memory(GiB)": 138.1, + "step": 76380, + "train_speed(iter/s)": 0.200992 + }, + { + "acc": 0.78834362, + "epoch": 1.7822365447148543, + "grad_norm": 11.0, + "learning_rate": 3.0758199899920014e-07, + "loss": 0.74992781, + "memory(GiB)": 138.1, + "step": 76390, + "train_speed(iter/s)": 0.201005 + }, + { + "acc": 0.77249632, + "epoch": 1.7824698522871434, + "grad_norm": 4.71875, + "learning_rate": 3.069299787114782e-07, + "loss": 0.83343058, + "memory(GiB)": 138.1, + "step": 76400, + "train_speed(iter/s)": 0.201019 + }, + { + "acc": 0.78593674, + "epoch": 1.782703159859432, + "grad_norm": 4.46875, + "learning_rate": 3.0627862835813814e-07, + "loss": 0.75706806, + "memory(GiB)": 138.1, + "step": 76410, + "train_speed(iter/s)": 0.201033 + }, + { + "acc": 0.77988739, + "epoch": 1.7829364674317212, + "grad_norm": 4.8125, + "learning_rate": 3.0562794803216114e-07, + "loss": 0.77060046, + "memory(GiB)": 138.1, + "step": 76420, + "train_speed(iter/s)": 0.201045 + }, + { + "acc": 0.80068836, + "epoch": 1.7831697750040099, + "grad_norm": 6.96875, + "learning_rate": 3.0497793782642946e-07, + "loss": 0.69606977, + "memory(GiB)": 138.1, + "step": 76430, + "train_speed(iter/s)": 0.201059 + }, + { + "acc": 0.80669575, + "epoch": 1.783403082576299, + "grad_norm": 4.6875, + "learning_rate": 3.043285978337346e-07, + "loss": 0.69010477, + "memory(GiB)": 138.1, + "step": 76440, + "train_speed(iter/s)": 0.201073 + }, + { + "acc": 0.78510504, + "epoch": 1.7836363901485877, + "grad_norm": 6.0, + "learning_rate": 3.036799281467678e-07, + "loss": 0.75290413, + "memory(GiB)": 138.1, + "step": 76450, + "train_speed(iter/s)": 0.201086 + }, + { + "acc": 0.78098297, + "epoch": 1.7838696977208768, + "grad_norm": 4.96875, + "learning_rate": 3.0303192885812737e-07, + "loss": 0.77756319, + "memory(GiB)": 138.1, + "step": 76460, + "train_speed(iter/s)": 0.2011 + }, + { + "acc": 0.79312925, + "epoch": 1.7841030052931655, + "grad_norm": 4.8125, + "learning_rate": 3.023846000603148e-07, + "loss": 0.733179, + "memory(GiB)": 138.1, + "step": 76470, + "train_speed(iter/s)": 0.201115 + }, + { + "acc": 0.79176178, + "epoch": 1.7843363128654546, + "grad_norm": 5.375, + "learning_rate": 3.0173794184573444e-07, + "loss": 0.75639806, + "memory(GiB)": 138.1, + "step": 76480, + "train_speed(iter/s)": 0.201129 + }, + { + "acc": 0.78836441, + "epoch": 1.7845696204377433, + "grad_norm": 5.28125, + "learning_rate": 3.0109195430669925e-07, + "loss": 0.76128173, + "memory(GiB)": 138.1, + "step": 76490, + "train_speed(iter/s)": 0.201142 + }, + { + "acc": 0.78055067, + "epoch": 1.7848029280100324, + "grad_norm": 5.78125, + "learning_rate": 3.00446637535422e-07, + "loss": 0.79550552, + "memory(GiB)": 138.1, + "step": 76500, + "train_speed(iter/s)": 0.201156 + }, + { + "epoch": 1.7848029280100324, + "eval_acc": 0.7446987054420694, + "eval_loss": 0.8044025897979736, + "eval_runtime": 1269.883, + "eval_samples_per_second": 28.342, + "eval_steps_per_second": 14.171, + "step": 76500 + }, + { + "acc": 0.78949766, + "epoch": 1.785036235582321, + "grad_norm": 4.96875, + "learning_rate": 2.9980199162402245e-07, + "loss": 0.77280111, + "memory(GiB)": 138.1, + "step": 76510, + "train_speed(iter/s)": 0.200486 + }, + { + "acc": 0.7705359, + "epoch": 1.7852695431546102, + "grad_norm": 5.9375, + "learning_rate": 2.9915801666452307e-07, + "loss": 0.82575054, + "memory(GiB)": 138.1, + "step": 76520, + "train_speed(iter/s)": 0.200499 + }, + { + "acc": 0.79377584, + "epoch": 1.7855028507268988, + "grad_norm": 5.96875, + "learning_rate": 2.985147127488508e-07, + "loss": 0.73605633, + "memory(GiB)": 138.1, + "step": 76530, + "train_speed(iter/s)": 0.200512 + }, + { + "acc": 0.77772698, + "epoch": 1.7857361582991877, + "grad_norm": 6.75, + "learning_rate": 2.978720799688378e-07, + "loss": 0.77994833, + "memory(GiB)": 138.1, + "step": 76540, + "train_speed(iter/s)": 0.200526 + }, + { + "acc": 0.78651562, + "epoch": 1.7859694658714766, + "grad_norm": 7.78125, + "learning_rate": 2.9723011841621905e-07, + "loss": 0.74728775, + "memory(GiB)": 138.1, + "step": 76550, + "train_speed(iter/s)": 0.200539 + }, + { + "acc": 0.76202259, + "epoch": 1.7862027734437655, + "grad_norm": 8.3125, + "learning_rate": 2.965888281826357e-07, + "loss": 0.84415607, + "memory(GiB)": 138.1, + "step": 76560, + "train_speed(iter/s)": 0.200552 + }, + { + "acc": 0.77949867, + "epoch": 1.7864360810160544, + "grad_norm": 4.3125, + "learning_rate": 2.9594820935963e-07, + "loss": 0.76835766, + "memory(GiB)": 138.1, + "step": 76570, + "train_speed(iter/s)": 0.200565 + }, + { + "acc": 0.78318701, + "epoch": 1.7866693885883433, + "grad_norm": 4.03125, + "learning_rate": 2.953082620386516e-07, + "loss": 0.77278767, + "memory(GiB)": 138.1, + "step": 76580, + "train_speed(iter/s)": 0.200579 + }, + { + "acc": 0.78765154, + "epoch": 1.7869026961606322, + "grad_norm": 7.375, + "learning_rate": 2.946689863110508e-07, + "loss": 0.74817243, + "memory(GiB)": 138.1, + "step": 76590, + "train_speed(iter/s)": 0.200593 + }, + { + "acc": 0.79166756, + "epoch": 1.7871360037329211, + "grad_norm": 4.875, + "learning_rate": 2.9403038226808625e-07, + "loss": 0.74761534, + "memory(GiB)": 138.1, + "step": 76600, + "train_speed(iter/s)": 0.200606 + }, + { + "acc": 0.77928109, + "epoch": 1.78736931130521, + "grad_norm": 5.9375, + "learning_rate": 2.9339245000091776e-07, + "loss": 0.80569696, + "memory(GiB)": 138.1, + "step": 76610, + "train_speed(iter/s)": 0.20062 + }, + { + "acc": 0.78152971, + "epoch": 1.787602618877499, + "grad_norm": 5.90625, + "learning_rate": 2.9275518960060867e-07, + "loss": 0.78764749, + "memory(GiB)": 138.1, + "step": 76620, + "train_speed(iter/s)": 0.200633 + }, + { + "acc": 0.78837299, + "epoch": 1.7878359264497878, + "grad_norm": 6.40625, + "learning_rate": 2.9211860115813005e-07, + "loss": 0.7382781, + "memory(GiB)": 138.1, + "step": 76630, + "train_speed(iter/s)": 0.200647 + }, + { + "acc": 0.77095346, + "epoch": 1.7880692340220767, + "grad_norm": 5.0, + "learning_rate": 2.9148268476435206e-07, + "loss": 0.82100334, + "memory(GiB)": 138.1, + "step": 76640, + "train_speed(iter/s)": 0.200661 + }, + { + "acc": 0.77619829, + "epoch": 1.7883025415943656, + "grad_norm": 7.4375, + "learning_rate": 2.908474405100542e-07, + "loss": 0.82527828, + "memory(GiB)": 138.1, + "step": 76650, + "train_speed(iter/s)": 0.200675 + }, + { + "acc": 0.77772455, + "epoch": 1.7885358491666545, + "grad_norm": 3.921875, + "learning_rate": 2.9021286848591626e-07, + "loss": 0.79985743, + "memory(GiB)": 138.1, + "step": 76660, + "train_speed(iter/s)": 0.200688 + }, + { + "acc": 0.78497052, + "epoch": 1.7887691567389434, + "grad_norm": 6.3125, + "learning_rate": 2.895789687825218e-07, + "loss": 0.77265472, + "memory(GiB)": 138.1, + "step": 76670, + "train_speed(iter/s)": 0.200702 + }, + { + "acc": 0.78477559, + "epoch": 1.7890024643112323, + "grad_norm": 5.6875, + "learning_rate": 2.8894574149036237e-07, + "loss": 0.77249074, + "memory(GiB)": 138.1, + "step": 76680, + "train_speed(iter/s)": 0.200715 + }, + { + "acc": 0.78217597, + "epoch": 1.7892357718835212, + "grad_norm": 5.3125, + "learning_rate": 2.8831318669982956e-07, + "loss": 0.78129129, + "memory(GiB)": 138.1, + "step": 76690, + "train_speed(iter/s)": 0.200728 + }, + { + "acc": 0.78776398, + "epoch": 1.7894690794558101, + "grad_norm": 4.5, + "learning_rate": 2.876813045012211e-07, + "loss": 0.76364908, + "memory(GiB)": 138.1, + "step": 76700, + "train_speed(iter/s)": 0.200742 + }, + { + "acc": 0.78187065, + "epoch": 1.789702387028099, + "grad_norm": 6.8125, + "learning_rate": 2.8705009498473604e-07, + "loss": 0.77821112, + "memory(GiB)": 138.1, + "step": 76710, + "train_speed(iter/s)": 0.200756 + }, + { + "acc": 0.78065195, + "epoch": 1.789935694600388, + "grad_norm": 12.5625, + "learning_rate": 2.8641955824048216e-07, + "loss": 0.78781352, + "memory(GiB)": 138.1, + "step": 76720, + "train_speed(iter/s)": 0.20077 + }, + { + "acc": 0.75964537, + "epoch": 1.7901690021726768, + "grad_norm": 5.03125, + "learning_rate": 2.857896943584665e-07, + "loss": 0.86047745, + "memory(GiB)": 138.1, + "step": 76730, + "train_speed(iter/s)": 0.200784 + }, + { + "acc": 0.76801662, + "epoch": 1.7904023097449655, + "grad_norm": 7.28125, + "learning_rate": 2.85160503428602e-07, + "loss": 0.84608536, + "memory(GiB)": 138.1, + "step": 76740, + "train_speed(iter/s)": 0.200797 + }, + { + "acc": 0.78553681, + "epoch": 1.7906356173172546, + "grad_norm": 5.1875, + "learning_rate": 2.8453198554070694e-07, + "loss": 0.78472672, + "memory(GiB)": 138.1, + "step": 76750, + "train_speed(iter/s)": 0.20081 + }, + { + "acc": 0.7960628, + "epoch": 1.7908689248895433, + "grad_norm": 5.34375, + "learning_rate": 2.8390414078450003e-07, + "loss": 0.71955853, + "memory(GiB)": 138.1, + "step": 76760, + "train_speed(iter/s)": 0.200824 + }, + { + "acc": 0.78666868, + "epoch": 1.7911022324618324, + "grad_norm": 13.3125, + "learning_rate": 2.8327696924960737e-07, + "loss": 0.76741972, + "memory(GiB)": 138.1, + "step": 76770, + "train_speed(iter/s)": 0.200837 + }, + { + "acc": 0.7942173, + "epoch": 1.791335540034121, + "grad_norm": 5.59375, + "learning_rate": 2.8265047102555733e-07, + "loss": 0.75203381, + "memory(GiB)": 138.1, + "step": 76780, + "train_speed(iter/s)": 0.20085 + }, + { + "acc": 0.79992652, + "epoch": 1.7915688476064102, + "grad_norm": 6.03125, + "learning_rate": 2.8202464620178225e-07, + "loss": 0.70470304, + "memory(GiB)": 138.1, + "step": 76790, + "train_speed(iter/s)": 0.200864 + }, + { + "acc": 0.80589857, + "epoch": 1.791802155178699, + "grad_norm": 4.15625, + "learning_rate": 2.8139949486761853e-07, + "loss": 0.67734222, + "memory(GiB)": 138.1, + "step": 76800, + "train_speed(iter/s)": 0.200877 + }, + { + "acc": 0.78081474, + "epoch": 1.792035462750988, + "grad_norm": 5.9375, + "learning_rate": 2.807750171123058e-07, + "loss": 0.78283153, + "memory(GiB)": 138.1, + "step": 76810, + "train_speed(iter/s)": 0.200891 + }, + { + "acc": 0.78747606, + "epoch": 1.7922687703232767, + "grad_norm": 6.5625, + "learning_rate": 2.8015121302498894e-07, + "loss": 0.75321217, + "memory(GiB)": 138.1, + "step": 76820, + "train_speed(iter/s)": 0.200905 + }, + { + "acc": 0.78687334, + "epoch": 1.7925020778955658, + "grad_norm": 3.875, + "learning_rate": 2.7952808269471445e-07, + "loss": 0.75753322, + "memory(GiB)": 138.1, + "step": 76830, + "train_speed(iter/s)": 0.200918 + }, + { + "acc": 0.77859888, + "epoch": 1.7927353854678545, + "grad_norm": 6.28125, + "learning_rate": 2.7890562621043503e-07, + "loss": 0.7866539, + "memory(GiB)": 138.1, + "step": 76840, + "train_speed(iter/s)": 0.200932 + }, + { + "acc": 0.78868618, + "epoch": 1.7929686930401436, + "grad_norm": 4.1875, + "learning_rate": 2.782838436610058e-07, + "loss": 0.75083804, + "memory(GiB)": 138.1, + "step": 76850, + "train_speed(iter/s)": 0.200945 + }, + { + "acc": 0.76023216, + "epoch": 1.7932020006124323, + "grad_norm": 6.53125, + "learning_rate": 2.776627351351868e-07, + "loss": 0.87442131, + "memory(GiB)": 138.1, + "step": 76860, + "train_speed(iter/s)": 0.200959 + }, + { + "acc": 0.7890811, + "epoch": 1.7934353081847214, + "grad_norm": 4.96875, + "learning_rate": 2.770423007216411e-07, + "loss": 0.75472221, + "memory(GiB)": 138.1, + "step": 76870, + "train_speed(iter/s)": 0.200973 + }, + { + "acc": 0.77964153, + "epoch": 1.79366861575701, + "grad_norm": 5.875, + "learning_rate": 2.764225405089332e-07, + "loss": 0.80402193, + "memory(GiB)": 138.1, + "step": 76880, + "train_speed(iter/s)": 0.200986 + }, + { + "acc": 0.76858959, + "epoch": 1.7939019233292992, + "grad_norm": 5.3125, + "learning_rate": 2.7580345458553705e-07, + "loss": 0.84677277, + "memory(GiB)": 138.1, + "step": 76890, + "train_speed(iter/s)": 0.201 + }, + { + "acc": 0.78171978, + "epoch": 1.7941352309015879, + "grad_norm": 6.0625, + "learning_rate": 2.75185043039824e-07, + "loss": 0.78139849, + "memory(GiB)": 138.1, + "step": 76900, + "train_speed(iter/s)": 0.201014 + }, + { + "acc": 0.76607041, + "epoch": 1.794368538473877, + "grad_norm": 4.9375, + "learning_rate": 2.7456730596007454e-07, + "loss": 0.83766766, + "memory(GiB)": 138.1, + "step": 76910, + "train_speed(iter/s)": 0.201027 + }, + { + "acc": 0.79166379, + "epoch": 1.7946018460461657, + "grad_norm": 4.75, + "learning_rate": 2.739502434344693e-07, + "loss": 0.73072734, + "memory(GiB)": 138.1, + "step": 76920, + "train_speed(iter/s)": 0.201041 + }, + { + "acc": 0.7861433, + "epoch": 1.7948351536184546, + "grad_norm": 5.5, + "learning_rate": 2.733338555510939e-07, + "loss": 0.77780027, + "memory(GiB)": 138.1, + "step": 76930, + "train_speed(iter/s)": 0.201054 + }, + { + "acc": 0.78073063, + "epoch": 1.7950684611907435, + "grad_norm": 4.625, + "learning_rate": 2.7271814239793693e-07, + "loss": 0.77235231, + "memory(GiB)": 138.1, + "step": 76940, + "train_speed(iter/s)": 0.201067 + }, + { + "acc": 0.79449263, + "epoch": 1.7953017687630324, + "grad_norm": 6.34375, + "learning_rate": 2.721031040628924e-07, + "loss": 0.74123163, + "memory(GiB)": 138.1, + "step": 76950, + "train_speed(iter/s)": 0.20108 + }, + { + "acc": 0.77358427, + "epoch": 1.7955350763353213, + "grad_norm": 6.84375, + "learning_rate": 2.714887406337563e-07, + "loss": 0.8185647, + "memory(GiB)": 138.1, + "step": 76960, + "train_speed(iter/s)": 0.201093 + }, + { + "acc": 0.77752705, + "epoch": 1.7957683839076102, + "grad_norm": 5.125, + "learning_rate": 2.708750521982284e-07, + "loss": 0.80426521, + "memory(GiB)": 138.1, + "step": 76970, + "train_speed(iter/s)": 0.201107 + }, + { + "acc": 0.7755044, + "epoch": 1.796001691479899, + "grad_norm": 4.78125, + "learning_rate": 2.7026203884391313e-07, + "loss": 0.81011715, + "memory(GiB)": 138.1, + "step": 76980, + "train_speed(iter/s)": 0.201121 + }, + { + "acc": 0.78045125, + "epoch": 1.796234999052188, + "grad_norm": 5.21875, + "learning_rate": 2.696497006583176e-07, + "loss": 0.79744205, + "memory(GiB)": 138.1, + "step": 76990, + "train_speed(iter/s)": 0.201135 + }, + { + "acc": 0.77946234, + "epoch": 1.7964683066244769, + "grad_norm": 5.09375, + "learning_rate": 2.6903803772885375e-07, + "loss": 0.79059172, + "memory(GiB)": 138.1, + "step": 77000, + "train_speed(iter/s)": 0.201149 + }, + { + "epoch": 1.7964683066244769, + "eval_acc": 0.7447200046249655, + "eval_loss": 0.8044254779815674, + "eval_runtime": 1271.9514, + "eval_samples_per_second": 28.296, + "eval_steps_per_second": 14.148, + "step": 77000 + }, + { + "acc": 0.78546667, + "epoch": 1.7967016141967658, + "grad_norm": 6.96875, + "learning_rate": 2.6842705014283545e-07, + "loss": 0.76823864, + "memory(GiB)": 138.1, + "step": 77010, + "train_speed(iter/s)": 0.200482 + }, + { + "acc": 0.77572174, + "epoch": 1.7969349217690547, + "grad_norm": 4.21875, + "learning_rate": 2.6781673798748074e-07, + "loss": 0.8007781, + "memory(GiB)": 138.1, + "step": 77020, + "train_speed(iter/s)": 0.200497 + }, + { + "acc": 0.77363772, + "epoch": 1.7971682293413436, + "grad_norm": 6.09375, + "learning_rate": 2.672071013499122e-07, + "loss": 0.82122746, + "memory(GiB)": 138.1, + "step": 77030, + "train_speed(iter/s)": 0.200509 + }, + { + "acc": 0.76963663, + "epoch": 1.7974015369136325, + "grad_norm": 5.40625, + "learning_rate": 2.665981403171558e-07, + "loss": 0.84492531, + "memory(GiB)": 138.1, + "step": 77040, + "train_speed(iter/s)": 0.200522 + }, + { + "acc": 0.78084383, + "epoch": 1.7976348444859214, + "grad_norm": 23.875, + "learning_rate": 2.6598985497613915e-07, + "loss": 0.7569222, + "memory(GiB)": 138.1, + "step": 77050, + "train_speed(iter/s)": 0.200536 + }, + { + "acc": 0.77769785, + "epoch": 1.7978681520582103, + "grad_norm": 5.46875, + "learning_rate": 2.653822454136951e-07, + "loss": 0.78896704, + "memory(GiB)": 138.1, + "step": 77060, + "train_speed(iter/s)": 0.20055 + }, + { + "acc": 0.80396709, + "epoch": 1.7981014596304992, + "grad_norm": 4.40625, + "learning_rate": 2.647753117165608e-07, + "loss": 0.6958436, + "memory(GiB)": 138.1, + "step": 77070, + "train_speed(iter/s)": 0.200564 + }, + { + "acc": 0.79132442, + "epoch": 1.798334767202788, + "grad_norm": 3.875, + "learning_rate": 2.641690539713743e-07, + "loss": 0.75024261, + "memory(GiB)": 138.1, + "step": 77080, + "train_speed(iter/s)": 0.200577 + }, + { + "acc": 0.78543453, + "epoch": 1.798568074775077, + "grad_norm": 5.8125, + "learning_rate": 2.635634722646807e-07, + "loss": 0.77864475, + "memory(GiB)": 138.1, + "step": 77090, + "train_speed(iter/s)": 0.200591 + }, + { + "acc": 0.77233229, + "epoch": 1.7988013823473659, + "grad_norm": 4.65625, + "learning_rate": 2.6295856668292487e-07, + "loss": 0.81739025, + "memory(GiB)": 138.1, + "step": 77100, + "train_speed(iter/s)": 0.200605 + }, + { + "acc": 0.76904373, + "epoch": 1.7990346899196545, + "grad_norm": 5.65625, + "learning_rate": 2.623543373124571e-07, + "loss": 0.81224604, + "memory(GiB)": 138.1, + "step": 77110, + "train_speed(iter/s)": 0.200618 + }, + { + "acc": 0.78083372, + "epoch": 1.7992679974919437, + "grad_norm": 5.90625, + "learning_rate": 2.6175078423953225e-07, + "loss": 0.79091768, + "memory(GiB)": 138.1, + "step": 77120, + "train_speed(iter/s)": 0.200631 + }, + { + "acc": 0.80353069, + "epoch": 1.7995013050642323, + "grad_norm": 5.625, + "learning_rate": 2.6114790755030593e-07, + "loss": 0.72558646, + "memory(GiB)": 138.1, + "step": 77130, + "train_speed(iter/s)": 0.200644 + }, + { + "acc": 0.77595344, + "epoch": 1.7997346126365215, + "grad_norm": 6.28125, + "learning_rate": 2.605457073308393e-07, + "loss": 0.79515977, + "memory(GiB)": 138.1, + "step": 77140, + "train_speed(iter/s)": 0.200658 + }, + { + "acc": 0.7906909, + "epoch": 1.7999679202088101, + "grad_norm": 6.53125, + "learning_rate": 2.5994418366709686e-07, + "loss": 0.75885439, + "memory(GiB)": 138.1, + "step": 77150, + "train_speed(iter/s)": 0.200672 + }, + { + "acc": 0.76935129, + "epoch": 1.8002012277810993, + "grad_norm": 6.25, + "learning_rate": 2.5934333664494436e-07, + "loss": 0.84422007, + "memory(GiB)": 138.1, + "step": 77160, + "train_speed(iter/s)": 0.200685 + }, + { + "acc": 0.77682958, + "epoch": 1.800434535353388, + "grad_norm": 5.28125, + "learning_rate": 2.5874316635015383e-07, + "loss": 0.82697449, + "memory(GiB)": 138.1, + "step": 77170, + "train_speed(iter/s)": 0.200699 + }, + { + "acc": 0.78309669, + "epoch": 1.800667842925677, + "grad_norm": 6.25, + "learning_rate": 2.581436728683984e-07, + "loss": 0.76987929, + "memory(GiB)": 138.1, + "step": 77180, + "train_speed(iter/s)": 0.200712 + }, + { + "acc": 0.79188871, + "epoch": 1.8009011504979657, + "grad_norm": 6.625, + "learning_rate": 2.575448562852567e-07, + "loss": 0.76328211, + "memory(GiB)": 138.1, + "step": 77190, + "train_speed(iter/s)": 0.200726 + }, + { + "acc": 0.7809392, + "epoch": 1.8011344580702549, + "grad_norm": 6.65625, + "learning_rate": 2.569467166862083e-07, + "loss": 0.79415689, + "memory(GiB)": 138.1, + "step": 77200, + "train_speed(iter/s)": 0.200738 + }, + { + "acc": 0.77853994, + "epoch": 1.8013677656425435, + "grad_norm": 4.875, + "learning_rate": 2.563492541566387e-07, + "loss": 0.80777569, + "memory(GiB)": 138.1, + "step": 77210, + "train_speed(iter/s)": 0.200752 + }, + { + "acc": 0.78512239, + "epoch": 1.8016010732148326, + "grad_norm": 5.09375, + "learning_rate": 2.557524687818347e-07, + "loss": 0.76266022, + "memory(GiB)": 138.1, + "step": 77220, + "train_speed(iter/s)": 0.200764 + }, + { + "acc": 0.78263688, + "epoch": 1.8018343807871213, + "grad_norm": 6.0625, + "learning_rate": 2.551563606469881e-07, + "loss": 0.7762639, + "memory(GiB)": 138.1, + "step": 77230, + "train_speed(iter/s)": 0.200777 + }, + { + "acc": 0.79786873, + "epoch": 1.8020676883594104, + "grad_norm": 5.09375, + "learning_rate": 2.545609298371926e-07, + "loss": 0.72444544, + "memory(GiB)": 138.1, + "step": 77240, + "train_speed(iter/s)": 0.20079 + }, + { + "acc": 0.78850036, + "epoch": 1.8023009959316991, + "grad_norm": 4.3125, + "learning_rate": 2.539661764374457e-07, + "loss": 0.75326676, + "memory(GiB)": 138.1, + "step": 77250, + "train_speed(iter/s)": 0.200803 + }, + { + "acc": 0.77271185, + "epoch": 1.8025343035039882, + "grad_norm": 4.65625, + "learning_rate": 2.5337210053264893e-07, + "loss": 0.80799408, + "memory(GiB)": 138.1, + "step": 77260, + "train_speed(iter/s)": 0.200816 + }, + { + "acc": 0.76806946, + "epoch": 1.802767611076277, + "grad_norm": 5.59375, + "learning_rate": 2.5277870220760504e-07, + "loss": 0.85381813, + "memory(GiB)": 138.1, + "step": 77270, + "train_speed(iter/s)": 0.20083 + }, + { + "acc": 0.77597227, + "epoch": 1.803000918648566, + "grad_norm": 7.71875, + "learning_rate": 2.521859815470229e-07, + "loss": 0.81069126, + "memory(GiB)": 138.1, + "step": 77280, + "train_speed(iter/s)": 0.200844 + }, + { + "acc": 0.785149, + "epoch": 1.8032342262208547, + "grad_norm": 7.34375, + "learning_rate": 2.515939386355121e-07, + "loss": 0.76063423, + "memory(GiB)": 138.1, + "step": 77290, + "train_speed(iter/s)": 0.200857 + }, + { + "acc": 0.77492638, + "epoch": 1.8034675337931438, + "grad_norm": 4.59375, + "learning_rate": 2.5100257355758715e-07, + "loss": 0.78454142, + "memory(GiB)": 138.1, + "step": 77300, + "train_speed(iter/s)": 0.200871 + }, + { + "acc": 0.77450662, + "epoch": 1.8037008413654325, + "grad_norm": 4.96875, + "learning_rate": 2.5041188639766624e-07, + "loss": 0.78545609, + "memory(GiB)": 138.1, + "step": 77310, + "train_speed(iter/s)": 0.200885 + }, + { + "acc": 0.77065268, + "epoch": 1.8039341489377214, + "grad_norm": 4.5625, + "learning_rate": 2.498218772400673e-07, + "loss": 0.83770046, + "memory(GiB)": 138.1, + "step": 77320, + "train_speed(iter/s)": 0.200898 + }, + { + "acc": 0.77967792, + "epoch": 1.8041674565100103, + "grad_norm": 10.25, + "learning_rate": 2.4923254616901646e-07, + "loss": 0.80178738, + "memory(GiB)": 138.1, + "step": 77330, + "train_speed(iter/s)": 0.200912 + }, + { + "acc": 0.74775844, + "epoch": 1.8044007640822992, + "grad_norm": 5.40625, + "learning_rate": 2.486438932686386e-07, + "loss": 0.90301533, + "memory(GiB)": 138.1, + "step": 77340, + "train_speed(iter/s)": 0.200926 + }, + { + "acc": 0.7848135, + "epoch": 1.8046340716545881, + "grad_norm": 10.9375, + "learning_rate": 2.4805591862296587e-07, + "loss": 0.75431414, + "memory(GiB)": 138.1, + "step": 77350, + "train_speed(iter/s)": 0.20094 + }, + { + "acc": 0.78480129, + "epoch": 1.804867379226877, + "grad_norm": 5.21875, + "learning_rate": 2.4746862231593006e-07, + "loss": 0.76673164, + "memory(GiB)": 138.1, + "step": 77360, + "train_speed(iter/s)": 0.200953 + }, + { + "acc": 0.77168055, + "epoch": 1.805100686799166, + "grad_norm": 6.15625, + "learning_rate": 2.468820044313669e-07, + "loss": 0.83345318, + "memory(GiB)": 138.1, + "step": 77370, + "train_speed(iter/s)": 0.200967 + }, + { + "acc": 0.77629547, + "epoch": 1.8053339943714548, + "grad_norm": 4.65625, + "learning_rate": 2.4629606505301775e-07, + "loss": 0.81766853, + "memory(GiB)": 138.1, + "step": 77380, + "train_speed(iter/s)": 0.20098 + }, + { + "acc": 0.80229378, + "epoch": 1.8055673019437437, + "grad_norm": 4.25, + "learning_rate": 2.457108042645245e-07, + "loss": 0.71003008, + "memory(GiB)": 138.1, + "step": 77390, + "train_speed(iter/s)": 0.200993 + }, + { + "acc": 0.76214561, + "epoch": 1.8058006095160326, + "grad_norm": 5.4375, + "learning_rate": 2.4512622214943274e-07, + "loss": 0.87743969, + "memory(GiB)": 138.1, + "step": 77400, + "train_speed(iter/s)": 0.201007 + }, + { + "acc": 0.78265681, + "epoch": 1.8060339170883215, + "grad_norm": 4.21875, + "learning_rate": 2.445423187911905e-07, + "loss": 0.78319178, + "memory(GiB)": 138.1, + "step": 77410, + "train_speed(iter/s)": 0.201019 + }, + { + "acc": 0.77975445, + "epoch": 1.8062672246606104, + "grad_norm": 6.09375, + "learning_rate": 2.439590942731518e-07, + "loss": 0.79706392, + "memory(GiB)": 138.1, + "step": 77420, + "train_speed(iter/s)": 0.201033 + }, + { + "acc": 0.80120268, + "epoch": 1.8065005322328993, + "grad_norm": 5.5, + "learning_rate": 2.433765486785694e-07, + "loss": 0.70945711, + "memory(GiB)": 138.1, + "step": 77430, + "train_speed(iter/s)": 0.201046 + }, + { + "acc": 0.76658869, + "epoch": 1.8067338398051882, + "grad_norm": 4.1875, + "learning_rate": 2.427946820906041e-07, + "loss": 0.84224901, + "memory(GiB)": 138.1, + "step": 77440, + "train_speed(iter/s)": 0.201059 + }, + { + "acc": 0.78487082, + "epoch": 1.806967147377477, + "grad_norm": 7.28125, + "learning_rate": 2.42213494592316e-07, + "loss": 0.76289835, + "memory(GiB)": 138.1, + "step": 77450, + "train_speed(iter/s)": 0.201072 + }, + { + "acc": 0.78675003, + "epoch": 1.807200454949766, + "grad_norm": 5.84375, + "learning_rate": 2.4163298626666885e-07, + "loss": 0.76830788, + "memory(GiB)": 138.1, + "step": 77460, + "train_speed(iter/s)": 0.201085 + }, + { + "acc": 0.79696965, + "epoch": 1.807433762522055, + "grad_norm": 5.125, + "learning_rate": 2.410531571965308e-07, + "loss": 0.73548222, + "memory(GiB)": 138.1, + "step": 77470, + "train_speed(iter/s)": 0.201099 + }, + { + "acc": 0.76947789, + "epoch": 1.8076670700943438, + "grad_norm": 5.34375, + "learning_rate": 2.4047400746467165e-07, + "loss": 0.84600887, + "memory(GiB)": 138.1, + "step": 77480, + "train_speed(iter/s)": 0.201112 + }, + { + "acc": 0.78702273, + "epoch": 1.8079003776666327, + "grad_norm": 4.0625, + "learning_rate": 2.398955371537665e-07, + "loss": 0.76369286, + "memory(GiB)": 138.1, + "step": 77490, + "train_speed(iter/s)": 0.201126 + }, + { + "acc": 0.7907228, + "epoch": 1.8081336852389214, + "grad_norm": 7.03125, + "learning_rate": 2.393177463463897e-07, + "loss": 0.76782665, + "memory(GiB)": 138.1, + "step": 77500, + "train_speed(iter/s)": 0.201137 + }, + { + "epoch": 1.8081336852389214, + "eval_acc": 0.7447478697213858, + "eval_loss": 0.8044267296791077, + "eval_runtime": 1270.6205, + "eval_samples_per_second": 28.326, + "eval_steps_per_second": 14.163, + "step": 77500 + }, + { + "acc": 0.79640326, + "epoch": 1.8083669928112105, + "grad_norm": 12.0625, + "learning_rate": 2.387406351250221e-07, + "loss": 0.71954288, + "memory(GiB)": 138.1, + "step": 77510, + "train_speed(iter/s)": 0.200476 + }, + { + "acc": 0.78035679, + "epoch": 1.8086003003834992, + "grad_norm": 9.3125, + "learning_rate": 2.3816420357204495e-07, + "loss": 0.79402161, + "memory(GiB)": 138.1, + "step": 77520, + "train_speed(iter/s)": 0.200489 + }, + { + "acc": 0.80683222, + "epoch": 1.8088336079557883, + "grad_norm": 4.125, + "learning_rate": 2.3758845176974465e-07, + "loss": 0.70322952, + "memory(GiB)": 138.1, + "step": 77530, + "train_speed(iter/s)": 0.200502 + }, + { + "acc": 0.78853745, + "epoch": 1.809066915528077, + "grad_norm": 6.34375, + "learning_rate": 2.3701337980030993e-07, + "loss": 0.76086707, + "memory(GiB)": 138.1, + "step": 77540, + "train_speed(iter/s)": 0.200515 + }, + { + "acc": 0.77532892, + "epoch": 1.809300223100366, + "grad_norm": 5.6875, + "learning_rate": 2.3643898774583064e-07, + "loss": 0.8250165, + "memory(GiB)": 138.1, + "step": 77550, + "train_speed(iter/s)": 0.200528 + }, + { + "acc": 0.79044323, + "epoch": 1.8095335306726548, + "grad_norm": 4.625, + "learning_rate": 2.3586527568830286e-07, + "loss": 0.76854286, + "memory(GiB)": 138.1, + "step": 77560, + "train_speed(iter/s)": 0.200543 + }, + { + "acc": 0.76759367, + "epoch": 1.8097668382449439, + "grad_norm": 6.0625, + "learning_rate": 2.3529224370962223e-07, + "loss": 0.82133694, + "memory(GiB)": 138.1, + "step": 77570, + "train_speed(iter/s)": 0.200557 + }, + { + "acc": 0.79010715, + "epoch": 1.8100001458172326, + "grad_norm": 5.75, + "learning_rate": 2.347198918915905e-07, + "loss": 0.74337749, + "memory(GiB)": 138.1, + "step": 77580, + "train_speed(iter/s)": 0.200571 + }, + { + "acc": 0.78925729, + "epoch": 1.8102334533895217, + "grad_norm": 4.21875, + "learning_rate": 2.3414822031590956e-07, + "loss": 0.76391687, + "memory(GiB)": 138.1, + "step": 77590, + "train_speed(iter/s)": 0.200584 + }, + { + "acc": 0.78881116, + "epoch": 1.8104667609618104, + "grad_norm": 5.3125, + "learning_rate": 2.3357722906418523e-07, + "loss": 0.74917927, + "memory(GiB)": 138.1, + "step": 77600, + "train_speed(iter/s)": 0.200597 + }, + { + "acc": 0.77421007, + "epoch": 1.8107000685340995, + "grad_norm": 8.5, + "learning_rate": 2.3300691821792788e-07, + "loss": 0.81043053, + "memory(GiB)": 138.1, + "step": 77610, + "train_speed(iter/s)": 0.20061 + }, + { + "acc": 0.79148536, + "epoch": 1.8109333761063882, + "grad_norm": 6.21875, + "learning_rate": 2.3243728785854737e-07, + "loss": 0.7413094, + "memory(GiB)": 138.1, + "step": 77620, + "train_speed(iter/s)": 0.200622 + }, + { + "acc": 0.75671234, + "epoch": 1.8111666836786773, + "grad_norm": 5.15625, + "learning_rate": 2.318683380673592e-07, + "loss": 0.88378201, + "memory(GiB)": 138.1, + "step": 77630, + "train_speed(iter/s)": 0.200636 + }, + { + "acc": 0.77831712, + "epoch": 1.811399991250966, + "grad_norm": 6.625, + "learning_rate": 2.313000689255801e-07, + "loss": 0.779283, + "memory(GiB)": 138.1, + "step": 77640, + "train_speed(iter/s)": 0.20065 + }, + { + "acc": 0.77758551, + "epoch": 1.811633298823255, + "grad_norm": 5.40625, + "learning_rate": 2.3073248051433127e-07, + "loss": 0.79231982, + "memory(GiB)": 138.1, + "step": 77650, + "train_speed(iter/s)": 0.200664 + }, + { + "acc": 0.79954634, + "epoch": 1.8118666063955438, + "grad_norm": 8.3125, + "learning_rate": 2.301655729146357e-07, + "loss": 0.70971565, + "memory(GiB)": 138.1, + "step": 77660, + "train_speed(iter/s)": 0.200677 + }, + { + "acc": 0.79472017, + "epoch": 1.8120999139678329, + "grad_norm": 3.671875, + "learning_rate": 2.295993462074181e-07, + "loss": 0.72714438, + "memory(GiB)": 138.1, + "step": 77670, + "train_speed(iter/s)": 0.200691 + }, + { + "acc": 0.78361807, + "epoch": 1.8123332215401216, + "grad_norm": 9.3125, + "learning_rate": 2.2903380047350876e-07, + "loss": 0.76192293, + "memory(GiB)": 138.1, + "step": 77680, + "train_speed(iter/s)": 0.200704 + }, + { + "acc": 0.75392261, + "epoch": 1.8125665291124105, + "grad_norm": 5.75, + "learning_rate": 2.284689357936376e-07, + "loss": 0.89919205, + "memory(GiB)": 138.1, + "step": 77690, + "train_speed(iter/s)": 0.200718 + }, + { + "acc": 0.77323561, + "epoch": 1.8127998366846994, + "grad_norm": 7.1875, + "learning_rate": 2.2790475224844067e-07, + "loss": 0.82149096, + "memory(GiB)": 138.1, + "step": 77700, + "train_speed(iter/s)": 0.200732 + }, + { + "acc": 0.7900878, + "epoch": 1.8130331442569882, + "grad_norm": 5.84375, + "learning_rate": 2.2734124991845352e-07, + "loss": 0.74978971, + "memory(GiB)": 138.1, + "step": 77710, + "train_speed(iter/s)": 0.200745 + }, + { + "acc": 0.77167468, + "epoch": 1.8132664518292771, + "grad_norm": 6.09375, + "learning_rate": 2.2677842888411738e-07, + "loss": 0.83236217, + "memory(GiB)": 138.1, + "step": 77720, + "train_speed(iter/s)": 0.200759 + }, + { + "acc": 0.75530715, + "epoch": 1.813499759401566, + "grad_norm": 4.59375, + "learning_rate": 2.26216289225773e-07, + "loss": 0.8862648, + "memory(GiB)": 138.1, + "step": 77730, + "train_speed(iter/s)": 0.200772 + }, + { + "acc": 0.79051895, + "epoch": 1.813733066973855, + "grad_norm": 5.15625, + "learning_rate": 2.256548310236667e-07, + "loss": 0.74030609, + "memory(GiB)": 138.1, + "step": 77740, + "train_speed(iter/s)": 0.200784 + }, + { + "acc": 0.79147606, + "epoch": 1.8139663745461438, + "grad_norm": 4.6875, + "learning_rate": 2.2509405435794662e-07, + "loss": 0.72963748, + "memory(GiB)": 138.1, + "step": 77750, + "train_speed(iter/s)": 0.200797 + }, + { + "acc": 0.77560072, + "epoch": 1.8141996821184327, + "grad_norm": 6.15625, + "learning_rate": 2.2453395930866262e-07, + "loss": 0.8010334, + "memory(GiB)": 138.1, + "step": 77760, + "train_speed(iter/s)": 0.20081 + }, + { + "acc": 0.7697216, + "epoch": 1.8144329896907216, + "grad_norm": 5.25, + "learning_rate": 2.2397454595576906e-07, + "loss": 0.82412586, + "memory(GiB)": 138.1, + "step": 77770, + "train_speed(iter/s)": 0.200822 + }, + { + "acc": 0.79263906, + "epoch": 1.8146662972630105, + "grad_norm": 4.71875, + "learning_rate": 2.2341581437912097e-07, + "loss": 0.73385839, + "memory(GiB)": 138.1, + "step": 77780, + "train_speed(iter/s)": 0.200837 + }, + { + "acc": 0.77369003, + "epoch": 1.8148996048352994, + "grad_norm": 9.0625, + "learning_rate": 2.2285776465847842e-07, + "loss": 0.78555937, + "memory(GiB)": 138.1, + "step": 77790, + "train_speed(iter/s)": 0.20085 + }, + { + "acc": 0.81455336, + "epoch": 1.8151329124075883, + "grad_norm": 5.625, + "learning_rate": 2.2230039687350212e-07, + "loss": 0.65586133, + "memory(GiB)": 138.1, + "step": 77800, + "train_speed(iter/s)": 0.200865 + }, + { + "acc": 0.79386964, + "epoch": 1.8153662199798772, + "grad_norm": 5.46875, + "learning_rate": 2.217437111037557e-07, + "loss": 0.74161119, + "memory(GiB)": 138.1, + "step": 77810, + "train_speed(iter/s)": 0.200878 + }, + { + "acc": 0.784272, + "epoch": 1.8155995275521661, + "grad_norm": 5.90625, + "learning_rate": 2.211877074287072e-07, + "loss": 0.76136417, + "memory(GiB)": 138.1, + "step": 77820, + "train_speed(iter/s)": 0.200892 + }, + { + "acc": 0.78744574, + "epoch": 1.815832835124455, + "grad_norm": 7.90625, + "learning_rate": 2.206323859277254e-07, + "loss": 0.77874732, + "memory(GiB)": 138.1, + "step": 77830, + "train_speed(iter/s)": 0.200905 + }, + { + "acc": 0.78655195, + "epoch": 1.816066142696744, + "grad_norm": 4.03125, + "learning_rate": 2.200777466800813e-07, + "loss": 0.76195555, + "memory(GiB)": 138.1, + "step": 77840, + "train_speed(iter/s)": 0.200918 + }, + { + "acc": 0.77641582, + "epoch": 1.8162994502690328, + "grad_norm": 7.09375, + "learning_rate": 2.1952378976495048e-07, + "loss": 0.82479191, + "memory(GiB)": 138.1, + "step": 77850, + "train_speed(iter/s)": 0.200932 + }, + { + "acc": 0.78586106, + "epoch": 1.8165327578413217, + "grad_norm": 5.34375, + "learning_rate": 2.1897051526141022e-07, + "loss": 0.76681871, + "memory(GiB)": 138.1, + "step": 77860, + "train_speed(iter/s)": 0.200945 + }, + { + "acc": 0.78540492, + "epoch": 1.8167660654136106, + "grad_norm": 7.5625, + "learning_rate": 2.1841792324843958e-07, + "loss": 0.78315992, + "memory(GiB)": 138.1, + "step": 77870, + "train_speed(iter/s)": 0.200959 + }, + { + "acc": 0.78267908, + "epoch": 1.8169993729858995, + "grad_norm": 6.71875, + "learning_rate": 2.1786601380492156e-07, + "loss": 0.79527521, + "memory(GiB)": 138.1, + "step": 77880, + "train_speed(iter/s)": 0.200972 + }, + { + "acc": 0.78243675, + "epoch": 1.8172326805581882, + "grad_norm": 6.21875, + "learning_rate": 2.1731478700964093e-07, + "loss": 0.78953333, + "memory(GiB)": 138.1, + "step": 77890, + "train_speed(iter/s)": 0.200986 + }, + { + "acc": 0.77621989, + "epoch": 1.8174659881304773, + "grad_norm": 7.5625, + "learning_rate": 2.1676424294128474e-07, + "loss": 0.78211274, + "memory(GiB)": 138.1, + "step": 77900, + "train_speed(iter/s)": 0.200999 + }, + { + "acc": 0.79703307, + "epoch": 1.817699295702766, + "grad_norm": 6.5, + "learning_rate": 2.16214381678444e-07, + "loss": 0.70052767, + "memory(GiB)": 138.1, + "step": 77910, + "train_speed(iter/s)": 0.201012 + }, + { + "acc": 0.78100514, + "epoch": 1.8179326032750551, + "grad_norm": 6.1875, + "learning_rate": 2.1566520329960928e-07, + "loss": 0.76739492, + "memory(GiB)": 138.1, + "step": 77920, + "train_speed(iter/s)": 0.201026 + }, + { + "acc": 0.78988457, + "epoch": 1.8181659108473438, + "grad_norm": 6.75, + "learning_rate": 2.1511670788317839e-07, + "loss": 0.75334415, + "memory(GiB)": 138.1, + "step": 77930, + "train_speed(iter/s)": 0.201039 + }, + { + "acc": 0.79587126, + "epoch": 1.818399218419633, + "grad_norm": 6.25, + "learning_rate": 2.1456889550744707e-07, + "loss": 0.7218152, + "memory(GiB)": 138.1, + "step": 77940, + "train_speed(iter/s)": 0.201052 + }, + { + "acc": 0.76245584, + "epoch": 1.8186325259919216, + "grad_norm": 6.53125, + "learning_rate": 2.1402176625061554e-07, + "loss": 0.88262806, + "memory(GiB)": 138.1, + "step": 77950, + "train_speed(iter/s)": 0.201066 + }, + { + "acc": 0.77558794, + "epoch": 1.8188658335642107, + "grad_norm": 6.09375, + "learning_rate": 2.1347532019078686e-07, + "loss": 0.80598965, + "memory(GiB)": 138.1, + "step": 77960, + "train_speed(iter/s)": 0.201079 + }, + { + "acc": 0.77870779, + "epoch": 1.8190991411364994, + "grad_norm": 6.96875, + "learning_rate": 2.1292955740596478e-07, + "loss": 0.79289889, + "memory(GiB)": 138.1, + "step": 77970, + "train_speed(iter/s)": 0.201093 + }, + { + "acc": 0.77847414, + "epoch": 1.8193324487087885, + "grad_norm": 5.71875, + "learning_rate": 2.123844779740586e-07, + "loss": 0.80017395, + "memory(GiB)": 138.1, + "step": 77980, + "train_speed(iter/s)": 0.201106 + }, + { + "acc": 0.77644243, + "epoch": 1.8195657562810772, + "grad_norm": 8.125, + "learning_rate": 2.118400819728772e-07, + "loss": 0.80827723, + "memory(GiB)": 138.1, + "step": 77990, + "train_speed(iter/s)": 0.20112 + }, + { + "acc": 0.79161921, + "epoch": 1.8197990638533663, + "grad_norm": 8.25, + "learning_rate": 2.1129636948013287e-07, + "loss": 0.75301976, + "memory(GiB)": 138.1, + "step": 78000, + "train_speed(iter/s)": 0.201134 + }, + { + "epoch": 1.8197990638533663, + "eval_acc": 0.7446887764996667, + "eval_loss": 0.8044082522392273, + "eval_runtime": 1271.6729, + "eval_samples_per_second": 28.302, + "eval_steps_per_second": 14.151, + "step": 78000 + }, + { + "acc": 0.78124561, + "epoch": 1.820032371425655, + "grad_norm": 5.125, + "learning_rate": 2.1075334057344077e-07, + "loss": 0.79524374, + "memory(GiB)": 138.1, + "step": 78010, + "train_speed(iter/s)": 0.200477 + }, + { + "acc": 0.80066538, + "epoch": 1.8202656789979441, + "grad_norm": 4.03125, + "learning_rate": 2.102109953303183e-07, + "loss": 0.70290136, + "memory(GiB)": 138.1, + "step": 78020, + "train_speed(iter/s)": 0.20049 + }, + { + "acc": 0.76704898, + "epoch": 1.8204989865702328, + "grad_norm": 7.71875, + "learning_rate": 2.0966933382818465e-07, + "loss": 0.86864662, + "memory(GiB)": 138.1, + "step": 78030, + "train_speed(iter/s)": 0.200504 + }, + { + "acc": 0.78443837, + "epoch": 1.820732294142522, + "grad_norm": 6.0, + "learning_rate": 2.0912835614436132e-07, + "loss": 0.78189793, + "memory(GiB)": 138.1, + "step": 78040, + "train_speed(iter/s)": 0.200517 + }, + { + "acc": 0.76247149, + "epoch": 1.8209656017148106, + "grad_norm": 5.9375, + "learning_rate": 2.085880623560743e-07, + "loss": 0.85706472, + "memory(GiB)": 138.1, + "step": 78050, + "train_speed(iter/s)": 0.200531 + }, + { + "acc": 0.79499025, + "epoch": 1.8211989092870997, + "grad_norm": 5.46875, + "learning_rate": 2.080484525404497e-07, + "loss": 0.73801851, + "memory(GiB)": 138.1, + "step": 78060, + "train_speed(iter/s)": 0.200544 + }, + { + "acc": 0.77147074, + "epoch": 1.8214322168593884, + "grad_norm": 4.8125, + "learning_rate": 2.0750952677451585e-07, + "loss": 0.82764015, + "memory(GiB)": 138.1, + "step": 78070, + "train_speed(iter/s)": 0.200558 + }, + { + "acc": 0.78612347, + "epoch": 1.8216655244316773, + "grad_norm": 8.4375, + "learning_rate": 2.069712851352046e-07, + "loss": 0.74784603, + "memory(GiB)": 138.1, + "step": 78080, + "train_speed(iter/s)": 0.200572 + }, + { + "acc": 0.79965754, + "epoch": 1.8218988320039662, + "grad_norm": 7.21875, + "learning_rate": 2.0643372769935055e-07, + "loss": 0.71098585, + "memory(GiB)": 138.1, + "step": 78090, + "train_speed(iter/s)": 0.200585 + }, + { + "acc": 0.79127426, + "epoch": 1.822132139576255, + "grad_norm": 8.9375, + "learning_rate": 2.0589685454368957e-07, + "loss": 0.75095186, + "memory(GiB)": 138.1, + "step": 78100, + "train_speed(iter/s)": 0.200599 + }, + { + "acc": 0.79673557, + "epoch": 1.822365447148544, + "grad_norm": 6.96875, + "learning_rate": 2.0536066574485868e-07, + "loss": 0.72113428, + "memory(GiB)": 138.1, + "step": 78110, + "train_speed(iter/s)": 0.200612 + }, + { + "acc": 0.79205379, + "epoch": 1.8225987547208329, + "grad_norm": 4.1875, + "learning_rate": 2.0482516137940113e-07, + "loss": 0.73350883, + "memory(GiB)": 138.1, + "step": 78120, + "train_speed(iter/s)": 0.200625 + }, + { + "acc": 0.76067524, + "epoch": 1.8228320622931218, + "grad_norm": 5.125, + "learning_rate": 2.0429034152375793e-07, + "loss": 0.87135782, + "memory(GiB)": 138.1, + "step": 78130, + "train_speed(iter/s)": 0.200639 + }, + { + "acc": 0.77361522, + "epoch": 1.8230653698654107, + "grad_norm": 4.78125, + "learning_rate": 2.0375620625427584e-07, + "loss": 0.79716196, + "memory(GiB)": 138.1, + "step": 78140, + "train_speed(iter/s)": 0.200652 + }, + { + "acc": 0.76916037, + "epoch": 1.8232986774376996, + "grad_norm": 4.46875, + "learning_rate": 2.0322275564720163e-07, + "loss": 0.81197529, + "memory(GiB)": 138.1, + "step": 78150, + "train_speed(iter/s)": 0.200665 + }, + { + "acc": 0.78346782, + "epoch": 1.8235319850099885, + "grad_norm": 6.6875, + "learning_rate": 2.0268998977868603e-07, + "loss": 0.79479795, + "memory(GiB)": 138.1, + "step": 78160, + "train_speed(iter/s)": 0.200677 + }, + { + "acc": 0.78680763, + "epoch": 1.8237652925822774, + "grad_norm": 6.25, + "learning_rate": 2.0215790872478048e-07, + "loss": 0.75535355, + "memory(GiB)": 138.1, + "step": 78170, + "train_speed(iter/s)": 0.200691 + }, + { + "acc": 0.78493023, + "epoch": 1.8239986001545663, + "grad_norm": 7.21875, + "learning_rate": 2.016265125614403e-07, + "loss": 0.79210768, + "memory(GiB)": 138.1, + "step": 78180, + "train_speed(iter/s)": 0.200703 + }, + { + "acc": 0.77242117, + "epoch": 1.8242319077268552, + "grad_norm": 5.4375, + "learning_rate": 2.010958013645209e-07, + "loss": 0.8096137, + "memory(GiB)": 138.1, + "step": 78190, + "train_speed(iter/s)": 0.200716 + }, + { + "acc": 0.79208431, + "epoch": 1.824465215299144, + "grad_norm": 4.40625, + "learning_rate": 2.0056577520978171e-07, + "loss": 0.7458868, + "memory(GiB)": 138.1, + "step": 78200, + "train_speed(iter/s)": 0.200729 + }, + { + "acc": 0.76901507, + "epoch": 1.824698522871433, + "grad_norm": 4.90625, + "learning_rate": 2.0003643417288386e-07, + "loss": 0.82966356, + "memory(GiB)": 138.1, + "step": 78210, + "train_speed(iter/s)": 0.200743 + }, + { + "acc": 0.7897049, + "epoch": 1.8249318304437219, + "grad_norm": 7.875, + "learning_rate": 1.9950777832939029e-07, + "loss": 0.74858961, + "memory(GiB)": 138.1, + "step": 78220, + "train_speed(iter/s)": 0.200756 + }, + { + "acc": 0.76055508, + "epoch": 1.8251651380160108, + "grad_norm": 5.96875, + "learning_rate": 1.9897980775476778e-07, + "loss": 0.87515984, + "memory(GiB)": 138.1, + "step": 78230, + "train_speed(iter/s)": 0.20077 + }, + { + "acc": 0.79980593, + "epoch": 1.8253984455882997, + "grad_norm": 6.59375, + "learning_rate": 1.9845252252438274e-07, + "loss": 0.70951848, + "memory(GiB)": 138.1, + "step": 78240, + "train_speed(iter/s)": 0.200783 + }, + { + "acc": 0.79019027, + "epoch": 1.8256317531605886, + "grad_norm": 5.4375, + "learning_rate": 1.9792592271350443e-07, + "loss": 0.73835411, + "memory(GiB)": 138.1, + "step": 78250, + "train_speed(iter/s)": 0.200796 + }, + { + "acc": 0.76858778, + "epoch": 1.8258650607328775, + "grad_norm": 5.375, + "learning_rate": 1.9740000839730656e-07, + "loss": 0.82827644, + "memory(GiB)": 138.1, + "step": 78260, + "train_speed(iter/s)": 0.20081 + }, + { + "acc": 0.77280111, + "epoch": 1.8260983683051664, + "grad_norm": 5.46875, + "learning_rate": 1.9687477965086132e-07, + "loss": 0.81801071, + "memory(GiB)": 138.1, + "step": 78270, + "train_speed(iter/s)": 0.200824 + }, + { + "acc": 0.7763041, + "epoch": 1.826331675877455, + "grad_norm": 6.71875, + "learning_rate": 1.963502365491471e-07, + "loss": 0.82029505, + "memory(GiB)": 138.1, + "step": 78280, + "train_speed(iter/s)": 0.200838 + }, + { + "acc": 0.77627983, + "epoch": 1.8265649834497442, + "grad_norm": 4.65625, + "learning_rate": 1.958263791670406e-07, + "loss": 0.80829754, + "memory(GiB)": 138.1, + "step": 78290, + "train_speed(iter/s)": 0.200851 + }, + { + "acc": 0.80785828, + "epoch": 1.8267982910220328, + "grad_norm": 5.59375, + "learning_rate": 1.953032075793232e-07, + "loss": 0.68023977, + "memory(GiB)": 138.1, + "step": 78300, + "train_speed(iter/s)": 0.200865 + }, + { + "acc": 0.75986423, + "epoch": 1.827031598594322, + "grad_norm": 10.4375, + "learning_rate": 1.9478072186067676e-07, + "loss": 0.88966808, + "memory(GiB)": 138.1, + "step": 78310, + "train_speed(iter/s)": 0.200879 + }, + { + "acc": 0.77503624, + "epoch": 1.8272649061666106, + "grad_norm": 4.15625, + "learning_rate": 1.9425892208568665e-07, + "loss": 0.80476112, + "memory(GiB)": 138.1, + "step": 78320, + "train_speed(iter/s)": 0.200892 + }, + { + "acc": 0.77604232, + "epoch": 1.8274982137388998, + "grad_norm": 6.875, + "learning_rate": 1.9373780832883937e-07, + "loss": 0.82170696, + "memory(GiB)": 138.1, + "step": 78330, + "train_speed(iter/s)": 0.200905 + }, + { + "acc": 0.77696238, + "epoch": 1.8277315213111884, + "grad_norm": 4.34375, + "learning_rate": 1.9321738066452266e-07, + "loss": 0.80580139, + "memory(GiB)": 138.1, + "step": 78340, + "train_speed(iter/s)": 0.200918 + }, + { + "acc": 0.77858658, + "epoch": 1.8279648288834776, + "grad_norm": 9.375, + "learning_rate": 1.9269763916702988e-07, + "loss": 0.80092487, + "memory(GiB)": 138.1, + "step": 78350, + "train_speed(iter/s)": 0.200931 + }, + { + "acc": 0.78533554, + "epoch": 1.8281981364557662, + "grad_norm": 7.9375, + "learning_rate": 1.9217858391055167e-07, + "loss": 0.75122461, + "memory(GiB)": 138.1, + "step": 78360, + "train_speed(iter/s)": 0.200945 + }, + { + "acc": 0.7888196, + "epoch": 1.8284314440280554, + "grad_norm": 5.0625, + "learning_rate": 1.916602149691843e-07, + "loss": 0.74484324, + "memory(GiB)": 138.1, + "step": 78370, + "train_speed(iter/s)": 0.200958 + }, + { + "acc": 0.76858897, + "epoch": 1.828664751600344, + "grad_norm": 4.90625, + "learning_rate": 1.911425324169247e-07, + "loss": 0.82479372, + "memory(GiB)": 138.1, + "step": 78380, + "train_speed(iter/s)": 0.200972 + }, + { + "acc": 0.77135515, + "epoch": 1.8288980591726332, + "grad_norm": 7.0, + "learning_rate": 1.90625536327671e-07, + "loss": 0.83313236, + "memory(GiB)": 138.1, + "step": 78390, + "train_speed(iter/s)": 0.200985 + }, + { + "acc": 0.79445343, + "epoch": 1.8291313667449218, + "grad_norm": 5.5625, + "learning_rate": 1.9010922677522525e-07, + "loss": 0.74220562, + "memory(GiB)": 138.1, + "step": 78400, + "train_speed(iter/s)": 0.200998 + }, + { + "acc": 0.78015499, + "epoch": 1.829364674317211, + "grad_norm": 5.59375, + "learning_rate": 1.8959360383329017e-07, + "loss": 0.79001665, + "memory(GiB)": 138.1, + "step": 78410, + "train_speed(iter/s)": 0.201012 + }, + { + "acc": 0.76368427, + "epoch": 1.8295979818894996, + "grad_norm": 7.40625, + "learning_rate": 1.8907866757547077e-07, + "loss": 0.86557426, + "memory(GiB)": 138.1, + "step": 78420, + "train_speed(iter/s)": 0.201026 + }, + { + "acc": 0.79951019, + "epoch": 1.8298312894617887, + "grad_norm": 6.25, + "learning_rate": 1.8856441807527325e-07, + "loss": 0.71126404, + "memory(GiB)": 138.1, + "step": 78430, + "train_speed(iter/s)": 0.201039 + }, + { + "acc": 0.77727337, + "epoch": 1.8300645970340774, + "grad_norm": 5.78125, + "learning_rate": 1.8805085540610836e-07, + "loss": 0.78865066, + "memory(GiB)": 138.1, + "step": 78440, + "train_speed(iter/s)": 0.201053 + }, + { + "acc": 0.8113802, + "epoch": 1.8302979046063665, + "grad_norm": 14.6875, + "learning_rate": 1.8753797964128573e-07, + "loss": 0.68480577, + "memory(GiB)": 138.1, + "step": 78450, + "train_speed(iter/s)": 0.201065 + }, + { + "acc": 0.78258839, + "epoch": 1.8305312121786552, + "grad_norm": 5.0625, + "learning_rate": 1.870257908540174e-07, + "loss": 0.79557185, + "memory(GiB)": 138.1, + "step": 78460, + "train_speed(iter/s)": 0.201078 + }, + { + "acc": 0.78330555, + "epoch": 1.8307645197509441, + "grad_norm": 7.375, + "learning_rate": 1.8651428911742043e-07, + "loss": 0.77810469, + "memory(GiB)": 138.1, + "step": 78470, + "train_speed(iter/s)": 0.201092 + }, + { + "acc": 0.78139677, + "epoch": 1.830997827323233, + "grad_norm": 4.90625, + "learning_rate": 1.8600347450450974e-07, + "loss": 0.77297707, + "memory(GiB)": 138.1, + "step": 78480, + "train_speed(iter/s)": 0.201106 + }, + { + "acc": 0.79237366, + "epoch": 1.831231134895522, + "grad_norm": 7.4375, + "learning_rate": 1.8549334708820476e-07, + "loss": 0.73578663, + "memory(GiB)": 138.1, + "step": 78490, + "train_speed(iter/s)": 0.201118 + }, + { + "acc": 0.78118787, + "epoch": 1.8314644424678108, + "grad_norm": 3.578125, + "learning_rate": 1.8498390694132562e-07, + "loss": 0.77322454, + "memory(GiB)": 138.1, + "step": 78500, + "train_speed(iter/s)": 0.201132 + }, + { + "epoch": 1.8314644424678108, + "eval_acc": 0.7446793279899608, + "eval_loss": 0.8043855428695679, + "eval_runtime": 1271.7476, + "eval_samples_per_second": 28.3, + "eval_steps_per_second": 14.151, + "step": 78500 + }, + { + "acc": 0.79847651, + "epoch": 1.8316977500400997, + "grad_norm": 6.1875, + "learning_rate": 1.8447515413659578e-07, + "loss": 0.71289806, + "memory(GiB)": 138.1, + "step": 78510, + "train_speed(iter/s)": 0.200478 + }, + { + "acc": 0.76541691, + "epoch": 1.8319310576123886, + "grad_norm": 5.40625, + "learning_rate": 1.8396708874663826e-07, + "loss": 0.84451513, + "memory(GiB)": 138.1, + "step": 78520, + "train_speed(iter/s)": 0.200492 + }, + { + "acc": 0.75975571, + "epoch": 1.8321643651846775, + "grad_norm": 5.875, + "learning_rate": 1.8345971084398007e-07, + "loss": 0.87615738, + "memory(GiB)": 138.1, + "step": 78530, + "train_speed(iter/s)": 0.200504 + }, + { + "acc": 0.76963463, + "epoch": 1.8323976727569664, + "grad_norm": 6.0625, + "learning_rate": 1.829530205010488e-07, + "loss": 0.83870296, + "memory(GiB)": 138.1, + "step": 78540, + "train_speed(iter/s)": 0.200518 + }, + { + "acc": 0.7904026, + "epoch": 1.8326309803292553, + "grad_norm": 5.1875, + "learning_rate": 1.8244701779017438e-07, + "loss": 0.74433708, + "memory(GiB)": 138.1, + "step": 78550, + "train_speed(iter/s)": 0.200531 + }, + { + "acc": 0.78499613, + "epoch": 1.8328642879015442, + "grad_norm": 6.28125, + "learning_rate": 1.8194170278358847e-07, + "loss": 0.77916923, + "memory(GiB)": 138.1, + "step": 78560, + "train_speed(iter/s)": 0.200544 + }, + { + "acc": 0.76727276, + "epoch": 1.833097595473833, + "grad_norm": 4.5, + "learning_rate": 1.8143707555342504e-07, + "loss": 0.84287758, + "memory(GiB)": 138.1, + "step": 78570, + "train_speed(iter/s)": 0.200557 + }, + { + "acc": 0.78795671, + "epoch": 1.833330903046122, + "grad_norm": 3.9375, + "learning_rate": 1.8093313617171927e-07, + "loss": 0.77323265, + "memory(GiB)": 138.1, + "step": 78580, + "train_speed(iter/s)": 0.20057 + }, + { + "acc": 0.76881981, + "epoch": 1.833564210618411, + "grad_norm": 6.40625, + "learning_rate": 1.8042988471040856e-07, + "loss": 0.84690332, + "memory(GiB)": 138.1, + "step": 78590, + "train_speed(iter/s)": 0.200583 + }, + { + "acc": 0.78443508, + "epoch": 1.8337975181906998, + "grad_norm": 4.84375, + "learning_rate": 1.7992732124133106e-07, + "loss": 0.78077679, + "memory(GiB)": 138.1, + "step": 78600, + "train_speed(iter/s)": 0.200596 + }, + { + "acc": 0.79543123, + "epoch": 1.8340308257629887, + "grad_norm": 6.59375, + "learning_rate": 1.7942544583622878e-07, + "loss": 0.73268194, + "memory(GiB)": 138.1, + "step": 78610, + "train_speed(iter/s)": 0.200609 + }, + { + "acc": 0.77256894, + "epoch": 1.8342641333352776, + "grad_norm": 4.75, + "learning_rate": 1.7892425856674334e-07, + "loss": 0.83725958, + "memory(GiB)": 138.1, + "step": 78620, + "train_speed(iter/s)": 0.200622 + }, + { + "acc": 0.78704338, + "epoch": 1.8344974409075665, + "grad_norm": 6.09375, + "learning_rate": 1.7842375950442025e-07, + "loss": 0.75788522, + "memory(GiB)": 138.1, + "step": 78630, + "train_speed(iter/s)": 0.200636 + }, + { + "acc": 0.76816063, + "epoch": 1.8347307484798554, + "grad_norm": 5.96875, + "learning_rate": 1.7792394872070407e-07, + "loss": 0.82397146, + "memory(GiB)": 138.1, + "step": 78640, + "train_speed(iter/s)": 0.200649 + }, + { + "acc": 0.79935145, + "epoch": 1.834964056052144, + "grad_norm": 6.21875, + "learning_rate": 1.7742482628694379e-07, + "loss": 0.72347355, + "memory(GiB)": 138.1, + "step": 78650, + "train_speed(iter/s)": 0.200663 + }, + { + "acc": 0.78173332, + "epoch": 1.8351973636244332, + "grad_norm": 6.75, + "learning_rate": 1.76926392274388e-07, + "loss": 0.78438444, + "memory(GiB)": 138.1, + "step": 78660, + "train_speed(iter/s)": 0.200676 + }, + { + "acc": 0.79420872, + "epoch": 1.8354306711967219, + "grad_norm": 5.6875, + "learning_rate": 1.7642864675418925e-07, + "loss": 0.74262061, + "memory(GiB)": 138.1, + "step": 78670, + "train_speed(iter/s)": 0.200688 + }, + { + "acc": 0.78799567, + "epoch": 1.835663978769011, + "grad_norm": 4.625, + "learning_rate": 1.7593158979739955e-07, + "loss": 0.75903831, + "memory(GiB)": 138.1, + "step": 78680, + "train_speed(iter/s)": 0.200701 + }, + { + "acc": 0.77767072, + "epoch": 1.8358972863412997, + "grad_norm": 5.71875, + "learning_rate": 1.7543522147497382e-07, + "loss": 0.79137626, + "memory(GiB)": 138.1, + "step": 78690, + "train_speed(iter/s)": 0.200715 + }, + { + "acc": 0.79292173, + "epoch": 1.8361305939135888, + "grad_norm": 4.84375, + "learning_rate": 1.7493954185776928e-07, + "loss": 0.74533262, + "memory(GiB)": 138.1, + "step": 78700, + "train_speed(iter/s)": 0.200728 + }, + { + "acc": 0.79466348, + "epoch": 1.8363639014858775, + "grad_norm": 4.1875, + "learning_rate": 1.7444455101654267e-07, + "loss": 0.73135486, + "memory(GiB)": 138.1, + "step": 78710, + "train_speed(iter/s)": 0.200741 + }, + { + "acc": 0.78591375, + "epoch": 1.8365972090581666, + "grad_norm": 3.9375, + "learning_rate": 1.7395024902195522e-07, + "loss": 0.77585912, + "memory(GiB)": 138.1, + "step": 78720, + "train_speed(iter/s)": 0.200755 + }, + { + "acc": 0.77571945, + "epoch": 1.8368305166304553, + "grad_norm": 5.375, + "learning_rate": 1.7345663594456775e-07, + "loss": 0.79873123, + "memory(GiB)": 138.1, + "step": 78730, + "train_speed(iter/s)": 0.200768 + }, + { + "acc": 0.80318031, + "epoch": 1.8370638242027444, + "grad_norm": 6.53125, + "learning_rate": 1.7296371185484328e-07, + "loss": 0.6877492, + "memory(GiB)": 138.1, + "step": 78740, + "train_speed(iter/s)": 0.20078 + }, + { + "acc": 0.77101789, + "epoch": 1.837297131775033, + "grad_norm": 9.375, + "learning_rate": 1.7247147682314724e-07, + "loss": 0.8241971, + "memory(GiB)": 138.1, + "step": 78750, + "train_speed(iter/s)": 0.200793 + }, + { + "acc": 0.78852234, + "epoch": 1.8375304393473222, + "grad_norm": 6.8125, + "learning_rate": 1.7197993091974452e-07, + "loss": 0.75911512, + "memory(GiB)": 138.1, + "step": 78760, + "train_speed(iter/s)": 0.200806 + }, + { + "acc": 0.77903395, + "epoch": 1.8377637469196109, + "grad_norm": 5.3125, + "learning_rate": 1.7148907421480455e-07, + "loss": 0.7937254, + "memory(GiB)": 138.1, + "step": 78770, + "train_speed(iter/s)": 0.20082 + }, + { + "acc": 0.78879142, + "epoch": 1.8379970544919, + "grad_norm": 4.03125, + "learning_rate": 1.7099890677839626e-07, + "loss": 0.77068634, + "memory(GiB)": 138.1, + "step": 78780, + "train_speed(iter/s)": 0.200833 + }, + { + "acc": 0.78774767, + "epoch": 1.8382303620641887, + "grad_norm": 7.6875, + "learning_rate": 1.7050942868049147e-07, + "loss": 0.75463233, + "memory(GiB)": 138.1, + "step": 78790, + "train_speed(iter/s)": 0.200847 + }, + { + "acc": 0.78863096, + "epoch": 1.8384636696364778, + "grad_norm": 4.09375, + "learning_rate": 1.7002063999096208e-07, + "loss": 0.74385252, + "memory(GiB)": 138.1, + "step": 78800, + "train_speed(iter/s)": 0.20086 + }, + { + "acc": 0.77340384, + "epoch": 1.8386969772087665, + "grad_norm": 4.53125, + "learning_rate": 1.695325407795839e-07, + "loss": 0.82360716, + "memory(GiB)": 138.1, + "step": 78810, + "train_speed(iter/s)": 0.200874 + }, + { + "acc": 0.78802562, + "epoch": 1.8389302847810556, + "grad_norm": 5.9375, + "learning_rate": 1.6904513111603238e-07, + "loss": 0.76730204, + "memory(GiB)": 138.1, + "step": 78820, + "train_speed(iter/s)": 0.200886 + }, + { + "acc": 0.78368859, + "epoch": 1.8391635923533443, + "grad_norm": 4.46875, + "learning_rate": 1.685584110698846e-07, + "loss": 0.77572279, + "memory(GiB)": 138.1, + "step": 78830, + "train_speed(iter/s)": 0.2009 + }, + { + "acc": 0.79617405, + "epoch": 1.8393968999256334, + "grad_norm": 5.34375, + "learning_rate": 1.680723807106205e-07, + "loss": 0.70224414, + "memory(GiB)": 138.1, + "step": 78840, + "train_speed(iter/s)": 0.200912 + }, + { + "acc": 0.77466588, + "epoch": 1.839630207497922, + "grad_norm": 5.46875, + "learning_rate": 1.675870401076196e-07, + "loss": 0.81798153, + "memory(GiB)": 138.1, + "step": 78850, + "train_speed(iter/s)": 0.200925 + }, + { + "acc": 0.78734894, + "epoch": 1.839863515070211, + "grad_norm": 4.5625, + "learning_rate": 1.6710238933016597e-07, + "loss": 0.7503355, + "memory(GiB)": 138.1, + "step": 78860, + "train_speed(iter/s)": 0.200938 + }, + { + "acc": 0.79105768, + "epoch": 1.8400968226424999, + "grad_norm": 5.1875, + "learning_rate": 1.6661842844744148e-07, + "loss": 0.74174986, + "memory(GiB)": 138.1, + "step": 78870, + "train_speed(iter/s)": 0.200951 + }, + { + "acc": 0.76048555, + "epoch": 1.8403301302147888, + "grad_norm": 5.09375, + "learning_rate": 1.6613515752853303e-07, + "loss": 0.8763361, + "memory(GiB)": 138.1, + "step": 78880, + "train_speed(iter/s)": 0.200964 + }, + { + "acc": 0.77112312, + "epoch": 1.8405634377870776, + "grad_norm": 6.9375, + "learning_rate": 1.6565257664242606e-07, + "loss": 0.79881568, + "memory(GiB)": 138.1, + "step": 78890, + "train_speed(iter/s)": 0.200977 + }, + { + "acc": 0.76842785, + "epoch": 1.8407967453593665, + "grad_norm": 5.03125, + "learning_rate": 1.6517068585800932e-07, + "loss": 0.82623138, + "memory(GiB)": 138.1, + "step": 78900, + "train_speed(iter/s)": 0.200991 + }, + { + "acc": 0.78424797, + "epoch": 1.8410300529316554, + "grad_norm": 5.28125, + "learning_rate": 1.646894852440728e-07, + "loss": 0.76516953, + "memory(GiB)": 138.1, + "step": 78910, + "train_speed(iter/s)": 0.201004 + }, + { + "acc": 0.79357691, + "epoch": 1.8412633605039443, + "grad_norm": 5.59375, + "learning_rate": 1.642089748693071e-07, + "loss": 0.73923221, + "memory(GiB)": 138.1, + "step": 78920, + "train_speed(iter/s)": 0.201018 + }, + { + "acc": 0.77890062, + "epoch": 1.8414966680762332, + "grad_norm": 4.4375, + "learning_rate": 1.6372915480230622e-07, + "loss": 0.81259813, + "memory(GiB)": 138.1, + "step": 78930, + "train_speed(iter/s)": 0.201031 + }, + { + "acc": 0.80381775, + "epoch": 1.8417299756485221, + "grad_norm": 6.09375, + "learning_rate": 1.6325002511156262e-07, + "loss": 0.7070673, + "memory(GiB)": 138.1, + "step": 78940, + "train_speed(iter/s)": 0.201044 + }, + { + "acc": 0.77546864, + "epoch": 1.841963283220811, + "grad_norm": 4.71875, + "learning_rate": 1.6277158586547325e-07, + "loss": 0.81202116, + "memory(GiB)": 138.1, + "step": 78950, + "train_speed(iter/s)": 0.201056 + }, + { + "acc": 0.78609886, + "epoch": 1.8421965907931, + "grad_norm": 5.53125, + "learning_rate": 1.6229383713233516e-07, + "loss": 0.76256151, + "memory(GiB)": 138.1, + "step": 78960, + "train_speed(iter/s)": 0.20107 + }, + { + "acc": 0.77121072, + "epoch": 1.8424298983653888, + "grad_norm": 5.15625, + "learning_rate": 1.6181677898034597e-07, + "loss": 0.82044077, + "memory(GiB)": 138.1, + "step": 78970, + "train_speed(iter/s)": 0.201083 + }, + { + "acc": 0.7971004, + "epoch": 1.8426632059376777, + "grad_norm": 7.03125, + "learning_rate": 1.6134041147760738e-07, + "loss": 0.70855894, + "memory(GiB)": 138.1, + "step": 78980, + "train_speed(iter/s)": 0.201096 + }, + { + "acc": 0.76304283, + "epoch": 1.8428965135099666, + "grad_norm": 6.03125, + "learning_rate": 1.608647346921177e-07, + "loss": 0.84548931, + "memory(GiB)": 138.1, + "step": 78990, + "train_speed(iter/s)": 0.201108 + }, + { + "acc": 0.76878996, + "epoch": 1.8431298210822555, + "grad_norm": 6.9375, + "learning_rate": 1.6038974869178214e-07, + "loss": 0.8415431, + "memory(GiB)": 138.1, + "step": 79000, + "train_speed(iter/s)": 0.201122 + }, + { + "epoch": 1.8431298210822555, + "eval_acc": 0.744701748182483, + "eval_loss": 0.8044163584709167, + "eval_runtime": 1270.82, + "eval_samples_per_second": 28.321, + "eval_steps_per_second": 14.161, + "step": 79000 + }, + { + "acc": 0.7843946, + "epoch": 1.8433631286545444, + "grad_norm": 4.59375, + "learning_rate": 1.5991545354440363e-07, + "loss": 0.77073593, + "memory(GiB)": 138.1, + "step": 79010, + "train_speed(iter/s)": 0.200473 + }, + { + "acc": 0.78733177, + "epoch": 1.8435964362268333, + "grad_norm": 5.3125, + "learning_rate": 1.594418493176886e-07, + "loss": 0.76721573, + "memory(GiB)": 138.1, + "step": 79020, + "train_speed(iter/s)": 0.200486 + }, + { + "acc": 0.772614, + "epoch": 1.8438297437991222, + "grad_norm": 4.53125, + "learning_rate": 1.5896893607924346e-07, + "loss": 0.82392635, + "memory(GiB)": 138.1, + "step": 79030, + "train_speed(iter/s)": 0.200499 + }, + { + "acc": 0.78404236, + "epoch": 1.844063051371411, + "grad_norm": 5.25, + "learning_rate": 1.5849671389657594e-07, + "loss": 0.7704998, + "memory(GiB)": 138.1, + "step": 79040, + "train_speed(iter/s)": 0.200512 + }, + { + "acc": 0.77602654, + "epoch": 1.8442963589437, + "grad_norm": 6.9375, + "learning_rate": 1.58025182837096e-07, + "loss": 0.80820026, + "memory(GiB)": 138.1, + "step": 79050, + "train_speed(iter/s)": 0.200526 + }, + { + "acc": 0.79040246, + "epoch": 1.8445296665159887, + "grad_norm": 4.125, + "learning_rate": 1.5755434296811478e-07, + "loss": 0.75322657, + "memory(GiB)": 138.1, + "step": 79060, + "train_speed(iter/s)": 0.200539 + }, + { + "acc": 0.76732149, + "epoch": 1.8447629740882778, + "grad_norm": 4.59375, + "learning_rate": 1.5708419435684463e-07, + "loss": 0.85027685, + "memory(GiB)": 138.1, + "step": 79070, + "train_speed(iter/s)": 0.200552 + }, + { + "acc": 0.77900348, + "epoch": 1.8449962816605665, + "grad_norm": 7.15625, + "learning_rate": 1.5661473707039852e-07, + "loss": 0.7863019, + "memory(GiB)": 138.1, + "step": 79080, + "train_speed(iter/s)": 0.200564 + }, + { + "acc": 0.79656229, + "epoch": 1.8452295892328556, + "grad_norm": 4.15625, + "learning_rate": 1.561459711757918e-07, + "loss": 0.73082271, + "memory(GiB)": 138.1, + "step": 79090, + "train_speed(iter/s)": 0.200578 + }, + { + "acc": 0.78649101, + "epoch": 1.8454628968051443, + "grad_norm": 5.34375, + "learning_rate": 1.5567789673994026e-07, + "loss": 0.76644926, + "memory(GiB)": 138.1, + "step": 79100, + "train_speed(iter/s)": 0.20059 + }, + { + "acc": 0.76249952, + "epoch": 1.8456962043774334, + "grad_norm": 4.8125, + "learning_rate": 1.5521051382966224e-07, + "loss": 0.85472183, + "memory(GiB)": 138.1, + "step": 79110, + "train_speed(iter/s)": 0.200604 + }, + { + "acc": 0.79876027, + "epoch": 1.845929511949722, + "grad_norm": 5.15625, + "learning_rate": 1.5474382251167597e-07, + "loss": 0.70353813, + "memory(GiB)": 138.1, + "step": 79120, + "train_speed(iter/s)": 0.200616 + }, + { + "acc": 0.77245951, + "epoch": 1.8461628195220112, + "grad_norm": 5.0, + "learning_rate": 1.542778228526004e-07, + "loss": 0.79053502, + "memory(GiB)": 138.1, + "step": 79130, + "train_speed(iter/s)": 0.20063 + }, + { + "acc": 0.76596804, + "epoch": 1.8463961270943, + "grad_norm": 5.21875, + "learning_rate": 1.53812514918959e-07, + "loss": 0.83255501, + "memory(GiB)": 138.1, + "step": 79140, + "train_speed(iter/s)": 0.200644 + }, + { + "acc": 0.76959438, + "epoch": 1.846629434666589, + "grad_norm": 4.5625, + "learning_rate": 1.5334789877717248e-07, + "loss": 0.82380829, + "memory(GiB)": 138.1, + "step": 79150, + "train_speed(iter/s)": 0.200657 + }, + { + "acc": 0.76817465, + "epoch": 1.8468627422388777, + "grad_norm": 5.03125, + "learning_rate": 1.5288397449356617e-07, + "loss": 0.84938889, + "memory(GiB)": 138.1, + "step": 79160, + "train_speed(iter/s)": 0.20067 + }, + { + "acc": 0.77281103, + "epoch": 1.8470960498111668, + "grad_norm": 4.96875, + "learning_rate": 1.524207421343643e-07, + "loss": 0.81317339, + "memory(GiB)": 138.1, + "step": 79170, + "train_speed(iter/s)": 0.200683 + }, + { + "acc": 0.78833442, + "epoch": 1.8473293573834555, + "grad_norm": 4.78125, + "learning_rate": 1.5195820176569288e-07, + "loss": 0.73116922, + "memory(GiB)": 138.1, + "step": 79180, + "train_speed(iter/s)": 0.200696 + }, + { + "acc": 0.77847328, + "epoch": 1.8475626649557446, + "grad_norm": 5.8125, + "learning_rate": 1.5149635345358017e-07, + "loss": 0.80921946, + "memory(GiB)": 138.1, + "step": 79190, + "train_speed(iter/s)": 0.200708 + }, + { + "acc": 0.77257528, + "epoch": 1.8477959725280333, + "grad_norm": 5.40625, + "learning_rate": 1.51035197263954e-07, + "loss": 0.83006363, + "memory(GiB)": 138.1, + "step": 79200, + "train_speed(iter/s)": 0.200722 + }, + { + "acc": 0.78382149, + "epoch": 1.8480292801003224, + "grad_norm": 4.9375, + "learning_rate": 1.5057473326264614e-07, + "loss": 0.76725683, + "memory(GiB)": 138.1, + "step": 79210, + "train_speed(iter/s)": 0.200735 + }, + { + "acc": 0.79515305, + "epoch": 1.848262587672611, + "grad_norm": 4.6875, + "learning_rate": 1.5011496151538462e-07, + "loss": 0.73181725, + "memory(GiB)": 138.1, + "step": 79220, + "train_speed(iter/s)": 0.200748 + }, + { + "acc": 0.77747626, + "epoch": 1.8484958952449, + "grad_norm": 5.5, + "learning_rate": 1.4965588208780468e-07, + "loss": 0.77987266, + "memory(GiB)": 138.1, + "step": 79230, + "train_speed(iter/s)": 0.200761 + }, + { + "acc": 0.77652779, + "epoch": 1.8487292028171889, + "grad_norm": 5.75, + "learning_rate": 1.491974950454378e-07, + "loss": 0.80380249, + "memory(GiB)": 138.1, + "step": 79240, + "train_speed(iter/s)": 0.200775 + }, + { + "acc": 0.78755636, + "epoch": 1.8489625103894778, + "grad_norm": 4.4375, + "learning_rate": 1.4873980045371938e-07, + "loss": 0.75492868, + "memory(GiB)": 138.1, + "step": 79250, + "train_speed(iter/s)": 0.200787 + }, + { + "acc": 0.80663643, + "epoch": 1.8491958179617667, + "grad_norm": 4.90625, + "learning_rate": 1.4828279837798553e-07, + "loss": 0.68906574, + "memory(GiB)": 138.1, + "step": 79260, + "train_speed(iter/s)": 0.200801 + }, + { + "acc": 0.78660917, + "epoch": 1.8494291255340556, + "grad_norm": 4.96875, + "learning_rate": 1.478264888834724e-07, + "loss": 0.76002073, + "memory(GiB)": 138.1, + "step": 79270, + "train_speed(iter/s)": 0.200814 + }, + { + "acc": 0.77109876, + "epoch": 1.8496624331063445, + "grad_norm": 6.40625, + "learning_rate": 1.4737087203531896e-07, + "loss": 0.81629725, + "memory(GiB)": 138.1, + "step": 79280, + "train_speed(iter/s)": 0.200827 + }, + { + "acc": 0.78523669, + "epoch": 1.8498957406786334, + "grad_norm": 7.78125, + "learning_rate": 1.4691594789856268e-07, + "loss": 0.76274586, + "memory(GiB)": 138.1, + "step": 79290, + "train_speed(iter/s)": 0.20084 + }, + { + "acc": 0.78136597, + "epoch": 1.8501290482509223, + "grad_norm": 5.65625, + "learning_rate": 1.46461716538146e-07, + "loss": 0.79236145, + "memory(GiB)": 138.1, + "step": 79300, + "train_speed(iter/s)": 0.200852 + }, + { + "acc": 0.77388811, + "epoch": 1.8503623558232112, + "grad_norm": 6.90625, + "learning_rate": 1.4600817801890933e-07, + "loss": 0.82036371, + "memory(GiB)": 138.1, + "step": 79310, + "train_speed(iter/s)": 0.200865 + }, + { + "acc": 0.74084215, + "epoch": 1.8505956633955, + "grad_norm": 5.65625, + "learning_rate": 1.4555533240559526e-07, + "loss": 0.93160048, + "memory(GiB)": 138.1, + "step": 79320, + "train_speed(iter/s)": 0.200878 + }, + { + "acc": 0.77508183, + "epoch": 1.850828970967789, + "grad_norm": 6.1875, + "learning_rate": 1.4510317976284715e-07, + "loss": 0.81556673, + "memory(GiB)": 138.1, + "step": 79330, + "train_speed(iter/s)": 0.200891 + }, + { + "acc": 0.7719595, + "epoch": 1.8510622785400779, + "grad_norm": 9.0, + "learning_rate": 1.4465172015520945e-07, + "loss": 0.82613068, + "memory(GiB)": 138.1, + "step": 79340, + "train_speed(iter/s)": 0.200903 + }, + { + "acc": 0.79376431, + "epoch": 1.8512955861123668, + "grad_norm": 4.75, + "learning_rate": 1.4420095364712838e-07, + "loss": 0.72482729, + "memory(GiB)": 138.1, + "step": 79350, + "train_speed(iter/s)": 0.200915 + }, + { + "acc": 0.76974692, + "epoch": 1.8515288936846557, + "grad_norm": 6.25, + "learning_rate": 1.4375088030295027e-07, + "loss": 0.83236866, + "memory(GiB)": 138.1, + "step": 79360, + "train_speed(iter/s)": 0.200928 + }, + { + "acc": 0.79396596, + "epoch": 1.8517622012569446, + "grad_norm": 4.0, + "learning_rate": 1.433015001869237e-07, + "loss": 0.74162588, + "memory(GiB)": 138.1, + "step": 79370, + "train_speed(iter/s)": 0.20094 + }, + { + "acc": 0.76310349, + "epoch": 1.8519955088292335, + "grad_norm": 5.6875, + "learning_rate": 1.428528133631968e-07, + "loss": 0.85866594, + "memory(GiB)": 138.1, + "step": 79380, + "train_speed(iter/s)": 0.200953 + }, + { + "acc": 0.75772862, + "epoch": 1.8522288164015224, + "grad_norm": 6.75, + "learning_rate": 1.4240481989581944e-07, + "loss": 0.87741089, + "memory(GiB)": 138.1, + "step": 79390, + "train_speed(iter/s)": 0.200967 + }, + { + "acc": 0.77879143, + "epoch": 1.8524621239738113, + "grad_norm": 5.125, + "learning_rate": 1.4195751984874383e-07, + "loss": 0.79758248, + "memory(GiB)": 138.1, + "step": 79400, + "train_speed(iter/s)": 0.20098 + }, + { + "acc": 0.78456173, + "epoch": 1.8526954315461002, + "grad_norm": 5.5, + "learning_rate": 1.4151091328582e-07, + "loss": 0.75210719, + "memory(GiB)": 138.1, + "step": 79410, + "train_speed(iter/s)": 0.200993 + }, + { + "acc": 0.77924018, + "epoch": 1.852928739118389, + "grad_norm": 4.03125, + "learning_rate": 1.410650002708025e-07, + "loss": 0.80060263, + "memory(GiB)": 138.1, + "step": 79420, + "train_speed(iter/s)": 0.201006 + }, + { + "acc": 0.78018885, + "epoch": 1.8531620466906777, + "grad_norm": 5.03125, + "learning_rate": 1.4061978086734484e-07, + "loss": 0.77695847, + "memory(GiB)": 138.1, + "step": 79430, + "train_speed(iter/s)": 0.20102 + }, + { + "acc": 0.76331925, + "epoch": 1.8533953542629669, + "grad_norm": 6.84375, + "learning_rate": 1.4017525513900175e-07, + "loss": 0.85669298, + "memory(GiB)": 138.1, + "step": 79440, + "train_speed(iter/s)": 0.201033 + }, + { + "acc": 0.79373131, + "epoch": 1.8536286618352555, + "grad_norm": 10.5, + "learning_rate": 1.3973142314922862e-07, + "loss": 0.73819771, + "memory(GiB)": 138.1, + "step": 79450, + "train_speed(iter/s)": 0.201046 + }, + { + "acc": 0.77359915, + "epoch": 1.8538619694075447, + "grad_norm": 6.625, + "learning_rate": 1.3928828496138358e-07, + "loss": 0.83192682, + "memory(GiB)": 138.1, + "step": 79460, + "train_speed(iter/s)": 0.201058 + }, + { + "acc": 0.79424295, + "epoch": 1.8540952769798333, + "grad_norm": 7.75, + "learning_rate": 1.3884584063872386e-07, + "loss": 0.72979498, + "memory(GiB)": 138.1, + "step": 79470, + "train_speed(iter/s)": 0.201071 + }, + { + "acc": 0.79742351, + "epoch": 1.8543285845521225, + "grad_norm": 4.625, + "learning_rate": 1.3840409024440726e-07, + "loss": 0.72036037, + "memory(GiB)": 138.1, + "step": 79480, + "train_speed(iter/s)": 0.201084 + }, + { + "acc": 0.76095319, + "epoch": 1.8545618921244111, + "grad_norm": 6.125, + "learning_rate": 1.3796303384149557e-07, + "loss": 0.855832, + "memory(GiB)": 138.1, + "step": 79490, + "train_speed(iter/s)": 0.201098 + }, + { + "acc": 0.768857, + "epoch": 1.8547951996967003, + "grad_norm": 4.84375, + "learning_rate": 1.375226714929473e-07, + "loss": 0.83169584, + "memory(GiB)": 138.1, + "step": 79500, + "train_speed(iter/s)": 0.201111 + }, + { + "epoch": 1.8547951996967003, + "eval_acc": 0.744718723471107, + "eval_loss": 0.8043954968452454, + "eval_runtime": 1271.2392, + "eval_samples_per_second": 28.312, + "eval_steps_per_second": 14.156, + "step": 79500 + }, + { + "acc": 0.80413227, + "epoch": 1.855028507268989, + "grad_norm": 4.5625, + "learning_rate": 1.3708300326162605e-07, + "loss": 0.69383755, + "memory(GiB)": 138.1, + "step": 79510, + "train_speed(iter/s)": 0.200467 + }, + { + "acc": 0.77757564, + "epoch": 1.855261814841278, + "grad_norm": 5.1875, + "learning_rate": 1.3664402921029328e-07, + "loss": 0.78825073, + "memory(GiB)": 138.1, + "step": 79520, + "train_speed(iter/s)": 0.20048 + }, + { + "acc": 0.75611916, + "epoch": 1.8554951224135667, + "grad_norm": 4.21875, + "learning_rate": 1.3620574940161168e-07, + "loss": 0.88193607, + "memory(GiB)": 138.1, + "step": 79530, + "train_speed(iter/s)": 0.200492 + }, + { + "acc": 0.78447027, + "epoch": 1.8557284299858559, + "grad_norm": 5.59375, + "learning_rate": 1.357681638981473e-07, + "loss": 0.78287034, + "memory(GiB)": 138.1, + "step": 79540, + "train_speed(iter/s)": 0.200504 + }, + { + "acc": 0.78698115, + "epoch": 1.8559617375581445, + "grad_norm": 6.875, + "learning_rate": 1.3533127276236458e-07, + "loss": 0.78185997, + "memory(GiB)": 138.1, + "step": 79550, + "train_speed(iter/s)": 0.200518 + }, + { + "acc": 0.77826419, + "epoch": 1.8561950451304337, + "grad_norm": 4.4375, + "learning_rate": 1.348950760566292e-07, + "loss": 0.79370699, + "memory(GiB)": 138.1, + "step": 79560, + "train_speed(iter/s)": 0.200531 + }, + { + "acc": 0.78184977, + "epoch": 1.8564283527027223, + "grad_norm": 5.78125, + "learning_rate": 1.3445957384320808e-07, + "loss": 0.77437334, + "memory(GiB)": 138.1, + "step": 79570, + "train_speed(iter/s)": 0.200545 + }, + { + "acc": 0.76415167, + "epoch": 1.8566616602750114, + "grad_norm": 6.125, + "learning_rate": 1.340247661842692e-07, + "loss": 0.85826664, + "memory(GiB)": 138.1, + "step": 79580, + "train_speed(iter/s)": 0.200557 + }, + { + "acc": 0.76513901, + "epoch": 1.8568949678473001, + "grad_norm": 5.625, + "learning_rate": 1.335906531418818e-07, + "loss": 0.84353552, + "memory(GiB)": 138.1, + "step": 79590, + "train_speed(iter/s)": 0.20057 + }, + { + "acc": 0.79837427, + "epoch": 1.8571282754195892, + "grad_norm": 4.8125, + "learning_rate": 1.3315723477801467e-07, + "loss": 0.71099424, + "memory(GiB)": 138.1, + "step": 79600, + "train_speed(iter/s)": 0.200583 + }, + { + "acc": 0.77831469, + "epoch": 1.857361582991878, + "grad_norm": 4.90625, + "learning_rate": 1.3272451115453888e-07, + "loss": 0.80567703, + "memory(GiB)": 138.1, + "step": 79610, + "train_speed(iter/s)": 0.200597 + }, + { + "acc": 0.77662849, + "epoch": 1.8575948905641668, + "grad_norm": 5.875, + "learning_rate": 1.32292482333225e-07, + "loss": 0.80015821, + "memory(GiB)": 138.1, + "step": 79620, + "train_speed(iter/s)": 0.20061 + }, + { + "acc": 0.78590217, + "epoch": 1.8578281981364557, + "grad_norm": 5.0625, + "learning_rate": 1.3186114837574538e-07, + "loss": 0.75417018, + "memory(GiB)": 138.1, + "step": 79630, + "train_speed(iter/s)": 0.200624 + }, + { + "acc": 0.77556219, + "epoch": 1.8580615057087446, + "grad_norm": 4.78125, + "learning_rate": 1.3143050934367187e-07, + "loss": 0.81039896, + "memory(GiB)": 138.1, + "step": 79640, + "train_speed(iter/s)": 0.200637 + }, + { + "acc": 0.78844309, + "epoch": 1.8582948132810335, + "grad_norm": 6.4375, + "learning_rate": 1.310005652984797e-07, + "loss": 0.75361748, + "memory(GiB)": 138.1, + "step": 79650, + "train_speed(iter/s)": 0.20065 + }, + { + "acc": 0.78295665, + "epoch": 1.8585281208533224, + "grad_norm": 11.5, + "learning_rate": 1.3057131630154208e-07, + "loss": 0.7822053, + "memory(GiB)": 138.1, + "step": 79660, + "train_speed(iter/s)": 0.200663 + }, + { + "acc": 0.76080747, + "epoch": 1.8587614284256113, + "grad_norm": 5.53125, + "learning_rate": 1.3014276241413438e-07, + "loss": 0.84956684, + "memory(GiB)": 138.1, + "step": 79670, + "train_speed(iter/s)": 0.200676 + }, + { + "acc": 0.78363895, + "epoch": 1.8589947359979002, + "grad_norm": 4.53125, + "learning_rate": 1.2971490369743323e-07, + "loss": 0.79574165, + "memory(GiB)": 138.1, + "step": 79680, + "train_speed(iter/s)": 0.200689 + }, + { + "acc": 0.77177901, + "epoch": 1.8592280435701891, + "grad_norm": 6.0, + "learning_rate": 1.2928774021251368e-07, + "loss": 0.8176857, + "memory(GiB)": 138.1, + "step": 79690, + "train_speed(iter/s)": 0.200702 + }, + { + "acc": 0.75965395, + "epoch": 1.859461351142478, + "grad_norm": 5.53125, + "learning_rate": 1.288612720203547e-07, + "loss": 0.86375732, + "memory(GiB)": 138.1, + "step": 79700, + "train_speed(iter/s)": 0.200715 + }, + { + "acc": 0.76926231, + "epoch": 1.859694658714767, + "grad_norm": 7.09375, + "learning_rate": 1.284354991818343e-07, + "loss": 0.83586464, + "memory(GiB)": 138.1, + "step": 79710, + "train_speed(iter/s)": 0.200729 + }, + { + "acc": 0.78583107, + "epoch": 1.8599279662870558, + "grad_norm": 5.125, + "learning_rate": 1.2801042175773104e-07, + "loss": 0.76661096, + "memory(GiB)": 138.1, + "step": 79720, + "train_speed(iter/s)": 0.200742 + }, + { + "acc": 0.7856154, + "epoch": 1.8601612738593447, + "grad_norm": 4.96875, + "learning_rate": 1.2758603980872419e-07, + "loss": 0.76946712, + "memory(GiB)": 138.1, + "step": 79730, + "train_speed(iter/s)": 0.200755 + }, + { + "acc": 0.78945646, + "epoch": 1.8603945814316336, + "grad_norm": 5.3125, + "learning_rate": 1.2716235339539585e-07, + "loss": 0.76702728, + "memory(GiB)": 138.1, + "step": 79740, + "train_speed(iter/s)": 0.200768 + }, + { + "acc": 0.78451262, + "epoch": 1.8606278890039225, + "grad_norm": 5.5625, + "learning_rate": 1.267393625782254e-07, + "loss": 0.75838819, + "memory(GiB)": 138.1, + "step": 79750, + "train_speed(iter/s)": 0.20078 + }, + { + "acc": 0.77074156, + "epoch": 1.8608611965762114, + "grad_norm": 4.71875, + "learning_rate": 1.263170674175951e-07, + "loss": 0.83515434, + "memory(GiB)": 138.1, + "step": 79760, + "train_speed(iter/s)": 0.200793 + }, + { + "acc": 0.77783523, + "epoch": 1.8610945041485003, + "grad_norm": 5.5625, + "learning_rate": 1.2589546797378783e-07, + "loss": 0.7946856, + "memory(GiB)": 138.1, + "step": 79770, + "train_speed(iter/s)": 0.200807 + }, + { + "acc": 0.76884985, + "epoch": 1.8613278117207892, + "grad_norm": 6.03125, + "learning_rate": 1.2547456430698656e-07, + "loss": 0.82223921, + "memory(GiB)": 138.1, + "step": 79780, + "train_speed(iter/s)": 0.20082 + }, + { + "acc": 0.7526144, + "epoch": 1.861561119293078, + "grad_norm": 5.15625, + "learning_rate": 1.2505435647727548e-07, + "loss": 0.89265423, + "memory(GiB)": 138.1, + "step": 79790, + "train_speed(iter/s)": 0.200833 + }, + { + "acc": 0.77072687, + "epoch": 1.861794426865367, + "grad_norm": 5.75, + "learning_rate": 1.2463484454463826e-07, + "loss": 0.82332354, + "memory(GiB)": 138.1, + "step": 79800, + "train_speed(iter/s)": 0.200845 + }, + { + "acc": 0.7775485, + "epoch": 1.862027734437656, + "grad_norm": 5.34375, + "learning_rate": 1.2421602856896087e-07, + "loss": 0.7913826, + "memory(GiB)": 138.1, + "step": 79810, + "train_speed(iter/s)": 0.200858 + }, + { + "acc": 0.78050547, + "epoch": 1.8622610420099446, + "grad_norm": 4.65625, + "learning_rate": 1.237979086100294e-07, + "loss": 0.7908287, + "memory(GiB)": 138.1, + "step": 79820, + "train_speed(iter/s)": 0.200871 + }, + { + "acc": 0.78652105, + "epoch": 1.8624943495822337, + "grad_norm": 4.96875, + "learning_rate": 1.233804847275294e-07, + "loss": 0.76471519, + "memory(GiB)": 138.1, + "step": 79830, + "train_speed(iter/s)": 0.200884 + }, + { + "acc": 0.78220301, + "epoch": 1.8627276571545224, + "grad_norm": 5.34375, + "learning_rate": 1.2296375698104878e-07, + "loss": 0.77280107, + "memory(GiB)": 138.1, + "step": 79840, + "train_speed(iter/s)": 0.200897 + }, + { + "acc": 0.78212929, + "epoch": 1.8629609647268115, + "grad_norm": 6.0625, + "learning_rate": 1.2254772543007442e-07, + "loss": 0.80081415, + "memory(GiB)": 138.1, + "step": 79850, + "train_speed(iter/s)": 0.20091 + }, + { + "acc": 0.78351145, + "epoch": 1.8631942722991002, + "grad_norm": 6.125, + "learning_rate": 1.2213239013399602e-07, + "loss": 0.77549877, + "memory(GiB)": 138.1, + "step": 79860, + "train_speed(iter/s)": 0.200923 + }, + { + "acc": 0.77944012, + "epoch": 1.8634275798713893, + "grad_norm": 5.3125, + "learning_rate": 1.217177511521017e-07, + "loss": 0.78213992, + "memory(GiB)": 138.1, + "step": 79870, + "train_speed(iter/s)": 0.200936 + }, + { + "acc": 0.78833656, + "epoch": 1.863660887443678, + "grad_norm": 4.78125, + "learning_rate": 1.2130380854358136e-07, + "loss": 0.74980211, + "memory(GiB)": 138.1, + "step": 79880, + "train_speed(iter/s)": 0.200949 + }, + { + "acc": 0.78198175, + "epoch": 1.863894195015967, + "grad_norm": 4.9375, + "learning_rate": 1.208905623675255e-07, + "loss": 0.80464964, + "memory(GiB)": 138.1, + "step": 79890, + "train_speed(iter/s)": 0.200961 + }, + { + "acc": 0.77077436, + "epoch": 1.8641275025882558, + "grad_norm": 6.78125, + "learning_rate": 1.2047801268292414e-07, + "loss": 0.82589788, + "memory(GiB)": 138.1, + "step": 79900, + "train_speed(iter/s)": 0.200974 + }, + { + "acc": 0.77802629, + "epoch": 1.864360810160545, + "grad_norm": 5.9375, + "learning_rate": 1.2006615954866906e-07, + "loss": 0.77997022, + "memory(GiB)": 138.1, + "step": 79910, + "train_speed(iter/s)": 0.200987 + }, + { + "acc": 0.77522287, + "epoch": 1.8645941177328336, + "grad_norm": 5.71875, + "learning_rate": 1.196550030235516e-07, + "loss": 0.80558329, + "memory(GiB)": 138.1, + "step": 79920, + "train_speed(iter/s)": 0.201 + }, + { + "acc": 0.76346202, + "epoch": 1.8648274253051227, + "grad_norm": 6.40625, + "learning_rate": 1.1924454316626478e-07, + "loss": 0.8442524, + "memory(GiB)": 138.1, + "step": 79930, + "train_speed(iter/s)": 0.201012 + }, + { + "acc": 0.77351999, + "epoch": 1.8650607328774114, + "grad_norm": 8.8125, + "learning_rate": 1.1883478003540172e-07, + "loss": 0.82075424, + "memory(GiB)": 138.1, + "step": 79940, + "train_speed(iter/s)": 0.201026 + }, + { + "acc": 0.7649683, + "epoch": 1.8652940404497005, + "grad_norm": 3.84375, + "learning_rate": 1.1842571368945566e-07, + "loss": 0.85019636, + "memory(GiB)": 138.1, + "step": 79950, + "train_speed(iter/s)": 0.201038 + }, + { + "acc": 0.78048048, + "epoch": 1.8655273480219892, + "grad_norm": 5.09375, + "learning_rate": 1.1801734418682154e-07, + "loss": 0.79374285, + "memory(GiB)": 138.1, + "step": 79960, + "train_speed(iter/s)": 0.201052 + }, + { + "acc": 0.78092804, + "epoch": 1.8657606555942783, + "grad_norm": 6.125, + "learning_rate": 1.1760967158579217e-07, + "loss": 0.80186243, + "memory(GiB)": 138.1, + "step": 79970, + "train_speed(iter/s)": 0.201065 + }, + { + "acc": 0.79199791, + "epoch": 1.865993963166567, + "grad_norm": 5.71875, + "learning_rate": 1.172026959445649e-07, + "loss": 0.74338923, + "memory(GiB)": 138.1, + "step": 79980, + "train_speed(iter/s)": 0.201077 + }, + { + "acc": 0.79580288, + "epoch": 1.866227270738856, + "grad_norm": 5.21875, + "learning_rate": 1.1679641732123382e-07, + "loss": 0.71093869, + "memory(GiB)": 138.1, + "step": 79990, + "train_speed(iter/s)": 0.201091 + }, + { + "acc": 0.77390499, + "epoch": 1.8664605783111448, + "grad_norm": 6.25, + "learning_rate": 1.163908357737964e-07, + "loss": 0.81473513, + "memory(GiB)": 138.1, + "step": 80000, + "train_speed(iter/s)": 0.201105 + }, + { + "epoch": 1.8664605783111448, + "eval_acc": 0.7447302538558327, + "eval_loss": 0.8044291138648987, + "eval_runtime": 1271.074, + "eval_samples_per_second": 28.315, + "eval_steps_per_second": 14.158, + "step": 80000 + }, + { + "acc": 0.79017296, + "epoch": 1.8666938858834337, + "grad_norm": 5.59375, + "learning_rate": 1.1598595136014745e-07, + "loss": 0.75285716, + "memory(GiB)": 138.1, + "step": 80010, + "train_speed(iter/s)": 0.200465 + }, + { + "acc": 0.76734552, + "epoch": 1.8669271934557226, + "grad_norm": 5.46875, + "learning_rate": 1.1558176413808519e-07, + "loss": 0.864007, + "memory(GiB)": 138.1, + "step": 80020, + "train_speed(iter/s)": 0.200477 + }, + { + "acc": 0.76343198, + "epoch": 1.8671605010280115, + "grad_norm": 5.34375, + "learning_rate": 1.1517827416530736e-07, + "loss": 0.8714529, + "memory(GiB)": 138.1, + "step": 80030, + "train_speed(iter/s)": 0.200489 + }, + { + "acc": 0.7741498, + "epoch": 1.8673938086003004, + "grad_norm": 4.0625, + "learning_rate": 1.1477548149941176e-07, + "loss": 0.79983683, + "memory(GiB)": 138.1, + "step": 80040, + "train_speed(iter/s)": 0.200503 + }, + { + "acc": 0.79464846, + "epoch": 1.8676271161725893, + "grad_norm": 5.71875, + "learning_rate": 1.143733861978974e-07, + "loss": 0.72727637, + "memory(GiB)": 138.1, + "step": 80050, + "train_speed(iter/s)": 0.200516 + }, + { + "acc": 0.7844614, + "epoch": 1.8678604237448782, + "grad_norm": 4.5, + "learning_rate": 1.1397198831816226e-07, + "loss": 0.73518605, + "memory(GiB)": 138.1, + "step": 80060, + "train_speed(iter/s)": 0.200529 + }, + { + "acc": 0.78245525, + "epoch": 1.868093731317167, + "grad_norm": 5.3125, + "learning_rate": 1.1357128791750716e-07, + "loss": 0.78568835, + "memory(GiB)": 138.1, + "step": 80070, + "train_speed(iter/s)": 0.200543 + }, + { + "acc": 0.79324665, + "epoch": 1.868327038889456, + "grad_norm": 5.28125, + "learning_rate": 1.1317128505313024e-07, + "loss": 0.75692196, + "memory(GiB)": 138.1, + "step": 80080, + "train_speed(iter/s)": 0.200556 + }, + { + "acc": 0.79333744, + "epoch": 1.8685603464617448, + "grad_norm": 5.6875, + "learning_rate": 1.1277197978213362e-07, + "loss": 0.71830125, + "memory(GiB)": 138.1, + "step": 80090, + "train_speed(iter/s)": 0.200569 + }, + { + "acc": 0.75221357, + "epoch": 1.8687936540340337, + "grad_norm": 5.5625, + "learning_rate": 1.1237337216151723e-07, + "loss": 0.89948711, + "memory(GiB)": 138.1, + "step": 80100, + "train_speed(iter/s)": 0.200582 + }, + { + "acc": 0.77064877, + "epoch": 1.8690269616063226, + "grad_norm": 6.15625, + "learning_rate": 1.1197546224818112e-07, + "loss": 0.82701855, + "memory(GiB)": 138.1, + "step": 80110, + "train_speed(iter/s)": 0.200595 + }, + { + "acc": 0.79913635, + "epoch": 1.8692602691786115, + "grad_norm": 5.125, + "learning_rate": 1.1157825009892931e-07, + "loss": 0.71697931, + "memory(GiB)": 138.1, + "step": 80120, + "train_speed(iter/s)": 0.200608 + }, + { + "acc": 0.76889296, + "epoch": 1.8694935767509004, + "grad_norm": 6.0, + "learning_rate": 1.1118173577046088e-07, + "loss": 0.82190304, + "memory(GiB)": 138.1, + "step": 80130, + "train_speed(iter/s)": 0.200621 + }, + { + "acc": 0.78341179, + "epoch": 1.8697268843231893, + "grad_norm": 5.15625, + "learning_rate": 1.1078591931937999e-07, + "loss": 0.77279377, + "memory(GiB)": 138.1, + "step": 80140, + "train_speed(iter/s)": 0.200633 + }, + { + "acc": 0.78050628, + "epoch": 1.8699601918954782, + "grad_norm": 5.25, + "learning_rate": 1.1039080080218811e-07, + "loss": 0.80319948, + "memory(GiB)": 138.1, + "step": 80150, + "train_speed(iter/s)": 0.200646 + }, + { + "acc": 0.78934498, + "epoch": 1.8701934994677671, + "grad_norm": 7.625, + "learning_rate": 1.0999638027528959e-07, + "loss": 0.74584684, + "memory(GiB)": 138.1, + "step": 80160, + "train_speed(iter/s)": 0.200659 + }, + { + "acc": 0.78847446, + "epoch": 1.870426807040056, + "grad_norm": 4.8125, + "learning_rate": 1.0960265779498769e-07, + "loss": 0.73895617, + "memory(GiB)": 138.1, + "step": 80170, + "train_speed(iter/s)": 0.200671 + }, + { + "acc": 0.77396088, + "epoch": 1.870660114612345, + "grad_norm": 5.4375, + "learning_rate": 1.092096334174847e-07, + "loss": 0.81397905, + "memory(GiB)": 138.1, + "step": 80180, + "train_speed(iter/s)": 0.200684 + }, + { + "acc": 0.78955622, + "epoch": 1.8708934221846336, + "grad_norm": 5.21875, + "learning_rate": 1.0881730719888628e-07, + "loss": 0.76100597, + "memory(GiB)": 138.1, + "step": 80190, + "train_speed(iter/s)": 0.200696 + }, + { + "acc": 0.80156097, + "epoch": 1.8711267297569227, + "grad_norm": 3.828125, + "learning_rate": 1.0842567919519597e-07, + "loss": 0.69538174, + "memory(GiB)": 138.1, + "step": 80200, + "train_speed(iter/s)": 0.200709 + }, + { + "acc": 0.78851342, + "epoch": 1.8713600373292114, + "grad_norm": 6.34375, + "learning_rate": 1.0803474946231963e-07, + "loss": 0.76109829, + "memory(GiB)": 138.1, + "step": 80210, + "train_speed(iter/s)": 0.200721 + }, + { + "acc": 0.78874664, + "epoch": 1.8715933449015005, + "grad_norm": 5.71875, + "learning_rate": 1.0764451805606091e-07, + "loss": 0.73757567, + "memory(GiB)": 138.1, + "step": 80220, + "train_speed(iter/s)": 0.200735 + }, + { + "acc": 0.77023153, + "epoch": 1.8718266524737892, + "grad_norm": 3.984375, + "learning_rate": 1.0725498503212694e-07, + "loss": 0.81957588, + "memory(GiB)": 138.1, + "step": 80230, + "train_speed(iter/s)": 0.200748 + }, + { + "acc": 0.77765932, + "epoch": 1.8720599600460783, + "grad_norm": 6.6875, + "learning_rate": 1.0686615044612159e-07, + "loss": 0.77763338, + "memory(GiB)": 138.1, + "step": 80240, + "train_speed(iter/s)": 0.20076 + }, + { + "acc": 0.7891181, + "epoch": 1.872293267618367, + "grad_norm": 4.84375, + "learning_rate": 1.0647801435355264e-07, + "loss": 0.75162325, + "memory(GiB)": 138.1, + "step": 80250, + "train_speed(iter/s)": 0.200774 + }, + { + "acc": 0.76905499, + "epoch": 1.8725265751906561, + "grad_norm": 4.5625, + "learning_rate": 1.0609057680982527e-07, + "loss": 0.82668858, + "memory(GiB)": 138.1, + "step": 80260, + "train_speed(iter/s)": 0.200787 + }, + { + "acc": 0.78598719, + "epoch": 1.8727598827629448, + "grad_norm": 4.9375, + "learning_rate": 1.0570383787024574e-07, + "loss": 0.76735053, + "memory(GiB)": 138.1, + "step": 80270, + "train_speed(iter/s)": 0.200798 + }, + { + "acc": 0.78083096, + "epoch": 1.872993190335234, + "grad_norm": 5.46875, + "learning_rate": 1.0531779759002214e-07, + "loss": 0.79012175, + "memory(GiB)": 138.1, + "step": 80280, + "train_speed(iter/s)": 0.200811 + }, + { + "acc": 0.7970469, + "epoch": 1.8732264979075226, + "grad_norm": 6.25, + "learning_rate": 1.0493245602426095e-07, + "loss": 0.71222925, + "memory(GiB)": 138.1, + "step": 80290, + "train_speed(iter/s)": 0.200823 + }, + { + "acc": 0.78838396, + "epoch": 1.8734598054798117, + "grad_norm": 6.0625, + "learning_rate": 1.0454781322796981e-07, + "loss": 0.74144182, + "memory(GiB)": 138.1, + "step": 80300, + "train_speed(iter/s)": 0.200836 + }, + { + "acc": 0.80160275, + "epoch": 1.8736931130521004, + "grad_norm": 4.78125, + "learning_rate": 1.0416386925605592e-07, + "loss": 0.71015835, + "memory(GiB)": 138.1, + "step": 80310, + "train_speed(iter/s)": 0.200849 + }, + { + "acc": 0.80436602, + "epoch": 1.8739264206243895, + "grad_norm": 3.828125, + "learning_rate": 1.0378062416332712e-07, + "loss": 0.70317974, + "memory(GiB)": 138.1, + "step": 80320, + "train_speed(iter/s)": 0.200862 + }, + { + "acc": 0.7857511, + "epoch": 1.8741597281966782, + "grad_norm": 4.625, + "learning_rate": 1.0339807800449241e-07, + "loss": 0.76099644, + "memory(GiB)": 138.1, + "step": 80330, + "train_speed(iter/s)": 0.200875 + }, + { + "acc": 0.77934189, + "epoch": 1.8743930357689673, + "grad_norm": 5.90625, + "learning_rate": 1.0301623083415924e-07, + "loss": 0.79090543, + "memory(GiB)": 138.1, + "step": 80340, + "train_speed(iter/s)": 0.200887 + }, + { + "acc": 0.79418821, + "epoch": 1.874626343341256, + "grad_norm": 5.65625, + "learning_rate": 1.0263508270683731e-07, + "loss": 0.73172894, + "memory(GiB)": 138.1, + "step": 80350, + "train_speed(iter/s)": 0.200899 + }, + { + "acc": 0.76580453, + "epoch": 1.8748596509135451, + "grad_norm": 5.71875, + "learning_rate": 1.0225463367693367e-07, + "loss": 0.84161091, + "memory(GiB)": 138.1, + "step": 80360, + "train_speed(iter/s)": 0.200912 + }, + { + "acc": 0.80100813, + "epoch": 1.8750929584858338, + "grad_norm": 5.4375, + "learning_rate": 1.0187488379875876e-07, + "loss": 0.71097794, + "memory(GiB)": 138.1, + "step": 80370, + "train_speed(iter/s)": 0.200925 + }, + { + "acc": 0.79370942, + "epoch": 1.875326266058123, + "grad_norm": 6.65625, + "learning_rate": 1.0149583312652089e-07, + "loss": 0.73821492, + "memory(GiB)": 138.1, + "step": 80380, + "train_speed(iter/s)": 0.200938 + }, + { + "acc": 0.79008064, + "epoch": 1.8755595736304116, + "grad_norm": 6.1875, + "learning_rate": 1.0111748171433067e-07, + "loss": 0.76331358, + "memory(GiB)": 138.1, + "step": 80390, + "train_speed(iter/s)": 0.20095 + }, + { + "acc": 0.76334047, + "epoch": 1.8757928812027005, + "grad_norm": 6.1875, + "learning_rate": 1.00739829616196e-07, + "loss": 0.85111656, + "memory(GiB)": 138.1, + "step": 80400, + "train_speed(iter/s)": 0.200962 + }, + { + "acc": 0.78283181, + "epoch": 1.8760261887749894, + "grad_norm": 4.78125, + "learning_rate": 1.0036287688602764e-07, + "loss": 0.76812816, + "memory(GiB)": 138.1, + "step": 80410, + "train_speed(iter/s)": 0.200976 + }, + { + "acc": 0.76459651, + "epoch": 1.8762594963472783, + "grad_norm": 5.71875, + "learning_rate": 9.998662357763534e-08, + "loss": 0.85813875, + "memory(GiB)": 138.1, + "step": 80420, + "train_speed(iter/s)": 0.200988 + }, + { + "acc": 0.77645798, + "epoch": 1.8764928039195672, + "grad_norm": 4.84375, + "learning_rate": 9.961106974472834e-08, + "loss": 0.80161037, + "memory(GiB)": 138.1, + "step": 80430, + "train_speed(iter/s)": 0.201 + }, + { + "acc": 0.77092047, + "epoch": 1.876726111491856, + "grad_norm": 5.625, + "learning_rate": 9.923621544091877e-08, + "loss": 0.8250824, + "memory(GiB)": 138.1, + "step": 80440, + "train_speed(iter/s)": 0.201013 + }, + { + "acc": 0.79298916, + "epoch": 1.876959419064145, + "grad_norm": 6.1875, + "learning_rate": 9.886206071971493e-08, + "loss": 0.75579348, + "memory(GiB)": 138.1, + "step": 80450, + "train_speed(iter/s)": 0.201025 + }, + { + "acc": 0.77953978, + "epoch": 1.8771927266364339, + "grad_norm": 7.0625, + "learning_rate": 9.848860563452855e-08, + "loss": 0.78975258, + "memory(GiB)": 138.1, + "step": 80460, + "train_speed(iter/s)": 0.201037 + }, + { + "acc": 0.78567953, + "epoch": 1.8774260342087228, + "grad_norm": 3.96875, + "learning_rate": 9.811585023866976e-08, + "loss": 0.76402936, + "memory(GiB)": 138.1, + "step": 80470, + "train_speed(iter/s)": 0.20105 + }, + { + "acc": 0.78225698, + "epoch": 1.8776593417810117, + "grad_norm": 4.875, + "learning_rate": 9.774379458534933e-08, + "loss": 0.78416901, + "memory(GiB)": 138.1, + "step": 80480, + "train_speed(iter/s)": 0.201063 + }, + { + "acc": 0.77153597, + "epoch": 1.8778926493533006, + "grad_norm": 5.28125, + "learning_rate": 9.73724387276781e-08, + "loss": 0.84029179, + "memory(GiB)": 138.1, + "step": 80490, + "train_speed(iter/s)": 0.201075 + }, + { + "acc": 0.77242804, + "epoch": 1.8781259569255895, + "grad_norm": 5.1875, + "learning_rate": 9.700178271866645e-08, + "loss": 0.81851034, + "memory(GiB)": 138.1, + "step": 80500, + "train_speed(iter/s)": 0.201087 + }, + { + "epoch": 1.8781259569255895, + "eval_acc": 0.7446892569323635, + "eval_loss": 0.804384708404541, + "eval_runtime": 1272.9616, + "eval_samples_per_second": 28.273, + "eval_steps_per_second": 14.137, + "step": 80500 + }, + { + "acc": 0.78566909, + "epoch": 1.8783592644978784, + "grad_norm": 5.0625, + "learning_rate": 9.66318266112265e-08, + "loss": 0.78130226, + "memory(GiB)": 138.1, + "step": 80510, + "train_speed(iter/s)": 0.20045 + }, + { + "acc": 0.78306484, + "epoch": 1.8785925720701673, + "grad_norm": 7.53125, + "learning_rate": 9.626257045816879e-08, + "loss": 0.77127929, + "memory(GiB)": 138.1, + "step": 80520, + "train_speed(iter/s)": 0.200462 + }, + { + "acc": 0.80443401, + "epoch": 1.8788258796424562, + "grad_norm": 4.59375, + "learning_rate": 9.589401431220502e-08, + "loss": 0.70631161, + "memory(GiB)": 138.1, + "step": 80530, + "train_speed(iter/s)": 0.200474 + }, + { + "acc": 0.78915844, + "epoch": 1.879059187214745, + "grad_norm": 5.4375, + "learning_rate": 9.552615822594536e-08, + "loss": 0.75501652, + "memory(GiB)": 138.1, + "step": 80540, + "train_speed(iter/s)": 0.200486 + }, + { + "acc": 0.77199645, + "epoch": 1.879292494787034, + "grad_norm": 7.125, + "learning_rate": 9.515900225190222e-08, + "loss": 0.8397831, + "memory(GiB)": 138.1, + "step": 80550, + "train_speed(iter/s)": 0.200499 + }, + { + "acc": 0.77649212, + "epoch": 1.8795258023593229, + "grad_norm": 8.0, + "learning_rate": 9.479254644248648e-08, + "loss": 0.80428057, + "memory(GiB)": 138.1, + "step": 80560, + "train_speed(iter/s)": 0.200511 + }, + { + "acc": 0.79153538, + "epoch": 1.8797591099316118, + "grad_norm": 5.5625, + "learning_rate": 9.442679085000961e-08, + "loss": 0.74753222, + "memory(GiB)": 138.1, + "step": 80570, + "train_speed(iter/s)": 0.200524 + }, + { + "acc": 0.77919335, + "epoch": 1.8799924175039004, + "grad_norm": 4.9375, + "learning_rate": 9.40617355266843e-08, + "loss": 0.80626259, + "memory(GiB)": 138.1, + "step": 80580, + "train_speed(iter/s)": 0.200538 + }, + { + "acc": 0.76336107, + "epoch": 1.8802257250761896, + "grad_norm": 5.1875, + "learning_rate": 9.369738052461996e-08, + "loss": 0.86293964, + "memory(GiB)": 138.1, + "step": 80590, + "train_speed(iter/s)": 0.20055 + }, + { + "acc": 0.79171991, + "epoch": 1.8804590326484782, + "grad_norm": 23.5, + "learning_rate": 9.333372589583e-08, + "loss": 0.75359583, + "memory(GiB)": 138.1, + "step": 80600, + "train_speed(iter/s)": 0.200563 + }, + { + "acc": 0.79271598, + "epoch": 1.8806923402207674, + "grad_norm": 8.5, + "learning_rate": 9.297077169222513e-08, + "loss": 0.74482245, + "memory(GiB)": 138.1, + "step": 80610, + "train_speed(iter/s)": 0.200576 + }, + { + "acc": 0.77063508, + "epoch": 1.880925647793056, + "grad_norm": 4.40625, + "learning_rate": 9.260851796561609e-08, + "loss": 0.84915276, + "memory(GiB)": 138.1, + "step": 80620, + "train_speed(iter/s)": 0.20059 + }, + { + "acc": 0.79158163, + "epoch": 1.8811589553653452, + "grad_norm": 3.96875, + "learning_rate": 9.224696476771655e-08, + "loss": 0.78680096, + "memory(GiB)": 138.1, + "step": 80630, + "train_speed(iter/s)": 0.200602 + }, + { + "acc": 0.77896152, + "epoch": 1.8813922629376338, + "grad_norm": 5.6875, + "learning_rate": 9.188611215013631e-08, + "loss": 0.79821224, + "memory(GiB)": 138.1, + "step": 80640, + "train_speed(iter/s)": 0.200614 + }, + { + "acc": 0.77518044, + "epoch": 1.881625570509923, + "grad_norm": 6.5, + "learning_rate": 9.152596016438864e-08, + "loss": 0.80904598, + "memory(GiB)": 138.1, + "step": 80650, + "train_speed(iter/s)": 0.200627 + }, + { + "acc": 0.7727581, + "epoch": 1.8818588780822116, + "grad_norm": 6.8125, + "learning_rate": 9.11665088618835e-08, + "loss": 0.83627472, + "memory(GiB)": 138.1, + "step": 80660, + "train_speed(iter/s)": 0.20064 + }, + { + "acc": 0.79154997, + "epoch": 1.8820921856545008, + "grad_norm": 4.25, + "learning_rate": 9.080775829393373e-08, + "loss": 0.75480595, + "memory(GiB)": 138.1, + "step": 80670, + "train_speed(iter/s)": 0.200653 + }, + { + "acc": 0.78853526, + "epoch": 1.8823254932267894, + "grad_norm": 6.28125, + "learning_rate": 9.044970851175006e-08, + "loss": 0.75794249, + "memory(GiB)": 138.1, + "step": 80680, + "train_speed(iter/s)": 0.200665 + }, + { + "acc": 0.77822475, + "epoch": 1.8825588007990786, + "grad_norm": 4.6875, + "learning_rate": 9.009235956644491e-08, + "loss": 0.79094992, + "memory(GiB)": 138.1, + "step": 80690, + "train_speed(iter/s)": 0.200677 + }, + { + "acc": 0.7686842, + "epoch": 1.8827921083713672, + "grad_norm": 4.75, + "learning_rate": 8.97357115090286e-08, + "loss": 0.84376907, + "memory(GiB)": 138.1, + "step": 80700, + "train_speed(iter/s)": 0.20069 + }, + { + "acc": 0.7691422, + "epoch": 1.8830254159436564, + "grad_norm": 5.625, + "learning_rate": 8.937976439041263e-08, + "loss": 0.83207283, + "memory(GiB)": 138.1, + "step": 80710, + "train_speed(iter/s)": 0.200702 + }, + { + "acc": 0.78920188, + "epoch": 1.883258723515945, + "grad_norm": 4.90625, + "learning_rate": 8.902451826140911e-08, + "loss": 0.74556618, + "memory(GiB)": 138.1, + "step": 80720, + "train_speed(iter/s)": 0.200715 + }, + { + "acc": 0.78106089, + "epoch": 1.8834920310882342, + "grad_norm": 6.59375, + "learning_rate": 8.866997317272863e-08, + "loss": 0.78032007, + "memory(GiB)": 138.1, + "step": 80730, + "train_speed(iter/s)": 0.200726 + }, + { + "acc": 0.80264349, + "epoch": 1.8837253386605228, + "grad_norm": 7.53125, + "learning_rate": 8.831612917498288e-08, + "loss": 0.69003105, + "memory(GiB)": 138.1, + "step": 80740, + "train_speed(iter/s)": 0.200739 + }, + { + "acc": 0.78006659, + "epoch": 1.883958646232812, + "grad_norm": 4.78125, + "learning_rate": 8.796298631868317e-08, + "loss": 0.79409275, + "memory(GiB)": 138.1, + "step": 80750, + "train_speed(iter/s)": 0.200752 + }, + { + "acc": 0.77235012, + "epoch": 1.8841919538051006, + "grad_norm": 6.09375, + "learning_rate": 8.761054465423969e-08, + "loss": 0.83254261, + "memory(GiB)": 138.1, + "step": 80760, + "train_speed(iter/s)": 0.200764 + }, + { + "acc": 0.79439788, + "epoch": 1.8844252613773895, + "grad_norm": 4.875, + "learning_rate": 8.725880423196442e-08, + "loss": 0.75280304, + "memory(GiB)": 138.1, + "step": 80770, + "train_speed(iter/s)": 0.200777 + }, + { + "acc": 0.7835988, + "epoch": 1.8846585689496784, + "grad_norm": 4.375, + "learning_rate": 8.690776510206723e-08, + "loss": 0.77280979, + "memory(GiB)": 138.1, + "step": 80780, + "train_speed(iter/s)": 0.20079 + }, + { + "acc": 0.79621954, + "epoch": 1.8848918765219673, + "grad_norm": 4.9375, + "learning_rate": 8.655742731465966e-08, + "loss": 0.74539442, + "memory(GiB)": 138.1, + "step": 80790, + "train_speed(iter/s)": 0.200803 + }, + { + "acc": 0.77132044, + "epoch": 1.8851251840942562, + "grad_norm": 6.1875, + "learning_rate": 8.62077909197523e-08, + "loss": 0.80378342, + "memory(GiB)": 138.1, + "step": 80800, + "train_speed(iter/s)": 0.200816 + }, + { + "acc": 0.79714594, + "epoch": 1.8853584916665451, + "grad_norm": 5.78125, + "learning_rate": 8.585885596725518e-08, + "loss": 0.71226463, + "memory(GiB)": 138.1, + "step": 80810, + "train_speed(iter/s)": 0.200829 + }, + { + "acc": 0.76155009, + "epoch": 1.885591799238834, + "grad_norm": 9.1875, + "learning_rate": 8.551062250697795e-08, + "loss": 0.89277039, + "memory(GiB)": 138.1, + "step": 80820, + "train_speed(iter/s)": 0.200842 + }, + { + "acc": 0.79220524, + "epoch": 1.885825106811123, + "grad_norm": 3.609375, + "learning_rate": 8.516309058863249e-08, + "loss": 0.72857275, + "memory(GiB)": 138.1, + "step": 80830, + "train_speed(iter/s)": 0.200855 + }, + { + "acc": 0.77188721, + "epoch": 1.8860584143834118, + "grad_norm": 4.71875, + "learning_rate": 8.481626026182798e-08, + "loss": 0.84043217, + "memory(GiB)": 138.1, + "step": 80840, + "train_speed(iter/s)": 0.200869 + }, + { + "acc": 0.79511795, + "epoch": 1.8862917219557007, + "grad_norm": 5.8125, + "learning_rate": 8.447013157607431e-08, + "loss": 0.7310348, + "memory(GiB)": 138.1, + "step": 80850, + "train_speed(iter/s)": 0.200881 + }, + { + "acc": 0.79568253, + "epoch": 1.8865250295279896, + "grad_norm": 6.03125, + "learning_rate": 8.412470458078137e-08, + "loss": 0.72156382, + "memory(GiB)": 138.1, + "step": 80860, + "train_speed(iter/s)": 0.200894 + }, + { + "acc": 0.75479288, + "epoch": 1.8867583371002785, + "grad_norm": 8.25, + "learning_rate": 8.37799793252586e-08, + "loss": 0.89089012, + "memory(GiB)": 138.1, + "step": 80870, + "train_speed(iter/s)": 0.200906 + }, + { + "acc": 0.79052868, + "epoch": 1.8869916446725674, + "grad_norm": 5.53125, + "learning_rate": 8.343595585871611e-08, + "loss": 0.74636126, + "memory(GiB)": 138.1, + "step": 80880, + "train_speed(iter/s)": 0.200919 + }, + { + "acc": 0.77063046, + "epoch": 1.8872249522448563, + "grad_norm": 4.78125, + "learning_rate": 8.309263423026237e-08, + "loss": 0.81843338, + "memory(GiB)": 138.1, + "step": 80890, + "train_speed(iter/s)": 0.200933 + }, + { + "acc": 0.77630882, + "epoch": 1.8874582598171452, + "grad_norm": 5.8125, + "learning_rate": 8.275001448890651e-08, + "loss": 0.81249247, + "memory(GiB)": 138.1, + "step": 80900, + "train_speed(iter/s)": 0.200945 + }, + { + "acc": 0.75715265, + "epoch": 1.887691567389434, + "grad_norm": 5.65625, + "learning_rate": 8.240809668355776e-08, + "loss": 0.88787174, + "memory(GiB)": 138.1, + "step": 80910, + "train_speed(iter/s)": 0.200959 + }, + { + "acc": 0.7806591, + "epoch": 1.887924874961723, + "grad_norm": 6.15625, + "learning_rate": 8.206688086302483e-08, + "loss": 0.80691376, + "memory(GiB)": 138.1, + "step": 80920, + "train_speed(iter/s)": 0.200972 + }, + { + "acc": 0.78252301, + "epoch": 1.888158182534012, + "grad_norm": 5.25, + "learning_rate": 8.1726367076016e-08, + "loss": 0.78188124, + "memory(GiB)": 138.1, + "step": 80930, + "train_speed(iter/s)": 0.200985 + }, + { + "acc": 0.77645779, + "epoch": 1.8883914901063008, + "grad_norm": 5.90625, + "learning_rate": 8.138655537113904e-08, + "loss": 0.82004871, + "memory(GiB)": 138.1, + "step": 80940, + "train_speed(iter/s)": 0.200998 + }, + { + "acc": 0.79138808, + "epoch": 1.8886247976785897, + "grad_norm": 4.375, + "learning_rate": 8.104744579690294e-08, + "loss": 0.75213003, + "memory(GiB)": 138.1, + "step": 80950, + "train_speed(iter/s)": 0.20101 + }, + { + "acc": 0.77159157, + "epoch": 1.8888581052508786, + "grad_norm": 5.15625, + "learning_rate": 8.070903840171451e-08, + "loss": 0.84294567, + "memory(GiB)": 138.1, + "step": 80960, + "train_speed(iter/s)": 0.201022 + }, + { + "acc": 0.77069464, + "epoch": 1.8890914128231673, + "grad_norm": 7.9375, + "learning_rate": 8.037133323388236e-08, + "loss": 0.82081795, + "memory(GiB)": 138.1, + "step": 80970, + "train_speed(iter/s)": 0.201035 + }, + { + "acc": 0.79293261, + "epoch": 1.8893247203954564, + "grad_norm": 5.75, + "learning_rate": 8.00343303416129e-08, + "loss": 0.74969621, + "memory(GiB)": 138.1, + "step": 80980, + "train_speed(iter/s)": 0.201049 + }, + { + "acc": 0.78427496, + "epoch": 1.889558027967745, + "grad_norm": 6.34375, + "learning_rate": 7.969802977301322e-08, + "loss": 0.77531166, + "memory(GiB)": 138.1, + "step": 80990, + "train_speed(iter/s)": 0.201062 + }, + { + "acc": 0.77599058, + "epoch": 1.8897913355400342, + "grad_norm": 5.4375, + "learning_rate": 7.936243157609103e-08, + "loss": 0.79406672, + "memory(GiB)": 138.1, + "step": 81000, + "train_speed(iter/s)": 0.201076 + }, + { + "epoch": 1.8897913355400342, + "eval_acc": 0.7446983851536048, + "eval_loss": 0.804445207118988, + "eval_runtime": 1271.5518, + "eval_samples_per_second": 28.305, + "eval_steps_per_second": 14.153, + "step": 81000 + }, + { + "acc": 0.7871388, + "epoch": 1.8900246431123229, + "grad_norm": 5.40625, + "learning_rate": 7.902753579875189e-08, + "loss": 0.78215551, + "memory(GiB)": 138.1, + "step": 81010, + "train_speed(iter/s)": 0.200443 + }, + { + "acc": 0.76717043, + "epoch": 1.890257950684612, + "grad_norm": 11.125, + "learning_rate": 7.869334248880256e-08, + "loss": 0.82725344, + "memory(GiB)": 138.1, + "step": 81020, + "train_speed(iter/s)": 0.200456 + }, + { + "acc": 0.77815256, + "epoch": 1.8904912582569007, + "grad_norm": 5.96875, + "learning_rate": 7.835985169394933e-08, + "loss": 0.7883893, + "memory(GiB)": 138.1, + "step": 81030, + "train_speed(iter/s)": 0.200469 + }, + { + "acc": 0.76834126, + "epoch": 1.8907245658291898, + "grad_norm": 4.8125, + "learning_rate": 7.802706346179744e-08, + "loss": 0.84197254, + "memory(GiB)": 138.1, + "step": 81040, + "train_speed(iter/s)": 0.200482 + }, + { + "acc": 0.7763195, + "epoch": 1.8909578734014785, + "grad_norm": 5.53125, + "learning_rate": 7.769497783985224e-08, + "loss": 0.81596165, + "memory(GiB)": 138.1, + "step": 81050, + "train_speed(iter/s)": 0.200494 + }, + { + "acc": 0.77682486, + "epoch": 1.8911911809737676, + "grad_norm": 4.71875, + "learning_rate": 7.736359487551859e-08, + "loss": 0.80225954, + "memory(GiB)": 138.1, + "step": 81060, + "train_speed(iter/s)": 0.200507 + }, + { + "acc": 0.77334614, + "epoch": 1.8914244885460563, + "grad_norm": 5.71875, + "learning_rate": 7.703291461610252e-08, + "loss": 0.81839981, + "memory(GiB)": 138.1, + "step": 81070, + "train_speed(iter/s)": 0.20052 + }, + { + "acc": 0.76000309, + "epoch": 1.8916577961183454, + "grad_norm": 7.65625, + "learning_rate": 7.670293710880683e-08, + "loss": 0.86824694, + "memory(GiB)": 138.1, + "step": 81080, + "train_speed(iter/s)": 0.200533 + }, + { + "acc": 0.75734501, + "epoch": 1.891891103690634, + "grad_norm": 4.4375, + "learning_rate": 7.637366240073717e-08, + "loss": 0.89891548, + "memory(GiB)": 138.1, + "step": 81090, + "train_speed(iter/s)": 0.200546 + }, + { + "acc": 0.78540058, + "epoch": 1.8921244112629232, + "grad_norm": 4.4375, + "learning_rate": 7.604509053889708e-08, + "loss": 0.7534976, + "memory(GiB)": 138.1, + "step": 81100, + "train_speed(iter/s)": 0.200559 + }, + { + "acc": 0.76539426, + "epoch": 1.8923577188352119, + "grad_norm": 6.15625, + "learning_rate": 7.571722157018957e-08, + "loss": 0.84462833, + "memory(GiB)": 138.1, + "step": 81110, + "train_speed(iter/s)": 0.200571 + }, + { + "acc": 0.79330382, + "epoch": 1.892591026407501, + "grad_norm": 4.5625, + "learning_rate": 7.539005554141831e-08, + "loss": 0.72500844, + "memory(GiB)": 138.1, + "step": 81120, + "train_speed(iter/s)": 0.200584 + }, + { + "acc": 0.80007896, + "epoch": 1.8928243339797897, + "grad_norm": 3.875, + "learning_rate": 7.506359249928542e-08, + "loss": 0.71526747, + "memory(GiB)": 138.1, + "step": 81130, + "train_speed(iter/s)": 0.200596 + }, + { + "acc": 0.77556849, + "epoch": 1.8930576415520788, + "grad_norm": 5.8125, + "learning_rate": 7.473783249039468e-08, + "loss": 0.80747452, + "memory(GiB)": 138.1, + "step": 81140, + "train_speed(iter/s)": 0.200609 + }, + { + "acc": 0.78302164, + "epoch": 1.8932909491243675, + "grad_norm": 5.8125, + "learning_rate": 7.441277556124781e-08, + "loss": 0.77237792, + "memory(GiB)": 138.1, + "step": 81150, + "train_speed(iter/s)": 0.200622 + }, + { + "acc": 0.78645997, + "epoch": 1.8935242566966564, + "grad_norm": 5.84375, + "learning_rate": 7.408842175824604e-08, + "loss": 0.76683159, + "memory(GiB)": 138.1, + "step": 81160, + "train_speed(iter/s)": 0.200635 + }, + { + "acc": 0.78576627, + "epoch": 1.8937575642689453, + "grad_norm": 5.28125, + "learning_rate": 7.376477112769064e-08, + "loss": 0.77226114, + "memory(GiB)": 138.1, + "step": 81170, + "train_speed(iter/s)": 0.200648 + }, + { + "acc": 0.77925673, + "epoch": 1.8939908718412342, + "grad_norm": 5.0, + "learning_rate": 7.344182371578356e-08, + "loss": 0.79336381, + "memory(GiB)": 138.1, + "step": 81180, + "train_speed(iter/s)": 0.200661 + }, + { + "acc": 0.79144859, + "epoch": 1.894224179413523, + "grad_norm": 4.46875, + "learning_rate": 7.311957956862459e-08, + "loss": 0.73583159, + "memory(GiB)": 138.1, + "step": 81190, + "train_speed(iter/s)": 0.200674 + }, + { + "acc": 0.7666995, + "epoch": 1.894457486985812, + "grad_norm": 5.5625, + "learning_rate": 7.279803873221469e-08, + "loss": 0.84476967, + "memory(GiB)": 138.1, + "step": 81200, + "train_speed(iter/s)": 0.200687 + }, + { + "acc": 0.80386267, + "epoch": 1.8946907945581009, + "grad_norm": 5.375, + "learning_rate": 7.247720125245328e-08, + "loss": 0.68902006, + "memory(GiB)": 138.1, + "step": 81210, + "train_speed(iter/s)": 0.2007 + }, + { + "acc": 0.79350948, + "epoch": 1.8949241021303898, + "grad_norm": 5.8125, + "learning_rate": 7.215706717513982e-08, + "loss": 0.72404013, + "memory(GiB)": 138.1, + "step": 81220, + "train_speed(iter/s)": 0.200712 + }, + { + "acc": 0.76372366, + "epoch": 1.8951574097026787, + "grad_norm": 5.375, + "learning_rate": 7.183763654597387e-08, + "loss": 0.8393321, + "memory(GiB)": 138.1, + "step": 81230, + "train_speed(iter/s)": 0.200725 + }, + { + "acc": 0.79516706, + "epoch": 1.8953907172749676, + "grad_norm": 6.53125, + "learning_rate": 7.15189094105534e-08, + "loss": 0.72492518, + "memory(GiB)": 138.1, + "step": 81240, + "train_speed(iter/s)": 0.200739 + }, + { + "acc": 0.76616926, + "epoch": 1.8956240248472565, + "grad_norm": 4.40625, + "learning_rate": 7.120088581437645e-08, + "loss": 0.85155525, + "memory(GiB)": 138.1, + "step": 81250, + "train_speed(iter/s)": 0.200751 + }, + { + "acc": 0.79208398, + "epoch": 1.8958573324195453, + "grad_norm": 4.1875, + "learning_rate": 7.088356580284228e-08, + "loss": 0.73564825, + "memory(GiB)": 138.1, + "step": 81260, + "train_speed(iter/s)": 0.200763 + }, + { + "acc": 0.77262831, + "epoch": 1.8960906399918342, + "grad_norm": 5.5625, + "learning_rate": 7.05669494212463e-08, + "loss": 0.818857, + "memory(GiB)": 138.1, + "step": 81270, + "train_speed(iter/s)": 0.200776 + }, + { + "acc": 0.77157354, + "epoch": 1.8963239475641231, + "grad_norm": 6.28125, + "learning_rate": 7.025103671478684e-08, + "loss": 0.81951456, + "memory(GiB)": 138.1, + "step": 81280, + "train_speed(iter/s)": 0.200789 + }, + { + "acc": 0.78546553, + "epoch": 1.896557255136412, + "grad_norm": 4.53125, + "learning_rate": 6.993582772855889e-08, + "loss": 0.77391062, + "memory(GiB)": 138.1, + "step": 81290, + "train_speed(iter/s)": 0.200801 + }, + { + "acc": 0.77268543, + "epoch": 1.896790562708701, + "grad_norm": 4.59375, + "learning_rate": 6.962132250756037e-08, + "loss": 0.82842484, + "memory(GiB)": 138.1, + "step": 81300, + "train_speed(iter/s)": 0.200815 + }, + { + "acc": 0.7767509, + "epoch": 1.8970238702809898, + "grad_norm": 5.53125, + "learning_rate": 6.930752109668481e-08, + "loss": 0.7966526, + "memory(GiB)": 138.1, + "step": 81310, + "train_speed(iter/s)": 0.200828 + }, + { + "acc": 0.7757802, + "epoch": 1.8972571778532787, + "grad_norm": 4.3125, + "learning_rate": 6.899442354072916e-08, + "loss": 0.79777894, + "memory(GiB)": 138.1, + "step": 81320, + "train_speed(iter/s)": 0.20084 + }, + { + "acc": 0.77078934, + "epoch": 1.8974904854255676, + "grad_norm": 5.84375, + "learning_rate": 6.868202988438655e-08, + "loss": 0.82341976, + "memory(GiB)": 138.1, + "step": 81330, + "train_speed(iter/s)": 0.200852 + }, + { + "acc": 0.78582072, + "epoch": 1.8977237929978565, + "grad_norm": 5.65625, + "learning_rate": 6.837034017225186e-08, + "loss": 0.78013291, + "memory(GiB)": 138.1, + "step": 81340, + "train_speed(iter/s)": 0.200864 + }, + { + "acc": 0.77980404, + "epoch": 1.8979571005701454, + "grad_norm": 4.96875, + "learning_rate": 6.80593544488184e-08, + "loss": 0.78744049, + "memory(GiB)": 138.1, + "step": 81350, + "train_speed(iter/s)": 0.200877 + }, + { + "acc": 0.77276716, + "epoch": 1.8981904081424341, + "grad_norm": 5.625, + "learning_rate": 6.774907275847898e-08, + "loss": 0.81919241, + "memory(GiB)": 138.1, + "step": 81360, + "train_speed(iter/s)": 0.200889 + }, + { + "acc": 0.76729136, + "epoch": 1.8984237157147232, + "grad_norm": 5.21875, + "learning_rate": 6.743949514552706e-08, + "loss": 0.83895359, + "memory(GiB)": 138.1, + "step": 81370, + "train_speed(iter/s)": 0.200903 + }, + { + "acc": 0.78235312, + "epoch": 1.898657023287012, + "grad_norm": 4.375, + "learning_rate": 6.713062165415451e-08, + "loss": 0.77152939, + "memory(GiB)": 138.1, + "step": 81380, + "train_speed(iter/s)": 0.200916 + }, + { + "acc": 0.79524784, + "epoch": 1.898890330859301, + "grad_norm": 7.625, + "learning_rate": 6.682245232845219e-08, + "loss": 0.74803662, + "memory(GiB)": 138.1, + "step": 81390, + "train_speed(iter/s)": 0.200929 + }, + { + "acc": 0.78636365, + "epoch": 1.8991236384315897, + "grad_norm": 6.09375, + "learning_rate": 6.651498721241212e-08, + "loss": 0.78164091, + "memory(GiB)": 138.1, + "step": 81400, + "train_speed(iter/s)": 0.200941 + }, + { + "acc": 0.7818541, + "epoch": 1.8993569460038788, + "grad_norm": 4.1875, + "learning_rate": 6.620822634992419e-08, + "loss": 0.79073544, + "memory(GiB)": 138.1, + "step": 81410, + "train_speed(iter/s)": 0.200954 + }, + { + "acc": 0.7828022, + "epoch": 1.8995902535761675, + "grad_norm": 5.71875, + "learning_rate": 6.590216978477836e-08, + "loss": 0.77449131, + "memory(GiB)": 138.1, + "step": 81420, + "train_speed(iter/s)": 0.200964 + }, + { + "acc": 0.78085957, + "epoch": 1.8998235611484566, + "grad_norm": 6.0625, + "learning_rate": 6.559681756066471e-08, + "loss": 0.7848876, + "memory(GiB)": 138.1, + "step": 81430, + "train_speed(iter/s)": 0.200978 + }, + { + "acc": 0.78292851, + "epoch": 1.9000568687207453, + "grad_norm": 5.15625, + "learning_rate": 6.529216972117225e-08, + "loss": 0.77457685, + "memory(GiB)": 138.1, + "step": 81440, + "train_speed(iter/s)": 0.20099 + }, + { + "acc": 0.76646495, + "epoch": 1.9002901762930344, + "grad_norm": 5.03125, + "learning_rate": 6.49882263097884e-08, + "loss": 0.8381773, + "memory(GiB)": 138.1, + "step": 81450, + "train_speed(iter/s)": 0.201003 + }, + { + "acc": 0.78961086, + "epoch": 1.900523483865323, + "grad_norm": 4.78125, + "learning_rate": 6.468498736990181e-08, + "loss": 0.73249388, + "memory(GiB)": 138.1, + "step": 81460, + "train_speed(iter/s)": 0.201015 + }, + { + "acc": 0.77733517, + "epoch": 1.9007567914376122, + "grad_norm": 5.0, + "learning_rate": 6.438245294480006e-08, + "loss": 0.78677988, + "memory(GiB)": 138.1, + "step": 81470, + "train_speed(iter/s)": 0.201028 + }, + { + "acc": 0.79435854, + "epoch": 1.900990099009901, + "grad_norm": 4.96875, + "learning_rate": 6.40806230776686e-08, + "loss": 0.71958585, + "memory(GiB)": 138.1, + "step": 81480, + "train_speed(iter/s)": 0.20104 + }, + { + "acc": 0.7977644, + "epoch": 1.90122340658219, + "grad_norm": 6.25, + "learning_rate": 6.37794978115952e-08, + "loss": 0.73467293, + "memory(GiB)": 138.1, + "step": 81490, + "train_speed(iter/s)": 0.201052 + }, + { + "acc": 0.78441677, + "epoch": 1.9014567141544787, + "grad_norm": 7.0625, + "learning_rate": 6.347907718956381e-08, + "loss": 0.77021914, + "memory(GiB)": 138.1, + "step": 81500, + "train_speed(iter/s)": 0.201064 + }, + { + "epoch": 1.9014567141544787, + "eval_acc": 0.7447350581828017, + "eval_loss": 0.8044015765190125, + "eval_runtime": 1270.7097, + "eval_samples_per_second": 28.324, + "eval_steps_per_second": 14.162, + "step": 81500 + }, + { + "acc": 0.78291373, + "epoch": 1.9016900217267678, + "grad_norm": 5.34375, + "learning_rate": 6.317936125446012e-08, + "loss": 0.78251152, + "memory(GiB)": 138.1, + "step": 81510, + "train_speed(iter/s)": 0.200436 + }, + { + "acc": 0.76142359, + "epoch": 1.9019233292990565, + "grad_norm": 6.21875, + "learning_rate": 6.288035004906878e-08, + "loss": 0.87129774, + "memory(GiB)": 138.1, + "step": 81520, + "train_speed(iter/s)": 0.200449 + }, + { + "acc": 0.80396862, + "epoch": 1.9021566368713456, + "grad_norm": 6.1875, + "learning_rate": 6.258204361607289e-08, + "loss": 0.69574623, + "memory(GiB)": 138.1, + "step": 81530, + "train_speed(iter/s)": 0.200461 + }, + { + "acc": 0.78162894, + "epoch": 1.9023899444436343, + "grad_norm": 4.46875, + "learning_rate": 6.228444199805617e-08, + "loss": 0.77852755, + "memory(GiB)": 138.1, + "step": 81540, + "train_speed(iter/s)": 0.200474 + }, + { + "acc": 0.77957335, + "epoch": 1.9026232520159232, + "grad_norm": 4.875, + "learning_rate": 6.198754523750072e-08, + "loss": 0.80971165, + "memory(GiB)": 138.1, + "step": 81550, + "train_speed(iter/s)": 0.200487 + }, + { + "acc": 0.7833252, + "epoch": 1.902856559588212, + "grad_norm": 5.84375, + "learning_rate": 6.169135337678878e-08, + "loss": 0.77970705, + "memory(GiB)": 138.1, + "step": 81560, + "train_speed(iter/s)": 0.2005 + }, + { + "acc": 0.77797418, + "epoch": 1.903089867160501, + "grad_norm": 7.125, + "learning_rate": 6.13958664582015e-08, + "loss": 0.79811144, + "memory(GiB)": 138.1, + "step": 81570, + "train_speed(iter/s)": 0.200513 + }, + { + "acc": 0.77659426, + "epoch": 1.90332317473279, + "grad_norm": 5.9375, + "learning_rate": 6.11010845239196e-08, + "loss": 0.80996256, + "memory(GiB)": 138.1, + "step": 81580, + "train_speed(iter/s)": 0.200525 + }, + { + "acc": 0.79570951, + "epoch": 1.9035564823050788, + "grad_norm": 4.21875, + "learning_rate": 6.080700761602331e-08, + "loss": 0.72712016, + "memory(GiB)": 138.1, + "step": 81590, + "train_speed(iter/s)": 0.200538 + }, + { + "acc": 0.77094784, + "epoch": 1.9037897898773677, + "grad_norm": 5.1875, + "learning_rate": 6.051363577649238e-08, + "loss": 0.83303509, + "memory(GiB)": 138.1, + "step": 81600, + "train_speed(iter/s)": 0.200551 + }, + { + "acc": 0.77590933, + "epoch": 1.9040230974496566, + "grad_norm": 5.25, + "learning_rate": 6.022096904720388e-08, + "loss": 0.79160347, + "memory(GiB)": 138.1, + "step": 81610, + "train_speed(iter/s)": 0.200562 + }, + { + "acc": 0.77262087, + "epoch": 1.9042564050219455, + "grad_norm": 4.75, + "learning_rate": 5.992900746993768e-08, + "loss": 0.82788048, + "memory(GiB)": 138.1, + "step": 81620, + "train_speed(iter/s)": 0.200575 + }, + { + "acc": 0.77261982, + "epoch": 1.9044897125942344, + "grad_norm": 8.3125, + "learning_rate": 5.963775108637048e-08, + "loss": 0.83426991, + "memory(GiB)": 138.1, + "step": 81630, + "train_speed(iter/s)": 0.200588 + }, + { + "acc": 0.76702747, + "epoch": 1.9047230201665233, + "grad_norm": 5.75, + "learning_rate": 5.9347199938079026e-08, + "loss": 0.8470952, + "memory(GiB)": 138.1, + "step": 81640, + "train_speed(iter/s)": 0.200601 + }, + { + "acc": 0.77795296, + "epoch": 1.9049563277388122, + "grad_norm": 4.84375, + "learning_rate": 5.9057354066539564e-08, + "loss": 0.79301691, + "memory(GiB)": 138.1, + "step": 81650, + "train_speed(iter/s)": 0.200613 + }, + { + "acc": 0.77648511, + "epoch": 1.905189635311101, + "grad_norm": 5.40625, + "learning_rate": 5.876821351312734e-08, + "loss": 0.81472111, + "memory(GiB)": 138.1, + "step": 81660, + "train_speed(iter/s)": 0.200626 + }, + { + "acc": 0.78697462, + "epoch": 1.90542294288339, + "grad_norm": 5.625, + "learning_rate": 5.8479778319117665e-08, + "loss": 0.76727009, + "memory(GiB)": 138.1, + "step": 81670, + "train_speed(iter/s)": 0.200639 + }, + { + "acc": 0.76950393, + "epoch": 1.9056562504556789, + "grad_norm": 4.59375, + "learning_rate": 5.819204852568372e-08, + "loss": 0.82860451, + "memory(GiB)": 138.1, + "step": 81680, + "train_speed(iter/s)": 0.200652 + }, + { + "acc": 0.76736298, + "epoch": 1.9058895580279678, + "grad_norm": 6.875, + "learning_rate": 5.790502417389876e-08, + "loss": 0.82692204, + "memory(GiB)": 138.1, + "step": 81690, + "train_speed(iter/s)": 0.200665 + }, + { + "acc": 0.77684221, + "epoch": 1.9061228656002567, + "grad_norm": 5.15625, + "learning_rate": 5.7618705304736676e-08, + "loss": 0.79659696, + "memory(GiB)": 138.1, + "step": 81700, + "train_speed(iter/s)": 0.200677 + }, + { + "acc": 0.76629496, + "epoch": 1.9063561731725456, + "grad_norm": 4.25, + "learning_rate": 5.733309195906811e-08, + "loss": 0.8493578, + "memory(GiB)": 138.1, + "step": 81710, + "train_speed(iter/s)": 0.20069 + }, + { + "acc": 0.76272459, + "epoch": 1.9065894807448345, + "grad_norm": 5.40625, + "learning_rate": 5.7048184177666e-08, + "loss": 0.84202557, + "memory(GiB)": 138.1, + "step": 81720, + "train_speed(iter/s)": 0.200703 + }, + { + "acc": 0.77872486, + "epoch": 1.9068227883171232, + "grad_norm": 7.9375, + "learning_rate": 5.676398200119837e-08, + "loss": 0.78867388, + "memory(GiB)": 138.1, + "step": 81730, + "train_speed(iter/s)": 0.200716 + }, + { + "acc": 0.79240322, + "epoch": 1.9070560958894123, + "grad_norm": 4.65625, + "learning_rate": 5.648048547023666e-08, + "loss": 0.71631851, + "memory(GiB)": 138.1, + "step": 81740, + "train_speed(iter/s)": 0.200729 + }, + { + "acc": 0.77871118, + "epoch": 1.907289403461701, + "grad_norm": 5.1875, + "learning_rate": 5.61976946252496e-08, + "loss": 0.79847703, + "memory(GiB)": 138.1, + "step": 81750, + "train_speed(iter/s)": 0.200742 + }, + { + "acc": 0.77508669, + "epoch": 1.90752271103399, + "grad_norm": 4.71875, + "learning_rate": 5.591560950660546e-08, + "loss": 0.8124464, + "memory(GiB)": 138.1, + "step": 81760, + "train_speed(iter/s)": 0.200755 + }, + { + "acc": 0.79896374, + "epoch": 1.9077560186062787, + "grad_norm": 5.15625, + "learning_rate": 5.563423015457203e-08, + "loss": 0.72898488, + "memory(GiB)": 138.1, + "step": 81770, + "train_speed(iter/s)": 0.200767 + }, + { + "acc": 0.80807238, + "epoch": 1.9079893261785679, + "grad_norm": 4.5, + "learning_rate": 5.535355660931552e-08, + "loss": 0.68237019, + "memory(GiB)": 138.1, + "step": 81780, + "train_speed(iter/s)": 0.200779 + }, + { + "acc": 0.77174668, + "epoch": 1.9082226337508565, + "grad_norm": 4.6875, + "learning_rate": 5.50735889109022e-08, + "loss": 0.82769318, + "memory(GiB)": 138.1, + "step": 81790, + "train_speed(iter/s)": 0.200792 + }, + { + "acc": 0.75991964, + "epoch": 1.9084559413231457, + "grad_norm": 4.3125, + "learning_rate": 5.4794327099297887e-08, + "loss": 0.86439896, + "memory(GiB)": 138.1, + "step": 81800, + "train_speed(iter/s)": 0.200805 + }, + { + "acc": 0.76893587, + "epoch": 1.9086892488954343, + "grad_norm": 5.15625, + "learning_rate": 5.45157712143668e-08, + "loss": 0.82247448, + "memory(GiB)": 138.1, + "step": 81810, + "train_speed(iter/s)": 0.200817 + }, + { + "acc": 0.79243269, + "epoch": 1.9089225564677235, + "grad_norm": 5.03125, + "learning_rate": 5.423792129587269e-08, + "loss": 0.71618028, + "memory(GiB)": 138.1, + "step": 81820, + "train_speed(iter/s)": 0.20083 + }, + { + "acc": 0.79535136, + "epoch": 1.9091558640400121, + "grad_norm": 4.21875, + "learning_rate": 5.396077738347882e-08, + "loss": 0.74078579, + "memory(GiB)": 138.1, + "step": 81830, + "train_speed(iter/s)": 0.200842 + }, + { + "acc": 0.79126263, + "epoch": 1.9093891716123013, + "grad_norm": 5.90625, + "learning_rate": 5.36843395167469e-08, + "loss": 0.76003933, + "memory(GiB)": 138.1, + "step": 81840, + "train_speed(iter/s)": 0.200855 + }, + { + "acc": 0.78736057, + "epoch": 1.90962247918459, + "grad_norm": 5.1875, + "learning_rate": 5.340860773513812e-08, + "loss": 0.76994362, + "memory(GiB)": 138.1, + "step": 81850, + "train_speed(iter/s)": 0.200867 + }, + { + "acc": 0.77512932, + "epoch": 1.909855786756879, + "grad_norm": 6.625, + "learning_rate": 5.313358207801433e-08, + "loss": 0.81734104, + "memory(GiB)": 138.1, + "step": 81860, + "train_speed(iter/s)": 0.20088 + }, + { + "acc": 0.77062392, + "epoch": 1.9100890943291677, + "grad_norm": 5.1875, + "learning_rate": 5.285926258463414e-08, + "loss": 0.80548515, + "memory(GiB)": 138.1, + "step": 81870, + "train_speed(iter/s)": 0.200893 + }, + { + "acc": 0.78182507, + "epoch": 1.9103224019014569, + "grad_norm": 5.4375, + "learning_rate": 5.2585649294157326e-08, + "loss": 0.79319897, + "memory(GiB)": 138.1, + "step": 81880, + "train_speed(iter/s)": 0.200904 + }, + { + "acc": 0.78957319, + "epoch": 1.9105557094737455, + "grad_norm": 7.28125, + "learning_rate": 5.231274224564154e-08, + "loss": 0.7621007, + "memory(GiB)": 138.1, + "step": 81890, + "train_speed(iter/s)": 0.200916 + }, + { + "acc": 0.77407713, + "epoch": 1.9107890170460347, + "grad_norm": 5.0625, + "learning_rate": 5.2040541478044496e-08, + "loss": 0.82251482, + "memory(GiB)": 138.1, + "step": 81900, + "train_speed(iter/s)": 0.200929 + }, + { + "acc": 0.75633006, + "epoch": 1.9110223246183233, + "grad_norm": 6.5, + "learning_rate": 5.176904703022345e-08, + "loss": 0.87226925, + "memory(GiB)": 138.1, + "step": 81910, + "train_speed(iter/s)": 0.200942 + }, + { + "acc": 0.77395215, + "epoch": 1.9112556321906125, + "grad_norm": 6.0625, + "learning_rate": 5.149825894093241e-08, + "loss": 0.80833683, + "memory(GiB)": 138.1, + "step": 81920, + "train_speed(iter/s)": 0.200955 + }, + { + "acc": 0.76668892, + "epoch": 1.9114889397629011, + "grad_norm": 7.75, + "learning_rate": 5.1228177248828224e-08, + "loss": 0.82612123, + "memory(GiB)": 138.1, + "step": 81930, + "train_speed(iter/s)": 0.200968 + }, + { + "acc": 0.7853972, + "epoch": 1.91172224733519, + "grad_norm": 6.59375, + "learning_rate": 5.0958801992463944e-08, + "loss": 0.76109056, + "memory(GiB)": 138.1, + "step": 81940, + "train_speed(iter/s)": 0.20098 + }, + { + "acc": 0.78190913, + "epoch": 1.911955554907479, + "grad_norm": 8.0, + "learning_rate": 5.069013321029326e-08, + "loss": 0.7810389, + "memory(GiB)": 138.1, + "step": 81950, + "train_speed(iter/s)": 0.200993 + }, + { + "acc": 0.76182289, + "epoch": 1.9121888624797678, + "grad_norm": 5.125, + "learning_rate": 5.0422170940667147e-08, + "loss": 0.85948324, + "memory(GiB)": 138.1, + "step": 81960, + "train_speed(iter/s)": 0.201006 + }, + { + "acc": 0.78014874, + "epoch": 1.9124221700520567, + "grad_norm": 7.4375, + "learning_rate": 5.015491522183946e-08, + "loss": 0.78254461, + "memory(GiB)": 138.1, + "step": 81970, + "train_speed(iter/s)": 0.201019 + }, + { + "acc": 0.77584715, + "epoch": 1.9126554776243456, + "grad_norm": 6.25, + "learning_rate": 4.9888366091959125e-08, + "loss": 0.8201292, + "memory(GiB)": 138.1, + "step": 81980, + "train_speed(iter/s)": 0.201031 + }, + { + "acc": 0.76424618, + "epoch": 1.9128887851966345, + "grad_norm": 5.1875, + "learning_rate": 4.962252358907627e-08, + "loss": 0.85319223, + "memory(GiB)": 138.1, + "step": 81990, + "train_speed(iter/s)": 0.201044 + }, + { + "acc": 0.76576996, + "epoch": 1.9131220927689234, + "grad_norm": 5.125, + "learning_rate": 4.935738775114052e-08, + "loss": 0.84802933, + "memory(GiB)": 138.1, + "step": 82000, + "train_speed(iter/s)": 0.201056 + }, + { + "epoch": 1.9131220927689234, + "eval_acc": 0.7447390617886093, + "eval_loss": 0.8044153451919556, + "eval_runtime": 1273.2096, + "eval_samples_per_second": 28.268, + "eval_steps_per_second": 14.134, + "step": 82000 + }, + { + "acc": 0.78968439, + "epoch": 1.9133554003412123, + "grad_norm": 5.59375, + "learning_rate": 4.9092958615999385e-08, + "loss": 0.77019253, + "memory(GiB)": 138.1, + "step": 82010, + "train_speed(iter/s)": 0.20043 + }, + { + "acc": 0.75527754, + "epoch": 1.9135887079135012, + "grad_norm": 14.875, + "learning_rate": 4.882923622140046e-08, + "loss": 0.87390118, + "memory(GiB)": 138.1, + "step": 82020, + "train_speed(iter/s)": 0.200442 + }, + { + "acc": 0.77405939, + "epoch": 1.9138220154857901, + "grad_norm": 5.5625, + "learning_rate": 4.856622060498972e-08, + "loss": 0.80371208, + "memory(GiB)": 138.1, + "step": 82030, + "train_speed(iter/s)": 0.200455 + }, + { + "acc": 0.78950992, + "epoch": 1.914055323058079, + "grad_norm": 3.859375, + "learning_rate": 4.83039118043116e-08, + "loss": 0.744206, + "memory(GiB)": 138.1, + "step": 82040, + "train_speed(iter/s)": 0.200467 + }, + { + "acc": 0.78136091, + "epoch": 1.914288630630368, + "grad_norm": 9.9375, + "learning_rate": 4.80423098568128e-08, + "loss": 0.781563, + "memory(GiB)": 138.1, + "step": 82050, + "train_speed(iter/s)": 0.20048 + }, + { + "acc": 0.76963954, + "epoch": 1.9145219382026568, + "grad_norm": 4.9375, + "learning_rate": 4.7781414799835116e-08, + "loss": 0.82037382, + "memory(GiB)": 138.1, + "step": 82060, + "train_speed(iter/s)": 0.200492 + }, + { + "acc": 0.80206232, + "epoch": 1.9147552457749457, + "grad_norm": 5.6875, + "learning_rate": 4.752122667062209e-08, + "loss": 0.70895433, + "memory(GiB)": 138.1, + "step": 82070, + "train_speed(iter/s)": 0.200505 + }, + { + "acc": 0.79812136, + "epoch": 1.9149885533472346, + "grad_norm": 4.40625, + "learning_rate": 4.726174550631513e-08, + "loss": 0.68837681, + "memory(GiB)": 138.1, + "step": 82080, + "train_speed(iter/s)": 0.200516 + }, + { + "acc": 0.78133001, + "epoch": 1.9152218609195235, + "grad_norm": 4.71875, + "learning_rate": 4.7002971343955153e-08, + "loss": 0.74567728, + "memory(GiB)": 138.1, + "step": 82090, + "train_speed(iter/s)": 0.200529 + }, + { + "acc": 0.80090256, + "epoch": 1.9154551684918124, + "grad_norm": 4.21875, + "learning_rate": 4.6744904220482056e-08, + "loss": 0.71008487, + "memory(GiB)": 138.1, + "step": 82100, + "train_speed(iter/s)": 0.200542 + }, + { + "acc": 0.7789093, + "epoch": 1.9156884760641013, + "grad_norm": 4.59375, + "learning_rate": 4.648754417273526e-08, + "loss": 0.78899159, + "memory(GiB)": 138.1, + "step": 82110, + "train_speed(iter/s)": 0.200554 + }, + { + "acc": 0.77255368, + "epoch": 1.91592178363639, + "grad_norm": 7.25, + "learning_rate": 4.623089123745261e-08, + "loss": 0.82642889, + "memory(GiB)": 138.1, + "step": 82120, + "train_speed(iter/s)": 0.200566 + }, + { + "acc": 0.79890847, + "epoch": 1.916155091208679, + "grad_norm": 5.15625, + "learning_rate": 4.597494545127035e-08, + "loss": 0.71255264, + "memory(GiB)": 138.1, + "step": 82130, + "train_speed(iter/s)": 0.200579 + }, + { + "acc": 0.77682538, + "epoch": 1.9163883987809678, + "grad_norm": 8.0625, + "learning_rate": 4.571970685072646e-08, + "loss": 0.7757277, + "memory(GiB)": 138.1, + "step": 82140, + "train_speed(iter/s)": 0.200591 + }, + { + "acc": 0.78598347, + "epoch": 1.916621706353257, + "grad_norm": 3.734375, + "learning_rate": 4.5465175472254594e-08, + "loss": 0.75754504, + "memory(GiB)": 138.1, + "step": 82150, + "train_speed(iter/s)": 0.200604 + }, + { + "acc": 0.78972554, + "epoch": 1.9168550139255456, + "grad_norm": 8.1875, + "learning_rate": 4.521135135218957e-08, + "loss": 0.7571701, + "memory(GiB)": 138.1, + "step": 82160, + "train_speed(iter/s)": 0.200615 + }, + { + "acc": 0.77707663, + "epoch": 1.9170883214978347, + "grad_norm": 4.84375, + "learning_rate": 4.495823452676518e-08, + "loss": 0.82133408, + "memory(GiB)": 138.1, + "step": 82170, + "train_speed(iter/s)": 0.200628 + }, + { + "acc": 0.76835198, + "epoch": 1.9173216290701234, + "grad_norm": 5.5, + "learning_rate": 4.4705825032113624e-08, + "loss": 0.83846817, + "memory(GiB)": 138.1, + "step": 82180, + "train_speed(iter/s)": 0.20064 + }, + { + "acc": 0.77485347, + "epoch": 1.9175549366424125, + "grad_norm": 6.78125, + "learning_rate": 4.445412290426554e-08, + "loss": 0.81650391, + "memory(GiB)": 138.1, + "step": 82190, + "train_speed(iter/s)": 0.200653 + }, + { + "acc": 0.77240715, + "epoch": 1.9177882442147012, + "grad_norm": 7.0625, + "learning_rate": 4.420312817915162e-08, + "loss": 0.81943703, + "memory(GiB)": 138.1, + "step": 82200, + "train_speed(iter/s)": 0.200666 + }, + { + "acc": 0.7817626, + "epoch": 1.9180215517869903, + "grad_norm": 6.25, + "learning_rate": 4.39528408926021e-08, + "loss": 0.78492823, + "memory(GiB)": 138.1, + "step": 82210, + "train_speed(iter/s)": 0.200678 + }, + { + "acc": 0.77328238, + "epoch": 1.918254859359279, + "grad_norm": 6.84375, + "learning_rate": 4.370326108034395e-08, + "loss": 0.81196861, + "memory(GiB)": 138.1, + "step": 82220, + "train_speed(iter/s)": 0.200691 + }, + { + "acc": 0.76239452, + "epoch": 1.918488166931568, + "grad_norm": 7.53125, + "learning_rate": 4.3454388778005894e-08, + "loss": 0.88454895, + "memory(GiB)": 138.1, + "step": 82230, + "train_speed(iter/s)": 0.200703 + }, + { + "acc": 0.76898079, + "epoch": 1.9187214745038568, + "grad_norm": 5.25, + "learning_rate": 4.3206224021113966e-08, + "loss": 0.84615307, + "memory(GiB)": 138.1, + "step": 82240, + "train_speed(iter/s)": 0.200715 + }, + { + "acc": 0.765411, + "epoch": 1.918954782076146, + "grad_norm": 6.9375, + "learning_rate": 4.2958766845093704e-08, + "loss": 0.85209064, + "memory(GiB)": 138.1, + "step": 82250, + "train_speed(iter/s)": 0.200728 + }, + { + "acc": 0.77735281, + "epoch": 1.9191880896484346, + "grad_norm": 6.15625, + "learning_rate": 4.271201728526963e-08, + "loss": 0.79499168, + "memory(GiB)": 138.1, + "step": 82260, + "train_speed(iter/s)": 0.200741 + }, + { + "acc": 0.78337259, + "epoch": 1.9194213972207237, + "grad_norm": 4.71875, + "learning_rate": 4.24659753768647e-08, + "loss": 0.80247488, + "memory(GiB)": 138.1, + "step": 82270, + "train_speed(iter/s)": 0.200755 + }, + { + "acc": 0.78709459, + "epoch": 1.9196547047930124, + "grad_norm": 5.21875, + "learning_rate": 4.222064115500191e-08, + "loss": 0.76674123, + "memory(GiB)": 138.1, + "step": 82280, + "train_speed(iter/s)": 0.200767 + }, + { + "acc": 0.77503223, + "epoch": 1.9198880123653015, + "grad_norm": 11.125, + "learning_rate": 4.1976014654702135e-08, + "loss": 0.8402914, + "memory(GiB)": 138.1, + "step": 82290, + "train_speed(iter/s)": 0.200779 + }, + { + "acc": 0.77781281, + "epoch": 1.9201213199375902, + "grad_norm": 7.5, + "learning_rate": 4.1732095910885785e-08, + "loss": 0.78994112, + "memory(GiB)": 138.1, + "step": 82300, + "train_speed(iter/s)": 0.200792 + }, + { + "acc": 0.79021931, + "epoch": 1.920354627509879, + "grad_norm": 4.34375, + "learning_rate": 4.148888495837222e-08, + "loss": 0.7459506, + "memory(GiB)": 138.1, + "step": 82310, + "train_speed(iter/s)": 0.200805 + }, + { + "acc": 0.79193068, + "epoch": 1.920587935082168, + "grad_norm": 4.40625, + "learning_rate": 4.1246381831880345e-08, + "loss": 0.73066068, + "memory(GiB)": 138.1, + "step": 82320, + "train_speed(iter/s)": 0.200818 + }, + { + "acc": 0.7667829, + "epoch": 1.9208212426544569, + "grad_norm": 4.65625, + "learning_rate": 4.1004586566026904e-08, + "loss": 0.80857086, + "memory(GiB)": 138.1, + "step": 82330, + "train_speed(iter/s)": 0.200831 + }, + { + "acc": 0.78284898, + "epoch": 1.9210545502267458, + "grad_norm": 6.53125, + "learning_rate": 4.076349919532763e-08, + "loss": 0.79354701, + "memory(GiB)": 138.1, + "step": 82340, + "train_speed(iter/s)": 0.200844 + }, + { + "acc": 0.79704752, + "epoch": 1.9212878577990347, + "grad_norm": 6.46875, + "learning_rate": 4.052311975419887e-08, + "loss": 0.75222983, + "memory(GiB)": 138.1, + "step": 82350, + "train_speed(iter/s)": 0.200856 + }, + { + "acc": 0.78382096, + "epoch": 1.9215211653713236, + "grad_norm": 4.25, + "learning_rate": 4.0283448276953186e-08, + "loss": 0.79394064, + "memory(GiB)": 138.1, + "step": 82360, + "train_speed(iter/s)": 0.200869 + }, + { + "acc": 0.78321552, + "epoch": 1.9217544729436125, + "grad_norm": 3.859375, + "learning_rate": 4.004448479780543e-08, + "loss": 0.77877493, + "memory(GiB)": 138.1, + "step": 82370, + "train_speed(iter/s)": 0.200881 + }, + { + "acc": 0.78489828, + "epoch": 1.9219877805159014, + "grad_norm": 3.953125, + "learning_rate": 3.9806229350865534e-08, + "loss": 0.76953378, + "memory(GiB)": 138.1, + "step": 82380, + "train_speed(iter/s)": 0.200893 + }, + { + "acc": 0.77053556, + "epoch": 1.9222210880881903, + "grad_norm": 4.78125, + "learning_rate": 3.95686819701463e-08, + "loss": 0.85348272, + "memory(GiB)": 138.1, + "step": 82390, + "train_speed(iter/s)": 0.200906 + }, + { + "acc": 0.77399349, + "epoch": 1.9224543956604792, + "grad_norm": 7.875, + "learning_rate": 3.93318426895567e-08, + "loss": 0.81090708, + "memory(GiB)": 138.1, + "step": 82400, + "train_speed(iter/s)": 0.200918 + }, + { + "acc": 0.79830041, + "epoch": 1.922687703232768, + "grad_norm": 5.28125, + "learning_rate": 3.9095711542905257e-08, + "loss": 0.71653395, + "memory(GiB)": 138.1, + "step": 82410, + "train_speed(iter/s)": 0.200931 + }, + { + "acc": 0.81262779, + "epoch": 1.922921010805057, + "grad_norm": 5.4375, + "learning_rate": 3.8860288563900006e-08, + "loss": 0.65959501, + "memory(GiB)": 138.1, + "step": 82420, + "train_speed(iter/s)": 0.200944 + }, + { + "acc": 0.77822075, + "epoch": 1.9231543183773459, + "grad_norm": 5.53125, + "learning_rate": 3.862557378614684e-08, + "loss": 0.79374828, + "memory(GiB)": 138.1, + "step": 82430, + "train_speed(iter/s)": 0.200957 + }, + { + "acc": 0.7975666, + "epoch": 1.9233876259496347, + "grad_norm": 7.4375, + "learning_rate": 3.839156724315174e-08, + "loss": 0.73447914, + "memory(GiB)": 138.1, + "step": 82440, + "train_speed(iter/s)": 0.200969 + }, + { + "acc": 0.78388729, + "epoch": 1.9236209335219236, + "grad_norm": 3.90625, + "learning_rate": 3.815826896831909e-08, + "loss": 0.79074955, + "memory(GiB)": 138.1, + "step": 82450, + "train_speed(iter/s)": 0.200981 + }, + { + "acc": 0.78147302, + "epoch": 1.9238542410942125, + "grad_norm": 5.875, + "learning_rate": 3.792567899495281e-08, + "loss": 0.78904285, + "memory(GiB)": 138.1, + "step": 82460, + "train_speed(iter/s)": 0.200994 + }, + { + "acc": 0.79638376, + "epoch": 1.9240875486665014, + "grad_norm": 6.28125, + "learning_rate": 3.7693797356254115e-08, + "loss": 0.72602043, + "memory(GiB)": 138.1, + "step": 82470, + "train_speed(iter/s)": 0.201007 + }, + { + "acc": 0.79340577, + "epoch": 1.9243208562387903, + "grad_norm": 5.25, + "learning_rate": 3.746262408532375e-08, + "loss": 0.72227745, + "memory(GiB)": 138.1, + "step": 82480, + "train_speed(iter/s)": 0.201019 + }, + { + "acc": 0.76795883, + "epoch": 1.9245541638110792, + "grad_norm": 6.40625, + "learning_rate": 3.723215921516254e-08, + "loss": 0.83940735, + "memory(GiB)": 138.1, + "step": 82490, + "train_speed(iter/s)": 0.201032 + }, + { + "acc": 0.78825803, + "epoch": 1.9247874713833681, + "grad_norm": 6.46875, + "learning_rate": 3.7002402778668625e-08, + "loss": 0.78783998, + "memory(GiB)": 138.1, + "step": 82500, + "train_speed(iter/s)": 0.201046 + }, + { + "epoch": 1.9247874713833681, + "eval_acc": 0.7447004670286247, + "eval_loss": 0.8044062852859497, + "eval_runtime": 1271.4855, + "eval_samples_per_second": 28.306, + "eval_steps_per_second": 14.154, + "step": 82500 + }, + { + "acc": 0.79530158, + "epoch": 1.9250207789556568, + "grad_norm": 4.71875, + "learning_rate": 3.6773354808640194e-08, + "loss": 0.71304455, + "memory(GiB)": 138.1, + "step": 82510, + "train_speed(iter/s)": 0.200425 + }, + { + "acc": 0.79802475, + "epoch": 1.925254086527946, + "grad_norm": 3.96875, + "learning_rate": 3.654501533777388e-08, + "loss": 0.71960392, + "memory(GiB)": 138.1, + "step": 82520, + "train_speed(iter/s)": 0.200437 + }, + { + "acc": 0.77804232, + "epoch": 1.9254873941002346, + "grad_norm": 4.375, + "learning_rate": 3.6317384398664167e-08, + "loss": 0.79983625, + "memory(GiB)": 138.1, + "step": 82530, + "train_speed(iter/s)": 0.20045 + }, + { + "acc": 0.78172102, + "epoch": 1.9257207016725237, + "grad_norm": 5.0625, + "learning_rate": 3.6090462023806175e-08, + "loss": 0.79019833, + "memory(GiB)": 138.1, + "step": 82540, + "train_speed(iter/s)": 0.200463 + }, + { + "acc": 0.77815161, + "epoch": 1.9259540092448124, + "grad_norm": 3.71875, + "learning_rate": 3.586424824559287e-08, + "loss": 0.80372667, + "memory(GiB)": 138.1, + "step": 82550, + "train_speed(iter/s)": 0.200475 + }, + { + "acc": 0.77129049, + "epoch": 1.9261873168171015, + "grad_norm": 5.28125, + "learning_rate": 3.563874309631565e-08, + "loss": 0.84257956, + "memory(GiB)": 138.1, + "step": 82560, + "train_speed(iter/s)": 0.200487 + }, + { + "acc": 0.77810478, + "epoch": 1.9264206243893902, + "grad_norm": 5.03125, + "learning_rate": 3.5413946608165995e-08, + "loss": 0.79976082, + "memory(GiB)": 138.1, + "step": 82570, + "train_speed(iter/s)": 0.200499 + }, + { + "acc": 0.77861633, + "epoch": 1.9266539319616793, + "grad_norm": 5.5625, + "learning_rate": 3.518985881323322e-08, + "loss": 0.78327084, + "memory(GiB)": 138.1, + "step": 82580, + "train_speed(iter/s)": 0.200511 + }, + { + "acc": 0.80329628, + "epoch": 1.926887239533968, + "grad_norm": 6.125, + "learning_rate": 3.496647974350509e-08, + "loss": 0.69266319, + "memory(GiB)": 138.1, + "step": 82590, + "train_speed(iter/s)": 0.200524 + }, + { + "acc": 0.77527142, + "epoch": 1.9271205471062571, + "grad_norm": 4.8125, + "learning_rate": 3.4743809430870525e-08, + "loss": 0.80601139, + "memory(GiB)": 138.1, + "step": 82600, + "train_speed(iter/s)": 0.200538 + }, + { + "acc": 0.7910471, + "epoch": 1.9273538546785458, + "grad_norm": 5.1875, + "learning_rate": 3.452184790711466e-08, + "loss": 0.7612885, + "memory(GiB)": 138.1, + "step": 82610, + "train_speed(iter/s)": 0.200551 + }, + { + "acc": 0.77904315, + "epoch": 1.927587162250835, + "grad_norm": 5.53125, + "learning_rate": 3.430059520392215e-08, + "loss": 0.79040279, + "memory(GiB)": 138.1, + "step": 82620, + "train_speed(iter/s)": 0.200564 + }, + { + "acc": 0.80225992, + "epoch": 1.9278204698231236, + "grad_norm": 4.65625, + "learning_rate": 3.408005135287773e-08, + "loss": 0.70399323, + "memory(GiB)": 138.1, + "step": 82630, + "train_speed(iter/s)": 0.200576 + }, + { + "acc": 0.75945187, + "epoch": 1.9280537773954127, + "grad_norm": 5.90625, + "learning_rate": 3.386021638546233e-08, + "loss": 0.87596512, + "memory(GiB)": 138.1, + "step": 82640, + "train_speed(iter/s)": 0.200586 + }, + { + "acc": 0.7902338, + "epoch": 1.9282870849677014, + "grad_norm": 5.5, + "learning_rate": 3.364109033305918e-08, + "loss": 0.74522243, + "memory(GiB)": 138.1, + "step": 82650, + "train_speed(iter/s)": 0.200599 + }, + { + "acc": 0.78515444, + "epoch": 1.9285203925399905, + "grad_norm": 4.71875, + "learning_rate": 3.3422673226947145e-08, + "loss": 0.76768475, + "memory(GiB)": 138.1, + "step": 82660, + "train_speed(iter/s)": 0.200611 + }, + { + "acc": 0.77809896, + "epoch": 1.9287537001122792, + "grad_norm": 6.21875, + "learning_rate": 3.3204965098306284e-08, + "loss": 0.79778728, + "memory(GiB)": 138.1, + "step": 82670, + "train_speed(iter/s)": 0.200624 + }, + { + "acc": 0.77451992, + "epoch": 1.9289870076845683, + "grad_norm": 7.25, + "learning_rate": 3.298796597821341e-08, + "loss": 0.80516586, + "memory(GiB)": 138.1, + "step": 82680, + "train_speed(iter/s)": 0.200636 + }, + { + "acc": 0.77414317, + "epoch": 1.929220315256857, + "grad_norm": 6.0, + "learning_rate": 3.2771675897645405e-08, + "loss": 0.80401745, + "memory(GiB)": 138.1, + "step": 82690, + "train_speed(iter/s)": 0.200649 + }, + { + "acc": 0.78727055, + "epoch": 1.929453622829146, + "grad_norm": 5.78125, + "learning_rate": 3.255609488747813e-08, + "loss": 0.74130497, + "memory(GiB)": 138.1, + "step": 82700, + "train_speed(iter/s)": 0.200661 + }, + { + "acc": 0.78334398, + "epoch": 1.9296869304014348, + "grad_norm": 4.40625, + "learning_rate": 3.234122297848474e-08, + "loss": 0.75857935, + "memory(GiB)": 138.1, + "step": 82710, + "train_speed(iter/s)": 0.200673 + }, + { + "acc": 0.78041697, + "epoch": 1.9299202379737237, + "grad_norm": 5.4375, + "learning_rate": 3.212706020133904e-08, + "loss": 0.80112228, + "memory(GiB)": 138.1, + "step": 82720, + "train_speed(iter/s)": 0.200685 + }, + { + "acc": 0.78621788, + "epoch": 1.9301535455460126, + "grad_norm": 5.875, + "learning_rate": 3.1913606586612135e-08, + "loss": 0.74973865, + "memory(GiB)": 138.1, + "step": 82730, + "train_speed(iter/s)": 0.200697 + }, + { + "acc": 0.76397328, + "epoch": 1.9303868531183015, + "grad_norm": 6.0, + "learning_rate": 3.170086216477464e-08, + "loss": 0.86834965, + "memory(GiB)": 138.1, + "step": 82740, + "train_speed(iter/s)": 0.20071 + }, + { + "acc": 0.76520152, + "epoch": 1.9306201606905904, + "grad_norm": 4.59375, + "learning_rate": 3.148882696619615e-08, + "loss": 0.86656876, + "memory(GiB)": 138.1, + "step": 82750, + "train_speed(iter/s)": 0.200721 + }, + { + "acc": 0.78473167, + "epoch": 1.9308534682628793, + "grad_norm": 5.59375, + "learning_rate": 3.127750102114358e-08, + "loss": 0.77946978, + "memory(GiB)": 138.1, + "step": 82760, + "train_speed(iter/s)": 0.200733 + }, + { + "acc": 0.78480234, + "epoch": 1.9310867758351682, + "grad_norm": 8.0, + "learning_rate": 3.1066884359785e-08, + "loss": 0.77041464, + "memory(GiB)": 138.1, + "step": 82770, + "train_speed(iter/s)": 0.200747 + }, + { + "acc": 0.81039772, + "epoch": 1.931320083407457, + "grad_norm": 5.5, + "learning_rate": 3.085697701218471e-08, + "loss": 0.65555139, + "memory(GiB)": 138.1, + "step": 82780, + "train_speed(iter/s)": 0.20076 + }, + { + "acc": 0.77078209, + "epoch": 1.931553390979746, + "grad_norm": 4.8125, + "learning_rate": 3.064777900830762e-08, + "loss": 0.82027807, + "memory(GiB)": 138.1, + "step": 82790, + "train_speed(iter/s)": 0.200772 + }, + { + "acc": 0.77345366, + "epoch": 1.9317866985520349, + "grad_norm": 5.875, + "learning_rate": 3.043929037801596e-08, + "loss": 0.79783955, + "memory(GiB)": 138.1, + "step": 82800, + "train_speed(iter/s)": 0.200786 + }, + { + "acc": 0.78663902, + "epoch": 1.9320200061243238, + "grad_norm": 8.375, + "learning_rate": 3.023151115107259e-08, + "loss": 0.77612534, + "memory(GiB)": 138.1, + "step": 82810, + "train_speed(iter/s)": 0.200799 + }, + { + "acc": 0.78559351, + "epoch": 1.9322533136966127, + "grad_norm": 7.21875, + "learning_rate": 3.002444135713711e-08, + "loss": 0.76495595, + "memory(GiB)": 138.1, + "step": 82820, + "train_speed(iter/s)": 0.200811 + }, + { + "acc": 0.79293275, + "epoch": 1.9324866212689016, + "grad_norm": 5.03125, + "learning_rate": 2.9818081025768667e-08, + "loss": 0.75699339, + "memory(GiB)": 138.1, + "step": 82830, + "train_speed(iter/s)": 0.200823 + }, + { + "acc": 0.77729588, + "epoch": 1.9327199288411905, + "grad_norm": 7.78125, + "learning_rate": 2.9612430186425346e-08, + "loss": 0.81681576, + "memory(GiB)": 138.1, + "step": 82840, + "train_speed(iter/s)": 0.200836 + }, + { + "acc": 0.78396635, + "epoch": 1.9329532364134794, + "grad_norm": 12.375, + "learning_rate": 2.9407488868463675e-08, + "loss": 0.77789707, + "memory(GiB)": 138.1, + "step": 82850, + "train_speed(iter/s)": 0.200849 + }, + { + "acc": 0.77509494, + "epoch": 1.9331865439857683, + "grad_norm": 6.46875, + "learning_rate": 2.9203257101139694e-08, + "loss": 0.80295143, + "memory(GiB)": 138.1, + "step": 82860, + "train_speed(iter/s)": 0.200862 + }, + { + "acc": 0.78139973, + "epoch": 1.9334198515580572, + "grad_norm": 5.03125, + "learning_rate": 2.8999734913606193e-08, + "loss": 0.7943119, + "memory(GiB)": 138.1, + "step": 82870, + "train_speed(iter/s)": 0.200875 + }, + { + "acc": 0.77936926, + "epoch": 1.933653159130346, + "grad_norm": 8.5, + "learning_rate": 2.8796922334916044e-08, + "loss": 0.78870902, + "memory(GiB)": 138.1, + "step": 82880, + "train_speed(iter/s)": 0.200887 + }, + { + "acc": 0.78523445, + "epoch": 1.933886466702635, + "grad_norm": 4.71875, + "learning_rate": 2.8594819394021646e-08, + "loss": 0.76724324, + "memory(GiB)": 138.1, + "step": 82890, + "train_speed(iter/s)": 0.2009 + }, + { + "acc": 0.78431993, + "epoch": 1.9341197742749237, + "grad_norm": 4.65625, + "learning_rate": 2.8393426119772138e-08, + "loss": 0.77278762, + "memory(GiB)": 138.1, + "step": 82900, + "train_speed(iter/s)": 0.200912 + }, + { + "acc": 0.77910967, + "epoch": 1.9343530818472128, + "grad_norm": 14.5625, + "learning_rate": 2.8192742540917305e-08, + "loss": 0.80649271, + "memory(GiB)": 138.1, + "step": 82910, + "train_speed(iter/s)": 0.200925 + }, + { + "acc": 0.78101177, + "epoch": 1.9345863894195015, + "grad_norm": 5.8125, + "learning_rate": 2.799276868610368e-08, + "loss": 0.7846736, + "memory(GiB)": 138.1, + "step": 82920, + "train_speed(iter/s)": 0.200938 + }, + { + "acc": 0.7781353, + "epoch": 1.9348196969917906, + "grad_norm": 4.75, + "learning_rate": 2.7793504583878417e-08, + "loss": 0.79075127, + "memory(GiB)": 138.1, + "step": 82930, + "train_speed(iter/s)": 0.20095 + }, + { + "acc": 0.77793598, + "epoch": 1.9350530045640792, + "grad_norm": 5.53125, + "learning_rate": 2.7594950262685438e-08, + "loss": 0.78132844, + "memory(GiB)": 138.1, + "step": 82940, + "train_speed(iter/s)": 0.200962 + }, + { + "acc": 0.76155319, + "epoch": 1.9352863121363684, + "grad_norm": 8.5625, + "learning_rate": 2.739710575086929e-08, + "loss": 0.8619318, + "memory(GiB)": 138.1, + "step": 82950, + "train_speed(iter/s)": 0.200975 + }, + { + "acc": 0.80455065, + "epoch": 1.935519619708657, + "grad_norm": 5.09375, + "learning_rate": 2.719997107667127e-08, + "loss": 0.71163216, + "memory(GiB)": 138.1, + "step": 82960, + "train_speed(iter/s)": 0.200987 + }, + { + "acc": 0.78168869, + "epoch": 1.9357529272809462, + "grad_norm": 8.0, + "learning_rate": 2.7003546268233317e-08, + "loss": 0.76260276, + "memory(GiB)": 138.1, + "step": 82970, + "train_speed(iter/s)": 0.200999 + }, + { + "acc": 0.75511971, + "epoch": 1.9359862348532348, + "grad_norm": 4.59375, + "learning_rate": 2.6807831353594106e-08, + "loss": 0.87710342, + "memory(GiB)": 138.1, + "step": 82980, + "train_speed(iter/s)": 0.20101 + }, + { + "acc": 0.80496244, + "epoch": 1.936219542425524, + "grad_norm": 5.28125, + "learning_rate": 2.66128263606924e-08, + "loss": 0.68820372, + "memory(GiB)": 138.1, + "step": 82990, + "train_speed(iter/s)": 0.201023 + }, + { + "acc": 0.78904171, + "epoch": 1.9364528499978126, + "grad_norm": 5.65625, + "learning_rate": 2.6418531317364825e-08, + "loss": 0.74845371, + "memory(GiB)": 138.1, + "step": 83000, + "train_speed(iter/s)": 0.201035 + }, + { + "epoch": 1.9364528499978126, + "eval_acc": 0.7447185633268747, + "eval_loss": 0.8043991327285767, + "eval_runtime": 1272.2124, + "eval_samples_per_second": 28.29, + "eval_steps_per_second": 14.145, + "step": 83000 + }, + { + "acc": 0.76983991, + "epoch": 1.9366861575701018, + "grad_norm": 5.0625, + "learning_rate": 2.6224946251346973e-08, + "loss": 0.84707108, + "memory(GiB)": 138.1, + "step": 83010, + "train_speed(iter/s)": 0.200418 + }, + { + "acc": 0.76536455, + "epoch": 1.9369194651423904, + "grad_norm": 4.5625, + "learning_rate": 2.60320711902734e-08, + "loss": 0.84149084, + "memory(GiB)": 138.1, + "step": 83020, + "train_speed(iter/s)": 0.200431 + }, + { + "acc": 0.78333254, + "epoch": 1.9371527727146796, + "grad_norm": 5.375, + "learning_rate": 2.5839906161676532e-08, + "loss": 0.77945223, + "memory(GiB)": 138.1, + "step": 83030, + "train_speed(iter/s)": 0.200443 + }, + { + "acc": 0.79502096, + "epoch": 1.9373860802869682, + "grad_norm": 5.1875, + "learning_rate": 2.564845119298831e-08, + "loss": 0.73158178, + "memory(GiB)": 138.1, + "step": 83040, + "train_speed(iter/s)": 0.200455 + }, + { + "acc": 0.77735152, + "epoch": 1.9376193878592574, + "grad_norm": 5.25, + "learning_rate": 2.5457706311538544e-08, + "loss": 0.81248274, + "memory(GiB)": 138.1, + "step": 83050, + "train_speed(iter/s)": 0.200467 + }, + { + "acc": 0.78180962, + "epoch": 1.937852695431546, + "grad_norm": 4.5625, + "learning_rate": 2.526767154455545e-08, + "loss": 0.7838274, + "memory(GiB)": 138.1, + "step": 83060, + "train_speed(iter/s)": 0.200479 + }, + { + "acc": 0.80066261, + "epoch": 1.9380860030038352, + "grad_norm": 5.5, + "learning_rate": 2.5078346919167883e-08, + "loss": 0.71001968, + "memory(GiB)": 138.1, + "step": 83070, + "train_speed(iter/s)": 0.200491 + }, + { + "acc": 0.76299033, + "epoch": 1.9383193105761238, + "grad_norm": 6.84375, + "learning_rate": 2.4889732462400895e-08, + "loss": 0.84023495, + "memory(GiB)": 138.1, + "step": 83080, + "train_speed(iter/s)": 0.200504 + }, + { + "acc": 0.79042563, + "epoch": 1.9385526181484127, + "grad_norm": 5.71875, + "learning_rate": 2.4701828201179057e-08, + "loss": 0.74814739, + "memory(GiB)": 138.1, + "step": 83090, + "train_speed(iter/s)": 0.200516 + }, + { + "acc": 0.78727136, + "epoch": 1.9387859257207016, + "grad_norm": 5.53125, + "learning_rate": 2.451463416232591e-08, + "loss": 0.76182685, + "memory(GiB)": 138.1, + "step": 83100, + "train_speed(iter/s)": 0.200529 + }, + { + "acc": 0.78427858, + "epoch": 1.9390192332929905, + "grad_norm": 6.0625, + "learning_rate": 2.4328150372563974e-08, + "loss": 0.77642002, + "memory(GiB)": 138.1, + "step": 83110, + "train_speed(iter/s)": 0.200541 + }, + { + "acc": 0.75748053, + "epoch": 1.9392525408652794, + "grad_norm": 5.46875, + "learning_rate": 2.4142376858512506e-08, + "loss": 0.87451792, + "memory(GiB)": 138.1, + "step": 83120, + "train_speed(iter/s)": 0.200554 + }, + { + "acc": 0.79059162, + "epoch": 1.9394858484375683, + "grad_norm": 5.4375, + "learning_rate": 2.3957313646691406e-08, + "loss": 0.74902925, + "memory(GiB)": 138.1, + "step": 83130, + "train_speed(iter/s)": 0.200567 + }, + { + "acc": 0.78442965, + "epoch": 1.9397191560098572, + "grad_norm": 4.5625, + "learning_rate": 2.3772960763518428e-08, + "loss": 0.77176137, + "memory(GiB)": 138.1, + "step": 83140, + "train_speed(iter/s)": 0.20058 + }, + { + "acc": 0.78968287, + "epoch": 1.9399524635821461, + "grad_norm": 4.34375, + "learning_rate": 2.358931823530919e-08, + "loss": 0.73917861, + "memory(GiB)": 138.1, + "step": 83150, + "train_speed(iter/s)": 0.200592 + }, + { + "acc": 0.79481835, + "epoch": 1.940185771154435, + "grad_norm": 4.625, + "learning_rate": 2.3406386088279387e-08, + "loss": 0.73219633, + "memory(GiB)": 138.1, + "step": 83160, + "train_speed(iter/s)": 0.200605 + }, + { + "acc": 0.78026934, + "epoch": 1.940419078726724, + "grad_norm": 9.0, + "learning_rate": 2.3224164348542576e-08, + "loss": 0.78946395, + "memory(GiB)": 138.1, + "step": 83170, + "train_speed(iter/s)": 0.200617 + }, + { + "acc": 0.77109671, + "epoch": 1.9406523862990128, + "grad_norm": 10.5625, + "learning_rate": 2.3042653042110175e-08, + "loss": 0.8287899, + "memory(GiB)": 138.1, + "step": 83180, + "train_speed(iter/s)": 0.20063 + }, + { + "acc": 0.77708349, + "epoch": 1.9408856938713017, + "grad_norm": 4.84375, + "learning_rate": 2.2861852194893118e-08, + "loss": 0.80719547, + "memory(GiB)": 138.1, + "step": 83190, + "train_speed(iter/s)": 0.200642 + }, + { + "acc": 0.78134727, + "epoch": 1.9411190014435906, + "grad_norm": 6.28125, + "learning_rate": 2.2681761832701323e-08, + "loss": 0.78660898, + "memory(GiB)": 138.1, + "step": 83200, + "train_speed(iter/s)": 0.200654 + }, + { + "acc": 0.76660013, + "epoch": 1.9413523090158795, + "grad_norm": 7.09375, + "learning_rate": 2.2502381981241993e-08, + "loss": 0.84043627, + "memory(GiB)": 138.1, + "step": 83210, + "train_speed(iter/s)": 0.200666 + }, + { + "acc": 0.78337593, + "epoch": 1.9415856165881684, + "grad_norm": 4.5, + "learning_rate": 2.2323712666121324e-08, + "loss": 0.77146144, + "memory(GiB)": 138.1, + "step": 83220, + "train_speed(iter/s)": 0.200679 + }, + { + "acc": 0.78208766, + "epoch": 1.9418189241604573, + "grad_norm": 4.625, + "learning_rate": 2.2145753912845014e-08, + "loss": 0.78833818, + "memory(GiB)": 138.1, + "step": 83230, + "train_speed(iter/s)": 0.200692 + }, + { + "acc": 0.77869391, + "epoch": 1.9420522317327462, + "grad_norm": 7.71875, + "learning_rate": 2.1968505746815527e-08, + "loss": 0.79904451, + "memory(GiB)": 138.1, + "step": 83240, + "train_speed(iter/s)": 0.200703 + }, + { + "acc": 0.78262606, + "epoch": 1.9422855393050351, + "grad_norm": 5.78125, + "learning_rate": 2.179196819333651e-08, + "loss": 0.77370358, + "memory(GiB)": 138.1, + "step": 83250, + "train_speed(iter/s)": 0.200716 + }, + { + "acc": 0.77597246, + "epoch": 1.942518846877324, + "grad_norm": 7.6875, + "learning_rate": 2.1616141277607804e-08, + "loss": 0.80707121, + "memory(GiB)": 138.1, + "step": 83260, + "train_speed(iter/s)": 0.200729 + }, + { + "acc": 0.79199247, + "epoch": 1.9427521544496127, + "grad_norm": 5.6875, + "learning_rate": 2.1441025024728225e-08, + "loss": 0.74981995, + "memory(GiB)": 138.1, + "step": 83270, + "train_speed(iter/s)": 0.200741 + }, + { + "acc": 0.77484131, + "epoch": 1.9429854620219018, + "grad_norm": 5.78125, + "learning_rate": 2.1266619459696102e-08, + "loss": 0.81854687, + "memory(GiB)": 138.1, + "step": 83280, + "train_speed(iter/s)": 0.200753 + }, + { + "acc": 0.78900671, + "epoch": 1.9432187695941905, + "grad_norm": 5.96875, + "learning_rate": 2.1092924607408195e-08, + "loss": 0.76061816, + "memory(GiB)": 138.1, + "step": 83290, + "train_speed(iter/s)": 0.200766 + }, + { + "acc": 0.7663866, + "epoch": 1.9434520771664796, + "grad_norm": 5.0, + "learning_rate": 2.091994049265855e-08, + "loss": 0.85271311, + "memory(GiB)": 138.1, + "step": 83300, + "train_speed(iter/s)": 0.200778 + }, + { + "acc": 0.79749975, + "epoch": 1.9436853847387683, + "grad_norm": 6.40625, + "learning_rate": 2.074766714014076e-08, + "loss": 0.72964478, + "memory(GiB)": 138.1, + "step": 83310, + "train_speed(iter/s)": 0.200791 + }, + { + "acc": 0.77364941, + "epoch": 1.9439186923110574, + "grad_norm": 5.90625, + "learning_rate": 2.057610457444792e-08, + "loss": 0.82341738, + "memory(GiB)": 138.1, + "step": 83320, + "train_speed(iter/s)": 0.200804 + }, + { + "acc": 0.78957624, + "epoch": 1.944151999883346, + "grad_norm": 5.875, + "learning_rate": 2.0405252820068776e-08, + "loss": 0.76410146, + "memory(GiB)": 138.1, + "step": 83330, + "train_speed(iter/s)": 0.200817 + }, + { + "acc": 0.77884073, + "epoch": 1.9443853074556352, + "grad_norm": 3.71875, + "learning_rate": 2.0235111901393266e-08, + "loss": 0.77160797, + "memory(GiB)": 138.1, + "step": 83340, + "train_speed(iter/s)": 0.200828 + }, + { + "acc": 0.76247535, + "epoch": 1.9446186150279239, + "grad_norm": 5.84375, + "learning_rate": 2.0065681842709185e-08, + "loss": 0.86031971, + "memory(GiB)": 138.1, + "step": 83350, + "train_speed(iter/s)": 0.20084 + }, + { + "acc": 0.77901812, + "epoch": 1.944851922600213, + "grad_norm": 4.34375, + "learning_rate": 1.989696266820218e-08, + "loss": 0.79860239, + "memory(GiB)": 138.1, + "step": 83360, + "train_speed(iter/s)": 0.200853 + }, + { + "acc": 0.77776785, + "epoch": 1.9450852301725017, + "grad_norm": 5.84375, + "learning_rate": 1.972895440195688e-08, + "loss": 0.7847723, + "memory(GiB)": 138.1, + "step": 83370, + "train_speed(iter/s)": 0.200865 + }, + { + "acc": 0.79139214, + "epoch": 1.9453185377447908, + "grad_norm": 4.375, + "learning_rate": 1.9561657067956874e-08, + "loss": 0.75544043, + "memory(GiB)": 138.1, + "step": 83380, + "train_speed(iter/s)": 0.200877 + }, + { + "acc": 0.76193299, + "epoch": 1.9455518453170795, + "grad_norm": 5.375, + "learning_rate": 1.9395070690083617e-08, + "loss": 0.84404163, + "memory(GiB)": 138.1, + "step": 83390, + "train_speed(iter/s)": 0.200889 + }, + { + "acc": 0.78581266, + "epoch": 1.9457851528893686, + "grad_norm": 5.4375, + "learning_rate": 1.9229195292116976e-08, + "loss": 0.76360941, + "memory(GiB)": 138.1, + "step": 83400, + "train_speed(iter/s)": 0.200901 + }, + { + "acc": 0.79550982, + "epoch": 1.9460184604616573, + "grad_norm": 6.375, + "learning_rate": 1.9064030897735232e-08, + "loss": 0.73615227, + "memory(GiB)": 138.1, + "step": 83410, + "train_speed(iter/s)": 0.200913 + }, + { + "acc": 0.78520379, + "epoch": 1.9462517680339464, + "grad_norm": 4.21875, + "learning_rate": 1.8899577530516744e-08, + "loss": 0.77724113, + "memory(GiB)": 138.1, + "step": 83420, + "train_speed(iter/s)": 0.200926 + }, + { + "acc": 0.78606596, + "epoch": 1.946485075606235, + "grad_norm": 5.90625, + "learning_rate": 1.8735835213936627e-08, + "loss": 0.76647534, + "memory(GiB)": 138.1, + "step": 83430, + "train_speed(iter/s)": 0.200938 + }, + { + "acc": 0.78347244, + "epoch": 1.9467183831785242, + "grad_norm": 5.59375, + "learning_rate": 1.8572803971368404e-08, + "loss": 0.80083179, + "memory(GiB)": 138.1, + "step": 83440, + "train_speed(iter/s)": 0.200951 + }, + { + "acc": 0.77931328, + "epoch": 1.9469516907508129, + "grad_norm": 8.6875, + "learning_rate": 1.841048382608568e-08, + "loss": 0.79017577, + "memory(GiB)": 138.1, + "step": 83450, + "train_speed(iter/s)": 0.200963 + }, + { + "acc": 0.77811432, + "epoch": 1.947184998323102, + "grad_norm": 5.8125, + "learning_rate": 1.8248874801259363e-08, + "loss": 0.79184465, + "memory(GiB)": 138.1, + "step": 83460, + "train_speed(iter/s)": 0.200976 + }, + { + "acc": 0.79206524, + "epoch": 1.9474183058953907, + "grad_norm": 5.625, + "learning_rate": 1.8087976919958783e-08, + "loss": 0.73837852, + "memory(GiB)": 138.1, + "step": 83470, + "train_speed(iter/s)": 0.200989 + }, + { + "acc": 0.78631954, + "epoch": 1.9476516134676796, + "grad_norm": 7.09375, + "learning_rate": 1.792779020515223e-08, + "loss": 0.76524954, + "memory(GiB)": 138.1, + "step": 83480, + "train_speed(iter/s)": 0.201002 + }, + { + "acc": 0.78290014, + "epoch": 1.9478849210399685, + "grad_norm": 6.65625, + "learning_rate": 1.776831467970641e-08, + "loss": 0.77942362, + "memory(GiB)": 138.1, + "step": 83490, + "train_speed(iter/s)": 0.201014 + }, + { + "acc": 0.79167795, + "epoch": 1.9481182286122574, + "grad_norm": 5.4375, + "learning_rate": 1.7609550366385897e-08, + "loss": 0.74010186, + "memory(GiB)": 138.1, + "step": 83500, + "train_speed(iter/s)": 0.201027 + }, + { + "epoch": 1.9481182286122574, + "eval_acc": 0.7446927801054742, + "eval_loss": 0.8044261932373047, + "eval_runtime": 1271.6845, + "eval_samples_per_second": 28.302, + "eval_steps_per_second": 14.151, + "step": 83500 + }, + { + "acc": 0.77565384, + "epoch": 1.9483515361845463, + "grad_norm": 5.25, + "learning_rate": 1.7451497287855334e-08, + "loss": 0.77604108, + "memory(GiB)": 138.1, + "step": 83510, + "train_speed(iter/s)": 0.200414 + }, + { + "acc": 0.7824501, + "epoch": 1.9485848437568352, + "grad_norm": 3.953125, + "learning_rate": 1.7294155466675567e-08, + "loss": 0.80296421, + "memory(GiB)": 138.1, + "step": 83520, + "train_speed(iter/s)": 0.200427 + }, + { + "acc": 0.7742733, + "epoch": 1.948818151329124, + "grad_norm": 4.5625, + "learning_rate": 1.7137524925307515e-08, + "loss": 0.83972244, + "memory(GiB)": 138.1, + "step": 83530, + "train_speed(iter/s)": 0.20044 + }, + { + "acc": 0.76697764, + "epoch": 1.949051458901413, + "grad_norm": 6.34375, + "learning_rate": 1.698160568611107e-08, + "loss": 0.82968264, + "memory(GiB)": 138.1, + "step": 83540, + "train_speed(iter/s)": 0.200453 + }, + { + "acc": 0.77781119, + "epoch": 1.9492847664737019, + "grad_norm": 5.5625, + "learning_rate": 1.6826397771342317e-08, + "loss": 0.80852528, + "memory(GiB)": 138.1, + "step": 83550, + "train_speed(iter/s)": 0.200466 + }, + { + "acc": 0.77753439, + "epoch": 1.9495180740459908, + "grad_norm": 7.4375, + "learning_rate": 1.6671901203157425e-08, + "loss": 0.83437614, + "memory(GiB)": 138.1, + "step": 83560, + "train_speed(iter/s)": 0.200479 + }, + { + "acc": 0.79008942, + "epoch": 1.9497513816182797, + "grad_norm": 5.875, + "learning_rate": 1.6518116003611527e-08, + "loss": 0.74177456, + "memory(GiB)": 138.1, + "step": 83570, + "train_speed(iter/s)": 0.200492 + }, + { + "acc": 0.80940037, + "epoch": 1.9499846891905686, + "grad_norm": 10.3125, + "learning_rate": 1.636504219465651e-08, + "loss": 0.66792626, + "memory(GiB)": 138.1, + "step": 83580, + "train_speed(iter/s)": 0.200505 + }, + { + "acc": 0.76242118, + "epoch": 1.9502179967628575, + "grad_norm": 5.65625, + "learning_rate": 1.6212679798143782e-08, + "loss": 0.85964575, + "memory(GiB)": 138.1, + "step": 83590, + "train_speed(iter/s)": 0.200517 + }, + { + "acc": 0.7804348, + "epoch": 1.9504513043351464, + "grad_norm": 5.96875, + "learning_rate": 1.6061028835823723e-08, + "loss": 0.78724127, + "memory(GiB)": 138.1, + "step": 83600, + "train_speed(iter/s)": 0.20053 + }, + { + "acc": 0.76076965, + "epoch": 1.9506846119074353, + "grad_norm": 6.0625, + "learning_rate": 1.591008932934346e-08, + "loss": 0.86802006, + "memory(GiB)": 138.1, + "step": 83610, + "train_speed(iter/s)": 0.200542 + }, + { + "acc": 0.77423449, + "epoch": 1.9509179194797241, + "grad_norm": 5.6875, + "learning_rate": 1.57598613002502e-08, + "loss": 0.81039219, + "memory(GiB)": 138.1, + "step": 83620, + "train_speed(iter/s)": 0.200555 + }, + { + "acc": 0.76987734, + "epoch": 1.951151227052013, + "grad_norm": 6.78125, + "learning_rate": 1.5610344769989017e-08, + "loss": 0.83137999, + "memory(GiB)": 138.1, + "step": 83630, + "train_speed(iter/s)": 0.200567 + }, + { + "acc": 0.76419888, + "epoch": 1.951384534624302, + "grad_norm": 5.84375, + "learning_rate": 1.5461539759902832e-08, + "loss": 0.84437933, + "memory(GiB)": 138.1, + "step": 83640, + "train_speed(iter/s)": 0.200579 + }, + { + "acc": 0.79972954, + "epoch": 1.9516178421965908, + "grad_norm": 4.40625, + "learning_rate": 1.5313446291234104e-08, + "loss": 0.706955, + "memory(GiB)": 138.1, + "step": 83650, + "train_speed(iter/s)": 0.200591 + }, + { + "acc": 0.79801159, + "epoch": 1.9518511497688795, + "grad_norm": 4.5, + "learning_rate": 1.516606438512258e-08, + "loss": 0.73285971, + "memory(GiB)": 138.1, + "step": 83660, + "train_speed(iter/s)": 0.200604 + }, + { + "acc": 0.77286844, + "epoch": 1.9520844573411686, + "grad_norm": 6.5, + "learning_rate": 1.501939406260755e-08, + "loss": 0.82769775, + "memory(GiB)": 138.1, + "step": 83670, + "train_speed(iter/s)": 0.200617 + }, + { + "acc": 0.78321648, + "epoch": 1.9523177649134573, + "grad_norm": 5.28125, + "learning_rate": 1.4873435344625597e-08, + "loss": 0.78056812, + "memory(GiB)": 138.1, + "step": 83680, + "train_speed(iter/s)": 0.200629 + }, + { + "acc": 0.77889185, + "epoch": 1.9525510724857464, + "grad_norm": 4.09375, + "learning_rate": 1.4728188252012832e-08, + "loss": 0.79473181, + "memory(GiB)": 138.1, + "step": 83690, + "train_speed(iter/s)": 0.200642 + }, + { + "acc": 0.78766642, + "epoch": 1.9527843800580351, + "grad_norm": 4.9375, + "learning_rate": 1.4583652805503223e-08, + "loss": 0.76746063, + "memory(GiB)": 138.1, + "step": 83700, + "train_speed(iter/s)": 0.200654 + }, + { + "acc": 0.79879084, + "epoch": 1.9530176876303242, + "grad_norm": 7.625, + "learning_rate": 1.4439829025728047e-08, + "loss": 0.72144575, + "memory(GiB)": 138.1, + "step": 83710, + "train_speed(iter/s)": 0.200667 + }, + { + "acc": 0.78694019, + "epoch": 1.953250995202613, + "grad_norm": 8.3125, + "learning_rate": 1.4296716933219768e-08, + "loss": 0.7669951, + "memory(GiB)": 138.1, + "step": 83720, + "train_speed(iter/s)": 0.200679 + }, + { + "acc": 0.77646313, + "epoch": 1.953484302774902, + "grad_norm": 5.09375, + "learning_rate": 1.4154316548406488e-08, + "loss": 0.8079896, + "memory(GiB)": 138.1, + "step": 83730, + "train_speed(iter/s)": 0.200692 + }, + { + "acc": 0.77782555, + "epoch": 1.9537176103471907, + "grad_norm": 5.625, + "learning_rate": 1.4012627891615838e-08, + "loss": 0.79529219, + "memory(GiB)": 138.1, + "step": 83740, + "train_speed(iter/s)": 0.200704 + }, + { + "acc": 0.77345643, + "epoch": 1.9539509179194798, + "grad_norm": 7.25, + "learning_rate": 1.3871650983074414e-08, + "loss": 0.81681423, + "memory(GiB)": 138.1, + "step": 83750, + "train_speed(iter/s)": 0.200717 + }, + { + "acc": 0.77701006, + "epoch": 1.9541842254917685, + "grad_norm": 5.625, + "learning_rate": 1.3731385842906675e-08, + "loss": 0.79945378, + "memory(GiB)": 138.1, + "step": 83760, + "train_speed(iter/s)": 0.200729 + }, + { + "acc": 0.78949852, + "epoch": 1.9544175330640576, + "grad_norm": 6.1875, + "learning_rate": 1.3591832491134382e-08, + "loss": 0.75286026, + "memory(GiB)": 138.1, + "step": 83770, + "train_speed(iter/s)": 0.200742 + }, + { + "acc": 0.77981601, + "epoch": 1.9546508406363463, + "grad_norm": 4.1875, + "learning_rate": 1.3452990947679933e-08, + "loss": 0.78779631, + "memory(GiB)": 138.1, + "step": 83780, + "train_speed(iter/s)": 0.200753 + }, + { + "acc": 0.77697887, + "epoch": 1.9548841482086354, + "grad_norm": 6.28125, + "learning_rate": 1.3314861232362475e-08, + "loss": 0.8031744, + "memory(GiB)": 138.1, + "step": 83790, + "train_speed(iter/s)": 0.200766 + }, + { + "acc": 0.7862566, + "epoch": 1.955117455780924, + "grad_norm": 7.625, + "learning_rate": 1.3177443364899567e-08, + "loss": 0.76605148, + "memory(GiB)": 138.1, + "step": 83800, + "train_speed(iter/s)": 0.200779 + }, + { + "acc": 0.78782101, + "epoch": 1.9553507633532132, + "grad_norm": 5.6875, + "learning_rate": 1.3040737364908295e-08, + "loss": 0.75679483, + "memory(GiB)": 138.1, + "step": 83810, + "train_speed(iter/s)": 0.200791 + }, + { + "acc": 0.77958326, + "epoch": 1.955584070925502, + "grad_norm": 5.5, + "learning_rate": 1.2904743251902496e-08, + "loss": 0.79298873, + "memory(GiB)": 138.1, + "step": 83820, + "train_speed(iter/s)": 0.200804 + }, + { + "acc": 0.77609186, + "epoch": 1.955817378497791, + "grad_norm": 6.21875, + "learning_rate": 1.2769461045296083e-08, + "loss": 0.81230192, + "memory(GiB)": 138.1, + "step": 83830, + "train_speed(iter/s)": 0.200816 + }, + { + "acc": 0.78773966, + "epoch": 1.9560506860700797, + "grad_norm": 5.75, + "learning_rate": 1.2634890764400832e-08, + "loss": 0.78044167, + "memory(GiB)": 138.1, + "step": 83840, + "train_speed(iter/s)": 0.200828 + }, + { + "acc": 0.77997303, + "epoch": 1.9562839936423686, + "grad_norm": 6.375, + "learning_rate": 1.2501032428425264e-08, + "loss": 0.80062265, + "memory(GiB)": 138.1, + "step": 83850, + "train_speed(iter/s)": 0.200841 + }, + { + "acc": 0.77704172, + "epoch": 1.9565173012146575, + "grad_norm": 6.25, + "learning_rate": 1.2367886056479095e-08, + "loss": 0.79110088, + "memory(GiB)": 138.1, + "step": 83860, + "train_speed(iter/s)": 0.200853 + }, + { + "acc": 0.77524996, + "epoch": 1.9567506087869464, + "grad_norm": 4.84375, + "learning_rate": 1.2235451667567678e-08, + "loss": 0.80665646, + "memory(GiB)": 138.1, + "step": 83870, + "train_speed(iter/s)": 0.200865 + }, + { + "acc": 0.80717812, + "epoch": 1.9569839163592353, + "grad_norm": 4.59375, + "learning_rate": 1.2103729280596998e-08, + "loss": 0.69145899, + "memory(GiB)": 138.1, + "step": 83880, + "train_speed(iter/s)": 0.200877 + }, + { + "acc": 0.7651278, + "epoch": 1.9572172239315242, + "grad_norm": 5.46875, + "learning_rate": 1.1972718914370351e-08, + "loss": 0.84759274, + "memory(GiB)": 138.1, + "step": 83890, + "train_speed(iter/s)": 0.20089 + }, + { + "acc": 0.78047352, + "epoch": 1.957450531503813, + "grad_norm": 4.625, + "learning_rate": 1.1842420587588333e-08, + "loss": 0.80353117, + "memory(GiB)": 138.1, + "step": 83900, + "train_speed(iter/s)": 0.200902 + }, + { + "acc": 0.77811413, + "epoch": 1.957683839076102, + "grad_norm": 5.375, + "learning_rate": 1.1712834318852173e-08, + "loss": 0.79200544, + "memory(GiB)": 138.1, + "step": 83910, + "train_speed(iter/s)": 0.200914 + }, + { + "acc": 0.79376502, + "epoch": 1.957917146648391, + "grad_norm": 6.0, + "learning_rate": 1.15839601266593e-08, + "loss": 0.74129534, + "memory(GiB)": 138.1, + "step": 83920, + "train_speed(iter/s)": 0.200927 + }, + { + "acc": 0.78470793, + "epoch": 1.9581504542206798, + "grad_norm": 5.5, + "learning_rate": 1.1455798029407772e-08, + "loss": 0.76864552, + "memory(GiB)": 138.1, + "step": 83930, + "train_speed(iter/s)": 0.200939 + }, + { + "acc": 0.78365679, + "epoch": 1.9583837617929687, + "grad_norm": 3.96875, + "learning_rate": 1.132834804539129e-08, + "loss": 0.75253015, + "memory(GiB)": 138.1, + "step": 83940, + "train_speed(iter/s)": 0.200952 + }, + { + "acc": 0.78898249, + "epoch": 1.9586170693652576, + "grad_norm": 4.96875, + "learning_rate": 1.120161019280419e-08, + "loss": 0.74453573, + "memory(GiB)": 138.1, + "step": 83950, + "train_speed(iter/s)": 0.200964 + }, + { + "acc": 0.77433839, + "epoch": 1.9588503769375465, + "grad_norm": 6.5, + "learning_rate": 1.1075584489737557e-08, + "loss": 0.82852116, + "memory(GiB)": 138.1, + "step": 83960, + "train_speed(iter/s)": 0.200977 + }, + { + "acc": 0.77746682, + "epoch": 1.9590836845098354, + "grad_norm": 4.78125, + "learning_rate": 1.0950270954181997e-08, + "loss": 0.79795814, + "memory(GiB)": 138.1, + "step": 83970, + "train_speed(iter/s)": 0.200989 + }, + { + "acc": 0.77838507, + "epoch": 1.9593169920821243, + "grad_norm": 6.46875, + "learning_rate": 1.0825669604026534e-08, + "loss": 0.79831772, + "memory(GiB)": 138.1, + "step": 83980, + "train_speed(iter/s)": 0.201001 + }, + { + "acc": 0.78435926, + "epoch": 1.9595502996544132, + "grad_norm": 6.28125, + "learning_rate": 1.0701780457056943e-08, + "loss": 0.79196739, + "memory(GiB)": 138.1, + "step": 83990, + "train_speed(iter/s)": 0.201014 + }, + { + "acc": 0.78845625, + "epoch": 1.959783607226702, + "grad_norm": 4.9375, + "learning_rate": 1.0578603530958519e-08, + "loss": 0.76660895, + "memory(GiB)": 138.1, + "step": 84000, + "train_speed(iter/s)": 0.201026 + }, + { + "epoch": 1.959783607226702, + "eval_acc": 0.7447569979426271, + "eval_loss": 0.8044305443763733, + "eval_runtime": 1271.2957, + "eval_samples_per_second": 28.31, + "eval_steps_per_second": 14.156, + "step": 84000 + }, + { + "acc": 0.77818713, + "epoch": 1.960016914798991, + "grad_norm": 6.59375, + "learning_rate": 1.0456138843315534e-08, + "loss": 0.80791035, + "memory(GiB)": 138.1, + "step": 84010, + "train_speed(iter/s)": 0.200416 + }, + { + "acc": 0.7748589, + "epoch": 1.9602502223712799, + "grad_norm": 6.375, + "learning_rate": 1.0334386411609e-08, + "loss": 0.7919076, + "memory(GiB)": 138.1, + "step": 84020, + "train_speed(iter/s)": 0.200429 + }, + { + "acc": 0.77099795, + "epoch": 1.9604835299435688, + "grad_norm": 5.40625, + "learning_rate": 1.0213346253219459e-08, + "loss": 0.7921875, + "memory(GiB)": 138.1, + "step": 84030, + "train_speed(iter/s)": 0.200441 + }, + { + "acc": 0.79554253, + "epoch": 1.9607168375158577, + "grad_norm": 5.4375, + "learning_rate": 1.0093018385424757e-08, + "loss": 0.73114367, + "memory(GiB)": 138.1, + "step": 84040, + "train_speed(iter/s)": 0.200454 + }, + { + "acc": 0.80636587, + "epoch": 1.9609501450881464, + "grad_norm": 6.21875, + "learning_rate": 9.973402825402823e-09, + "loss": 0.70025764, + "memory(GiB)": 138.1, + "step": 84050, + "train_speed(iter/s)": 0.200466 + }, + { + "acc": 0.79211783, + "epoch": 1.9611834526604355, + "grad_norm": 7.6875, + "learning_rate": 9.854499590227217e-09, + "loss": 0.72471952, + "memory(GiB)": 138.1, + "step": 84060, + "train_speed(iter/s)": 0.200478 + }, + { + "acc": 0.77074847, + "epoch": 1.9614167602327242, + "grad_norm": 4.5625, + "learning_rate": 9.736308696872698e-09, + "loss": 0.8087081, + "memory(GiB)": 138.1, + "step": 84070, + "train_speed(iter/s)": 0.200491 + }, + { + "acc": 0.77772079, + "epoch": 1.9616500678050133, + "grad_norm": 6.1875, + "learning_rate": 9.618830162210213e-09, + "loss": 0.80092773, + "memory(GiB)": 138.1, + "step": 84080, + "train_speed(iter/s)": 0.200502 + }, + { + "acc": 0.78463898, + "epoch": 1.961883375377302, + "grad_norm": 4.46875, + "learning_rate": 9.502064003010236e-09, + "loss": 0.78809996, + "memory(GiB)": 138.1, + "step": 84090, + "train_speed(iter/s)": 0.200515 + }, + { + "acc": 0.7911252, + "epoch": 1.962116682949591, + "grad_norm": 5.3125, + "learning_rate": 9.386010235940546e-09, + "loss": 0.76758842, + "memory(GiB)": 138.1, + "step": 84100, + "train_speed(iter/s)": 0.200527 + }, + { + "acc": 0.76237383, + "epoch": 1.9623499905218798, + "grad_norm": 7.28125, + "learning_rate": 9.270668877568444e-09, + "loss": 0.84567051, + "memory(GiB)": 138.1, + "step": 84110, + "train_speed(iter/s)": 0.200539 + }, + { + "acc": 0.77878971, + "epoch": 1.9625832980941689, + "grad_norm": 5.25, + "learning_rate": 9.156039944358542e-09, + "loss": 0.79879036, + "memory(GiB)": 138.1, + "step": 84120, + "train_speed(iter/s)": 0.200552 + }, + { + "acc": 0.76202507, + "epoch": 1.9628166056664575, + "grad_norm": 4.875, + "learning_rate": 9.042123452673856e-09, + "loss": 0.8723732, + "memory(GiB)": 138.1, + "step": 84130, + "train_speed(iter/s)": 0.200564 + }, + { + "acc": 0.77010603, + "epoch": 1.9630499132387467, + "grad_norm": 4.96875, + "learning_rate": 8.928919418776382e-09, + "loss": 0.83259068, + "memory(GiB)": 138.1, + "step": 84140, + "train_speed(iter/s)": 0.200577 + }, + { + "acc": 0.77653313, + "epoch": 1.9632832208110353, + "grad_norm": 5.65625, + "learning_rate": 8.816427858825416e-09, + "loss": 0.79427109, + "memory(GiB)": 138.1, + "step": 84150, + "train_speed(iter/s)": 0.200589 + }, + { + "acc": 0.79803519, + "epoch": 1.9635165283833245, + "grad_norm": 5.78125, + "learning_rate": 8.704648788879777e-09, + "loss": 0.73506384, + "memory(GiB)": 138.1, + "step": 84160, + "train_speed(iter/s)": 0.200601 + }, + { + "acc": 0.76183743, + "epoch": 1.9637498359556131, + "grad_norm": 4.375, + "learning_rate": 8.593582224895036e-09, + "loss": 0.85956383, + "memory(GiB)": 138.1, + "step": 84170, + "train_speed(iter/s)": 0.200613 + }, + { + "acc": 0.76150913, + "epoch": 1.9639831435279023, + "grad_norm": 8.1875, + "learning_rate": 8.483228182726843e-09, + "loss": 0.85972614, + "memory(GiB)": 138.1, + "step": 84180, + "train_speed(iter/s)": 0.200625 + }, + { + "acc": 0.79041133, + "epoch": 1.964216451100191, + "grad_norm": 4.71875, + "learning_rate": 8.3735866781276e-09, + "loss": 0.75700512, + "memory(GiB)": 138.1, + "step": 84190, + "train_speed(iter/s)": 0.200637 + }, + { + "acc": 0.76137877, + "epoch": 1.96444975867248, + "grad_norm": 5.78125, + "learning_rate": 8.264657726749226e-09, + "loss": 0.84933109, + "memory(GiB)": 138.1, + "step": 84200, + "train_speed(iter/s)": 0.200649 + }, + { + "acc": 0.782689, + "epoch": 1.9646830662447687, + "grad_norm": 4.1875, + "learning_rate": 8.156441344140398e-09, + "loss": 0.7834971, + "memory(GiB)": 138.1, + "step": 84210, + "train_speed(iter/s)": 0.200661 + }, + { + "acc": 0.78736172, + "epoch": 1.9649163738170579, + "grad_norm": 7.0625, + "learning_rate": 8.048937545749313e-09, + "loss": 0.74641953, + "memory(GiB)": 138.1, + "step": 84220, + "train_speed(iter/s)": 0.200674 + }, + { + "acc": 0.77960968, + "epoch": 1.9651496813893465, + "grad_norm": 6.65625, + "learning_rate": 7.942146346922586e-09, + "loss": 0.81022434, + "memory(GiB)": 138.1, + "step": 84230, + "train_speed(iter/s)": 0.200687 + }, + { + "acc": 0.75209432, + "epoch": 1.9653829889616354, + "grad_norm": 4.125, + "learning_rate": 7.83606776290413e-09, + "loss": 0.89543324, + "memory(GiB)": 138.1, + "step": 84240, + "train_speed(iter/s)": 0.200699 + }, + { + "acc": 0.79684596, + "epoch": 1.9656162965339243, + "grad_norm": 4.875, + "learning_rate": 7.730701808836837e-09, + "loss": 0.72995892, + "memory(GiB)": 138.1, + "step": 84250, + "train_speed(iter/s)": 0.200712 + }, + { + "acc": 0.78651543, + "epoch": 1.9658496041062132, + "grad_norm": 5.9375, + "learning_rate": 7.626048499761452e-09, + "loss": 0.74779282, + "memory(GiB)": 138.1, + "step": 84260, + "train_speed(iter/s)": 0.200723 + }, + { + "acc": 0.7833818, + "epoch": 1.9660829116785021, + "grad_norm": 6.28125, + "learning_rate": 7.522107850617689e-09, + "loss": 0.78160305, + "memory(GiB)": 138.1, + "step": 84270, + "train_speed(iter/s)": 0.200736 + }, + { + "acc": 0.80099115, + "epoch": 1.966316219250791, + "grad_norm": 3.59375, + "learning_rate": 7.418879876242014e-09, + "loss": 0.70863304, + "memory(GiB)": 138.1, + "step": 84280, + "train_speed(iter/s)": 0.200749 + }, + { + "acc": 0.76215916, + "epoch": 1.96654952682308, + "grad_norm": 6.40625, + "learning_rate": 7.316364591371527e-09, + "loss": 0.85770187, + "memory(GiB)": 138.1, + "step": 84290, + "train_speed(iter/s)": 0.20076 + }, + { + "acc": 0.79879122, + "epoch": 1.9667828343953688, + "grad_norm": 4.9375, + "learning_rate": 7.214562010639525e-09, + "loss": 0.73097854, + "memory(GiB)": 138.1, + "step": 84300, + "train_speed(iter/s)": 0.200772 + }, + { + "acc": 0.77161856, + "epoch": 1.9670161419676577, + "grad_norm": 6.625, + "learning_rate": 7.113472148578271e-09, + "loss": 0.85240879, + "memory(GiB)": 138.1, + "step": 84310, + "train_speed(iter/s)": 0.200784 + }, + { + "acc": 0.78391027, + "epoch": 1.9672494495399466, + "grad_norm": 4.625, + "learning_rate": 7.013095019618443e-09, + "loss": 0.76578526, + "memory(GiB)": 138.1, + "step": 84320, + "train_speed(iter/s)": 0.200796 + }, + { + "acc": 0.76423545, + "epoch": 1.9674827571122355, + "grad_norm": 6.28125, + "learning_rate": 6.9134306380885805e-09, + "loss": 0.87942724, + "memory(GiB)": 138.1, + "step": 84330, + "train_speed(iter/s)": 0.200808 + }, + { + "acc": 0.76918364, + "epoch": 1.9677160646845244, + "grad_norm": 8.1875, + "learning_rate": 6.814479018216192e-09, + "loss": 0.83055038, + "memory(GiB)": 138.1, + "step": 84340, + "train_speed(iter/s)": 0.200821 + }, + { + "acc": 0.76374469, + "epoch": 1.9679493722568133, + "grad_norm": 4.65625, + "learning_rate": 6.7162401741266425e-09, + "loss": 0.86087818, + "memory(GiB)": 138.1, + "step": 84350, + "train_speed(iter/s)": 0.200833 + }, + { + "acc": 0.76832781, + "epoch": 1.9681826798291022, + "grad_norm": 6.25, + "learning_rate": 6.6187141198431615e-09, + "loss": 0.83810291, + "memory(GiB)": 138.1, + "step": 84360, + "train_speed(iter/s)": 0.200845 + }, + { + "acc": 0.76127386, + "epoch": 1.9684159874013911, + "grad_norm": 7.15625, + "learning_rate": 6.52190086928739e-09, + "loss": 0.86279221, + "memory(GiB)": 138.1, + "step": 84370, + "train_speed(iter/s)": 0.200857 + }, + { + "acc": 0.78920145, + "epoch": 1.96864929497368, + "grad_norm": 5.59375, + "learning_rate": 6.425800436279383e-09, + "loss": 0.74230909, + "memory(GiB)": 138.1, + "step": 84380, + "train_speed(iter/s)": 0.20087 + }, + { + "acc": 0.77091441, + "epoch": 1.968882602545969, + "grad_norm": 5.125, + "learning_rate": 6.330412834538169e-09, + "loss": 0.83819075, + "memory(GiB)": 138.1, + "step": 84390, + "train_speed(iter/s)": 0.200882 + }, + { + "acc": 0.77305069, + "epoch": 1.9691159101182578, + "grad_norm": 5.1875, + "learning_rate": 6.235738077680076e-09, + "loss": 0.84126186, + "memory(GiB)": 138.1, + "step": 84400, + "train_speed(iter/s)": 0.200894 + }, + { + "acc": 0.7800487, + "epoch": 1.9693492176905467, + "grad_norm": 4.78125, + "learning_rate": 6.141776179219294e-09, + "loss": 0.7732388, + "memory(GiB)": 138.1, + "step": 84410, + "train_speed(iter/s)": 0.200906 + }, + { + "acc": 0.79444857, + "epoch": 1.9695825252628356, + "grad_norm": 10.0625, + "learning_rate": 6.048527152569539e-09, + "loss": 0.73840342, + "memory(GiB)": 138.1, + "step": 84420, + "train_speed(iter/s)": 0.200918 + }, + { + "acc": 0.78184214, + "epoch": 1.9698158328351245, + "grad_norm": 5.1875, + "learning_rate": 5.955991011041273e-09, + "loss": 0.80885363, + "memory(GiB)": 138.1, + "step": 84430, + "train_speed(iter/s)": 0.200931 + }, + { + "acc": 0.76946917, + "epoch": 1.9700491404074132, + "grad_norm": 11.5, + "learning_rate": 5.864167767845041e-09, + "loss": 0.82882366, + "memory(GiB)": 138.1, + "step": 84440, + "train_speed(iter/s)": 0.200943 + }, + { + "acc": 0.79174566, + "epoch": 1.9702824479797023, + "grad_norm": 8.375, + "learning_rate": 5.773057436087581e-09, + "loss": 0.72687259, + "memory(GiB)": 138.1, + "step": 84450, + "train_speed(iter/s)": 0.200956 + }, + { + "acc": 0.77029505, + "epoch": 1.970515755551991, + "grad_norm": 5.375, + "learning_rate": 5.6826600287757105e-09, + "loss": 0.83553743, + "memory(GiB)": 138.1, + "step": 84460, + "train_speed(iter/s)": 0.200968 + }, + { + "acc": 0.81181126, + "epoch": 1.97074906312428, + "grad_norm": 6.03125, + "learning_rate": 5.592975558813551e-09, + "loss": 0.68757429, + "memory(GiB)": 138.1, + "step": 84470, + "train_speed(iter/s)": 0.20098 + }, + { + "acc": 0.7760201, + "epoch": 1.9709823706965688, + "grad_norm": 4.9375, + "learning_rate": 5.504004039002531e-09, + "loss": 0.78468761, + "memory(GiB)": 138.1, + "step": 84480, + "train_speed(iter/s)": 0.200993 + }, + { + "acc": 0.78242598, + "epoch": 1.971215678268858, + "grad_norm": 6.6875, + "learning_rate": 5.41574548204471e-09, + "loss": 0.77616792, + "memory(GiB)": 138.1, + "step": 84490, + "train_speed(iter/s)": 0.201005 + }, + { + "acc": 0.77654591, + "epoch": 1.9714489858411466, + "grad_norm": 8.1875, + "learning_rate": 5.32819990053779e-09, + "loss": 0.82183952, + "memory(GiB)": 138.1, + "step": 84500, + "train_speed(iter/s)": 0.201017 + }, + { + "epoch": 1.9714489858411466, + "eval_acc": 0.7447488305867797, + "eval_loss": 0.8044179677963257, + "eval_runtime": 1271.4413, + "eval_samples_per_second": 28.307, + "eval_steps_per_second": 14.154, + "step": 84500 + }, + { + "acc": 0.77603683, + "epoch": 1.9716822934134357, + "grad_norm": 4.75, + "learning_rate": 5.24136730697955e-09, + "loss": 0.79193287, + "memory(GiB)": 138.1, + "step": 84510, + "train_speed(iter/s)": 0.200412 + }, + { + "acc": 0.7718574, + "epoch": 1.9719156009857244, + "grad_norm": 5.78125, + "learning_rate": 5.155247713765077e-09, + "loss": 0.82882061, + "memory(GiB)": 138.1, + "step": 84520, + "train_speed(iter/s)": 0.200425 + }, + { + "acc": 0.76282473, + "epoch": 1.9721489085580135, + "grad_norm": 4.0625, + "learning_rate": 5.069841133187869e-09, + "loss": 0.83978472, + "memory(GiB)": 138.1, + "step": 84530, + "train_speed(iter/s)": 0.200437 + }, + { + "acc": 0.79579945, + "epoch": 1.9723822161303022, + "grad_norm": 4.125, + "learning_rate": 4.985147577439842e-09, + "loss": 0.73812609, + "memory(GiB)": 138.1, + "step": 84540, + "train_speed(iter/s)": 0.200449 + }, + { + "acc": 0.7669486, + "epoch": 1.9726155237025913, + "grad_norm": 6.875, + "learning_rate": 4.901167058610767e-09, + "loss": 0.84001894, + "memory(GiB)": 138.1, + "step": 84550, + "train_speed(iter/s)": 0.200461 + }, + { + "acc": 0.78317232, + "epoch": 1.97284883127488, + "grad_norm": 11.375, + "learning_rate": 4.8178995886893895e-09, + "loss": 0.76386995, + "memory(GiB)": 138.1, + "step": 84560, + "train_speed(iter/s)": 0.200474 + }, + { + "acc": 0.78392029, + "epoch": 1.973082138847169, + "grad_norm": 4.90625, + "learning_rate": 4.735345179561757e-09, + "loss": 0.77154541, + "memory(GiB)": 138.1, + "step": 84570, + "train_speed(iter/s)": 0.200485 + }, + { + "acc": 0.77048054, + "epoch": 1.9733154464194578, + "grad_norm": 5.96875, + "learning_rate": 4.65350384301233e-09, + "loss": 0.82347736, + "memory(GiB)": 138.1, + "step": 84580, + "train_speed(iter/s)": 0.200498 + }, + { + "acc": 0.7574182, + "epoch": 1.973548753991747, + "grad_norm": 4.5625, + "learning_rate": 4.572375590723988e-09, + "loss": 0.88755779, + "memory(GiB)": 138.1, + "step": 84590, + "train_speed(iter/s)": 0.20051 + }, + { + "acc": 0.786063, + "epoch": 1.9737820615640356, + "grad_norm": 5.375, + "learning_rate": 4.49196043427802e-09, + "loss": 0.76739092, + "memory(GiB)": 138.1, + "step": 84600, + "train_speed(iter/s)": 0.200524 + }, + { + "acc": 0.75360541, + "epoch": 1.9740153691363247, + "grad_norm": 6.4375, + "learning_rate": 4.4122583851535785e-09, + "loss": 0.87514687, + "memory(GiB)": 138.1, + "step": 84610, + "train_speed(iter/s)": 0.200535 + }, + { + "acc": 0.78751106, + "epoch": 1.9742486767086134, + "grad_norm": 5.6875, + "learning_rate": 4.3332694547276736e-09, + "loss": 0.75879245, + "memory(GiB)": 138.1, + "step": 84620, + "train_speed(iter/s)": 0.200548 + }, + { + "acc": 0.77941394, + "epoch": 1.9744819842809023, + "grad_norm": 5.25, + "learning_rate": 4.254993654276285e-09, + "loss": 0.7969101, + "memory(GiB)": 138.1, + "step": 84630, + "train_speed(iter/s)": 0.20056 + }, + { + "acc": 0.78160868, + "epoch": 1.9747152918531912, + "grad_norm": 5.25, + "learning_rate": 4.177430994973808e-09, + "loss": 0.76313763, + "memory(GiB)": 138.1, + "step": 84640, + "train_speed(iter/s)": 0.200572 + }, + { + "acc": 0.78432083, + "epoch": 1.97494859942548, + "grad_norm": 7.34375, + "learning_rate": 4.1005814878913865e-09, + "loss": 0.78618231, + "memory(GiB)": 138.1, + "step": 84650, + "train_speed(iter/s)": 0.200584 + }, + { + "acc": 0.77199898, + "epoch": 1.975181906997769, + "grad_norm": 4.75, + "learning_rate": 4.024445143999689e-09, + "loss": 0.81763344, + "memory(GiB)": 138.1, + "step": 84660, + "train_speed(iter/s)": 0.200596 + }, + { + "acc": 0.77098942, + "epoch": 1.9754152145700579, + "grad_norm": 4.75, + "learning_rate": 3.9490219741672445e-09, + "loss": 0.82059212, + "memory(GiB)": 138.1, + "step": 84670, + "train_speed(iter/s)": 0.200608 + }, + { + "acc": 0.78812075, + "epoch": 1.9756485221423468, + "grad_norm": 6.84375, + "learning_rate": 3.87431198916044e-09, + "loss": 0.75257215, + "memory(GiB)": 138.1, + "step": 84680, + "train_speed(iter/s)": 0.20062 + }, + { + "acc": 0.78079576, + "epoch": 1.9758818297146357, + "grad_norm": 4.8125, + "learning_rate": 3.800315199644078e-09, + "loss": 0.78688898, + "memory(GiB)": 138.1, + "step": 84690, + "train_speed(iter/s)": 0.200632 + }, + { + "acc": 0.78553157, + "epoch": 1.9761151372869246, + "grad_norm": 4.34375, + "learning_rate": 3.727031616181376e-09, + "loss": 0.76703348, + "memory(GiB)": 138.1, + "step": 84700, + "train_speed(iter/s)": 0.200644 + }, + { + "acc": 0.78337941, + "epoch": 1.9763484448592135, + "grad_norm": 6.03125, + "learning_rate": 3.6544612492334097e-09, + "loss": 0.76780548, + "memory(GiB)": 138.1, + "step": 84710, + "train_speed(iter/s)": 0.200655 + }, + { + "acc": 0.77466536, + "epoch": 1.9765817524315024, + "grad_norm": 4.71875, + "learning_rate": 3.582604109159671e-09, + "loss": 0.80691957, + "memory(GiB)": 138.1, + "step": 84720, + "train_speed(iter/s)": 0.200668 + }, + { + "acc": 0.77310543, + "epoch": 1.9768150600037913, + "grad_norm": 4.375, + "learning_rate": 3.5114602062180646e-09, + "loss": 0.8129674, + "memory(GiB)": 138.1, + "step": 84730, + "train_speed(iter/s)": 0.200679 + }, + { + "acc": 0.77795973, + "epoch": 1.9770483675760802, + "grad_norm": 5.40625, + "learning_rate": 3.4410295505638013e-09, + "loss": 0.82256088, + "memory(GiB)": 138.1, + "step": 84740, + "train_speed(iter/s)": 0.200691 + }, + { + "acc": 0.78295822, + "epoch": 1.977281675148369, + "grad_norm": 5.90625, + "learning_rate": 3.3713121522510607e-09, + "loss": 0.79164433, + "memory(GiB)": 138.1, + "step": 84750, + "train_speed(iter/s)": 0.200703 + }, + { + "acc": 0.76906929, + "epoch": 1.977514982720658, + "grad_norm": 5.28125, + "learning_rate": 3.3023080212318814e-09, + "loss": 0.8484437, + "memory(GiB)": 138.1, + "step": 84760, + "train_speed(iter/s)": 0.200715 + }, + { + "acc": 0.80698633, + "epoch": 1.9777482902929469, + "grad_norm": 4.59375, + "learning_rate": 3.234017167356718e-09, + "loss": 0.68034239, + "memory(GiB)": 138.1, + "step": 84770, + "train_speed(iter/s)": 0.200727 + }, + { + "acc": 0.77349758, + "epoch": 1.9779815978652358, + "grad_norm": 7.46875, + "learning_rate": 3.1664396003738827e-09, + "loss": 0.82069979, + "memory(GiB)": 138.1, + "step": 84780, + "train_speed(iter/s)": 0.200739 + }, + { + "acc": 0.77547059, + "epoch": 1.9782149054375247, + "grad_norm": 5.28125, + "learning_rate": 3.0995753299306598e-09, + "loss": 0.80736742, + "memory(GiB)": 138.1, + "step": 84790, + "train_speed(iter/s)": 0.20075 + }, + { + "acc": 0.7942091, + "epoch": 1.9784482130098135, + "grad_norm": 5.15625, + "learning_rate": 3.0334243655710805e-09, + "loss": 0.7481492, + "memory(GiB)": 138.1, + "step": 84800, + "train_speed(iter/s)": 0.200762 + }, + { + "acc": 0.78032541, + "epoch": 1.9786815205821022, + "grad_norm": 5.5625, + "learning_rate": 2.9679867167387024e-09, + "loss": 0.79240317, + "memory(GiB)": 138.1, + "step": 84810, + "train_speed(iter/s)": 0.200775 + }, + { + "acc": 0.78130064, + "epoch": 1.9789148281543913, + "grad_norm": 5.125, + "learning_rate": 2.9032623927743864e-09, + "loss": 0.79106226, + "memory(GiB)": 138.1, + "step": 84820, + "train_speed(iter/s)": 0.200787 + }, + { + "acc": 0.77278428, + "epoch": 1.97914813572668, + "grad_norm": 5.53125, + "learning_rate": 2.839251402917964e-09, + "loss": 0.81999369, + "memory(GiB)": 138.1, + "step": 84830, + "train_speed(iter/s)": 0.200799 + }, + { + "acc": 0.78221178, + "epoch": 1.9793814432989691, + "grad_norm": 4.21875, + "learning_rate": 2.7759537563065706e-09, + "loss": 0.76479435, + "memory(GiB)": 138.1, + "step": 84840, + "train_speed(iter/s)": 0.200812 + }, + { + "acc": 0.7911087, + "epoch": 1.9796147508712578, + "grad_norm": 6.25, + "learning_rate": 2.7133694619763117e-09, + "loss": 0.75844345, + "memory(GiB)": 138.1, + "step": 84850, + "train_speed(iter/s)": 0.200824 + }, + { + "acc": 0.77396116, + "epoch": 1.979848058443547, + "grad_norm": 4.25, + "learning_rate": 2.6514985288605964e-09, + "loss": 0.83643398, + "memory(GiB)": 138.1, + "step": 84860, + "train_speed(iter/s)": 0.200836 + }, + { + "acc": 0.79459782, + "epoch": 1.9800813660158356, + "grad_norm": 10.875, + "learning_rate": 2.590340965791804e-09, + "loss": 0.74517059, + "memory(GiB)": 138.1, + "step": 84870, + "train_speed(iter/s)": 0.200848 + }, + { + "acc": 0.77023711, + "epoch": 1.9803146735881247, + "grad_norm": 5.40625, + "learning_rate": 2.529896781500174e-09, + "loss": 0.81559296, + "memory(GiB)": 138.1, + "step": 84880, + "train_speed(iter/s)": 0.200861 + }, + { + "acc": 0.75082717, + "epoch": 1.9805479811604134, + "grad_norm": 4.90625, + "learning_rate": 2.4701659846138036e-09, + "loss": 0.89876308, + "memory(GiB)": 138.1, + "step": 84890, + "train_speed(iter/s)": 0.200873 + }, + { + "acc": 0.7946104, + "epoch": 1.9807812887327025, + "grad_norm": 4.3125, + "learning_rate": 2.4111485836592065e-09, + "loss": 0.73043966, + "memory(GiB)": 138.1, + "step": 84900, + "train_speed(iter/s)": 0.200884 + }, + { + "acc": 0.76876898, + "epoch": 1.9810145963049912, + "grad_norm": 6.75, + "learning_rate": 2.3528445870618643e-09, + "loss": 0.83043594, + "memory(GiB)": 138.1, + "step": 84910, + "train_speed(iter/s)": 0.200896 + }, + { + "acc": 0.7821094, + "epoch": 1.9812479038772803, + "grad_norm": 5.71875, + "learning_rate": 2.2952540031440096e-09, + "loss": 0.79190273, + "memory(GiB)": 138.1, + "step": 84920, + "train_speed(iter/s)": 0.200909 + }, + { + "acc": 0.78448, + "epoch": 1.981481211449569, + "grad_norm": 5.21875, + "learning_rate": 2.2383768401268435e-09, + "loss": 0.78527408, + "memory(GiB)": 138.1, + "step": 84930, + "train_speed(iter/s)": 0.20092 + }, + { + "acc": 0.779772, + "epoch": 1.9817145190218581, + "grad_norm": 4.9375, + "learning_rate": 2.182213106129427e-09, + "loss": 0.78437147, + "memory(GiB)": 138.1, + "step": 84940, + "train_speed(iter/s)": 0.200933 + }, + { + "acc": 0.76009808, + "epoch": 1.9819478265941468, + "grad_norm": 4.625, + "learning_rate": 2.126762809169236e-09, + "loss": 0.84811478, + "memory(GiB)": 138.1, + "step": 84950, + "train_speed(iter/s)": 0.200945 + }, + { + "acc": 0.7567162, + "epoch": 1.982181134166436, + "grad_norm": 5.875, + "learning_rate": 2.072025957161605e-09, + "loss": 0.86951723, + "memory(GiB)": 138.1, + "step": 84960, + "train_speed(iter/s)": 0.200957 + }, + { + "acc": 0.78688498, + "epoch": 1.9824144417387246, + "grad_norm": 5.9375, + "learning_rate": 2.0180025579202844e-09, + "loss": 0.77867031, + "memory(GiB)": 138.1, + "step": 84970, + "train_speed(iter/s)": 0.200969 + }, + { + "acc": 0.79099741, + "epoch": 1.9826477493110137, + "grad_norm": 3.921875, + "learning_rate": 1.964692619157438e-09, + "loss": 0.73870716, + "memory(GiB)": 138.1, + "step": 84980, + "train_speed(iter/s)": 0.200981 + }, + { + "acc": 0.77988491, + "epoch": 1.9828810568833024, + "grad_norm": 3.890625, + "learning_rate": 1.912096148482534e-09, + "loss": 0.80207052, + "memory(GiB)": 138.1, + "step": 84990, + "train_speed(iter/s)": 0.200994 + }, + { + "acc": 0.78414559, + "epoch": 1.9831143644555915, + "grad_norm": 5.3125, + "learning_rate": 1.8602131534045665e-09, + "loss": 0.79055219, + "memory(GiB)": 138.1, + "step": 85000, + "train_speed(iter/s)": 0.201006 + }, + { + "epoch": 1.9831143644555915, + "eval_acc": 0.7447177626057132, + "eval_loss": 0.8044196963310242, + "eval_runtime": 1271.3001, + "eval_samples_per_second": 28.31, + "eval_steps_per_second": 14.156, + "step": 85000 + }, + { + "acc": 0.78565392, + "epoch": 1.9833476720278802, + "grad_norm": 6.03125, + "learning_rate": 1.8090436413287226e-09, + "loss": 0.76658969, + "memory(GiB)": 138.1, + "step": 85010, + "train_speed(iter/s)": 0.200404 + }, + { + "acc": 0.79666553, + "epoch": 1.983580979600169, + "grad_norm": 5.34375, + "learning_rate": 1.758587619559715e-09, + "loss": 0.73290777, + "memory(GiB)": 138.1, + "step": 85020, + "train_speed(iter/s)": 0.200416 + }, + { + "acc": 0.78800335, + "epoch": 1.983814287172458, + "grad_norm": 5.46875, + "learning_rate": 1.7088450953006708e-09, + "loss": 0.73815637, + "memory(GiB)": 138.1, + "step": 85030, + "train_speed(iter/s)": 0.200428 + }, + { + "acc": 0.78558655, + "epoch": 1.984047594744747, + "grad_norm": 5.09375, + "learning_rate": 1.659816075652021e-09, + "loss": 0.76726046, + "memory(GiB)": 138.1, + "step": 85040, + "train_speed(iter/s)": 0.20044 + }, + { + "acc": 0.79275599, + "epoch": 1.9842809023170358, + "grad_norm": 5.59375, + "learning_rate": 1.6115005676120565e-09, + "loss": 0.72810202, + "memory(GiB)": 138.1, + "step": 85050, + "train_speed(iter/s)": 0.200452 + }, + { + "acc": 0.78331842, + "epoch": 1.9845142098893247, + "grad_norm": 5.09375, + "learning_rate": 1.5638985780791483e-09, + "loss": 0.78223214, + "memory(GiB)": 138.1, + "step": 85060, + "train_speed(iter/s)": 0.200463 + }, + { + "acc": 0.77881184, + "epoch": 1.9847475174616136, + "grad_norm": 5.625, + "learning_rate": 1.517010113847306e-09, + "loss": 0.79921713, + "memory(GiB)": 138.1, + "step": 85070, + "train_speed(iter/s)": 0.200476 + }, + { + "acc": 0.77289662, + "epoch": 1.9849808250339025, + "grad_norm": 5.25, + "learning_rate": 1.4708351816100641e-09, + "loss": 0.80485191, + "memory(GiB)": 138.1, + "step": 85080, + "train_speed(iter/s)": 0.200488 + }, + { + "acc": 0.77916284, + "epoch": 1.9852141326061914, + "grad_norm": 4.5625, + "learning_rate": 1.425373787958817e-09, + "loss": 0.77773075, + "memory(GiB)": 138.1, + "step": 85090, + "train_speed(iter/s)": 0.200501 + }, + { + "acc": 0.79094009, + "epoch": 1.9854474401784803, + "grad_norm": 4.6875, + "learning_rate": 1.3806259393839282e-09, + "loss": 0.7560596, + "memory(GiB)": 138.1, + "step": 85100, + "train_speed(iter/s)": 0.200513 + }, + { + "acc": 0.77891779, + "epoch": 1.9856807477507692, + "grad_norm": 4.53125, + "learning_rate": 1.336591642271956e-09, + "loss": 0.79700637, + "memory(GiB)": 138.1, + "step": 85110, + "train_speed(iter/s)": 0.200526 + }, + { + "acc": 0.76863756, + "epoch": 1.985914055323058, + "grad_norm": 6.71875, + "learning_rate": 1.2932709029100933e-09, + "loss": 0.8220705, + "memory(GiB)": 138.1, + "step": 85120, + "train_speed(iter/s)": 0.200539 + }, + { + "acc": 0.77171407, + "epoch": 1.986147362895347, + "grad_norm": 5.625, + "learning_rate": 1.2506637274811717e-09, + "loss": 0.81749229, + "memory(GiB)": 138.1, + "step": 85130, + "train_speed(iter/s)": 0.200551 + }, + { + "acc": 0.78697386, + "epoch": 1.986380670467636, + "grad_norm": 7.96875, + "learning_rate": 1.2087701220681036e-09, + "loss": 0.76459389, + "memory(GiB)": 138.1, + "step": 85140, + "train_speed(iter/s)": 0.200565 + }, + { + "acc": 0.78453169, + "epoch": 1.9866139780399248, + "grad_norm": 7.8125, + "learning_rate": 1.1675900926511053e-09, + "loss": 0.77584524, + "memory(GiB)": 138.1, + "step": 85150, + "train_speed(iter/s)": 0.200577 + }, + { + "acc": 0.79571877, + "epoch": 1.9868472856122137, + "grad_norm": 4.5625, + "learning_rate": 1.1271236451082524e-09, + "loss": 0.72640424, + "memory(GiB)": 138.1, + "step": 85160, + "train_speed(iter/s)": 0.200589 + }, + { + "acc": 0.77864351, + "epoch": 1.9870805931845026, + "grad_norm": 7.5, + "learning_rate": 1.0873707852160354e-09, + "loss": 0.80453529, + "memory(GiB)": 138.1, + "step": 85170, + "train_speed(iter/s)": 0.200601 + }, + { + "acc": 0.76175656, + "epoch": 1.9873139007567915, + "grad_norm": 5.25, + "learning_rate": 1.0483315186499143e-09, + "loss": 0.84362011, + "memory(GiB)": 138.1, + "step": 85180, + "train_speed(iter/s)": 0.200613 + }, + { + "acc": 0.77368731, + "epoch": 1.9875472083290804, + "grad_norm": 3.71875, + "learning_rate": 1.0100058509815435e-09, + "loss": 0.80538511, + "memory(GiB)": 138.1, + "step": 85190, + "train_speed(iter/s)": 0.200625 + }, + { + "acc": 0.76618557, + "epoch": 1.987780515901369, + "grad_norm": 7.21875, + "learning_rate": 9.723937876832124e-10, + "loss": 0.82723331, + "memory(GiB)": 138.1, + "step": 85200, + "train_speed(iter/s)": 0.200638 + }, + { + "acc": 0.7858901, + "epoch": 1.9880138234736582, + "grad_norm": 5.0, + "learning_rate": 9.354953341234042e-10, + "loss": 0.75257692, + "memory(GiB)": 138.1, + "step": 85210, + "train_speed(iter/s)": 0.20065 + }, + { + "acc": 0.78722382, + "epoch": 1.9882471310459469, + "grad_norm": 5.625, + "learning_rate": 8.99310495569572e-10, + "loss": 0.74956484, + "memory(GiB)": 138.1, + "step": 85220, + "train_speed(iter/s)": 0.200662 + }, + { + "acc": 0.80738592, + "epoch": 1.988480438618236, + "grad_norm": 4.125, + "learning_rate": 8.638392771864734e-10, + "loss": 0.67543035, + "memory(GiB)": 138.1, + "step": 85230, + "train_speed(iter/s)": 0.200675 + }, + { + "acc": 0.79583402, + "epoch": 1.9887137461905247, + "grad_norm": 5.4375, + "learning_rate": 8.290816840383908e-10, + "loss": 0.73210516, + "memory(GiB)": 138.1, + "step": 85240, + "train_speed(iter/s)": 0.200687 + }, + { + "acc": 0.76078939, + "epoch": 1.9889470537628138, + "grad_norm": 6.4375, + "learning_rate": 7.950377210863558e-10, + "loss": 0.84969578, + "memory(GiB)": 138.1, + "step": 85250, + "train_speed(iter/s)": 0.2007 + }, + { + "acc": 0.77894077, + "epoch": 1.9891803613351025, + "grad_norm": 7.0625, + "learning_rate": 7.617073931909247e-10, + "loss": 0.79533377, + "memory(GiB)": 138.1, + "step": 85260, + "train_speed(iter/s)": 0.200713 + }, + { + "acc": 0.78382912, + "epoch": 1.9894136689073916, + "grad_norm": 5.21875, + "learning_rate": 7.29090705108848e-10, + "loss": 0.77288427, + "memory(GiB)": 138.1, + "step": 85270, + "train_speed(iter/s)": 0.200725 + }, + { + "acc": 0.75326347, + "epoch": 1.9896469764796803, + "grad_norm": 4.8125, + "learning_rate": 6.971876614969564e-10, + "loss": 0.89215813, + "memory(GiB)": 138.1, + "step": 85280, + "train_speed(iter/s)": 0.200737 + }, + { + "acc": 0.79840131, + "epoch": 1.9898802840519694, + "grad_norm": 6.84375, + "learning_rate": 6.659982669093845e-10, + "loss": 0.70803699, + "memory(GiB)": 138.1, + "step": 85290, + "train_speed(iter/s)": 0.20075 + }, + { + "acc": 0.78066187, + "epoch": 1.990113591624258, + "grad_norm": 4.90625, + "learning_rate": 6.355225257981268e-10, + "loss": 0.79505005, + "memory(GiB)": 138.1, + "step": 85300, + "train_speed(iter/s)": 0.200762 + }, + { + "acc": 0.78745785, + "epoch": 1.9903468991965472, + "grad_norm": 4.65625, + "learning_rate": 6.057604425135921e-10, + "loss": 0.74916191, + "memory(GiB)": 138.1, + "step": 85310, + "train_speed(iter/s)": 0.200774 + }, + { + "acc": 0.75778542, + "epoch": 1.9905802067688358, + "grad_norm": 5.96875, + "learning_rate": 5.76712021304604e-10, + "loss": 0.86030426, + "memory(GiB)": 138.1, + "step": 85320, + "train_speed(iter/s)": 0.200785 + }, + { + "acc": 0.76393342, + "epoch": 1.990813514341125, + "grad_norm": 6.6875, + "learning_rate": 5.483772663178455e-10, + "loss": 0.85037289, + "memory(GiB)": 138.1, + "step": 85330, + "train_speed(iter/s)": 0.200798 + }, + { + "acc": 0.77822199, + "epoch": 1.9910468219134136, + "grad_norm": 6.40625, + "learning_rate": 5.20756181597859e-10, + "loss": 0.80846586, + "memory(GiB)": 138.1, + "step": 85340, + "train_speed(iter/s)": 0.20081 + }, + { + "acc": 0.78142519, + "epoch": 1.9912801294857028, + "grad_norm": 6.0625, + "learning_rate": 4.938487710870465e-10, + "loss": 0.78094144, + "memory(GiB)": 138.1, + "step": 85350, + "train_speed(iter/s)": 0.200822 + }, + { + "acc": 0.77061892, + "epoch": 1.9915134370579914, + "grad_norm": 4.9375, + "learning_rate": 4.676550386273349e-10, + "loss": 0.82950497, + "memory(GiB)": 138.1, + "step": 85360, + "train_speed(iter/s)": 0.200834 + }, + { + "acc": 0.78219571, + "epoch": 1.9917467446302806, + "grad_norm": 6.28125, + "learning_rate": 4.421749879574e-10, + "loss": 0.76107216, + "memory(GiB)": 138.1, + "step": 85370, + "train_speed(iter/s)": 0.200847 + }, + { + "acc": 0.76850634, + "epoch": 1.9919800522025692, + "grad_norm": 5.09375, + "learning_rate": 4.174086227148877e-10, + "loss": 0.84992552, + "memory(GiB)": 138.1, + "step": 85380, + "train_speed(iter/s)": 0.200859 + }, + { + "acc": 0.76265931, + "epoch": 1.9922133597748581, + "grad_norm": 6.25, + "learning_rate": 3.9335594643419294e-10, + "loss": 0.84543304, + "memory(GiB)": 138.1, + "step": 85390, + "train_speed(iter/s)": 0.200872 + }, + { + "acc": 0.78111825, + "epoch": 1.992446667347147, + "grad_norm": 5.15625, + "learning_rate": 3.700169625503458e-10, + "loss": 0.79352822, + "memory(GiB)": 138.1, + "step": 85400, + "train_speed(iter/s)": 0.200884 + }, + { + "acc": 0.78446589, + "epoch": 1.992679974919436, + "grad_norm": 5.15625, + "learning_rate": 3.4739167439346024e-10, + "loss": 0.77862563, + "memory(GiB)": 138.1, + "step": 85410, + "train_speed(iter/s)": 0.200896 + }, + { + "acc": 0.78342505, + "epoch": 1.9929132824917248, + "grad_norm": 4.03125, + "learning_rate": 3.254800851948403e-10, + "loss": 0.78520279, + "memory(GiB)": 138.1, + "step": 85420, + "train_speed(iter/s)": 0.200908 + }, + { + "acc": 0.77086182, + "epoch": 1.9931465900640137, + "grad_norm": 5.59375, + "learning_rate": 3.042821980808741e-10, + "loss": 0.82681541, + "memory(GiB)": 138.1, + "step": 85430, + "train_speed(iter/s)": 0.20092 + }, + { + "acc": 0.76659908, + "epoch": 1.9933798976363026, + "grad_norm": 6.09375, + "learning_rate": 2.8379801607858473e-10, + "loss": 0.83367481, + "memory(GiB)": 138.1, + "step": 85440, + "train_speed(iter/s)": 0.200933 + }, + { + "acc": 0.77113953, + "epoch": 1.9936132052085915, + "grad_norm": 6.15625, + "learning_rate": 2.640275421111893e-10, + "loss": 0.82532921, + "memory(GiB)": 138.1, + "step": 85450, + "train_speed(iter/s)": 0.200945 + }, + { + "acc": 0.80559187, + "epoch": 1.9938465127808804, + "grad_norm": 4.71875, + "learning_rate": 2.449707790019851e-10, + "loss": 0.6829802, + "memory(GiB)": 138.1, + "step": 85460, + "train_speed(iter/s)": 0.200957 + }, + { + "acc": 0.77535009, + "epoch": 1.9940798203531693, + "grad_norm": 5.1875, + "learning_rate": 2.266277294704633e-10, + "loss": 0.80879288, + "memory(GiB)": 138.1, + "step": 85470, + "train_speed(iter/s)": 0.20097 + }, + { + "acc": 0.78569078, + "epoch": 1.9943131279254582, + "grad_norm": 7.46875, + "learning_rate": 2.0899839613508499e-10, + "loss": 0.76950636, + "memory(GiB)": 138.1, + "step": 85480, + "train_speed(iter/s)": 0.200981 + }, + { + "acc": 0.78273611, + "epoch": 1.9945464354977471, + "grad_norm": 7.84375, + "learning_rate": 1.9208278151328087e-10, + "loss": 0.78465424, + "memory(GiB)": 138.1, + "step": 85490, + "train_speed(iter/s)": 0.200993 + }, + { + "acc": 0.79737329, + "epoch": 1.994779743070036, + "grad_norm": 5.84375, + "learning_rate": 1.7588088801923088e-10, + "loss": 0.73678904, + "memory(GiB)": 138.1, + "step": 85500, + "train_speed(iter/s)": 0.201006 + }, + { + "epoch": 1.994779743070036, + "eval_acc": 0.744678847557264, + "eval_loss": 0.8044396638870239, + "eval_runtime": 1270.037, + "eval_samples_per_second": 28.339, + "eval_steps_per_second": 14.17, + "step": 85500 + } + ], + "logging_steps": 10, + "max_steps": 85722, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 500, + "total_flos": 2.0183064448975503e+19, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}