{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 882, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0011337868480725624, "grad_norm": 166.5214080810547, "learning_rate": 0.0, "loss": 3.3695, "num_tokens": 1145.0, "step": 1 }, { "epoch": 0.0022675736961451248, "grad_norm": 183.5157012939453, "learning_rate": 7.407407407407407e-07, "loss": 3.5725, "num_tokens": 2223.0, "step": 2 }, { "epoch": 0.003401360544217687, "grad_norm": 184.64498901367188, "learning_rate": 1.4814814814814815e-06, "loss": 3.6684, "num_tokens": 3315.0, "step": 3 }, { "epoch": 0.0045351473922902496, "grad_norm": 165.14901733398438, "learning_rate": 2.222222222222222e-06, "loss": 3.4216, "num_tokens": 4429.0, "step": 4 }, { "epoch": 0.005668934240362812, "grad_norm": 155.65533447265625, "learning_rate": 2.962962962962963e-06, "loss": 3.2972, "num_tokens": 5594.0, "step": 5 }, { "epoch": 0.006802721088435374, "grad_norm": 151.45179748535156, "learning_rate": 3.7037037037037037e-06, "loss": 3.4702, "num_tokens": 6751.0, "step": 6 }, { "epoch": 0.007936507936507936, "grad_norm": 167.11138916015625, "learning_rate": 4.444444444444444e-06, "loss": 3.6179, "num_tokens": 7768.0, "step": 7 }, { "epoch": 0.009070294784580499, "grad_norm": 151.14328002929688, "learning_rate": 5.185185185185185e-06, "loss": 3.2884, "num_tokens": 8885.0, "step": 8 }, { "epoch": 0.01020408163265306, "grad_norm": 144.55308532714844, "learning_rate": 5.925925925925926e-06, "loss": 3.3187, "num_tokens": 10034.0, "step": 9 }, { "epoch": 0.011337868480725623, "grad_norm": 138.56971740722656, "learning_rate": 6.666666666666667e-06, "loss": 3.1647, "num_tokens": 11110.0, "step": 10 }, { "epoch": 0.012471655328798186, "grad_norm": 148.52685546875, "learning_rate": 7.4074074074074075e-06, "loss": 3.2971, "num_tokens": 12163.0, "step": 11 }, { "epoch": 0.013605442176870748, "grad_norm": 133.64486694335938, "learning_rate": 8.148148148148148e-06, "loss": 3.1261, "num_tokens": 13214.0, "step": 12 }, { "epoch": 0.01473922902494331, "grad_norm": 117.43455505371094, "learning_rate": 8.888888888888888e-06, "loss": 2.8538, "num_tokens": 14332.0, "step": 13 }, { "epoch": 0.015873015873015872, "grad_norm": 122.57901763916016, "learning_rate": 9.62962962962963e-06, "loss": 2.8934, "num_tokens": 15486.0, "step": 14 }, { "epoch": 0.017006802721088437, "grad_norm": 139.67311096191406, "learning_rate": 1.037037037037037e-05, "loss": 2.6246, "num_tokens": 16505.0, "step": 15 }, { "epoch": 0.018140589569160998, "grad_norm": 111.19099426269531, "learning_rate": 1.1111111111111113e-05, "loss": 2.5889, "num_tokens": 17529.0, "step": 16 }, { "epoch": 0.01927437641723356, "grad_norm": 84.17432403564453, "learning_rate": 1.1851851851851852e-05, "loss": 2.5661, "num_tokens": 18625.0, "step": 17 }, { "epoch": 0.02040816326530612, "grad_norm": 75.19334411621094, "learning_rate": 1.2592592592592593e-05, "loss": 2.395, "num_tokens": 19757.0, "step": 18 }, { "epoch": 0.021541950113378686, "grad_norm": 70.94991302490234, "learning_rate": 1.3333333333333333e-05, "loss": 2.3632, "num_tokens": 20818.0, "step": 19 }, { "epoch": 0.022675736961451247, "grad_norm": 69.49271392822266, "learning_rate": 1.4074074074074075e-05, "loss": 2.3731, "num_tokens": 21939.0, "step": 20 }, { "epoch": 0.023809523809523808, "grad_norm": 61.87406539916992, "learning_rate": 1.4814814814814815e-05, "loss": 2.142, "num_tokens": 23014.0, "step": 21 }, { "epoch": 0.024943310657596373, "grad_norm": 50.746116638183594, "learning_rate": 1.555555555555556e-05, "loss": 2.1789, "num_tokens": 24169.0, "step": 22 }, { "epoch": 0.026077097505668934, "grad_norm": 50.25040817260742, "learning_rate": 1.6296296296296297e-05, "loss": 2.1423, "num_tokens": 25249.0, "step": 23 }, { "epoch": 0.027210884353741496, "grad_norm": 41.02925491333008, "learning_rate": 1.7037037037037038e-05, "loss": 2.0644, "num_tokens": 26313.0, "step": 24 }, { "epoch": 0.02834467120181406, "grad_norm": 39.495121002197266, "learning_rate": 1.7777777777777777e-05, "loss": 2.1007, "num_tokens": 27351.0, "step": 25 }, { "epoch": 0.02947845804988662, "grad_norm": 39.67973709106445, "learning_rate": 1.851851851851852e-05, "loss": 1.9308, "num_tokens": 28367.0, "step": 26 }, { "epoch": 0.030612244897959183, "grad_norm": 38.24416732788086, "learning_rate": 1.925925925925926e-05, "loss": 1.9548, "num_tokens": 29437.0, "step": 27 }, { "epoch": 0.031746031746031744, "grad_norm": 32.168609619140625, "learning_rate": 2e-05, "loss": 1.9028, "num_tokens": 30576.0, "step": 28 }, { "epoch": 0.032879818594104306, "grad_norm": 33.80866241455078, "learning_rate": 1.9976608187134504e-05, "loss": 1.8616, "num_tokens": 31772.0, "step": 29 }, { "epoch": 0.034013605442176874, "grad_norm": 37.89588928222656, "learning_rate": 1.9953216374269007e-05, "loss": 1.8158, "num_tokens": 32868.0, "step": 30 }, { "epoch": 0.035147392290249435, "grad_norm": 37.13409423828125, "learning_rate": 1.992982456140351e-05, "loss": 1.9149, "num_tokens": 34025.0, "step": 31 }, { "epoch": 0.036281179138321996, "grad_norm": 33.46371841430664, "learning_rate": 1.9906432748538015e-05, "loss": 1.8254, "num_tokens": 35175.0, "step": 32 }, { "epoch": 0.03741496598639456, "grad_norm": 32.30972671508789, "learning_rate": 1.9883040935672515e-05, "loss": 1.7033, "num_tokens": 36242.0, "step": 33 }, { "epoch": 0.03854875283446712, "grad_norm": 34.66704177856445, "learning_rate": 1.9859649122807017e-05, "loss": 1.7303, "num_tokens": 37352.0, "step": 34 }, { "epoch": 0.03968253968253968, "grad_norm": 34.120243072509766, "learning_rate": 1.9836257309941523e-05, "loss": 1.6841, "num_tokens": 38425.0, "step": 35 }, { "epoch": 0.04081632653061224, "grad_norm": 27.385013580322266, "learning_rate": 1.9812865497076026e-05, "loss": 1.5314, "num_tokens": 39617.0, "step": 36 }, { "epoch": 0.04195011337868481, "grad_norm": 29.181873321533203, "learning_rate": 1.9789473684210528e-05, "loss": 1.6671, "num_tokens": 40651.0, "step": 37 }, { "epoch": 0.04308390022675737, "grad_norm": 23.812429428100586, "learning_rate": 1.976608187134503e-05, "loss": 1.6136, "num_tokens": 41788.0, "step": 38 }, { "epoch": 0.04421768707482993, "grad_norm": 25.376712799072266, "learning_rate": 1.9742690058479533e-05, "loss": 1.6714, "num_tokens": 42935.0, "step": 39 }, { "epoch": 0.045351473922902494, "grad_norm": 22.803974151611328, "learning_rate": 1.9719298245614036e-05, "loss": 1.5118, "num_tokens": 44074.0, "step": 40 }, { "epoch": 0.046485260770975055, "grad_norm": 20.470184326171875, "learning_rate": 1.969590643274854e-05, "loss": 1.7483, "num_tokens": 45240.0, "step": 41 }, { "epoch": 0.047619047619047616, "grad_norm": 21.197111129760742, "learning_rate": 1.9672514619883044e-05, "loss": 1.5884, "num_tokens": 46381.0, "step": 42 }, { "epoch": 0.048752834467120185, "grad_norm": 23.196548461914062, "learning_rate": 1.9649122807017544e-05, "loss": 1.5301, "num_tokens": 47452.0, "step": 43 }, { "epoch": 0.049886621315192746, "grad_norm": 21.814697265625, "learning_rate": 1.962573099415205e-05, "loss": 1.5624, "num_tokens": 48609.0, "step": 44 }, { "epoch": 0.05102040816326531, "grad_norm": 20.672985076904297, "learning_rate": 1.9602339181286552e-05, "loss": 1.3742, "num_tokens": 49744.0, "step": 45 }, { "epoch": 0.05215419501133787, "grad_norm": 22.034076690673828, "learning_rate": 1.9578947368421055e-05, "loss": 1.6634, "num_tokens": 50850.0, "step": 46 }, { "epoch": 0.05328798185941043, "grad_norm": 23.442157745361328, "learning_rate": 1.9555555555555557e-05, "loss": 1.5867, "num_tokens": 51963.0, "step": 47 }, { "epoch": 0.05442176870748299, "grad_norm": 21.422443389892578, "learning_rate": 1.953216374269006e-05, "loss": 1.4898, "num_tokens": 53070.0, "step": 48 }, { "epoch": 0.05555555555555555, "grad_norm": 21.500629425048828, "learning_rate": 1.9508771929824562e-05, "loss": 1.5491, "num_tokens": 54154.0, "step": 49 }, { "epoch": 0.05668934240362812, "grad_norm": 21.832984924316406, "learning_rate": 1.9485380116959065e-05, "loss": 1.4155, "num_tokens": 55278.0, "step": 50 }, { "epoch": 0.05782312925170068, "grad_norm": 22.703264236450195, "learning_rate": 1.9461988304093568e-05, "loss": 1.6969, "num_tokens": 56381.0, "step": 51 }, { "epoch": 0.05895691609977324, "grad_norm": 21.087703704833984, "learning_rate": 1.9438596491228074e-05, "loss": 1.426, "num_tokens": 57413.0, "step": 52 }, { "epoch": 0.060090702947845805, "grad_norm": 21.750904083251953, "learning_rate": 1.9415204678362573e-05, "loss": 1.622, "num_tokens": 58555.0, "step": 53 }, { "epoch": 0.061224489795918366, "grad_norm": 22.866840362548828, "learning_rate": 1.939181286549708e-05, "loss": 1.4577, "num_tokens": 59581.0, "step": 54 }, { "epoch": 0.06235827664399093, "grad_norm": 23.061153411865234, "learning_rate": 1.936842105263158e-05, "loss": 1.5179, "num_tokens": 60776.0, "step": 55 }, { "epoch": 0.06349206349206349, "grad_norm": 20.190702438354492, "learning_rate": 1.9345029239766084e-05, "loss": 1.4721, "num_tokens": 61957.0, "step": 56 }, { "epoch": 0.06462585034013606, "grad_norm": 22.264616012573242, "learning_rate": 1.9321637426900586e-05, "loss": 1.4748, "num_tokens": 63043.0, "step": 57 }, { "epoch": 0.06575963718820861, "grad_norm": 24.544754028320312, "learning_rate": 1.929824561403509e-05, "loss": 1.6008, "num_tokens": 64129.0, "step": 58 }, { "epoch": 0.06689342403628118, "grad_norm": 23.750694274902344, "learning_rate": 1.927485380116959e-05, "loss": 1.4103, "num_tokens": 65256.0, "step": 59 }, { "epoch": 0.06802721088435375, "grad_norm": 22.61870765686035, "learning_rate": 1.9251461988304094e-05, "loss": 1.4834, "num_tokens": 66376.0, "step": 60 }, { "epoch": 0.0691609977324263, "grad_norm": 22.504253387451172, "learning_rate": 1.9228070175438597e-05, "loss": 1.4479, "num_tokens": 67572.0, "step": 61 }, { "epoch": 0.07029478458049887, "grad_norm": 24.895252227783203, "learning_rate": 1.9204678362573103e-05, "loss": 1.3843, "num_tokens": 68652.0, "step": 62 }, { "epoch": 0.07142857142857142, "grad_norm": 24.620946884155273, "learning_rate": 1.9181286549707602e-05, "loss": 1.4419, "num_tokens": 69693.0, "step": 63 }, { "epoch": 0.07256235827664399, "grad_norm": 23.75541877746582, "learning_rate": 1.9157894736842108e-05, "loss": 1.4852, "num_tokens": 70855.0, "step": 64 }, { "epoch": 0.07369614512471655, "grad_norm": 27.19347381591797, "learning_rate": 1.913450292397661e-05, "loss": 1.3611, "num_tokens": 71899.0, "step": 65 }, { "epoch": 0.07482993197278912, "grad_norm": 24.09107780456543, "learning_rate": 1.9111111111111113e-05, "loss": 1.3657, "num_tokens": 73092.0, "step": 66 }, { "epoch": 0.07596371882086168, "grad_norm": 22.136247634887695, "learning_rate": 1.9087719298245616e-05, "loss": 1.454, "num_tokens": 74232.0, "step": 67 }, { "epoch": 0.07709750566893424, "grad_norm": 22.34217071533203, "learning_rate": 1.9064327485380118e-05, "loss": 1.5185, "num_tokens": 75347.0, "step": 68 }, { "epoch": 0.0782312925170068, "grad_norm": 20.51241683959961, "learning_rate": 1.904093567251462e-05, "loss": 1.5013, "num_tokens": 76455.0, "step": 69 }, { "epoch": 0.07936507936507936, "grad_norm": 21.199934005737305, "learning_rate": 1.9017543859649123e-05, "loss": 1.5119, "num_tokens": 77648.0, "step": 70 }, { "epoch": 0.08049886621315193, "grad_norm": 23.09796905517578, "learning_rate": 1.8994152046783626e-05, "loss": 1.5551, "num_tokens": 78688.0, "step": 71 }, { "epoch": 0.08163265306122448, "grad_norm": 21.29806137084961, "learning_rate": 1.8970760233918132e-05, "loss": 1.2717, "num_tokens": 79760.0, "step": 72 }, { "epoch": 0.08276643990929705, "grad_norm": 20.702116012573242, "learning_rate": 1.894736842105263e-05, "loss": 1.4566, "num_tokens": 80889.0, "step": 73 }, { "epoch": 0.08390022675736962, "grad_norm": 19.729625701904297, "learning_rate": 1.8923976608187137e-05, "loss": 1.3565, "num_tokens": 81898.0, "step": 74 }, { "epoch": 0.08503401360544217, "grad_norm": 21.44884490966797, "learning_rate": 1.890058479532164e-05, "loss": 1.5702, "num_tokens": 83019.0, "step": 75 }, { "epoch": 0.08616780045351474, "grad_norm": 22.600431442260742, "learning_rate": 1.8877192982456142e-05, "loss": 1.5155, "num_tokens": 84034.0, "step": 76 }, { "epoch": 0.0873015873015873, "grad_norm": 18.412260055541992, "learning_rate": 1.8853801169590645e-05, "loss": 1.3648, "num_tokens": 85221.0, "step": 77 }, { "epoch": 0.08843537414965986, "grad_norm": 21.43720054626465, "learning_rate": 1.8830409356725147e-05, "loss": 1.4198, "num_tokens": 86291.0, "step": 78 }, { "epoch": 0.08956916099773243, "grad_norm": 24.085018157958984, "learning_rate": 1.880701754385965e-05, "loss": 1.5514, "num_tokens": 87273.0, "step": 79 }, { "epoch": 0.09070294784580499, "grad_norm": 21.004009246826172, "learning_rate": 1.8783625730994152e-05, "loss": 1.3665, "num_tokens": 88353.0, "step": 80 }, { "epoch": 0.09183673469387756, "grad_norm": 21.808130264282227, "learning_rate": 1.8760233918128655e-05, "loss": 1.4189, "num_tokens": 89389.0, "step": 81 }, { "epoch": 0.09297052154195011, "grad_norm": 23.251745223999023, "learning_rate": 1.873684210526316e-05, "loss": 1.3824, "num_tokens": 90434.0, "step": 82 }, { "epoch": 0.09410430839002268, "grad_norm": 19.73807144165039, "learning_rate": 1.871345029239766e-05, "loss": 1.2964, "num_tokens": 91566.0, "step": 83 }, { "epoch": 0.09523809523809523, "grad_norm": 20.76624298095703, "learning_rate": 1.8690058479532166e-05, "loss": 1.3478, "num_tokens": 92699.0, "step": 84 }, { "epoch": 0.0963718820861678, "grad_norm": 19.95400619506836, "learning_rate": 1.866666666666667e-05, "loss": 1.4581, "num_tokens": 93770.0, "step": 85 }, { "epoch": 0.09750566893424037, "grad_norm": 20.56294059753418, "learning_rate": 1.864327485380117e-05, "loss": 1.3797, "num_tokens": 94927.0, "step": 86 }, { "epoch": 0.09863945578231292, "grad_norm": 19.86748695373535, "learning_rate": 1.8619883040935674e-05, "loss": 1.5204, "num_tokens": 96064.0, "step": 87 }, { "epoch": 0.09977324263038549, "grad_norm": 19.751052856445312, "learning_rate": 1.8596491228070176e-05, "loss": 1.3639, "num_tokens": 97169.0, "step": 88 }, { "epoch": 0.10090702947845805, "grad_norm": 19.4849910736084, "learning_rate": 1.857309941520468e-05, "loss": 1.3627, "num_tokens": 98366.0, "step": 89 }, { "epoch": 0.10204081632653061, "grad_norm": 22.657968521118164, "learning_rate": 1.854970760233918e-05, "loss": 1.4533, "num_tokens": 99457.0, "step": 90 }, { "epoch": 0.10317460317460317, "grad_norm": 20.2347469329834, "learning_rate": 1.8526315789473684e-05, "loss": 1.4399, "num_tokens": 100599.0, "step": 91 }, { "epoch": 0.10430839002267574, "grad_norm": 19.844715118408203, "learning_rate": 1.850292397660819e-05, "loss": 1.229, "num_tokens": 101813.0, "step": 92 }, { "epoch": 0.1054421768707483, "grad_norm": 19.86066436767578, "learning_rate": 1.847953216374269e-05, "loss": 1.2761, "num_tokens": 102920.0, "step": 93 }, { "epoch": 0.10657596371882086, "grad_norm": 21.279823303222656, "learning_rate": 1.8456140350877195e-05, "loss": 1.3531, "num_tokens": 104083.0, "step": 94 }, { "epoch": 0.10770975056689343, "grad_norm": 19.693561553955078, "learning_rate": 1.8432748538011698e-05, "loss": 1.2868, "num_tokens": 105252.0, "step": 95 }, { "epoch": 0.10884353741496598, "grad_norm": 23.13196563720703, "learning_rate": 1.84093567251462e-05, "loss": 1.3683, "num_tokens": 106354.0, "step": 96 }, { "epoch": 0.10997732426303855, "grad_norm": 21.406845092773438, "learning_rate": 1.8385964912280703e-05, "loss": 1.3678, "num_tokens": 107532.0, "step": 97 }, { "epoch": 0.1111111111111111, "grad_norm": 21.883495330810547, "learning_rate": 1.8362573099415205e-05, "loss": 1.4525, "num_tokens": 108667.0, "step": 98 }, { "epoch": 0.11224489795918367, "grad_norm": 20.690698623657227, "learning_rate": 1.833918128654971e-05, "loss": 1.3653, "num_tokens": 109714.0, "step": 99 }, { "epoch": 0.11337868480725624, "grad_norm": 19.245071411132812, "learning_rate": 1.831578947368421e-05, "loss": 1.5926, "num_tokens": 110894.0, "step": 100 }, { "epoch": 0.1145124716553288, "grad_norm": 20.101728439331055, "learning_rate": 1.8292397660818713e-05, "loss": 1.4173, "num_tokens": 111990.0, "step": 101 }, { "epoch": 0.11564625850340136, "grad_norm": 20.703289031982422, "learning_rate": 1.826900584795322e-05, "loss": 1.3306, "num_tokens": 113085.0, "step": 102 }, { "epoch": 0.11678004535147392, "grad_norm": 18.79097557067871, "learning_rate": 1.824561403508772e-05, "loss": 1.4765, "num_tokens": 114209.0, "step": 103 }, { "epoch": 0.11791383219954649, "grad_norm": 18.59799575805664, "learning_rate": 1.8222222222222224e-05, "loss": 1.4152, "num_tokens": 115268.0, "step": 104 }, { "epoch": 0.11904761904761904, "grad_norm": 20.422094345092773, "learning_rate": 1.8198830409356727e-05, "loss": 1.3147, "num_tokens": 116382.0, "step": 105 }, { "epoch": 0.12018140589569161, "grad_norm": 19.0844783782959, "learning_rate": 1.817543859649123e-05, "loss": 1.4424, "num_tokens": 117504.0, "step": 106 }, { "epoch": 0.12131519274376418, "grad_norm": 19.89101791381836, "learning_rate": 1.8152046783625732e-05, "loss": 1.4456, "num_tokens": 118698.0, "step": 107 }, { "epoch": 0.12244897959183673, "grad_norm": 19.22842025756836, "learning_rate": 1.8128654970760235e-05, "loss": 1.4671, "num_tokens": 119791.0, "step": 108 }, { "epoch": 0.1235827664399093, "grad_norm": 19.693885803222656, "learning_rate": 1.810526315789474e-05, "loss": 1.5262, "num_tokens": 120996.0, "step": 109 }, { "epoch": 0.12471655328798185, "grad_norm": 18.969242095947266, "learning_rate": 1.808187134502924e-05, "loss": 1.3295, "num_tokens": 122066.0, "step": 110 }, { "epoch": 0.12585034013605442, "grad_norm": 18.591045379638672, "learning_rate": 1.8058479532163746e-05, "loss": 1.3131, "num_tokens": 123250.0, "step": 111 }, { "epoch": 0.12698412698412698, "grad_norm": 19.415027618408203, "learning_rate": 1.8035087719298248e-05, "loss": 1.3223, "num_tokens": 124337.0, "step": 112 }, { "epoch": 0.12811791383219956, "grad_norm": 20.726171493530273, "learning_rate": 1.8011695906432747e-05, "loss": 1.3237, "num_tokens": 125423.0, "step": 113 }, { "epoch": 0.1292517006802721, "grad_norm": 20.658830642700195, "learning_rate": 1.7988304093567253e-05, "loss": 1.4612, "num_tokens": 126544.0, "step": 114 }, { "epoch": 0.13038548752834467, "grad_norm": 19.018239974975586, "learning_rate": 1.7964912280701756e-05, "loss": 1.5008, "num_tokens": 127718.0, "step": 115 }, { "epoch": 0.13151927437641722, "grad_norm": 19.35009765625, "learning_rate": 1.794152046783626e-05, "loss": 1.4293, "num_tokens": 128897.0, "step": 116 }, { "epoch": 0.1326530612244898, "grad_norm": 19.17117691040039, "learning_rate": 1.791812865497076e-05, "loss": 1.4276, "num_tokens": 130034.0, "step": 117 }, { "epoch": 0.13378684807256236, "grad_norm": 19.1497745513916, "learning_rate": 1.7894736842105264e-05, "loss": 1.3727, "num_tokens": 131196.0, "step": 118 }, { "epoch": 0.1349206349206349, "grad_norm": 18.572772979736328, "learning_rate": 1.787134502923977e-05, "loss": 1.3874, "num_tokens": 132362.0, "step": 119 }, { "epoch": 0.1360544217687075, "grad_norm": 18.241405487060547, "learning_rate": 1.784795321637427e-05, "loss": 1.3882, "num_tokens": 133454.0, "step": 120 }, { "epoch": 0.13718820861678005, "grad_norm": 18.73154640197754, "learning_rate": 1.7824561403508775e-05, "loss": 1.4655, "num_tokens": 134552.0, "step": 121 }, { "epoch": 0.1383219954648526, "grad_norm": 17.963134765625, "learning_rate": 1.7801169590643277e-05, "loss": 1.4485, "num_tokens": 135670.0, "step": 122 }, { "epoch": 0.13945578231292516, "grad_norm": 21.91454315185547, "learning_rate": 1.7777777777777777e-05, "loss": 1.4819, "num_tokens": 136682.0, "step": 123 }, { "epoch": 0.14058956916099774, "grad_norm": 19.387649536132812, "learning_rate": 1.7754385964912283e-05, "loss": 1.3847, "num_tokens": 137759.0, "step": 124 }, { "epoch": 0.1417233560090703, "grad_norm": 20.603775024414062, "learning_rate": 1.7730994152046785e-05, "loss": 1.3973, "num_tokens": 138859.0, "step": 125 }, { "epoch": 0.14285714285714285, "grad_norm": 19.406925201416016, "learning_rate": 1.7707602339181288e-05, "loss": 1.2521, "num_tokens": 140019.0, "step": 126 }, { "epoch": 0.14399092970521543, "grad_norm": 20.93782615661621, "learning_rate": 1.768421052631579e-05, "loss": 1.3612, "num_tokens": 141070.0, "step": 127 }, { "epoch": 0.14512471655328799, "grad_norm": 18.799549102783203, "learning_rate": 1.7660818713450293e-05, "loss": 1.3536, "num_tokens": 142190.0, "step": 128 }, { "epoch": 0.14625850340136054, "grad_norm": 20.460613250732422, "learning_rate": 1.76374269005848e-05, "loss": 1.246, "num_tokens": 143354.0, "step": 129 }, { "epoch": 0.1473922902494331, "grad_norm": 19.515867233276367, "learning_rate": 1.7614035087719298e-05, "loss": 1.3296, "num_tokens": 144426.0, "step": 130 }, { "epoch": 0.14852607709750568, "grad_norm": 20.77712059020996, "learning_rate": 1.7590643274853804e-05, "loss": 1.3899, "num_tokens": 145548.0, "step": 131 }, { "epoch": 0.14965986394557823, "grad_norm": 22.63519287109375, "learning_rate": 1.7567251461988307e-05, "loss": 1.3826, "num_tokens": 146672.0, "step": 132 }, { "epoch": 0.15079365079365079, "grad_norm": 19.612306594848633, "learning_rate": 1.754385964912281e-05, "loss": 1.3528, "num_tokens": 147779.0, "step": 133 }, { "epoch": 0.15192743764172337, "grad_norm": 25.188501358032227, "learning_rate": 1.752046783625731e-05, "loss": 1.3508, "num_tokens": 148689.0, "step": 134 }, { "epoch": 0.15306122448979592, "grad_norm": 22.229259490966797, "learning_rate": 1.7497076023391814e-05, "loss": 1.5497, "num_tokens": 149900.0, "step": 135 }, { "epoch": 0.15419501133786848, "grad_norm": 21.376983642578125, "learning_rate": 1.7473684210526317e-05, "loss": 1.3985, "num_tokens": 151079.0, "step": 136 }, { "epoch": 0.15532879818594103, "grad_norm": 20.008968353271484, "learning_rate": 1.745029239766082e-05, "loss": 1.4268, "num_tokens": 152168.0, "step": 137 }, { "epoch": 0.1564625850340136, "grad_norm": 20.861085891723633, "learning_rate": 1.7426900584795322e-05, "loss": 1.2333, "num_tokens": 153262.0, "step": 138 }, { "epoch": 0.15759637188208617, "grad_norm": 18.783538818359375, "learning_rate": 1.7403508771929828e-05, "loss": 1.269, "num_tokens": 154398.0, "step": 139 }, { "epoch": 0.15873015873015872, "grad_norm": 18.933168411254883, "learning_rate": 1.7380116959064327e-05, "loss": 1.3084, "num_tokens": 155438.0, "step": 140 }, { "epoch": 0.1598639455782313, "grad_norm": 18.174386978149414, "learning_rate": 1.7356725146198833e-05, "loss": 1.3455, "num_tokens": 156535.0, "step": 141 }, { "epoch": 0.16099773242630386, "grad_norm": 19.143192291259766, "learning_rate": 1.7333333333333336e-05, "loss": 1.3789, "num_tokens": 157676.0, "step": 142 }, { "epoch": 0.1621315192743764, "grad_norm": 20.084741592407227, "learning_rate": 1.7309941520467838e-05, "loss": 1.3224, "num_tokens": 158849.0, "step": 143 }, { "epoch": 0.16326530612244897, "grad_norm": 20.131685256958008, "learning_rate": 1.728654970760234e-05, "loss": 1.5643, "num_tokens": 159943.0, "step": 144 }, { "epoch": 0.16439909297052155, "grad_norm": 21.717613220214844, "learning_rate": 1.7263157894736843e-05, "loss": 1.532, "num_tokens": 161152.0, "step": 145 }, { "epoch": 0.1655328798185941, "grad_norm": 18.25162696838379, "learning_rate": 1.7239766081871346e-05, "loss": 1.363, "num_tokens": 162350.0, "step": 146 }, { "epoch": 0.16666666666666666, "grad_norm": 21.760433197021484, "learning_rate": 1.721637426900585e-05, "loss": 1.2345, "num_tokens": 163374.0, "step": 147 }, { "epoch": 0.16780045351473924, "grad_norm": 17.82793617248535, "learning_rate": 1.719298245614035e-05, "loss": 1.3358, "num_tokens": 164537.0, "step": 148 }, { "epoch": 0.1689342403628118, "grad_norm": 20.227628707885742, "learning_rate": 1.7169590643274857e-05, "loss": 1.3032, "num_tokens": 165585.0, "step": 149 }, { "epoch": 0.17006802721088435, "grad_norm": 20.599544525146484, "learning_rate": 1.7146198830409356e-05, "loss": 1.3294, "num_tokens": 166600.0, "step": 150 }, { "epoch": 0.1712018140589569, "grad_norm": 20.629758834838867, "learning_rate": 1.7122807017543862e-05, "loss": 1.3442, "num_tokens": 167686.0, "step": 151 }, { "epoch": 0.17233560090702948, "grad_norm": 20.761476516723633, "learning_rate": 1.7099415204678365e-05, "loss": 1.4245, "num_tokens": 168793.0, "step": 152 }, { "epoch": 0.17346938775510204, "grad_norm": 21.658018112182617, "learning_rate": 1.7076023391812867e-05, "loss": 1.4516, "num_tokens": 169843.0, "step": 153 }, { "epoch": 0.1746031746031746, "grad_norm": 20.187335968017578, "learning_rate": 1.705263157894737e-05, "loss": 1.2961, "num_tokens": 170851.0, "step": 154 }, { "epoch": 0.17573696145124718, "grad_norm": 20.812149047851562, "learning_rate": 1.7029239766081872e-05, "loss": 1.4905, "num_tokens": 171960.0, "step": 155 }, { "epoch": 0.17687074829931973, "grad_norm": 19.095909118652344, "learning_rate": 1.7005847953216375e-05, "loss": 1.4067, "num_tokens": 173095.0, "step": 156 }, { "epoch": 0.17800453514739228, "grad_norm": 18.644315719604492, "learning_rate": 1.6982456140350878e-05, "loss": 1.3285, "num_tokens": 174221.0, "step": 157 }, { "epoch": 0.17913832199546487, "grad_norm": 19.706623077392578, "learning_rate": 1.695906432748538e-05, "loss": 1.3553, "num_tokens": 175405.0, "step": 158 }, { "epoch": 0.18027210884353742, "grad_norm": 19.92644500732422, "learning_rate": 1.6935672514619886e-05, "loss": 1.3019, "num_tokens": 176445.0, "step": 159 }, { "epoch": 0.18140589569160998, "grad_norm": 19.04597282409668, "learning_rate": 1.6912280701754385e-05, "loss": 1.3361, "num_tokens": 177648.0, "step": 160 }, { "epoch": 0.18253968253968253, "grad_norm": 17.983367919921875, "learning_rate": 1.688888888888889e-05, "loss": 1.3992, "num_tokens": 178831.0, "step": 161 }, { "epoch": 0.1836734693877551, "grad_norm": 17.700355529785156, "learning_rate": 1.6865497076023394e-05, "loss": 1.3004, "num_tokens": 179986.0, "step": 162 }, { "epoch": 0.18480725623582767, "grad_norm": 18.262815475463867, "learning_rate": 1.6842105263157896e-05, "loss": 1.4365, "num_tokens": 181201.0, "step": 163 }, { "epoch": 0.18594104308390022, "grad_norm": 20.517810821533203, "learning_rate": 1.68187134502924e-05, "loss": 1.5094, "num_tokens": 182310.0, "step": 164 }, { "epoch": 0.1870748299319728, "grad_norm": 17.606334686279297, "learning_rate": 1.67953216374269e-05, "loss": 1.3723, "num_tokens": 183432.0, "step": 165 }, { "epoch": 0.18820861678004536, "grad_norm": 20.512365341186523, "learning_rate": 1.6771929824561408e-05, "loss": 1.5234, "num_tokens": 184597.0, "step": 166 }, { "epoch": 0.1893424036281179, "grad_norm": 21.517213821411133, "learning_rate": 1.6748538011695907e-05, "loss": 1.4261, "num_tokens": 185668.0, "step": 167 }, { "epoch": 0.19047619047619047, "grad_norm": 18.336288452148438, "learning_rate": 1.672514619883041e-05, "loss": 1.3113, "num_tokens": 186778.0, "step": 168 }, { "epoch": 0.19160997732426305, "grad_norm": 17.611347198486328, "learning_rate": 1.6701754385964915e-05, "loss": 1.419, "num_tokens": 187963.0, "step": 169 }, { "epoch": 0.1927437641723356, "grad_norm": 18.618175506591797, "learning_rate": 1.6678362573099414e-05, "loss": 1.3611, "num_tokens": 189061.0, "step": 170 }, { "epoch": 0.19387755102040816, "grad_norm": 18.459095001220703, "learning_rate": 1.665497076023392e-05, "loss": 1.3561, "num_tokens": 190184.0, "step": 171 }, { "epoch": 0.19501133786848074, "grad_norm": 19.065645217895508, "learning_rate": 1.6631578947368423e-05, "loss": 1.2869, "num_tokens": 191228.0, "step": 172 }, { "epoch": 0.1961451247165533, "grad_norm": 18.562837600708008, "learning_rate": 1.6608187134502926e-05, "loss": 1.3189, "num_tokens": 192388.0, "step": 173 }, { "epoch": 0.19727891156462585, "grad_norm": 17.509126663208008, "learning_rate": 1.6584795321637428e-05, "loss": 1.3978, "num_tokens": 193528.0, "step": 174 }, { "epoch": 0.1984126984126984, "grad_norm": 17.223644256591797, "learning_rate": 1.656140350877193e-05, "loss": 1.3262, "num_tokens": 194550.0, "step": 175 }, { "epoch": 0.19954648526077098, "grad_norm": 17.391401290893555, "learning_rate": 1.6538011695906437e-05, "loss": 1.3271, "num_tokens": 195677.0, "step": 176 }, { "epoch": 0.20068027210884354, "grad_norm": 17.85859489440918, "learning_rate": 1.6514619883040936e-05, "loss": 1.3437, "num_tokens": 196912.0, "step": 177 }, { "epoch": 0.2018140589569161, "grad_norm": 17.89037322998047, "learning_rate": 1.649122807017544e-05, "loss": 1.377, "num_tokens": 198043.0, "step": 178 }, { "epoch": 0.20294784580498867, "grad_norm": 18.96498680114746, "learning_rate": 1.6467836257309944e-05, "loss": 1.5327, "num_tokens": 199157.0, "step": 179 }, { "epoch": 0.20408163265306123, "grad_norm": 19.338775634765625, "learning_rate": 1.6444444444444444e-05, "loss": 1.4341, "num_tokens": 200255.0, "step": 180 }, { "epoch": 0.20521541950113378, "grad_norm": 19.820980072021484, "learning_rate": 1.642105263157895e-05, "loss": 1.2833, "num_tokens": 201297.0, "step": 181 }, { "epoch": 0.20634920634920634, "grad_norm": 17.88483238220215, "learning_rate": 1.6397660818713452e-05, "loss": 1.2759, "num_tokens": 202492.0, "step": 182 }, { "epoch": 0.20748299319727892, "grad_norm": 19.055448532104492, "learning_rate": 1.6374269005847955e-05, "loss": 1.3843, "num_tokens": 203650.0, "step": 183 }, { "epoch": 0.20861678004535147, "grad_norm": 20.04726219177246, "learning_rate": 1.6350877192982457e-05, "loss": 1.2938, "num_tokens": 204723.0, "step": 184 }, { "epoch": 0.20975056689342403, "grad_norm": 19.267118453979492, "learning_rate": 1.632748538011696e-05, "loss": 1.3177, "num_tokens": 205792.0, "step": 185 }, { "epoch": 0.2108843537414966, "grad_norm": 19.38846778869629, "learning_rate": 1.6304093567251466e-05, "loss": 1.538, "num_tokens": 206878.0, "step": 186 }, { "epoch": 0.21201814058956917, "grad_norm": 17.611108779907227, "learning_rate": 1.6280701754385965e-05, "loss": 1.3847, "num_tokens": 208071.0, "step": 187 }, { "epoch": 0.21315192743764172, "grad_norm": 17.82870101928711, "learning_rate": 1.625730994152047e-05, "loss": 1.3566, "num_tokens": 209227.0, "step": 188 }, { "epoch": 0.21428571428571427, "grad_norm": 20.898141860961914, "learning_rate": 1.6233918128654974e-05, "loss": 1.4293, "num_tokens": 210372.0, "step": 189 }, { "epoch": 0.21541950113378686, "grad_norm": 17.881431579589844, "learning_rate": 1.6210526315789473e-05, "loss": 1.3483, "num_tokens": 211604.0, "step": 190 }, { "epoch": 0.2165532879818594, "grad_norm": 19.36078453063965, "learning_rate": 1.618713450292398e-05, "loss": 1.3726, "num_tokens": 212855.0, "step": 191 }, { "epoch": 0.21768707482993196, "grad_norm": 16.670745849609375, "learning_rate": 1.616374269005848e-05, "loss": 1.3198, "num_tokens": 213956.0, "step": 192 }, { "epoch": 0.21882086167800455, "grad_norm": 17.778671264648438, "learning_rate": 1.6140350877192984e-05, "loss": 1.5004, "num_tokens": 215120.0, "step": 193 }, { "epoch": 0.2199546485260771, "grad_norm": 18.08847427368164, "learning_rate": 1.6116959064327486e-05, "loss": 1.2994, "num_tokens": 216206.0, "step": 194 }, { "epoch": 0.22108843537414966, "grad_norm": 16.995229721069336, "learning_rate": 1.609356725146199e-05, "loss": 1.2784, "num_tokens": 217303.0, "step": 195 }, { "epoch": 0.2222222222222222, "grad_norm": 19.43770980834961, "learning_rate": 1.6070175438596495e-05, "loss": 1.446, "num_tokens": 218327.0, "step": 196 }, { "epoch": 0.2233560090702948, "grad_norm": 17.98920249938965, "learning_rate": 1.6046783625730994e-05, "loss": 1.2319, "num_tokens": 219485.0, "step": 197 }, { "epoch": 0.22448979591836735, "grad_norm": 18.416790008544922, "learning_rate": 1.60233918128655e-05, "loss": 1.5078, "num_tokens": 220708.0, "step": 198 }, { "epoch": 0.2256235827664399, "grad_norm": 19.166311264038086, "learning_rate": 1.6000000000000003e-05, "loss": 1.3471, "num_tokens": 221781.0, "step": 199 }, { "epoch": 0.22675736961451248, "grad_norm": 17.823318481445312, "learning_rate": 1.5976608187134505e-05, "loss": 1.3839, "num_tokens": 222829.0, "step": 200 }, { "epoch": 0.22789115646258504, "grad_norm": 21.60662078857422, "learning_rate": 1.5953216374269008e-05, "loss": 1.3423, "num_tokens": 224071.0, "step": 201 }, { "epoch": 0.2290249433106576, "grad_norm": 20.148921966552734, "learning_rate": 1.592982456140351e-05, "loss": 1.4631, "num_tokens": 225179.0, "step": 202 }, { "epoch": 0.23015873015873015, "grad_norm": 20.172704696655273, "learning_rate": 1.5906432748538013e-05, "loss": 1.312, "num_tokens": 226250.0, "step": 203 }, { "epoch": 0.23129251700680273, "grad_norm": 20.227020263671875, "learning_rate": 1.5883040935672516e-05, "loss": 1.3808, "num_tokens": 227380.0, "step": 204 }, { "epoch": 0.23242630385487528, "grad_norm": 16.813283920288086, "learning_rate": 1.5859649122807018e-05, "loss": 1.2912, "num_tokens": 228565.0, "step": 205 }, { "epoch": 0.23356009070294784, "grad_norm": 20.42585563659668, "learning_rate": 1.583625730994152e-05, "loss": 1.5254, "num_tokens": 229639.0, "step": 206 }, { "epoch": 0.23469387755102042, "grad_norm": 17.481595993041992, "learning_rate": 1.5812865497076023e-05, "loss": 1.3817, "num_tokens": 230753.0, "step": 207 }, { "epoch": 0.23582766439909297, "grad_norm": 19.45645523071289, "learning_rate": 1.578947368421053e-05, "loss": 1.5004, "num_tokens": 231866.0, "step": 208 }, { "epoch": 0.23696145124716553, "grad_norm": 18.422744750976562, "learning_rate": 1.5766081871345032e-05, "loss": 1.4017, "num_tokens": 233001.0, "step": 209 }, { "epoch": 0.23809523809523808, "grad_norm": 16.9028263092041, "learning_rate": 1.5742690058479534e-05, "loss": 1.3741, "num_tokens": 234113.0, "step": 210 }, { "epoch": 0.23922902494331066, "grad_norm": 17.101531982421875, "learning_rate": 1.5719298245614037e-05, "loss": 1.2564, "num_tokens": 235294.0, "step": 211 }, { "epoch": 0.24036281179138322, "grad_norm": 18.186054229736328, "learning_rate": 1.569590643274854e-05, "loss": 1.2791, "num_tokens": 236392.0, "step": 212 }, { "epoch": 0.24149659863945577, "grad_norm": 18.865562438964844, "learning_rate": 1.5672514619883042e-05, "loss": 1.3361, "num_tokens": 237522.0, "step": 213 }, { "epoch": 0.24263038548752835, "grad_norm": 19.406614303588867, "learning_rate": 1.5649122807017545e-05, "loss": 1.4308, "num_tokens": 238635.0, "step": 214 }, { "epoch": 0.2437641723356009, "grad_norm": 19.976255416870117, "learning_rate": 1.5625730994152047e-05, "loss": 1.2673, "num_tokens": 239690.0, "step": 215 }, { "epoch": 0.24489795918367346, "grad_norm": 20.151403427124023, "learning_rate": 1.560233918128655e-05, "loss": 1.3816, "num_tokens": 240712.0, "step": 216 }, { "epoch": 0.24603174603174602, "grad_norm": 17.595935821533203, "learning_rate": 1.5578947368421052e-05, "loss": 1.3355, "num_tokens": 241899.0, "step": 217 }, { "epoch": 0.2471655328798186, "grad_norm": 18.910552978515625, "learning_rate": 1.555555555555556e-05, "loss": 1.3723, "num_tokens": 242975.0, "step": 218 }, { "epoch": 0.24829931972789115, "grad_norm": 19.044044494628906, "learning_rate": 1.553216374269006e-05, "loss": 1.4088, "num_tokens": 244129.0, "step": 219 }, { "epoch": 0.2494331065759637, "grad_norm": 19.70855140686035, "learning_rate": 1.5508771929824563e-05, "loss": 1.4177, "num_tokens": 245290.0, "step": 220 }, { "epoch": 0.25056689342403626, "grad_norm": 20.583599090576172, "learning_rate": 1.5485380116959066e-05, "loss": 1.484, "num_tokens": 246386.0, "step": 221 }, { "epoch": 0.25170068027210885, "grad_norm": 18.10258674621582, "learning_rate": 1.546198830409357e-05, "loss": 1.3423, "num_tokens": 247440.0, "step": 222 }, { "epoch": 0.2528344671201814, "grad_norm": 20.333332061767578, "learning_rate": 1.543859649122807e-05, "loss": 1.2536, "num_tokens": 248514.0, "step": 223 }, { "epoch": 0.25396825396825395, "grad_norm": 17.73214340209961, "learning_rate": 1.5415204678362574e-05, "loss": 1.3212, "num_tokens": 249642.0, "step": 224 }, { "epoch": 0.25510204081632654, "grad_norm": 17.801233291625977, "learning_rate": 1.5391812865497076e-05, "loss": 1.2453, "num_tokens": 250712.0, "step": 225 }, { "epoch": 0.2562358276643991, "grad_norm": 17.496618270874023, "learning_rate": 1.536842105263158e-05, "loss": 1.3112, "num_tokens": 251897.0, "step": 226 }, { "epoch": 0.25736961451247165, "grad_norm": 17.244487762451172, "learning_rate": 1.534502923976608e-05, "loss": 1.2682, "num_tokens": 253031.0, "step": 227 }, { "epoch": 0.2585034013605442, "grad_norm": 19.099130630493164, "learning_rate": 1.5321637426900587e-05, "loss": 1.5014, "num_tokens": 254172.0, "step": 228 }, { "epoch": 0.25963718820861675, "grad_norm": 17.753938674926758, "learning_rate": 1.529824561403509e-05, "loss": 1.2489, "num_tokens": 255350.0, "step": 229 }, { "epoch": 0.26077097505668934, "grad_norm": 18.264495849609375, "learning_rate": 1.5274853801169593e-05, "loss": 1.2714, "num_tokens": 256554.0, "step": 230 }, { "epoch": 0.2619047619047619, "grad_norm": 17.892715454101562, "learning_rate": 1.5251461988304095e-05, "loss": 1.3647, "num_tokens": 257729.0, "step": 231 }, { "epoch": 0.26303854875283444, "grad_norm": 20.5532169342041, "learning_rate": 1.5228070175438598e-05, "loss": 1.3013, "num_tokens": 258806.0, "step": 232 }, { "epoch": 0.264172335600907, "grad_norm": 19.51557159423828, "learning_rate": 1.52046783625731e-05, "loss": 1.3447, "num_tokens": 259874.0, "step": 233 }, { "epoch": 0.2653061224489796, "grad_norm": 20.213157653808594, "learning_rate": 1.5181286549707603e-05, "loss": 1.3767, "num_tokens": 260959.0, "step": 234 }, { "epoch": 0.26643990929705214, "grad_norm": 17.533151626586914, "learning_rate": 1.5157894736842107e-05, "loss": 1.5549, "num_tokens": 262189.0, "step": 235 }, { "epoch": 0.2675736961451247, "grad_norm": 16.97377586364746, "learning_rate": 1.5134502923976608e-05, "loss": 1.3521, "num_tokens": 263381.0, "step": 236 }, { "epoch": 0.2687074829931973, "grad_norm": 18.186052322387695, "learning_rate": 1.5111111111111112e-05, "loss": 1.3888, "num_tokens": 264514.0, "step": 237 }, { "epoch": 0.2698412698412698, "grad_norm": 17.744230270385742, "learning_rate": 1.5087719298245615e-05, "loss": 1.4052, "num_tokens": 265629.0, "step": 238 }, { "epoch": 0.2709750566893424, "grad_norm": 17.492412567138672, "learning_rate": 1.5064327485380119e-05, "loss": 1.3202, "num_tokens": 266750.0, "step": 239 }, { "epoch": 0.272108843537415, "grad_norm": 18.4610538482666, "learning_rate": 1.504093567251462e-05, "loss": 1.3697, "num_tokens": 267781.0, "step": 240 }, { "epoch": 0.2732426303854875, "grad_norm": 18.794511795043945, "learning_rate": 1.5017543859649124e-05, "loss": 1.2962, "num_tokens": 268921.0, "step": 241 }, { "epoch": 0.2743764172335601, "grad_norm": 22.105533599853516, "learning_rate": 1.4994152046783627e-05, "loss": 1.3314, "num_tokens": 269985.0, "step": 242 }, { "epoch": 0.2755102040816326, "grad_norm": 19.732385635375977, "learning_rate": 1.497076023391813e-05, "loss": 1.4754, "num_tokens": 271085.0, "step": 243 }, { "epoch": 0.2766439909297052, "grad_norm": 16.8427734375, "learning_rate": 1.4947368421052632e-05, "loss": 1.4078, "num_tokens": 272207.0, "step": 244 }, { "epoch": 0.2777777777777778, "grad_norm": 18.812732696533203, "learning_rate": 1.4923976608187136e-05, "loss": 1.2987, "num_tokens": 273195.0, "step": 245 }, { "epoch": 0.2789115646258503, "grad_norm": 19.384742736816406, "learning_rate": 1.4900584795321637e-05, "loss": 1.4385, "num_tokens": 274395.0, "step": 246 }, { "epoch": 0.2800453514739229, "grad_norm": 20.1599063873291, "learning_rate": 1.4877192982456141e-05, "loss": 1.4454, "num_tokens": 275413.0, "step": 247 }, { "epoch": 0.2811791383219955, "grad_norm": 18.764848709106445, "learning_rate": 1.4853801169590644e-05, "loss": 1.216, "num_tokens": 276425.0, "step": 248 }, { "epoch": 0.282312925170068, "grad_norm": 19.58850860595703, "learning_rate": 1.4830409356725148e-05, "loss": 1.372, "num_tokens": 277674.0, "step": 249 }, { "epoch": 0.2834467120181406, "grad_norm": 18.369375228881836, "learning_rate": 1.4807017543859649e-05, "loss": 1.2592, "num_tokens": 278761.0, "step": 250 }, { "epoch": 0.28458049886621317, "grad_norm": 19.515583038330078, "learning_rate": 1.4783625730994153e-05, "loss": 1.4817, "num_tokens": 279900.0, "step": 251 }, { "epoch": 0.2857142857142857, "grad_norm": 18.146976470947266, "learning_rate": 1.4760233918128658e-05, "loss": 1.4193, "num_tokens": 281152.0, "step": 252 }, { "epoch": 0.2868480725623583, "grad_norm": 19.507518768310547, "learning_rate": 1.4736842105263159e-05, "loss": 1.419, "num_tokens": 282236.0, "step": 253 }, { "epoch": 0.28798185941043086, "grad_norm": 18.844755172729492, "learning_rate": 1.4713450292397661e-05, "loss": 1.3267, "num_tokens": 283294.0, "step": 254 }, { "epoch": 0.2891156462585034, "grad_norm": 17.970260620117188, "learning_rate": 1.4690058479532165e-05, "loss": 1.3213, "num_tokens": 284345.0, "step": 255 }, { "epoch": 0.29024943310657597, "grad_norm": 18.758127212524414, "learning_rate": 1.4666666666666666e-05, "loss": 1.4439, "num_tokens": 285419.0, "step": 256 }, { "epoch": 0.29138321995464855, "grad_norm": 18.686439514160156, "learning_rate": 1.464327485380117e-05, "loss": 1.3119, "num_tokens": 286535.0, "step": 257 }, { "epoch": 0.2925170068027211, "grad_norm": 19.03111457824707, "learning_rate": 1.4619883040935675e-05, "loss": 1.3626, "num_tokens": 287631.0, "step": 258 }, { "epoch": 0.29365079365079366, "grad_norm": 17.014270782470703, "learning_rate": 1.4596491228070177e-05, "loss": 1.4804, "num_tokens": 288754.0, "step": 259 }, { "epoch": 0.2947845804988662, "grad_norm": 16.614084243774414, "learning_rate": 1.4573099415204678e-05, "loss": 1.411, "num_tokens": 289958.0, "step": 260 }, { "epoch": 0.29591836734693877, "grad_norm": 18.345542907714844, "learning_rate": 1.4549707602339183e-05, "loss": 1.3021, "num_tokens": 291137.0, "step": 261 }, { "epoch": 0.29705215419501135, "grad_norm": 17.023345947265625, "learning_rate": 1.4526315789473687e-05, "loss": 1.3828, "num_tokens": 292312.0, "step": 262 }, { "epoch": 0.2981859410430839, "grad_norm": 19.762277603149414, "learning_rate": 1.4502923976608188e-05, "loss": 1.3128, "num_tokens": 293355.0, "step": 263 }, { "epoch": 0.29931972789115646, "grad_norm": 20.014009475708008, "learning_rate": 1.447953216374269e-05, "loss": 1.3058, "num_tokens": 294480.0, "step": 264 }, { "epoch": 0.30045351473922904, "grad_norm": 18.648479461669922, "learning_rate": 1.4456140350877195e-05, "loss": 1.4155, "num_tokens": 295543.0, "step": 265 }, { "epoch": 0.30158730158730157, "grad_norm": 18.497230529785156, "learning_rate": 1.4432748538011695e-05, "loss": 1.3598, "num_tokens": 296682.0, "step": 266 }, { "epoch": 0.30272108843537415, "grad_norm": 17.821691513061523, "learning_rate": 1.44093567251462e-05, "loss": 1.4373, "num_tokens": 297762.0, "step": 267 }, { "epoch": 0.30385487528344673, "grad_norm": 18.2099666595459, "learning_rate": 1.4385964912280704e-05, "loss": 1.3183, "num_tokens": 298916.0, "step": 268 }, { "epoch": 0.30498866213151926, "grad_norm": 18.04322624206543, "learning_rate": 1.4362573099415207e-05, "loss": 1.2767, "num_tokens": 299936.0, "step": 269 }, { "epoch": 0.30612244897959184, "grad_norm": 17.976163864135742, "learning_rate": 1.4339181286549707e-05, "loss": 1.4378, "num_tokens": 301074.0, "step": 270 }, { "epoch": 0.3072562358276644, "grad_norm": 18.689985275268555, "learning_rate": 1.4315789473684212e-05, "loss": 1.3617, "num_tokens": 302125.0, "step": 271 }, { "epoch": 0.30839002267573695, "grad_norm": 18.958097457885742, "learning_rate": 1.4292397660818716e-05, "loss": 1.3331, "num_tokens": 303263.0, "step": 272 }, { "epoch": 0.30952380952380953, "grad_norm": 18.772878646850586, "learning_rate": 1.4269005847953217e-05, "loss": 1.3565, "num_tokens": 304463.0, "step": 273 }, { "epoch": 0.31065759637188206, "grad_norm": 17.425918579101562, "learning_rate": 1.4245614035087721e-05, "loss": 1.332, "num_tokens": 305660.0, "step": 274 }, { "epoch": 0.31179138321995464, "grad_norm": 18.931303024291992, "learning_rate": 1.4222222222222224e-05, "loss": 1.358, "num_tokens": 306684.0, "step": 275 }, { "epoch": 0.3129251700680272, "grad_norm": 16.99065399169922, "learning_rate": 1.4198830409356725e-05, "loss": 1.3331, "num_tokens": 307813.0, "step": 276 }, { "epoch": 0.31405895691609975, "grad_norm": 18.153553009033203, "learning_rate": 1.4175438596491229e-05, "loss": 1.3584, "num_tokens": 308876.0, "step": 277 }, { "epoch": 0.31519274376417233, "grad_norm": 17.73149871826172, "learning_rate": 1.4152046783625733e-05, "loss": 1.2783, "num_tokens": 310012.0, "step": 278 }, { "epoch": 0.3163265306122449, "grad_norm": 19.030410766601562, "learning_rate": 1.4128654970760236e-05, "loss": 1.3519, "num_tokens": 311070.0, "step": 279 }, { "epoch": 0.31746031746031744, "grad_norm": 16.927932739257812, "learning_rate": 1.4105263157894738e-05, "loss": 1.2184, "num_tokens": 312154.0, "step": 280 }, { "epoch": 0.31859410430839, "grad_norm": 19.48080062866211, "learning_rate": 1.408187134502924e-05, "loss": 1.529, "num_tokens": 313253.0, "step": 281 }, { "epoch": 0.3197278911564626, "grad_norm": 18.875545501708984, "learning_rate": 1.4058479532163745e-05, "loss": 1.3173, "num_tokens": 314325.0, "step": 282 }, { "epoch": 0.32086167800453513, "grad_norm": 17.87661361694336, "learning_rate": 1.4035087719298246e-05, "loss": 1.3601, "num_tokens": 315399.0, "step": 283 }, { "epoch": 0.3219954648526077, "grad_norm": 16.739702224731445, "learning_rate": 1.401169590643275e-05, "loss": 1.2588, "num_tokens": 316543.0, "step": 284 }, { "epoch": 0.3231292517006803, "grad_norm": 17.692420959472656, "learning_rate": 1.3988304093567253e-05, "loss": 1.3191, "num_tokens": 317658.0, "step": 285 }, { "epoch": 0.3242630385487528, "grad_norm": 19.47262191772461, "learning_rate": 1.3964912280701755e-05, "loss": 1.3529, "num_tokens": 318773.0, "step": 286 }, { "epoch": 0.3253968253968254, "grad_norm": 18.594545364379883, "learning_rate": 1.3941520467836258e-05, "loss": 1.3961, "num_tokens": 319995.0, "step": 287 }, { "epoch": 0.32653061224489793, "grad_norm": 16.38534164428711, "learning_rate": 1.3918128654970762e-05, "loss": 1.2593, "num_tokens": 321078.0, "step": 288 }, { "epoch": 0.3276643990929705, "grad_norm": 19.34218978881836, "learning_rate": 1.3894736842105265e-05, "loss": 1.3406, "num_tokens": 322140.0, "step": 289 }, { "epoch": 0.3287981859410431, "grad_norm": 20.129104614257812, "learning_rate": 1.3871345029239767e-05, "loss": 1.3634, "num_tokens": 323326.0, "step": 290 }, { "epoch": 0.3299319727891156, "grad_norm": 18.582170486450195, "learning_rate": 1.384795321637427e-05, "loss": 1.2407, "num_tokens": 324391.0, "step": 291 }, { "epoch": 0.3310657596371882, "grad_norm": 18.887828826904297, "learning_rate": 1.3824561403508774e-05, "loss": 1.2504, "num_tokens": 325423.0, "step": 292 }, { "epoch": 0.3321995464852608, "grad_norm": 19.49580955505371, "learning_rate": 1.3801169590643275e-05, "loss": 1.2945, "num_tokens": 326471.0, "step": 293 }, { "epoch": 0.3333333333333333, "grad_norm": 21.09246063232422, "learning_rate": 1.377777777777778e-05, "loss": 1.3799, "num_tokens": 327570.0, "step": 294 }, { "epoch": 0.3344671201814059, "grad_norm": 18.326868057250977, "learning_rate": 1.3754385964912282e-05, "loss": 1.3228, "num_tokens": 328672.0, "step": 295 }, { "epoch": 0.3356009070294785, "grad_norm": 19.176008224487305, "learning_rate": 1.3730994152046784e-05, "loss": 1.3348, "num_tokens": 329696.0, "step": 296 }, { "epoch": 0.336734693877551, "grad_norm": 18.465486526489258, "learning_rate": 1.3707602339181287e-05, "loss": 1.2659, "num_tokens": 330810.0, "step": 297 }, { "epoch": 0.3378684807256236, "grad_norm": 16.499107360839844, "learning_rate": 1.3684210526315791e-05, "loss": 1.2951, "num_tokens": 331998.0, "step": 298 }, { "epoch": 0.33900226757369617, "grad_norm": 18.211158752441406, "learning_rate": 1.3660818713450294e-05, "loss": 1.243, "num_tokens": 333011.0, "step": 299 }, { "epoch": 0.3401360544217687, "grad_norm": 17.3167724609375, "learning_rate": 1.3637426900584796e-05, "loss": 1.3465, "num_tokens": 334104.0, "step": 300 }, { "epoch": 0.3412698412698413, "grad_norm": 18.3892822265625, "learning_rate": 1.3614035087719299e-05, "loss": 1.4763, "num_tokens": 335283.0, "step": 301 }, { "epoch": 0.3424036281179138, "grad_norm": 18.357229232788086, "learning_rate": 1.3590643274853803e-05, "loss": 1.4627, "num_tokens": 336363.0, "step": 302 }, { "epoch": 0.3435374149659864, "grad_norm": 18.55135726928711, "learning_rate": 1.3567251461988304e-05, "loss": 1.4521, "num_tokens": 337576.0, "step": 303 }, { "epoch": 0.34467120181405897, "grad_norm": 17.058040618896484, "learning_rate": 1.3543859649122808e-05, "loss": 1.2474, "num_tokens": 338641.0, "step": 304 }, { "epoch": 0.3458049886621315, "grad_norm": 18.360454559326172, "learning_rate": 1.3520467836257311e-05, "loss": 1.3871, "num_tokens": 339795.0, "step": 305 }, { "epoch": 0.3469387755102041, "grad_norm": 18.694433212280273, "learning_rate": 1.3497076023391814e-05, "loss": 1.3593, "num_tokens": 340889.0, "step": 306 }, { "epoch": 0.34807256235827666, "grad_norm": 17.909223556518555, "learning_rate": 1.3473684210526316e-05, "loss": 1.4255, "num_tokens": 342007.0, "step": 307 }, { "epoch": 0.3492063492063492, "grad_norm": 18.030197143554688, "learning_rate": 1.345029239766082e-05, "loss": 1.2348, "num_tokens": 343151.0, "step": 308 }, { "epoch": 0.35034013605442177, "grad_norm": 19.783491134643555, "learning_rate": 1.3426900584795323e-05, "loss": 1.4069, "num_tokens": 344208.0, "step": 309 }, { "epoch": 0.35147392290249435, "grad_norm": 18.20054054260254, "learning_rate": 1.3403508771929826e-05, "loss": 1.4214, "num_tokens": 345374.0, "step": 310 }, { "epoch": 0.3526077097505669, "grad_norm": 17.507898330688477, "learning_rate": 1.3380116959064328e-05, "loss": 1.396, "num_tokens": 346608.0, "step": 311 }, { "epoch": 0.35374149659863946, "grad_norm": 19.637102127075195, "learning_rate": 1.3356725146198832e-05, "loss": 1.4677, "num_tokens": 347714.0, "step": 312 }, { "epoch": 0.35487528344671204, "grad_norm": 18.308876037597656, "learning_rate": 1.3333333333333333e-05, "loss": 1.3536, "num_tokens": 348930.0, "step": 313 }, { "epoch": 0.35600907029478457, "grad_norm": 21.49268913269043, "learning_rate": 1.3309941520467838e-05, "loss": 1.4455, "num_tokens": 349999.0, "step": 314 }, { "epoch": 0.35714285714285715, "grad_norm": 18.466651916503906, "learning_rate": 1.328654970760234e-05, "loss": 1.418, "num_tokens": 351123.0, "step": 315 }, { "epoch": 0.35827664399092973, "grad_norm": 17.94565200805664, "learning_rate": 1.3263157894736843e-05, "loss": 1.3624, "num_tokens": 352245.0, "step": 316 }, { "epoch": 0.35941043083900226, "grad_norm": 18.44500160217285, "learning_rate": 1.3239766081871345e-05, "loss": 1.3552, "num_tokens": 353380.0, "step": 317 }, { "epoch": 0.36054421768707484, "grad_norm": 21.510616302490234, "learning_rate": 1.321637426900585e-05, "loss": 1.4998, "num_tokens": 354375.0, "step": 318 }, { "epoch": 0.36167800453514737, "grad_norm": 19.28470230102539, "learning_rate": 1.3192982456140354e-05, "loss": 1.3258, "num_tokens": 355502.0, "step": 319 }, { "epoch": 0.36281179138321995, "grad_norm": 18.8537540435791, "learning_rate": 1.3169590643274855e-05, "loss": 1.2844, "num_tokens": 356615.0, "step": 320 }, { "epoch": 0.36394557823129253, "grad_norm": 16.58262825012207, "learning_rate": 1.3146198830409357e-05, "loss": 1.2823, "num_tokens": 357688.0, "step": 321 }, { "epoch": 0.36507936507936506, "grad_norm": 16.598854064941406, "learning_rate": 1.3122807017543862e-05, "loss": 1.2514, "num_tokens": 358832.0, "step": 322 }, { "epoch": 0.36621315192743764, "grad_norm": 18.89130401611328, "learning_rate": 1.3099415204678362e-05, "loss": 1.3144, "num_tokens": 359904.0, "step": 323 }, { "epoch": 0.3673469387755102, "grad_norm": 16.702598571777344, "learning_rate": 1.3076023391812867e-05, "loss": 1.2213, "num_tokens": 361014.0, "step": 324 }, { "epoch": 0.36848072562358275, "grad_norm": 17.977264404296875, "learning_rate": 1.305263157894737e-05, "loss": 1.2883, "num_tokens": 362163.0, "step": 325 }, { "epoch": 0.36961451247165533, "grad_norm": 19.28291893005371, "learning_rate": 1.3029239766081872e-05, "loss": 1.3182, "num_tokens": 363222.0, "step": 326 }, { "epoch": 0.3707482993197279, "grad_norm": 17.178043365478516, "learning_rate": 1.3005847953216374e-05, "loss": 1.274, "num_tokens": 364332.0, "step": 327 }, { "epoch": 0.37188208616780044, "grad_norm": 18.939533233642578, "learning_rate": 1.2982456140350879e-05, "loss": 1.3883, "num_tokens": 365433.0, "step": 328 }, { "epoch": 0.373015873015873, "grad_norm": 19.40555191040039, "learning_rate": 1.2959064327485383e-05, "loss": 1.2875, "num_tokens": 366487.0, "step": 329 }, { "epoch": 0.3741496598639456, "grad_norm": 16.3048038482666, "learning_rate": 1.2935672514619884e-05, "loss": 1.3365, "num_tokens": 367618.0, "step": 330 }, { "epoch": 0.37528344671201813, "grad_norm": 17.931875228881836, "learning_rate": 1.2912280701754386e-05, "loss": 1.3292, "num_tokens": 368738.0, "step": 331 }, { "epoch": 0.3764172335600907, "grad_norm": 21.821672439575195, "learning_rate": 1.288888888888889e-05, "loss": 1.3869, "num_tokens": 369720.0, "step": 332 }, { "epoch": 0.37755102040816324, "grad_norm": 16.764142990112305, "learning_rate": 1.2865497076023392e-05, "loss": 1.3183, "num_tokens": 370885.0, "step": 333 }, { "epoch": 0.3786848072562358, "grad_norm": 17.28849220275879, "learning_rate": 1.2842105263157896e-05, "loss": 1.2476, "num_tokens": 371958.0, "step": 334 }, { "epoch": 0.3798185941043084, "grad_norm": 18.260684967041016, "learning_rate": 1.28187134502924e-05, "loss": 1.4425, "num_tokens": 373085.0, "step": 335 }, { "epoch": 0.38095238095238093, "grad_norm": 16.82544708251953, "learning_rate": 1.2795321637426901e-05, "loss": 1.3373, "num_tokens": 374210.0, "step": 336 }, { "epoch": 0.3820861678004535, "grad_norm": 17.36188316345215, "learning_rate": 1.2771929824561404e-05, "loss": 1.3534, "num_tokens": 375310.0, "step": 337 }, { "epoch": 0.3832199546485261, "grad_norm": 18.09803581237793, "learning_rate": 1.2748538011695908e-05, "loss": 1.2559, "num_tokens": 376372.0, "step": 338 }, { "epoch": 0.3843537414965986, "grad_norm": 19.373031616210938, "learning_rate": 1.2725146198830412e-05, "loss": 1.5019, "num_tokens": 377527.0, "step": 339 }, { "epoch": 0.3854875283446712, "grad_norm": 19.310766220092773, "learning_rate": 1.2701754385964913e-05, "loss": 1.2678, "num_tokens": 378604.0, "step": 340 }, { "epoch": 0.3866213151927438, "grad_norm": 18.202178955078125, "learning_rate": 1.2678362573099417e-05, "loss": 1.2171, "num_tokens": 379678.0, "step": 341 }, { "epoch": 0.3877551020408163, "grad_norm": 18.8320369720459, "learning_rate": 1.265497076023392e-05, "loss": 1.2849, "num_tokens": 380729.0, "step": 342 }, { "epoch": 0.3888888888888889, "grad_norm": 18.66080093383789, "learning_rate": 1.263157894736842e-05, "loss": 1.3137, "num_tokens": 381876.0, "step": 343 }, { "epoch": 0.3900226757369615, "grad_norm": 19.323078155517578, "learning_rate": 1.2608187134502925e-05, "loss": 1.3574, "num_tokens": 382962.0, "step": 344 }, { "epoch": 0.391156462585034, "grad_norm": 19.04428482055664, "learning_rate": 1.258479532163743e-05, "loss": 1.362, "num_tokens": 384096.0, "step": 345 }, { "epoch": 0.3922902494331066, "grad_norm": 20.02356719970703, "learning_rate": 1.256140350877193e-05, "loss": 1.2055, "num_tokens": 385175.0, "step": 346 }, { "epoch": 0.3934240362811791, "grad_norm": 17.03901481628418, "learning_rate": 1.2538011695906434e-05, "loss": 1.2646, "num_tokens": 386358.0, "step": 347 }, { "epoch": 0.3945578231292517, "grad_norm": 18.585233688354492, "learning_rate": 1.2514619883040937e-05, "loss": 1.317, "num_tokens": 387482.0, "step": 348 }, { "epoch": 0.3956916099773243, "grad_norm": 17.557876586914062, "learning_rate": 1.2491228070175441e-05, "loss": 1.3041, "num_tokens": 388662.0, "step": 349 }, { "epoch": 0.3968253968253968, "grad_norm": 20.155532836914062, "learning_rate": 1.2467836257309942e-05, "loss": 1.4664, "num_tokens": 389731.0, "step": 350 }, { "epoch": 0.3979591836734694, "grad_norm": 17.1430721282959, "learning_rate": 1.2444444444444446e-05, "loss": 1.2873, "num_tokens": 390921.0, "step": 351 }, { "epoch": 0.39909297052154197, "grad_norm": 17.54984474182129, "learning_rate": 1.2421052631578949e-05, "loss": 1.3858, "num_tokens": 392035.0, "step": 352 }, { "epoch": 0.4002267573696145, "grad_norm": 18.338607788085938, "learning_rate": 1.239766081871345e-05, "loss": 1.3534, "num_tokens": 393093.0, "step": 353 }, { "epoch": 0.4013605442176871, "grad_norm": 18.00188446044922, "learning_rate": 1.2374269005847954e-05, "loss": 1.4016, "num_tokens": 394239.0, "step": 354 }, { "epoch": 0.40249433106575966, "grad_norm": 19.884347915649414, "learning_rate": 1.2350877192982458e-05, "loss": 1.3107, "num_tokens": 395291.0, "step": 355 }, { "epoch": 0.4036281179138322, "grad_norm": 16.58745765686035, "learning_rate": 1.232748538011696e-05, "loss": 1.2492, "num_tokens": 396485.0, "step": 356 }, { "epoch": 0.40476190476190477, "grad_norm": 16.661865234375, "learning_rate": 1.2304093567251463e-05, "loss": 1.2897, "num_tokens": 397644.0, "step": 357 }, { "epoch": 0.40589569160997735, "grad_norm": 18.313318252563477, "learning_rate": 1.2280701754385966e-05, "loss": 1.1862, "num_tokens": 398789.0, "step": 358 }, { "epoch": 0.4070294784580499, "grad_norm": 17.393386840820312, "learning_rate": 1.225730994152047e-05, "loss": 1.3261, "num_tokens": 399878.0, "step": 359 }, { "epoch": 0.40816326530612246, "grad_norm": 17.21160316467285, "learning_rate": 1.2233918128654971e-05, "loss": 1.188, "num_tokens": 400987.0, "step": 360 }, { "epoch": 0.409297052154195, "grad_norm": 17.881792068481445, "learning_rate": 1.2210526315789475e-05, "loss": 1.4177, "num_tokens": 402056.0, "step": 361 }, { "epoch": 0.41043083900226757, "grad_norm": 18.00080108642578, "learning_rate": 1.2187134502923978e-05, "loss": 1.2907, "num_tokens": 403197.0, "step": 362 }, { "epoch": 0.41156462585034015, "grad_norm": 17.8870906829834, "learning_rate": 1.216374269005848e-05, "loss": 1.475, "num_tokens": 404342.0, "step": 363 }, { "epoch": 0.4126984126984127, "grad_norm": 18.225791931152344, "learning_rate": 1.2140350877192983e-05, "loss": 1.4006, "num_tokens": 405485.0, "step": 364 }, { "epoch": 0.41383219954648526, "grad_norm": 16.519243240356445, "learning_rate": 1.2116959064327487e-05, "loss": 1.3321, "num_tokens": 406682.0, "step": 365 }, { "epoch": 0.41496598639455784, "grad_norm": 18.153871536254883, "learning_rate": 1.2093567251461988e-05, "loss": 1.3119, "num_tokens": 407731.0, "step": 366 }, { "epoch": 0.41609977324263037, "grad_norm": 21.92259979248047, "learning_rate": 1.2070175438596493e-05, "loss": 1.3713, "num_tokens": 408773.0, "step": 367 }, { "epoch": 0.41723356009070295, "grad_norm": 18.638320922851562, "learning_rate": 1.2046783625730995e-05, "loss": 1.1941, "num_tokens": 409839.0, "step": 368 }, { "epoch": 0.41836734693877553, "grad_norm": 18.971038818359375, "learning_rate": 1.20233918128655e-05, "loss": 1.3417, "num_tokens": 410856.0, "step": 369 }, { "epoch": 0.41950113378684806, "grad_norm": 17.432235717773438, "learning_rate": 1.2e-05, "loss": 1.3059, "num_tokens": 411957.0, "step": 370 }, { "epoch": 0.42063492063492064, "grad_norm": 16.81254768371582, "learning_rate": 1.1976608187134505e-05, "loss": 1.2662, "num_tokens": 413070.0, "step": 371 }, { "epoch": 0.4217687074829932, "grad_norm": 18.428218841552734, "learning_rate": 1.1953216374269007e-05, "loss": 1.3241, "num_tokens": 414204.0, "step": 372 }, { "epoch": 0.42290249433106575, "grad_norm": 16.93047523498535, "learning_rate": 1.192982456140351e-05, "loss": 1.3024, "num_tokens": 415467.0, "step": 373 }, { "epoch": 0.42403628117913833, "grad_norm": 17.64056396484375, "learning_rate": 1.1906432748538012e-05, "loss": 1.3272, "num_tokens": 416638.0, "step": 374 }, { "epoch": 0.42517006802721086, "grad_norm": 17.135147094726562, "learning_rate": 1.1883040935672517e-05, "loss": 1.3108, "num_tokens": 417771.0, "step": 375 }, { "epoch": 0.42630385487528344, "grad_norm": 17.535350799560547, "learning_rate": 1.1859649122807017e-05, "loss": 1.3515, "num_tokens": 418899.0, "step": 376 }, { "epoch": 0.427437641723356, "grad_norm": 17.25712013244629, "learning_rate": 1.1836257309941522e-05, "loss": 1.3058, "num_tokens": 420087.0, "step": 377 }, { "epoch": 0.42857142857142855, "grad_norm": 17.746435165405273, "learning_rate": 1.1812865497076024e-05, "loss": 1.3141, "num_tokens": 421175.0, "step": 378 }, { "epoch": 0.42970521541950113, "grad_norm": 18.021026611328125, "learning_rate": 1.1789473684210527e-05, "loss": 1.3604, "num_tokens": 422301.0, "step": 379 }, { "epoch": 0.4308390022675737, "grad_norm": 16.63826560974121, "learning_rate": 1.176608187134503e-05, "loss": 1.2025, "num_tokens": 423479.0, "step": 380 }, { "epoch": 0.43197278911564624, "grad_norm": 18.336565017700195, "learning_rate": 1.1742690058479534e-05, "loss": 1.3859, "num_tokens": 424599.0, "step": 381 }, { "epoch": 0.4331065759637188, "grad_norm": 17.72231101989746, "learning_rate": 1.1719298245614036e-05, "loss": 1.2795, "num_tokens": 425638.0, "step": 382 }, { "epoch": 0.4342403628117914, "grad_norm": 17.340106964111328, "learning_rate": 1.1695906432748539e-05, "loss": 1.3613, "num_tokens": 426831.0, "step": 383 }, { "epoch": 0.43537414965986393, "grad_norm": 17.47897720336914, "learning_rate": 1.1672514619883041e-05, "loss": 1.3471, "num_tokens": 428008.0, "step": 384 }, { "epoch": 0.4365079365079365, "grad_norm": 16.47149658203125, "learning_rate": 1.1649122807017546e-05, "loss": 1.2196, "num_tokens": 429150.0, "step": 385 }, { "epoch": 0.4376417233560091, "grad_norm": 17.920820236206055, "learning_rate": 1.1625730994152047e-05, "loss": 1.3064, "num_tokens": 430401.0, "step": 386 }, { "epoch": 0.4387755102040816, "grad_norm": 17.858154296875, "learning_rate": 1.160233918128655e-05, "loss": 1.2954, "num_tokens": 431572.0, "step": 387 }, { "epoch": 0.4399092970521542, "grad_norm": 18.374351501464844, "learning_rate": 1.1578947368421053e-05, "loss": 1.4537, "num_tokens": 432748.0, "step": 388 }, { "epoch": 0.4410430839002268, "grad_norm": 19.84334945678711, "learning_rate": 1.1555555555555556e-05, "loss": 1.2758, "num_tokens": 433673.0, "step": 389 }, { "epoch": 0.4421768707482993, "grad_norm": 16.914400100708008, "learning_rate": 1.1532163742690059e-05, "loss": 1.2657, "num_tokens": 434803.0, "step": 390 }, { "epoch": 0.4433106575963719, "grad_norm": 18.99037742614746, "learning_rate": 1.1508771929824563e-05, "loss": 1.4169, "num_tokens": 435895.0, "step": 391 }, { "epoch": 0.4444444444444444, "grad_norm": 18.286720275878906, "learning_rate": 1.1485380116959065e-05, "loss": 1.4253, "num_tokens": 436983.0, "step": 392 }, { "epoch": 0.445578231292517, "grad_norm": 18.76189613342285, "learning_rate": 1.1461988304093568e-05, "loss": 1.441, "num_tokens": 438073.0, "step": 393 }, { "epoch": 0.4467120181405896, "grad_norm": 16.708738327026367, "learning_rate": 1.143859649122807e-05, "loss": 1.2524, "num_tokens": 439139.0, "step": 394 }, { "epoch": 0.4478458049886621, "grad_norm": 17.922731399536133, "learning_rate": 1.1415204678362575e-05, "loss": 1.358, "num_tokens": 440259.0, "step": 395 }, { "epoch": 0.4489795918367347, "grad_norm": 18.53734588623047, "learning_rate": 1.1391812865497076e-05, "loss": 1.4107, "num_tokens": 441416.0, "step": 396 }, { "epoch": 0.4501133786848073, "grad_norm": 18.188804626464844, "learning_rate": 1.136842105263158e-05, "loss": 1.3675, "num_tokens": 442549.0, "step": 397 }, { "epoch": 0.4512471655328798, "grad_norm": 16.795684814453125, "learning_rate": 1.1345029239766083e-05, "loss": 1.2673, "num_tokens": 443711.0, "step": 398 }, { "epoch": 0.4523809523809524, "grad_norm": 16.72913932800293, "learning_rate": 1.1321637426900585e-05, "loss": 1.3102, "num_tokens": 444840.0, "step": 399 }, { "epoch": 0.45351473922902497, "grad_norm": 17.20433235168457, "learning_rate": 1.1298245614035088e-05, "loss": 1.3778, "num_tokens": 445920.0, "step": 400 }, { "epoch": 0.4546485260770975, "grad_norm": 18.175119400024414, "learning_rate": 1.1274853801169592e-05, "loss": 1.4488, "num_tokens": 447140.0, "step": 401 }, { "epoch": 0.4557823129251701, "grad_norm": 17.975004196166992, "learning_rate": 1.1251461988304096e-05, "loss": 1.2597, "num_tokens": 448247.0, "step": 402 }, { "epoch": 0.45691609977324266, "grad_norm": 17.525251388549805, "learning_rate": 1.1228070175438597e-05, "loss": 1.2533, "num_tokens": 449442.0, "step": 403 }, { "epoch": 0.4580498866213152, "grad_norm": 20.157466888427734, "learning_rate": 1.12046783625731e-05, "loss": 1.2854, "num_tokens": 450561.0, "step": 404 }, { "epoch": 0.45918367346938777, "grad_norm": 16.90342140197754, "learning_rate": 1.1181286549707604e-05, "loss": 1.3678, "num_tokens": 451707.0, "step": 405 }, { "epoch": 0.4603174603174603, "grad_norm": 16.184585571289062, "learning_rate": 1.1157894736842105e-05, "loss": 1.3334, "num_tokens": 452849.0, "step": 406 }, { "epoch": 0.4614512471655329, "grad_norm": 18.51410675048828, "learning_rate": 1.1134502923976609e-05, "loss": 1.2296, "num_tokens": 453908.0, "step": 407 }, { "epoch": 0.46258503401360546, "grad_norm": 16.632823944091797, "learning_rate": 1.1111111111111113e-05, "loss": 1.2192, "num_tokens": 455072.0, "step": 408 }, { "epoch": 0.463718820861678, "grad_norm": 18.5230655670166, "learning_rate": 1.1087719298245614e-05, "loss": 1.4064, "num_tokens": 456203.0, "step": 409 }, { "epoch": 0.46485260770975056, "grad_norm": 17.45703125, "learning_rate": 1.1064327485380117e-05, "loss": 1.3585, "num_tokens": 457306.0, "step": 410 }, { "epoch": 0.46598639455782315, "grad_norm": 18.392507553100586, "learning_rate": 1.1040935672514621e-05, "loss": 1.3359, "num_tokens": 458435.0, "step": 411 }, { "epoch": 0.4671201814058957, "grad_norm": 17.494783401489258, "learning_rate": 1.1017543859649125e-05, "loss": 1.1005, "num_tokens": 459465.0, "step": 412 }, { "epoch": 0.46825396825396826, "grad_norm": 18.129724502563477, "learning_rate": 1.0994152046783626e-05, "loss": 1.4386, "num_tokens": 460630.0, "step": 413 }, { "epoch": 0.46938775510204084, "grad_norm": 17.525470733642578, "learning_rate": 1.0970760233918129e-05, "loss": 1.3187, "num_tokens": 461789.0, "step": 414 }, { "epoch": 0.47052154195011336, "grad_norm": 18.774728775024414, "learning_rate": 1.0947368421052633e-05, "loss": 1.3189, "num_tokens": 462919.0, "step": 415 }, { "epoch": 0.47165532879818595, "grad_norm": 18.843229293823242, "learning_rate": 1.0923976608187134e-05, "loss": 1.1756, "num_tokens": 463976.0, "step": 416 }, { "epoch": 0.47278911564625853, "grad_norm": 19.37541961669922, "learning_rate": 1.0900584795321638e-05, "loss": 1.3738, "num_tokens": 465105.0, "step": 417 }, { "epoch": 0.47392290249433106, "grad_norm": 18.707748413085938, "learning_rate": 1.0877192982456142e-05, "loss": 1.3039, "num_tokens": 466236.0, "step": 418 }, { "epoch": 0.47505668934240364, "grad_norm": 17.227943420410156, "learning_rate": 1.0853801169590643e-05, "loss": 1.3565, "num_tokens": 467384.0, "step": 419 }, { "epoch": 0.47619047619047616, "grad_norm": 16.37371826171875, "learning_rate": 1.0830409356725146e-05, "loss": 1.2327, "num_tokens": 468501.0, "step": 420 }, { "epoch": 0.47732426303854875, "grad_norm": 19.634498596191406, "learning_rate": 1.080701754385965e-05, "loss": 1.4242, "num_tokens": 469514.0, "step": 421 }, { "epoch": 0.47845804988662133, "grad_norm": 17.9482421875, "learning_rate": 1.0783625730994154e-05, "loss": 1.3825, "num_tokens": 470570.0, "step": 422 }, { "epoch": 0.47959183673469385, "grad_norm": 17.454389572143555, "learning_rate": 1.0760233918128655e-05, "loss": 1.3237, "num_tokens": 471783.0, "step": 423 }, { "epoch": 0.48072562358276644, "grad_norm": 20.609020233154297, "learning_rate": 1.073684210526316e-05, "loss": 1.2899, "num_tokens": 472934.0, "step": 424 }, { "epoch": 0.481859410430839, "grad_norm": 17.69285011291504, "learning_rate": 1.0713450292397662e-05, "loss": 1.3942, "num_tokens": 474128.0, "step": 425 }, { "epoch": 0.48299319727891155, "grad_norm": 17.683488845825195, "learning_rate": 1.0690058479532163e-05, "loss": 1.2815, "num_tokens": 475240.0, "step": 426 }, { "epoch": 0.48412698412698413, "grad_norm": 18.547361373901367, "learning_rate": 1.0666666666666667e-05, "loss": 1.4346, "num_tokens": 476354.0, "step": 427 }, { "epoch": 0.4852607709750567, "grad_norm": 18.398448944091797, "learning_rate": 1.0643274853801172e-05, "loss": 1.4309, "num_tokens": 477408.0, "step": 428 }, { "epoch": 0.48639455782312924, "grad_norm": 19.063720703125, "learning_rate": 1.0619883040935672e-05, "loss": 1.3739, "num_tokens": 478458.0, "step": 429 }, { "epoch": 0.4875283446712018, "grad_norm": 18.32535743713379, "learning_rate": 1.0596491228070177e-05, "loss": 1.278, "num_tokens": 479577.0, "step": 430 }, { "epoch": 0.4886621315192744, "grad_norm": 17.363754272460938, "learning_rate": 1.057309941520468e-05, "loss": 1.2988, "num_tokens": 480596.0, "step": 431 }, { "epoch": 0.4897959183673469, "grad_norm": 18.99915885925293, "learning_rate": 1.0549707602339184e-05, "loss": 1.3827, "num_tokens": 481636.0, "step": 432 }, { "epoch": 0.4909297052154195, "grad_norm": 17.312416076660156, "learning_rate": 1.0526315789473684e-05, "loss": 1.3422, "num_tokens": 482749.0, "step": 433 }, { "epoch": 0.49206349206349204, "grad_norm": 18.068572998046875, "learning_rate": 1.0502923976608189e-05, "loss": 1.3069, "num_tokens": 483894.0, "step": 434 }, { "epoch": 0.4931972789115646, "grad_norm": 17.16152572631836, "learning_rate": 1.0479532163742691e-05, "loss": 1.4533, "num_tokens": 485099.0, "step": 435 }, { "epoch": 0.4943310657596372, "grad_norm": 17.189016342163086, "learning_rate": 1.0456140350877194e-05, "loss": 1.3927, "num_tokens": 486238.0, "step": 436 }, { "epoch": 0.4954648526077097, "grad_norm": 19.20362091064453, "learning_rate": 1.0432748538011696e-05, "loss": 1.318, "num_tokens": 487264.0, "step": 437 }, { "epoch": 0.4965986394557823, "grad_norm": 17.41990089416504, "learning_rate": 1.04093567251462e-05, "loss": 1.3175, "num_tokens": 488457.0, "step": 438 }, { "epoch": 0.4977324263038549, "grad_norm": 17.663455963134766, "learning_rate": 1.0385964912280702e-05, "loss": 1.3584, "num_tokens": 489594.0, "step": 439 }, { "epoch": 0.4988662131519274, "grad_norm": 17.151172637939453, "learning_rate": 1.0362573099415206e-05, "loss": 1.3883, "num_tokens": 490762.0, "step": 440 }, { "epoch": 0.5, "grad_norm": 17.005578994750977, "learning_rate": 1.0339181286549708e-05, "loss": 1.3144, "num_tokens": 491869.0, "step": 441 }, { "epoch": 0.5011337868480725, "grad_norm": 16.874614715576172, "learning_rate": 1.0315789473684213e-05, "loss": 1.3956, "num_tokens": 492996.0, "step": 442 }, { "epoch": 0.5022675736961452, "grad_norm": 20.913042068481445, "learning_rate": 1.0292397660818714e-05, "loss": 1.4668, "num_tokens": 494026.0, "step": 443 }, { "epoch": 0.5034013605442177, "grad_norm": 16.352046966552734, "learning_rate": 1.0269005847953218e-05, "loss": 1.3393, "num_tokens": 495187.0, "step": 444 }, { "epoch": 0.5045351473922902, "grad_norm": 17.492143630981445, "learning_rate": 1.024561403508772e-05, "loss": 1.2858, "num_tokens": 496377.0, "step": 445 }, { "epoch": 0.5056689342403629, "grad_norm": 18.121971130371094, "learning_rate": 1.0222222222222223e-05, "loss": 1.3492, "num_tokens": 497515.0, "step": 446 }, { "epoch": 0.5068027210884354, "grad_norm": 17.30265235900879, "learning_rate": 1.0198830409356726e-05, "loss": 1.2293, "num_tokens": 498679.0, "step": 447 }, { "epoch": 0.5079365079365079, "grad_norm": 17.755374908447266, "learning_rate": 1.017543859649123e-05, "loss": 1.3147, "num_tokens": 499695.0, "step": 448 }, { "epoch": 0.5090702947845805, "grad_norm": 18.442646026611328, "learning_rate": 1.015204678362573e-05, "loss": 1.2659, "num_tokens": 500855.0, "step": 449 }, { "epoch": 0.5102040816326531, "grad_norm": 17.08477783203125, "learning_rate": 1.0128654970760235e-05, "loss": 1.3338, "num_tokens": 501968.0, "step": 450 }, { "epoch": 0.5113378684807256, "grad_norm": 21.1053409576416, "learning_rate": 1.0105263157894738e-05, "loss": 1.3168, "num_tokens": 502939.0, "step": 451 }, { "epoch": 0.5124716553287982, "grad_norm": 19.28374481201172, "learning_rate": 1.0081871345029242e-05, "loss": 1.2795, "num_tokens": 503993.0, "step": 452 }, { "epoch": 0.5136054421768708, "grad_norm": 18.68455696105957, "learning_rate": 1.0058479532163743e-05, "loss": 1.2891, "num_tokens": 505157.0, "step": 453 }, { "epoch": 0.5147392290249433, "grad_norm": 20.081592559814453, "learning_rate": 1.0035087719298247e-05, "loss": 1.2741, "num_tokens": 506313.0, "step": 454 }, { "epoch": 0.5158730158730159, "grad_norm": 19.956872940063477, "learning_rate": 1.001169590643275e-05, "loss": 1.3576, "num_tokens": 507464.0, "step": 455 }, { "epoch": 0.5170068027210885, "grad_norm": 17.171306610107422, "learning_rate": 9.988304093567252e-06, "loss": 1.3906, "num_tokens": 508542.0, "step": 456 }, { "epoch": 0.518140589569161, "grad_norm": 16.775720596313477, "learning_rate": 9.964912280701755e-06, "loss": 1.3633, "num_tokens": 509696.0, "step": 457 }, { "epoch": 0.5192743764172335, "grad_norm": 18.696247100830078, "learning_rate": 9.941520467836257e-06, "loss": 1.4039, "num_tokens": 510844.0, "step": 458 }, { "epoch": 0.5204081632653061, "grad_norm": 18.641944885253906, "learning_rate": 9.918128654970762e-06, "loss": 1.3362, "num_tokens": 511905.0, "step": 459 }, { "epoch": 0.5215419501133787, "grad_norm": 17.106464385986328, "learning_rate": 9.894736842105264e-06, "loss": 1.2452, "num_tokens": 513025.0, "step": 460 }, { "epoch": 0.5226757369614512, "grad_norm": 18.024394989013672, "learning_rate": 9.871345029239767e-06, "loss": 1.2359, "num_tokens": 514096.0, "step": 461 }, { "epoch": 0.5238095238095238, "grad_norm": 20.150434494018555, "learning_rate": 9.84795321637427e-06, "loss": 1.3942, "num_tokens": 515114.0, "step": 462 }, { "epoch": 0.5249433106575964, "grad_norm": 18.314773559570312, "learning_rate": 9.824561403508772e-06, "loss": 1.3264, "num_tokens": 516218.0, "step": 463 }, { "epoch": 0.5260770975056689, "grad_norm": 18.184812545776367, "learning_rate": 9.801169590643276e-06, "loss": 1.3662, "num_tokens": 517329.0, "step": 464 }, { "epoch": 0.5272108843537415, "grad_norm": 18.09324073791504, "learning_rate": 9.777777777777779e-06, "loss": 1.3919, "num_tokens": 518435.0, "step": 465 }, { "epoch": 0.528344671201814, "grad_norm": 17.663949966430664, "learning_rate": 9.754385964912281e-06, "loss": 1.2261, "num_tokens": 519509.0, "step": 466 }, { "epoch": 0.5294784580498866, "grad_norm": 18.685440063476562, "learning_rate": 9.730994152046784e-06, "loss": 1.4532, "num_tokens": 520620.0, "step": 467 }, { "epoch": 0.5306122448979592, "grad_norm": 17.659133911132812, "learning_rate": 9.707602339181286e-06, "loss": 1.1997, "num_tokens": 521806.0, "step": 468 }, { "epoch": 0.5317460317460317, "grad_norm": 17.235734939575195, "learning_rate": 9.68421052631579e-06, "loss": 1.4246, "num_tokens": 522945.0, "step": 469 }, { "epoch": 0.5328798185941043, "grad_norm": 16.777973175048828, "learning_rate": 9.660818713450293e-06, "loss": 1.3323, "num_tokens": 524157.0, "step": 470 }, { "epoch": 0.5340136054421769, "grad_norm": 18.61255645751953, "learning_rate": 9.637426900584796e-06, "loss": 1.3773, "num_tokens": 525245.0, "step": 471 }, { "epoch": 0.5351473922902494, "grad_norm": 18.647743225097656, "learning_rate": 9.614035087719298e-06, "loss": 1.3659, "num_tokens": 526398.0, "step": 472 }, { "epoch": 0.536281179138322, "grad_norm": 16.510019302368164, "learning_rate": 9.590643274853801e-06, "loss": 1.4054, "num_tokens": 527591.0, "step": 473 }, { "epoch": 0.5374149659863946, "grad_norm": 18.070676803588867, "learning_rate": 9.567251461988305e-06, "loss": 1.2677, "num_tokens": 528722.0, "step": 474 }, { "epoch": 0.5385487528344671, "grad_norm": 16.97783088684082, "learning_rate": 9.543859649122808e-06, "loss": 1.2604, "num_tokens": 529926.0, "step": 475 }, { "epoch": 0.5396825396825397, "grad_norm": 18.71210479736328, "learning_rate": 9.52046783625731e-06, "loss": 1.3034, "num_tokens": 531002.0, "step": 476 }, { "epoch": 0.5408163265306123, "grad_norm": 18.67724609375, "learning_rate": 9.497076023391813e-06, "loss": 1.3777, "num_tokens": 532063.0, "step": 477 }, { "epoch": 0.5419501133786848, "grad_norm": 18.577939987182617, "learning_rate": 9.473684210526315e-06, "loss": 1.2123, "num_tokens": 533122.0, "step": 478 }, { "epoch": 0.5430839002267573, "grad_norm": 17.068965911865234, "learning_rate": 9.45029239766082e-06, "loss": 1.3347, "num_tokens": 534354.0, "step": 479 }, { "epoch": 0.54421768707483, "grad_norm": 18.063716888427734, "learning_rate": 9.426900584795322e-06, "loss": 1.222, "num_tokens": 535447.0, "step": 480 }, { "epoch": 0.5453514739229025, "grad_norm": 17.992395401000977, "learning_rate": 9.403508771929825e-06, "loss": 1.4205, "num_tokens": 536573.0, "step": 481 }, { "epoch": 0.546485260770975, "grad_norm": 17.32271385192871, "learning_rate": 9.380116959064327e-06, "loss": 1.34, "num_tokens": 537707.0, "step": 482 }, { "epoch": 0.5476190476190477, "grad_norm": 18.503665924072266, "learning_rate": 9.35672514619883e-06, "loss": 1.3766, "num_tokens": 538818.0, "step": 483 }, { "epoch": 0.5487528344671202, "grad_norm": 19.996448516845703, "learning_rate": 9.333333333333334e-06, "loss": 1.4571, "num_tokens": 539908.0, "step": 484 }, { "epoch": 0.5498866213151927, "grad_norm": 17.211458206176758, "learning_rate": 9.309941520467837e-06, "loss": 1.3144, "num_tokens": 541045.0, "step": 485 }, { "epoch": 0.5510204081632653, "grad_norm": 17.301494598388672, "learning_rate": 9.28654970760234e-06, "loss": 1.3193, "num_tokens": 542124.0, "step": 486 }, { "epoch": 0.5521541950113379, "grad_norm": 19.131010055541992, "learning_rate": 9.263157894736842e-06, "loss": 1.2611, "num_tokens": 543220.0, "step": 487 }, { "epoch": 0.5532879818594104, "grad_norm": 15.8623685836792, "learning_rate": 9.239766081871345e-06, "loss": 1.2676, "num_tokens": 544354.0, "step": 488 }, { "epoch": 0.5544217687074829, "grad_norm": 18.77271842956543, "learning_rate": 9.216374269005849e-06, "loss": 1.3869, "num_tokens": 545379.0, "step": 489 }, { "epoch": 0.5555555555555556, "grad_norm": 16.649028778076172, "learning_rate": 9.192982456140351e-06, "loss": 1.2387, "num_tokens": 546460.0, "step": 490 }, { "epoch": 0.5566893424036281, "grad_norm": 14.686641693115234, "learning_rate": 9.169590643274856e-06, "loss": 1.2256, "num_tokens": 547604.0, "step": 491 }, { "epoch": 0.5578231292517006, "grad_norm": 18.15571403503418, "learning_rate": 9.146198830409357e-06, "loss": 1.3377, "num_tokens": 548661.0, "step": 492 }, { "epoch": 0.5589569160997733, "grad_norm": 19.204980850219727, "learning_rate": 9.12280701754386e-06, "loss": 1.4665, "num_tokens": 549763.0, "step": 493 }, { "epoch": 0.5600907029478458, "grad_norm": 16.642202377319336, "learning_rate": 9.099415204678363e-06, "loss": 1.2248, "num_tokens": 550930.0, "step": 494 }, { "epoch": 0.5612244897959183, "grad_norm": 17.54848861694336, "learning_rate": 9.076023391812866e-06, "loss": 1.3887, "num_tokens": 552031.0, "step": 495 }, { "epoch": 0.562358276643991, "grad_norm": 19.24024200439453, "learning_rate": 9.05263157894737e-06, "loss": 1.3567, "num_tokens": 553188.0, "step": 496 }, { "epoch": 0.5634920634920635, "grad_norm": 16.196155548095703, "learning_rate": 9.029239766081873e-06, "loss": 1.2491, "num_tokens": 554282.0, "step": 497 }, { "epoch": 0.564625850340136, "grad_norm": 19.248933792114258, "learning_rate": 9.005847953216374e-06, "loss": 1.3093, "num_tokens": 555422.0, "step": 498 }, { "epoch": 0.5657596371882087, "grad_norm": 17.538129806518555, "learning_rate": 8.982456140350878e-06, "loss": 1.493, "num_tokens": 556575.0, "step": 499 }, { "epoch": 0.5668934240362812, "grad_norm": 18.086639404296875, "learning_rate": 8.95906432748538e-06, "loss": 1.4094, "num_tokens": 557680.0, "step": 500 }, { "epoch": 0.5680272108843537, "grad_norm": 18.94271469116211, "learning_rate": 8.935672514619885e-06, "loss": 1.3086, "num_tokens": 558791.0, "step": 501 }, { "epoch": 0.5691609977324263, "grad_norm": 17.548765182495117, "learning_rate": 8.912280701754387e-06, "loss": 1.1368, "num_tokens": 559835.0, "step": 502 }, { "epoch": 0.5702947845804989, "grad_norm": 18.749052047729492, "learning_rate": 8.888888888888888e-06, "loss": 1.2716, "num_tokens": 560975.0, "step": 503 }, { "epoch": 0.5714285714285714, "grad_norm": 17.855331420898438, "learning_rate": 8.865497076023393e-06, "loss": 1.2367, "num_tokens": 562132.0, "step": 504 }, { "epoch": 0.572562358276644, "grad_norm": 18.3868465423584, "learning_rate": 8.842105263157895e-06, "loss": 1.2448, "num_tokens": 563198.0, "step": 505 }, { "epoch": 0.5736961451247166, "grad_norm": 16.713043212890625, "learning_rate": 8.8187134502924e-06, "loss": 1.2055, "num_tokens": 564323.0, "step": 506 }, { "epoch": 0.5748299319727891, "grad_norm": 19.583770751953125, "learning_rate": 8.795321637426902e-06, "loss": 1.4543, "num_tokens": 565410.0, "step": 507 }, { "epoch": 0.5759637188208617, "grad_norm": 17.730728149414062, "learning_rate": 8.771929824561405e-06, "loss": 1.2798, "num_tokens": 566583.0, "step": 508 }, { "epoch": 0.5770975056689343, "grad_norm": 17.315866470336914, "learning_rate": 8.748538011695907e-06, "loss": 1.261, "num_tokens": 567705.0, "step": 509 }, { "epoch": 0.5782312925170068, "grad_norm": 16.953575134277344, "learning_rate": 8.72514619883041e-06, "loss": 1.1939, "num_tokens": 568800.0, "step": 510 }, { "epoch": 0.5793650793650794, "grad_norm": 17.619497299194336, "learning_rate": 8.701754385964914e-06, "loss": 1.2446, "num_tokens": 569913.0, "step": 511 }, { "epoch": 0.5804988662131519, "grad_norm": 18.483022689819336, "learning_rate": 8.678362573099417e-06, "loss": 1.2713, "num_tokens": 571017.0, "step": 512 }, { "epoch": 0.5816326530612245, "grad_norm": 16.36237335205078, "learning_rate": 8.654970760233919e-06, "loss": 1.2186, "num_tokens": 572126.0, "step": 513 }, { "epoch": 0.5827664399092971, "grad_norm": 19.040184020996094, "learning_rate": 8.631578947368422e-06, "loss": 1.4335, "num_tokens": 573264.0, "step": 514 }, { "epoch": 0.5839002267573696, "grad_norm": 17.468027114868164, "learning_rate": 8.608187134502924e-06, "loss": 1.2643, "num_tokens": 574289.0, "step": 515 }, { "epoch": 0.5850340136054422, "grad_norm": 17.661149978637695, "learning_rate": 8.584795321637429e-06, "loss": 1.3698, "num_tokens": 575350.0, "step": 516 }, { "epoch": 0.5861678004535147, "grad_norm": 16.933862686157227, "learning_rate": 8.561403508771931e-06, "loss": 1.3584, "num_tokens": 576453.0, "step": 517 }, { "epoch": 0.5873015873015873, "grad_norm": 19.191181182861328, "learning_rate": 8.538011695906434e-06, "loss": 1.3657, "num_tokens": 577497.0, "step": 518 }, { "epoch": 0.5884353741496599, "grad_norm": 16.853866577148438, "learning_rate": 8.514619883040936e-06, "loss": 1.3159, "num_tokens": 578563.0, "step": 519 }, { "epoch": 0.5895691609977324, "grad_norm": 17.907520294189453, "learning_rate": 8.491228070175439e-06, "loss": 1.4287, "num_tokens": 579721.0, "step": 520 }, { "epoch": 0.590702947845805, "grad_norm": 17.38332748413086, "learning_rate": 8.467836257309943e-06, "loss": 1.3333, "num_tokens": 580835.0, "step": 521 }, { "epoch": 0.5918367346938775, "grad_norm": 17.52203941345215, "learning_rate": 8.444444444444446e-06, "loss": 1.342, "num_tokens": 582031.0, "step": 522 }, { "epoch": 0.5929705215419501, "grad_norm": 17.766830444335938, "learning_rate": 8.421052631578948e-06, "loss": 1.3092, "num_tokens": 583095.0, "step": 523 }, { "epoch": 0.5941043083900227, "grad_norm": 17.00160026550293, "learning_rate": 8.39766081871345e-06, "loss": 1.2767, "num_tokens": 584233.0, "step": 524 }, { "epoch": 0.5952380952380952, "grad_norm": 16.158689498901367, "learning_rate": 8.374269005847953e-06, "loss": 1.232, "num_tokens": 585302.0, "step": 525 }, { "epoch": 0.5963718820861678, "grad_norm": 17.967693328857422, "learning_rate": 8.350877192982458e-06, "loss": 1.2765, "num_tokens": 586371.0, "step": 526 }, { "epoch": 0.5975056689342404, "grad_norm": 17.455415725708008, "learning_rate": 8.32748538011696e-06, "loss": 1.3754, "num_tokens": 587512.0, "step": 527 }, { "epoch": 0.5986394557823129, "grad_norm": 18.405559539794922, "learning_rate": 8.304093567251463e-06, "loss": 1.4917, "num_tokens": 588596.0, "step": 528 }, { "epoch": 0.5997732426303855, "grad_norm": 18.642154693603516, "learning_rate": 8.280701754385965e-06, "loss": 1.247, "num_tokens": 589699.0, "step": 529 }, { "epoch": 0.6009070294784581, "grad_norm": 16.603816986083984, "learning_rate": 8.257309941520468e-06, "loss": 1.4079, "num_tokens": 590891.0, "step": 530 }, { "epoch": 0.6020408163265306, "grad_norm": 18.229610443115234, "learning_rate": 8.233918128654972e-06, "loss": 1.3164, "num_tokens": 592028.0, "step": 531 }, { "epoch": 0.6031746031746031, "grad_norm": 17.328216552734375, "learning_rate": 8.210526315789475e-06, "loss": 1.2742, "num_tokens": 593213.0, "step": 532 }, { "epoch": 0.6043083900226758, "grad_norm": 17.84442901611328, "learning_rate": 8.187134502923977e-06, "loss": 1.5234, "num_tokens": 594442.0, "step": 533 }, { "epoch": 0.6054421768707483, "grad_norm": 16.71651268005371, "learning_rate": 8.16374269005848e-06, "loss": 1.3295, "num_tokens": 595601.0, "step": 534 }, { "epoch": 0.6065759637188208, "grad_norm": 18.578290939331055, "learning_rate": 8.140350877192983e-06, "loss": 1.3404, "num_tokens": 596802.0, "step": 535 }, { "epoch": 0.6077097505668935, "grad_norm": 17.07627296447754, "learning_rate": 8.116959064327487e-06, "loss": 1.3295, "num_tokens": 597920.0, "step": 536 }, { "epoch": 0.608843537414966, "grad_norm": 18.019412994384766, "learning_rate": 8.09356725146199e-06, "loss": 1.2342, "num_tokens": 598998.0, "step": 537 }, { "epoch": 0.6099773242630385, "grad_norm": 17.841215133666992, "learning_rate": 8.070175438596492e-06, "loss": 1.4689, "num_tokens": 600212.0, "step": 538 }, { "epoch": 0.6111111111111112, "grad_norm": 18.639312744140625, "learning_rate": 8.046783625730994e-06, "loss": 1.2426, "num_tokens": 601248.0, "step": 539 }, { "epoch": 0.6122448979591837, "grad_norm": 16.199872970581055, "learning_rate": 8.023391812865497e-06, "loss": 1.2998, "num_tokens": 602367.0, "step": 540 }, { "epoch": 0.6133786848072562, "grad_norm": 16.76833152770996, "learning_rate": 8.000000000000001e-06, "loss": 1.3297, "num_tokens": 603524.0, "step": 541 }, { "epoch": 0.6145124716553289, "grad_norm": 19.57050895690918, "learning_rate": 7.976608187134504e-06, "loss": 1.3923, "num_tokens": 604618.0, "step": 542 }, { "epoch": 0.6156462585034014, "grad_norm": 17.667238235473633, "learning_rate": 7.953216374269006e-06, "loss": 1.1509, "num_tokens": 605667.0, "step": 543 }, { "epoch": 0.6167800453514739, "grad_norm": 17.75442123413086, "learning_rate": 7.929824561403509e-06, "loss": 1.3366, "num_tokens": 606768.0, "step": 544 }, { "epoch": 0.6179138321995464, "grad_norm": 18.858787536621094, "learning_rate": 7.906432748538012e-06, "loss": 1.403, "num_tokens": 607836.0, "step": 545 }, { "epoch": 0.6190476190476191, "grad_norm": 18.159116744995117, "learning_rate": 7.883040935672516e-06, "loss": 1.3661, "num_tokens": 608915.0, "step": 546 }, { "epoch": 0.6201814058956916, "grad_norm": 18.243877410888672, "learning_rate": 7.859649122807018e-06, "loss": 1.3329, "num_tokens": 609979.0, "step": 547 }, { "epoch": 0.6213151927437641, "grad_norm": 17.846384048461914, "learning_rate": 7.836257309941521e-06, "loss": 1.3189, "num_tokens": 611160.0, "step": 548 }, { "epoch": 0.6224489795918368, "grad_norm": 18.264671325683594, "learning_rate": 7.812865497076024e-06, "loss": 1.3237, "num_tokens": 612235.0, "step": 549 }, { "epoch": 0.6235827664399093, "grad_norm": 19.40984535217285, "learning_rate": 7.789473684210526e-06, "loss": 1.5031, "num_tokens": 613310.0, "step": 550 }, { "epoch": 0.6247165532879818, "grad_norm": 17.302501678466797, "learning_rate": 7.76608187134503e-06, "loss": 1.3535, "num_tokens": 614403.0, "step": 551 }, { "epoch": 0.6258503401360545, "grad_norm": 18.083362579345703, "learning_rate": 7.742690058479533e-06, "loss": 1.2787, "num_tokens": 615531.0, "step": 552 }, { "epoch": 0.626984126984127, "grad_norm": 17.231828689575195, "learning_rate": 7.719298245614036e-06, "loss": 1.3806, "num_tokens": 616780.0, "step": 553 }, { "epoch": 0.6281179138321995, "grad_norm": 17.220237731933594, "learning_rate": 7.695906432748538e-06, "loss": 1.2054, "num_tokens": 617962.0, "step": 554 }, { "epoch": 0.6292517006802721, "grad_norm": 18.67436981201172, "learning_rate": 7.67251461988304e-06, "loss": 1.426, "num_tokens": 619151.0, "step": 555 }, { "epoch": 0.6303854875283447, "grad_norm": 18.85828971862793, "learning_rate": 7.649122807017545e-06, "loss": 1.3415, "num_tokens": 620263.0, "step": 556 }, { "epoch": 0.6315192743764172, "grad_norm": 17.4761962890625, "learning_rate": 7.625730994152048e-06, "loss": 1.4598, "num_tokens": 621484.0, "step": 557 }, { "epoch": 0.6326530612244898, "grad_norm": 21.024747848510742, "learning_rate": 7.60233918128655e-06, "loss": 1.5124, "num_tokens": 622514.0, "step": 558 }, { "epoch": 0.6337868480725624, "grad_norm": 16.487855911254883, "learning_rate": 7.578947368421054e-06, "loss": 1.3914, "num_tokens": 623693.0, "step": 559 }, { "epoch": 0.6349206349206349, "grad_norm": 16.862720489501953, "learning_rate": 7.555555555555556e-06, "loss": 1.4403, "num_tokens": 624782.0, "step": 560 }, { "epoch": 0.6360544217687075, "grad_norm": 19.488327026367188, "learning_rate": 7.5321637426900596e-06, "loss": 1.3281, "num_tokens": 625875.0, "step": 561 }, { "epoch": 0.63718820861678, "grad_norm": 17.39655876159668, "learning_rate": 7.508771929824562e-06, "loss": 1.3988, "num_tokens": 627019.0, "step": 562 }, { "epoch": 0.6383219954648526, "grad_norm": 17.570581436157227, "learning_rate": 7.485380116959065e-06, "loss": 1.2679, "num_tokens": 628139.0, "step": 563 }, { "epoch": 0.6394557823129252, "grad_norm": 16.7895565032959, "learning_rate": 7.461988304093568e-06, "loss": 1.3567, "num_tokens": 629274.0, "step": 564 }, { "epoch": 0.6405895691609977, "grad_norm": 21.051969528198242, "learning_rate": 7.438596491228071e-06, "loss": 1.2586, "num_tokens": 630243.0, "step": 565 }, { "epoch": 0.6417233560090703, "grad_norm": 17.872528076171875, "learning_rate": 7.415204678362574e-06, "loss": 1.3342, "num_tokens": 631375.0, "step": 566 }, { "epoch": 0.6428571428571429, "grad_norm": 16.647104263305664, "learning_rate": 7.391812865497077e-06, "loss": 1.3369, "num_tokens": 632486.0, "step": 567 }, { "epoch": 0.6439909297052154, "grad_norm": 18.516103744506836, "learning_rate": 7.368421052631579e-06, "loss": 1.2058, "num_tokens": 633618.0, "step": 568 }, { "epoch": 0.645124716553288, "grad_norm": 17.799129486083984, "learning_rate": 7.345029239766083e-06, "loss": 1.3548, "num_tokens": 634700.0, "step": 569 }, { "epoch": 0.6462585034013606, "grad_norm": 17.39976692199707, "learning_rate": 7.321637426900585e-06, "loss": 1.1805, "num_tokens": 635812.0, "step": 570 }, { "epoch": 0.6473922902494331, "grad_norm": 17.5079345703125, "learning_rate": 7.298245614035089e-06, "loss": 1.2699, "num_tokens": 636943.0, "step": 571 }, { "epoch": 0.6485260770975056, "grad_norm": 15.763401985168457, "learning_rate": 7.274853801169591e-06, "loss": 1.2167, "num_tokens": 638032.0, "step": 572 }, { "epoch": 0.6496598639455783, "grad_norm": 19.949460983276367, "learning_rate": 7.251461988304094e-06, "loss": 1.3599, "num_tokens": 639185.0, "step": 573 }, { "epoch": 0.6507936507936508, "grad_norm": 19.457237243652344, "learning_rate": 7.228070175438597e-06, "loss": 1.3079, "num_tokens": 640432.0, "step": 574 }, { "epoch": 0.6519274376417233, "grad_norm": 17.709962844848633, "learning_rate": 7.2046783625731e-06, "loss": 1.2054, "num_tokens": 641575.0, "step": 575 }, { "epoch": 0.6530612244897959, "grad_norm": 18.247371673583984, "learning_rate": 7.181286549707603e-06, "loss": 1.478, "num_tokens": 642791.0, "step": 576 }, { "epoch": 0.6541950113378685, "grad_norm": 17.01307487487793, "learning_rate": 7.157894736842106e-06, "loss": 1.266, "num_tokens": 643851.0, "step": 577 }, { "epoch": 0.655328798185941, "grad_norm": 18.099470138549805, "learning_rate": 7.134502923976608e-06, "loss": 1.2111, "num_tokens": 644917.0, "step": 578 }, { "epoch": 0.6564625850340136, "grad_norm": 19.330289840698242, "learning_rate": 7.111111111111112e-06, "loss": 1.4143, "num_tokens": 645970.0, "step": 579 }, { "epoch": 0.6575963718820862, "grad_norm": 17.244670867919922, "learning_rate": 7.087719298245614e-06, "loss": 1.2975, "num_tokens": 647052.0, "step": 580 }, { "epoch": 0.6587301587301587, "grad_norm": 17.826627731323242, "learning_rate": 7.064327485380118e-06, "loss": 1.3101, "num_tokens": 648119.0, "step": 581 }, { "epoch": 0.6598639455782312, "grad_norm": 17.412382125854492, "learning_rate": 7.04093567251462e-06, "loss": 1.2225, "num_tokens": 649162.0, "step": 582 }, { "epoch": 0.6609977324263039, "grad_norm": 18.4223575592041, "learning_rate": 7.017543859649123e-06, "loss": 1.2326, "num_tokens": 650283.0, "step": 583 }, { "epoch": 0.6621315192743764, "grad_norm": 16.951290130615234, "learning_rate": 6.994152046783626e-06, "loss": 1.2765, "num_tokens": 651440.0, "step": 584 }, { "epoch": 0.6632653061224489, "grad_norm": 18.959163665771484, "learning_rate": 6.970760233918129e-06, "loss": 1.3595, "num_tokens": 652549.0, "step": 585 }, { "epoch": 0.6643990929705216, "grad_norm": 18.92232894897461, "learning_rate": 6.947368421052632e-06, "loss": 1.4333, "num_tokens": 653590.0, "step": 586 }, { "epoch": 0.6655328798185941, "grad_norm": 17.55598258972168, "learning_rate": 6.923976608187135e-06, "loss": 1.215, "num_tokens": 654714.0, "step": 587 }, { "epoch": 0.6666666666666666, "grad_norm": 17.422454833984375, "learning_rate": 6.9005847953216375e-06, "loss": 1.3603, "num_tokens": 655964.0, "step": 588 }, { "epoch": 0.6678004535147393, "grad_norm": 19.232685089111328, "learning_rate": 6.877192982456141e-06, "loss": 1.2864, "num_tokens": 657065.0, "step": 589 }, { "epoch": 0.6689342403628118, "grad_norm": 16.191415786743164, "learning_rate": 6.8538011695906435e-06, "loss": 1.2982, "num_tokens": 658255.0, "step": 590 }, { "epoch": 0.6700680272108843, "grad_norm": 19.965181350708008, "learning_rate": 6.830409356725147e-06, "loss": 1.3911, "num_tokens": 659325.0, "step": 591 }, { "epoch": 0.671201814058957, "grad_norm": 17.01615333557129, "learning_rate": 6.8070175438596495e-06, "loss": 1.3868, "num_tokens": 660485.0, "step": 592 }, { "epoch": 0.6723356009070295, "grad_norm": 18.37344741821289, "learning_rate": 6.783625730994152e-06, "loss": 1.2454, "num_tokens": 661513.0, "step": 593 }, { "epoch": 0.673469387755102, "grad_norm": 17.15907859802246, "learning_rate": 6.7602339181286555e-06, "loss": 1.3227, "num_tokens": 662618.0, "step": 594 }, { "epoch": 0.6746031746031746, "grad_norm": 17.135700225830078, "learning_rate": 6.736842105263158e-06, "loss": 1.1176, "num_tokens": 663737.0, "step": 595 }, { "epoch": 0.6757369614512472, "grad_norm": 18.115428924560547, "learning_rate": 6.7134502923976615e-06, "loss": 1.3098, "num_tokens": 664857.0, "step": 596 }, { "epoch": 0.6768707482993197, "grad_norm": 17.724384307861328, "learning_rate": 6.690058479532164e-06, "loss": 1.2835, "num_tokens": 665954.0, "step": 597 }, { "epoch": 0.6780045351473923, "grad_norm": 18.069942474365234, "learning_rate": 6.666666666666667e-06, "loss": 1.346, "num_tokens": 667045.0, "step": 598 }, { "epoch": 0.6791383219954649, "grad_norm": 17.33573341369629, "learning_rate": 6.64327485380117e-06, "loss": 1.451, "num_tokens": 668272.0, "step": 599 }, { "epoch": 0.6802721088435374, "grad_norm": 17.663867950439453, "learning_rate": 6.619883040935673e-06, "loss": 1.3737, "num_tokens": 669382.0, "step": 600 }, { "epoch": 0.68140589569161, "grad_norm": 17.320974349975586, "learning_rate": 6.596491228070177e-06, "loss": 1.375, "num_tokens": 670453.0, "step": 601 }, { "epoch": 0.6825396825396826, "grad_norm": 18.231542587280273, "learning_rate": 6.573099415204679e-06, "loss": 1.311, "num_tokens": 671545.0, "step": 602 }, { "epoch": 0.6836734693877551, "grad_norm": 18.052446365356445, "learning_rate": 6.549707602339181e-06, "loss": 1.3127, "num_tokens": 672697.0, "step": 603 }, { "epoch": 0.6848072562358276, "grad_norm": 18.89512062072754, "learning_rate": 6.526315789473685e-06, "loss": 1.3622, "num_tokens": 673798.0, "step": 604 }, { "epoch": 0.6859410430839002, "grad_norm": 17.570951461791992, "learning_rate": 6.502923976608187e-06, "loss": 1.2218, "num_tokens": 674822.0, "step": 605 }, { "epoch": 0.6870748299319728, "grad_norm": 18.972171783447266, "learning_rate": 6.4795321637426915e-06, "loss": 1.3582, "num_tokens": 675953.0, "step": 606 }, { "epoch": 0.6882086167800453, "grad_norm": 17.643735885620117, "learning_rate": 6.456140350877193e-06, "loss": 1.4052, "num_tokens": 677069.0, "step": 607 }, { "epoch": 0.6893424036281179, "grad_norm": 17.334129333496094, "learning_rate": 6.432748538011696e-06, "loss": 1.3848, "num_tokens": 678207.0, "step": 608 }, { "epoch": 0.6904761904761905, "grad_norm": 17.356857299804688, "learning_rate": 6.4093567251462e-06, "loss": 1.3585, "num_tokens": 679333.0, "step": 609 }, { "epoch": 0.691609977324263, "grad_norm": 17.62862205505371, "learning_rate": 6.385964912280702e-06, "loss": 1.3518, "num_tokens": 680472.0, "step": 610 }, { "epoch": 0.6927437641723356, "grad_norm": 17.393722534179688, "learning_rate": 6.362573099415206e-06, "loss": 1.3005, "num_tokens": 681550.0, "step": 611 }, { "epoch": 0.6938775510204082, "grad_norm": 17.79046058654785, "learning_rate": 6.339181286549709e-06, "loss": 1.4226, "num_tokens": 682665.0, "step": 612 }, { "epoch": 0.6950113378684807, "grad_norm": 18.60688591003418, "learning_rate": 6.31578947368421e-06, "loss": 1.2972, "num_tokens": 683797.0, "step": 613 }, { "epoch": 0.6961451247165533, "grad_norm": 18.402790069580078, "learning_rate": 6.292397660818715e-06, "loss": 1.3646, "num_tokens": 684941.0, "step": 614 }, { "epoch": 0.6972789115646258, "grad_norm": 17.006811141967773, "learning_rate": 6.269005847953217e-06, "loss": 1.316, "num_tokens": 686074.0, "step": 615 }, { "epoch": 0.6984126984126984, "grad_norm": 18.089576721191406, "learning_rate": 6.245614035087721e-06, "loss": 1.2821, "num_tokens": 687271.0, "step": 616 }, { "epoch": 0.699546485260771, "grad_norm": 17.52067756652832, "learning_rate": 6.222222222222223e-06, "loss": 1.2666, "num_tokens": 688341.0, "step": 617 }, { "epoch": 0.7006802721088435, "grad_norm": 19.406370162963867, "learning_rate": 6.198830409356725e-06, "loss": 1.3695, "num_tokens": 689435.0, "step": 618 }, { "epoch": 0.7018140589569161, "grad_norm": 16.33395004272461, "learning_rate": 6.175438596491229e-06, "loss": 1.2681, "num_tokens": 690569.0, "step": 619 }, { "epoch": 0.7029478458049887, "grad_norm": 18.202117919921875, "learning_rate": 6.152046783625732e-06, "loss": 1.3287, "num_tokens": 691672.0, "step": 620 }, { "epoch": 0.7040816326530612, "grad_norm": 18.300907135009766, "learning_rate": 6.128654970760235e-06, "loss": 1.2402, "num_tokens": 692947.0, "step": 621 }, { "epoch": 0.7052154195011338, "grad_norm": 18.698211669921875, "learning_rate": 6.105263157894738e-06, "loss": 1.2784, "num_tokens": 694052.0, "step": 622 }, { "epoch": 0.7063492063492064, "grad_norm": 18.063148498535156, "learning_rate": 6.08187134502924e-06, "loss": 1.3122, "num_tokens": 695248.0, "step": 623 }, { "epoch": 0.7074829931972789, "grad_norm": 17.882890701293945, "learning_rate": 6.058479532163744e-06, "loss": 1.3528, "num_tokens": 696356.0, "step": 624 }, { "epoch": 0.7086167800453514, "grad_norm": 17.1732120513916, "learning_rate": 6.035087719298246e-06, "loss": 1.3255, "num_tokens": 697517.0, "step": 625 }, { "epoch": 0.7097505668934241, "grad_norm": 17.36437225341797, "learning_rate": 6.01169590643275e-06, "loss": 1.2752, "num_tokens": 698582.0, "step": 626 }, { "epoch": 0.7108843537414966, "grad_norm": 16.8319091796875, "learning_rate": 5.988304093567252e-06, "loss": 1.3584, "num_tokens": 699745.0, "step": 627 }, { "epoch": 0.7120181405895691, "grad_norm": 18.70180320739746, "learning_rate": 5.964912280701755e-06, "loss": 1.3451, "num_tokens": 700778.0, "step": 628 }, { "epoch": 0.7131519274376418, "grad_norm": 19.382570266723633, "learning_rate": 5.941520467836258e-06, "loss": 1.2424, "num_tokens": 701913.0, "step": 629 }, { "epoch": 0.7142857142857143, "grad_norm": 17.256017684936523, "learning_rate": 5.918128654970761e-06, "loss": 1.2081, "num_tokens": 703034.0, "step": 630 }, { "epoch": 0.7154195011337868, "grad_norm": 16.574491500854492, "learning_rate": 5.8947368421052634e-06, "loss": 1.2736, "num_tokens": 704183.0, "step": 631 }, { "epoch": 0.7165532879818595, "grad_norm": 17.02162742614746, "learning_rate": 5.871345029239767e-06, "loss": 1.2391, "num_tokens": 705285.0, "step": 632 }, { "epoch": 0.717687074829932, "grad_norm": 19.27235221862793, "learning_rate": 5.847953216374269e-06, "loss": 1.2753, "num_tokens": 706394.0, "step": 633 }, { "epoch": 0.7188208616780045, "grad_norm": 20.503198623657227, "learning_rate": 5.824561403508773e-06, "loss": 1.4612, "num_tokens": 707507.0, "step": 634 }, { "epoch": 0.719954648526077, "grad_norm": 15.761817932128906, "learning_rate": 5.801169590643275e-06, "loss": 1.4017, "num_tokens": 708763.0, "step": 635 }, { "epoch": 0.7210884353741497, "grad_norm": 18.245853424072266, "learning_rate": 5.777777777777778e-06, "loss": 1.3035, "num_tokens": 709792.0, "step": 636 }, { "epoch": 0.7222222222222222, "grad_norm": 18.005619049072266, "learning_rate": 5.754385964912281e-06, "loss": 1.3574, "num_tokens": 710874.0, "step": 637 }, { "epoch": 0.7233560090702947, "grad_norm": 16.287179946899414, "learning_rate": 5.730994152046784e-06, "loss": 1.2426, "num_tokens": 712053.0, "step": 638 }, { "epoch": 0.7244897959183674, "grad_norm": 18.33644676208496, "learning_rate": 5.707602339181287e-06, "loss": 1.2237, "num_tokens": 713114.0, "step": 639 }, { "epoch": 0.7256235827664399, "grad_norm": 17.326969146728516, "learning_rate": 5.68421052631579e-06, "loss": 1.2535, "num_tokens": 714201.0, "step": 640 }, { "epoch": 0.7267573696145124, "grad_norm": 18.668529510498047, "learning_rate": 5.6608187134502925e-06, "loss": 1.4466, "num_tokens": 715281.0, "step": 641 }, { "epoch": 0.7278911564625851, "grad_norm": 17.105432510375977, "learning_rate": 5.637426900584796e-06, "loss": 1.2936, "num_tokens": 716404.0, "step": 642 }, { "epoch": 0.7290249433106576, "grad_norm": 17.338350296020508, "learning_rate": 5.6140350877192985e-06, "loss": 1.3548, "num_tokens": 717488.0, "step": 643 }, { "epoch": 0.7301587301587301, "grad_norm": 19.157217025756836, "learning_rate": 5.590643274853802e-06, "loss": 1.364, "num_tokens": 718571.0, "step": 644 }, { "epoch": 0.7312925170068028, "grad_norm": 17.105701446533203, "learning_rate": 5.5672514619883045e-06, "loss": 1.3607, "num_tokens": 719716.0, "step": 645 }, { "epoch": 0.7324263038548753, "grad_norm": 18.03299903869629, "learning_rate": 5.543859649122807e-06, "loss": 1.4081, "num_tokens": 720905.0, "step": 646 }, { "epoch": 0.7335600907029478, "grad_norm": 19.90826416015625, "learning_rate": 5.5204678362573105e-06, "loss": 1.4574, "num_tokens": 722076.0, "step": 647 }, { "epoch": 0.7346938775510204, "grad_norm": 18.08298683166504, "learning_rate": 5.497076023391813e-06, "loss": 1.1945, "num_tokens": 723183.0, "step": 648 }, { "epoch": 0.735827664399093, "grad_norm": 19.74321174621582, "learning_rate": 5.4736842105263165e-06, "loss": 1.4482, "num_tokens": 724329.0, "step": 649 }, { "epoch": 0.7369614512471655, "grad_norm": 16.616405487060547, "learning_rate": 5.450292397660819e-06, "loss": 1.3461, "num_tokens": 725447.0, "step": 650 }, { "epoch": 0.7380952380952381, "grad_norm": 17.868993759155273, "learning_rate": 5.426900584795322e-06, "loss": 1.2215, "num_tokens": 726477.0, "step": 651 }, { "epoch": 0.7392290249433107, "grad_norm": 17.35908317565918, "learning_rate": 5.403508771929825e-06, "loss": 1.4151, "num_tokens": 727606.0, "step": 652 }, { "epoch": 0.7403628117913832, "grad_norm": 16.30072021484375, "learning_rate": 5.380116959064328e-06, "loss": 1.3346, "num_tokens": 728718.0, "step": 653 }, { "epoch": 0.7414965986394558, "grad_norm": 18.497737884521484, "learning_rate": 5.356725146198831e-06, "loss": 1.4455, "num_tokens": 729745.0, "step": 654 }, { "epoch": 0.7426303854875284, "grad_norm": 17.67164421081543, "learning_rate": 5.333333333333334e-06, "loss": 1.2419, "num_tokens": 730924.0, "step": 655 }, { "epoch": 0.7437641723356009, "grad_norm": 17.78976821899414, "learning_rate": 5.309941520467836e-06, "loss": 1.1909, "num_tokens": 731980.0, "step": 656 }, { "epoch": 0.7448979591836735, "grad_norm": 18.883560180664062, "learning_rate": 5.28654970760234e-06, "loss": 1.3632, "num_tokens": 733073.0, "step": 657 }, { "epoch": 0.746031746031746, "grad_norm": 15.361091613769531, "learning_rate": 5.263157894736842e-06, "loss": 1.2674, "num_tokens": 734355.0, "step": 658 }, { "epoch": 0.7471655328798186, "grad_norm": 17.777999877929688, "learning_rate": 5.239766081871346e-06, "loss": 1.2996, "num_tokens": 735425.0, "step": 659 }, { "epoch": 0.7482993197278912, "grad_norm": 17.9564266204834, "learning_rate": 5.216374269005848e-06, "loss": 1.2512, "num_tokens": 736522.0, "step": 660 }, { "epoch": 0.7494331065759637, "grad_norm": 16.769588470458984, "learning_rate": 5.192982456140351e-06, "loss": 1.3217, "num_tokens": 737633.0, "step": 661 }, { "epoch": 0.7505668934240363, "grad_norm": 17.888736724853516, "learning_rate": 5.169590643274854e-06, "loss": 1.3336, "num_tokens": 738753.0, "step": 662 }, { "epoch": 0.7517006802721088, "grad_norm": 17.738611221313477, "learning_rate": 5.146198830409357e-06, "loss": 1.3217, "num_tokens": 739872.0, "step": 663 }, { "epoch": 0.7528344671201814, "grad_norm": 17.487979888916016, "learning_rate": 5.12280701754386e-06, "loss": 1.3311, "num_tokens": 741008.0, "step": 664 }, { "epoch": 0.753968253968254, "grad_norm": 17.191387176513672, "learning_rate": 5.099415204678363e-06, "loss": 1.3959, "num_tokens": 742227.0, "step": 665 }, { "epoch": 0.7551020408163265, "grad_norm": 17.804203033447266, "learning_rate": 5.076023391812865e-06, "loss": 1.2648, "num_tokens": 743294.0, "step": 666 }, { "epoch": 0.7562358276643991, "grad_norm": 17.26160430908203, "learning_rate": 5.052631578947369e-06, "loss": 1.2544, "num_tokens": 744498.0, "step": 667 }, { "epoch": 0.7573696145124716, "grad_norm": 17.819164276123047, "learning_rate": 5.029239766081871e-06, "loss": 1.3092, "num_tokens": 745753.0, "step": 668 }, { "epoch": 0.7585034013605442, "grad_norm": 16.67906951904297, "learning_rate": 5.005847953216375e-06, "loss": 1.2453, "num_tokens": 746899.0, "step": 669 }, { "epoch": 0.7596371882086168, "grad_norm": 18.667606353759766, "learning_rate": 4.982456140350877e-06, "loss": 1.3695, "num_tokens": 747982.0, "step": 670 }, { "epoch": 0.7607709750566893, "grad_norm": 18.80492401123047, "learning_rate": 4.959064327485381e-06, "loss": 1.1923, "num_tokens": 749043.0, "step": 671 }, { "epoch": 0.7619047619047619, "grad_norm": 16.101959228515625, "learning_rate": 4.935672514619883e-06, "loss": 1.254, "num_tokens": 750230.0, "step": 672 }, { "epoch": 0.7630385487528345, "grad_norm": 18.902116775512695, "learning_rate": 4.912280701754386e-06, "loss": 1.3446, "num_tokens": 751311.0, "step": 673 }, { "epoch": 0.764172335600907, "grad_norm": 17.63637924194336, "learning_rate": 4.888888888888889e-06, "loss": 1.3398, "num_tokens": 752533.0, "step": 674 }, { "epoch": 0.7653061224489796, "grad_norm": 18.541616439819336, "learning_rate": 4.865497076023392e-06, "loss": 1.2407, "num_tokens": 753621.0, "step": 675 }, { "epoch": 0.7664399092970522, "grad_norm": 17.306049346923828, "learning_rate": 4.842105263157895e-06, "loss": 1.2745, "num_tokens": 754720.0, "step": 676 }, { "epoch": 0.7675736961451247, "grad_norm": 19.852903366088867, "learning_rate": 4.818713450292398e-06, "loss": 1.3204, "num_tokens": 755780.0, "step": 677 }, { "epoch": 0.7687074829931972, "grad_norm": 17.333070755004883, "learning_rate": 4.7953216374269005e-06, "loss": 1.291, "num_tokens": 756845.0, "step": 678 }, { "epoch": 0.7698412698412699, "grad_norm": 19.042404174804688, "learning_rate": 4.771929824561404e-06, "loss": 1.3215, "num_tokens": 758014.0, "step": 679 }, { "epoch": 0.7709750566893424, "grad_norm": 17.769058227539062, "learning_rate": 4.7485380116959065e-06, "loss": 1.325, "num_tokens": 759116.0, "step": 680 }, { "epoch": 0.7721088435374149, "grad_norm": 16.49578857421875, "learning_rate": 4.72514619883041e-06, "loss": 1.2833, "num_tokens": 760209.0, "step": 681 }, { "epoch": 0.7732426303854876, "grad_norm": 17.752059936523438, "learning_rate": 4.7017543859649125e-06, "loss": 1.3906, "num_tokens": 761296.0, "step": 682 }, { "epoch": 0.7743764172335601, "grad_norm": 18.974281311035156, "learning_rate": 4.678362573099415e-06, "loss": 1.3296, "num_tokens": 762380.0, "step": 683 }, { "epoch": 0.7755102040816326, "grad_norm": 17.466659545898438, "learning_rate": 4.6549707602339184e-06, "loss": 1.2385, "num_tokens": 763416.0, "step": 684 }, { "epoch": 0.7766439909297053, "grad_norm": 18.69021224975586, "learning_rate": 4.631578947368421e-06, "loss": 1.2305, "num_tokens": 764519.0, "step": 685 }, { "epoch": 0.7777777777777778, "grad_norm": 16.898147583007812, "learning_rate": 4.6081871345029244e-06, "loss": 1.2576, "num_tokens": 765685.0, "step": 686 }, { "epoch": 0.7789115646258503, "grad_norm": 15.894250869750977, "learning_rate": 4.584795321637428e-06, "loss": 1.2783, "num_tokens": 766859.0, "step": 687 }, { "epoch": 0.780045351473923, "grad_norm": 17.941795349121094, "learning_rate": 4.56140350877193e-06, "loss": 1.2587, "num_tokens": 768070.0, "step": 688 }, { "epoch": 0.7811791383219955, "grad_norm": 18.966236114501953, "learning_rate": 4.538011695906433e-06, "loss": 1.4167, "num_tokens": 769163.0, "step": 689 }, { "epoch": 0.782312925170068, "grad_norm": 17.873804092407227, "learning_rate": 4.5146198830409364e-06, "loss": 1.4227, "num_tokens": 770331.0, "step": 690 }, { "epoch": 0.7834467120181405, "grad_norm": 17.66129493713379, "learning_rate": 4.491228070175439e-06, "loss": 1.2546, "num_tokens": 771348.0, "step": 691 }, { "epoch": 0.7845804988662132, "grad_norm": 17.359886169433594, "learning_rate": 4.467836257309942e-06, "loss": 1.1979, "num_tokens": 772423.0, "step": 692 }, { "epoch": 0.7857142857142857, "grad_norm": 19.223737716674805, "learning_rate": 4.444444444444444e-06, "loss": 1.2858, "num_tokens": 773491.0, "step": 693 }, { "epoch": 0.7868480725623582, "grad_norm": 17.888168334960938, "learning_rate": 4.4210526315789476e-06, "loss": 1.3819, "num_tokens": 774638.0, "step": 694 }, { "epoch": 0.7879818594104309, "grad_norm": 18.950334548950195, "learning_rate": 4.397660818713451e-06, "loss": 1.5627, "num_tokens": 775775.0, "step": 695 }, { "epoch": 0.7891156462585034, "grad_norm": 16.627885818481445, "learning_rate": 4.3742690058479536e-06, "loss": 1.3738, "num_tokens": 776879.0, "step": 696 }, { "epoch": 0.7902494331065759, "grad_norm": 17.247802734375, "learning_rate": 4.350877192982457e-06, "loss": 1.3818, "num_tokens": 778025.0, "step": 697 }, { "epoch": 0.7913832199546486, "grad_norm": 18.670366287231445, "learning_rate": 4.3274853801169596e-06, "loss": 1.3931, "num_tokens": 779182.0, "step": 698 }, { "epoch": 0.7925170068027211, "grad_norm": 18.606996536254883, "learning_rate": 4.304093567251462e-06, "loss": 1.2984, "num_tokens": 780292.0, "step": 699 }, { "epoch": 0.7936507936507936, "grad_norm": 19.796794891357422, "learning_rate": 4.2807017543859656e-06, "loss": 1.2146, "num_tokens": 781293.0, "step": 700 }, { "epoch": 0.7947845804988662, "grad_norm": 17.024425506591797, "learning_rate": 4.257309941520468e-06, "loss": 1.3486, "num_tokens": 782489.0, "step": 701 }, { "epoch": 0.7959183673469388, "grad_norm": 17.996366500854492, "learning_rate": 4.2339181286549715e-06, "loss": 1.3199, "num_tokens": 783575.0, "step": 702 }, { "epoch": 0.7970521541950113, "grad_norm": 18.175334930419922, "learning_rate": 4.210526315789474e-06, "loss": 1.4026, "num_tokens": 784656.0, "step": 703 }, { "epoch": 0.7981859410430839, "grad_norm": 17.05471420288086, "learning_rate": 4.187134502923977e-06, "loss": 1.2263, "num_tokens": 785780.0, "step": 704 }, { "epoch": 0.7993197278911565, "grad_norm": 16.355682373046875, "learning_rate": 4.16374269005848e-06, "loss": 1.2353, "num_tokens": 786993.0, "step": 705 }, { "epoch": 0.800453514739229, "grad_norm": 18.620145797729492, "learning_rate": 4.140350877192983e-06, "loss": 1.1782, "num_tokens": 788042.0, "step": 706 }, { "epoch": 0.8015873015873016, "grad_norm": 18.0267276763916, "learning_rate": 4.116959064327486e-06, "loss": 1.3622, "num_tokens": 789182.0, "step": 707 }, { "epoch": 0.8027210884353742, "grad_norm": 17.341079711914062, "learning_rate": 4.093567251461989e-06, "loss": 1.3555, "num_tokens": 790326.0, "step": 708 }, { "epoch": 0.8038548752834467, "grad_norm": 18.06015968322754, "learning_rate": 4.070175438596491e-06, "loss": 1.3924, "num_tokens": 791473.0, "step": 709 }, { "epoch": 0.8049886621315193, "grad_norm": 17.291038513183594, "learning_rate": 4.046783625730995e-06, "loss": 1.263, "num_tokens": 792646.0, "step": 710 }, { "epoch": 0.8061224489795918, "grad_norm": 17.47237205505371, "learning_rate": 4.023391812865497e-06, "loss": 1.361, "num_tokens": 793797.0, "step": 711 }, { "epoch": 0.8072562358276644, "grad_norm": 19.356576919555664, "learning_rate": 4.000000000000001e-06, "loss": 1.331, "num_tokens": 794861.0, "step": 712 }, { "epoch": 0.808390022675737, "grad_norm": 17.498414993286133, "learning_rate": 3.976608187134503e-06, "loss": 1.2078, "num_tokens": 796010.0, "step": 713 }, { "epoch": 0.8095238095238095, "grad_norm": 18.645160675048828, "learning_rate": 3.953216374269006e-06, "loss": 1.3656, "num_tokens": 797073.0, "step": 714 }, { "epoch": 0.8106575963718821, "grad_norm": 17.478775024414062, "learning_rate": 3.929824561403509e-06, "loss": 1.2949, "num_tokens": 798194.0, "step": 715 }, { "epoch": 0.8117913832199547, "grad_norm": 17.29451560974121, "learning_rate": 3.906432748538012e-06, "loss": 1.3011, "num_tokens": 799326.0, "step": 716 }, { "epoch": 0.8129251700680272, "grad_norm": 17.526670455932617, "learning_rate": 3.883040935672515e-06, "loss": 1.3374, "num_tokens": 800462.0, "step": 717 }, { "epoch": 0.8140589569160998, "grad_norm": 16.313770294189453, "learning_rate": 3.859649122807018e-06, "loss": 1.37, "num_tokens": 801660.0, "step": 718 }, { "epoch": 0.8151927437641724, "grad_norm": 18.832130432128906, "learning_rate": 3.83625730994152e-06, "loss": 1.3389, "num_tokens": 802681.0, "step": 719 }, { "epoch": 0.8163265306122449, "grad_norm": 18.339519500732422, "learning_rate": 3.812865497076024e-06, "loss": 1.3476, "num_tokens": 803845.0, "step": 720 }, { "epoch": 0.8174603174603174, "grad_norm": 18.63218879699707, "learning_rate": 3.789473684210527e-06, "loss": 1.3003, "num_tokens": 804919.0, "step": 721 }, { "epoch": 0.81859410430839, "grad_norm": 18.538555145263672, "learning_rate": 3.7660818713450298e-06, "loss": 1.3368, "num_tokens": 806066.0, "step": 722 }, { "epoch": 0.8197278911564626, "grad_norm": 18.34773826599121, "learning_rate": 3.7426900584795324e-06, "loss": 1.4031, "num_tokens": 807298.0, "step": 723 }, { "epoch": 0.8208616780045351, "grad_norm": 16.661283493041992, "learning_rate": 3.7192982456140354e-06, "loss": 1.2901, "num_tokens": 808431.0, "step": 724 }, { "epoch": 0.8219954648526077, "grad_norm": 17.518285751342773, "learning_rate": 3.6959064327485384e-06, "loss": 1.3681, "num_tokens": 809525.0, "step": 725 }, { "epoch": 0.8231292517006803, "grad_norm": 17.644357681274414, "learning_rate": 3.6725146198830414e-06, "loss": 1.3171, "num_tokens": 810602.0, "step": 726 }, { "epoch": 0.8242630385487528, "grad_norm": 17.630014419555664, "learning_rate": 3.6491228070175443e-06, "loss": 1.3163, "num_tokens": 811652.0, "step": 727 }, { "epoch": 0.8253968253968254, "grad_norm": 18.530231475830078, "learning_rate": 3.625730994152047e-06, "loss": 1.3366, "num_tokens": 812716.0, "step": 728 }, { "epoch": 0.826530612244898, "grad_norm": 18.179412841796875, "learning_rate": 3.60233918128655e-06, "loss": 1.2376, "num_tokens": 813814.0, "step": 729 }, { "epoch": 0.8276643990929705, "grad_norm": 16.12605857849121, "learning_rate": 3.578947368421053e-06, "loss": 1.2287, "num_tokens": 814929.0, "step": 730 }, { "epoch": 0.828798185941043, "grad_norm": 18.375032424926758, "learning_rate": 3.555555555555556e-06, "loss": 1.2336, "num_tokens": 815979.0, "step": 731 }, { "epoch": 0.8299319727891157, "grad_norm": 17.65633773803711, "learning_rate": 3.532163742690059e-06, "loss": 1.3779, "num_tokens": 817290.0, "step": 732 }, { "epoch": 0.8310657596371882, "grad_norm": 17.891504287719727, "learning_rate": 3.5087719298245615e-06, "loss": 1.3849, "num_tokens": 818392.0, "step": 733 }, { "epoch": 0.8321995464852607, "grad_norm": 18.025636672973633, "learning_rate": 3.4853801169590645e-06, "loss": 1.3142, "num_tokens": 819609.0, "step": 734 }, { "epoch": 0.8333333333333334, "grad_norm": 22.43589973449707, "learning_rate": 3.4619883040935675e-06, "loss": 1.4138, "num_tokens": 820656.0, "step": 735 }, { "epoch": 0.8344671201814059, "grad_norm": 16.9960880279541, "learning_rate": 3.4385964912280705e-06, "loss": 1.248, "num_tokens": 821762.0, "step": 736 }, { "epoch": 0.8356009070294784, "grad_norm": 20.340120315551758, "learning_rate": 3.4152046783625735e-06, "loss": 1.4243, "num_tokens": 822936.0, "step": 737 }, { "epoch": 0.8367346938775511, "grad_norm": 17.92860984802246, "learning_rate": 3.391812865497076e-06, "loss": 1.3371, "num_tokens": 824105.0, "step": 738 }, { "epoch": 0.8378684807256236, "grad_norm": 21.30694007873535, "learning_rate": 3.368421052631579e-06, "loss": 1.4731, "num_tokens": 825116.0, "step": 739 }, { "epoch": 0.8390022675736961, "grad_norm": 19.64236068725586, "learning_rate": 3.345029239766082e-06, "loss": 1.345, "num_tokens": 826109.0, "step": 740 }, { "epoch": 0.8401360544217688, "grad_norm": 18.508100509643555, "learning_rate": 3.321637426900585e-06, "loss": 1.2692, "num_tokens": 827151.0, "step": 741 }, { "epoch": 0.8412698412698413, "grad_norm": 18.74950408935547, "learning_rate": 3.2982456140350885e-06, "loss": 1.3716, "num_tokens": 828251.0, "step": 742 }, { "epoch": 0.8424036281179138, "grad_norm": 18.321237564086914, "learning_rate": 3.2748538011695906e-06, "loss": 1.152, "num_tokens": 829406.0, "step": 743 }, { "epoch": 0.8435374149659864, "grad_norm": 17.518526077270508, "learning_rate": 3.2514619883040936e-06, "loss": 1.2453, "num_tokens": 830517.0, "step": 744 }, { "epoch": 0.844671201814059, "grad_norm": 16.979219436645508, "learning_rate": 3.2280701754385966e-06, "loss": 1.4154, "num_tokens": 831751.0, "step": 745 }, { "epoch": 0.8458049886621315, "grad_norm": 18.43494987487793, "learning_rate": 3.2046783625731e-06, "loss": 1.4215, "num_tokens": 832836.0, "step": 746 }, { "epoch": 0.8469387755102041, "grad_norm": 17.778759002685547, "learning_rate": 3.181286549707603e-06, "loss": 1.4827, "num_tokens": 834005.0, "step": 747 }, { "epoch": 0.8480725623582767, "grad_norm": 18.422061920166016, "learning_rate": 3.157894736842105e-06, "loss": 1.2687, "num_tokens": 835035.0, "step": 748 }, { "epoch": 0.8492063492063492, "grad_norm": 19.745197296142578, "learning_rate": 3.1345029239766086e-06, "loss": 1.3371, "num_tokens": 836174.0, "step": 749 }, { "epoch": 0.8503401360544217, "grad_norm": 18.293498992919922, "learning_rate": 3.1111111111111116e-06, "loss": 1.344, "num_tokens": 837345.0, "step": 750 }, { "epoch": 0.8514739229024944, "grad_norm": 19.394824981689453, "learning_rate": 3.0877192982456146e-06, "loss": 1.2452, "num_tokens": 838470.0, "step": 751 }, { "epoch": 0.8526077097505669, "grad_norm": 17.224674224853516, "learning_rate": 3.0643274853801176e-06, "loss": 1.1512, "num_tokens": 839542.0, "step": 752 }, { "epoch": 0.8537414965986394, "grad_norm": 17.437070846557617, "learning_rate": 3.04093567251462e-06, "loss": 1.3275, "num_tokens": 840615.0, "step": 753 }, { "epoch": 0.854875283446712, "grad_norm": 16.678813934326172, "learning_rate": 3.017543859649123e-06, "loss": 1.3281, "num_tokens": 841800.0, "step": 754 }, { "epoch": 0.8560090702947846, "grad_norm": 19.362855911254883, "learning_rate": 2.994152046783626e-06, "loss": 1.3722, "num_tokens": 842943.0, "step": 755 }, { "epoch": 0.8571428571428571, "grad_norm": 17.840465545654297, "learning_rate": 2.970760233918129e-06, "loss": 1.4083, "num_tokens": 844094.0, "step": 756 }, { "epoch": 0.8582766439909297, "grad_norm": 17.949708938598633, "learning_rate": 2.9473684210526317e-06, "loss": 1.3102, "num_tokens": 845227.0, "step": 757 }, { "epoch": 0.8594104308390023, "grad_norm": 16.19768714904785, "learning_rate": 2.9239766081871347e-06, "loss": 1.2165, "num_tokens": 846379.0, "step": 758 }, { "epoch": 0.8605442176870748, "grad_norm": 17.489797592163086, "learning_rate": 2.9005847953216377e-06, "loss": 1.2462, "num_tokens": 847478.0, "step": 759 }, { "epoch": 0.8616780045351474, "grad_norm": 20.77772331237793, "learning_rate": 2.8771929824561407e-06, "loss": 1.3926, "num_tokens": 848561.0, "step": 760 }, { "epoch": 0.86281179138322, "grad_norm": 16.928083419799805, "learning_rate": 2.8538011695906437e-06, "loss": 1.3433, "num_tokens": 849697.0, "step": 761 }, { "epoch": 0.8639455782312925, "grad_norm": 18.48739242553711, "learning_rate": 2.8304093567251463e-06, "loss": 1.3172, "num_tokens": 850827.0, "step": 762 }, { "epoch": 0.8650793650793651, "grad_norm": 17.768648147583008, "learning_rate": 2.8070175438596493e-06, "loss": 1.3516, "num_tokens": 851967.0, "step": 763 }, { "epoch": 0.8662131519274376, "grad_norm": 17.650531768798828, "learning_rate": 2.7836257309941523e-06, "loss": 1.3642, "num_tokens": 853111.0, "step": 764 }, { "epoch": 0.8673469387755102, "grad_norm": 17.471181869506836, "learning_rate": 2.7602339181286553e-06, "loss": 1.3457, "num_tokens": 854225.0, "step": 765 }, { "epoch": 0.8684807256235828, "grad_norm": 17.436655044555664, "learning_rate": 2.7368421052631583e-06, "loss": 1.4339, "num_tokens": 855400.0, "step": 766 }, { "epoch": 0.8696145124716553, "grad_norm": 17.393081665039062, "learning_rate": 2.713450292397661e-06, "loss": 1.2138, "num_tokens": 856571.0, "step": 767 }, { "epoch": 0.8707482993197279, "grad_norm": 17.323787689208984, "learning_rate": 2.690058479532164e-06, "loss": 1.2477, "num_tokens": 857782.0, "step": 768 }, { "epoch": 0.8718820861678005, "grad_norm": 19.412883758544922, "learning_rate": 2.666666666666667e-06, "loss": 1.4087, "num_tokens": 858959.0, "step": 769 }, { "epoch": 0.873015873015873, "grad_norm": 16.870290756225586, "learning_rate": 2.64327485380117e-06, "loss": 1.349, "num_tokens": 860124.0, "step": 770 }, { "epoch": 0.8741496598639455, "grad_norm": 17.62514877319336, "learning_rate": 2.619883040935673e-06, "loss": 1.2714, "num_tokens": 861312.0, "step": 771 }, { "epoch": 0.8752834467120182, "grad_norm": 17.352182388305664, "learning_rate": 2.5964912280701754e-06, "loss": 1.2743, "num_tokens": 862379.0, "step": 772 }, { "epoch": 0.8764172335600907, "grad_norm": 18.01154136657715, "learning_rate": 2.5730994152046784e-06, "loss": 1.2533, "num_tokens": 863526.0, "step": 773 }, { "epoch": 0.8775510204081632, "grad_norm": 17.778823852539062, "learning_rate": 2.5497076023391814e-06, "loss": 1.4575, "num_tokens": 864663.0, "step": 774 }, { "epoch": 0.8786848072562359, "grad_norm": 18.24393653869629, "learning_rate": 2.5263157894736844e-06, "loss": 1.4635, "num_tokens": 865755.0, "step": 775 }, { "epoch": 0.8798185941043084, "grad_norm": 17.437816619873047, "learning_rate": 2.5029239766081874e-06, "loss": 1.3825, "num_tokens": 866867.0, "step": 776 }, { "epoch": 0.8809523809523809, "grad_norm": 18.154260635375977, "learning_rate": 2.4795321637426904e-06, "loss": 1.3668, "num_tokens": 867947.0, "step": 777 }, { "epoch": 0.8820861678004536, "grad_norm": 17.912960052490234, "learning_rate": 2.456140350877193e-06, "loss": 1.3158, "num_tokens": 869047.0, "step": 778 }, { "epoch": 0.8832199546485261, "grad_norm": 17.279184341430664, "learning_rate": 2.432748538011696e-06, "loss": 1.2668, "num_tokens": 870112.0, "step": 779 }, { "epoch": 0.8843537414965986, "grad_norm": 19.037927627563477, "learning_rate": 2.409356725146199e-06, "loss": 1.349, "num_tokens": 871206.0, "step": 780 }, { "epoch": 0.8854875283446711, "grad_norm": 17.318397521972656, "learning_rate": 2.385964912280702e-06, "loss": 1.2397, "num_tokens": 872313.0, "step": 781 }, { "epoch": 0.8866213151927438, "grad_norm": 17.00590705871582, "learning_rate": 2.362573099415205e-06, "loss": 1.3055, "num_tokens": 873407.0, "step": 782 }, { "epoch": 0.8877551020408163, "grad_norm": 17.461261749267578, "learning_rate": 2.3391812865497075e-06, "loss": 1.2394, "num_tokens": 874507.0, "step": 783 }, { "epoch": 0.8888888888888888, "grad_norm": 18.512025833129883, "learning_rate": 2.3157894736842105e-06, "loss": 1.3984, "num_tokens": 875594.0, "step": 784 }, { "epoch": 0.8900226757369615, "grad_norm": 17.244552612304688, "learning_rate": 2.292397660818714e-06, "loss": 1.3724, "num_tokens": 876759.0, "step": 785 }, { "epoch": 0.891156462585034, "grad_norm": 19.48406982421875, "learning_rate": 2.2690058479532165e-06, "loss": 1.3684, "num_tokens": 877790.0, "step": 786 }, { "epoch": 0.8922902494331065, "grad_norm": 18.64250373840332, "learning_rate": 2.2456140350877195e-06, "loss": 1.3401, "num_tokens": 878974.0, "step": 787 }, { "epoch": 0.8934240362811792, "grad_norm": 17.458181381225586, "learning_rate": 2.222222222222222e-06, "loss": 1.2793, "num_tokens": 880052.0, "step": 788 }, { "epoch": 0.8945578231292517, "grad_norm": 16.405527114868164, "learning_rate": 2.1988304093567255e-06, "loss": 1.272, "num_tokens": 881239.0, "step": 789 }, { "epoch": 0.8956916099773242, "grad_norm": 18.208492279052734, "learning_rate": 2.1754385964912285e-06, "loss": 1.3809, "num_tokens": 882281.0, "step": 790 }, { "epoch": 0.8968253968253969, "grad_norm": 17.376855850219727, "learning_rate": 2.152046783625731e-06, "loss": 1.3318, "num_tokens": 883346.0, "step": 791 }, { "epoch": 0.8979591836734694, "grad_norm": 21.826501846313477, "learning_rate": 2.128654970760234e-06, "loss": 1.4085, "num_tokens": 884352.0, "step": 792 }, { "epoch": 0.8990929705215419, "grad_norm": 17.40865135192871, "learning_rate": 2.105263157894737e-06, "loss": 1.4137, "num_tokens": 885502.0, "step": 793 }, { "epoch": 0.9002267573696145, "grad_norm": 19.549072265625, "learning_rate": 2.08187134502924e-06, "loss": 1.4434, "num_tokens": 886626.0, "step": 794 }, { "epoch": 0.9013605442176871, "grad_norm": 17.216943740844727, "learning_rate": 2.058479532163743e-06, "loss": 1.3574, "num_tokens": 887757.0, "step": 795 }, { "epoch": 0.9024943310657596, "grad_norm": 17.735605239868164, "learning_rate": 2.0350877192982456e-06, "loss": 1.3134, "num_tokens": 888939.0, "step": 796 }, { "epoch": 0.9036281179138322, "grad_norm": 16.386260986328125, "learning_rate": 2.0116959064327486e-06, "loss": 1.2122, "num_tokens": 890157.0, "step": 797 }, { "epoch": 0.9047619047619048, "grad_norm": 17.275033950805664, "learning_rate": 1.9883040935672516e-06, "loss": 1.5137, "num_tokens": 891398.0, "step": 798 }, { "epoch": 0.9058956916099773, "grad_norm": 24.554729461669922, "learning_rate": 1.9649122807017546e-06, "loss": 1.1789, "num_tokens": 892408.0, "step": 799 }, { "epoch": 0.9070294784580499, "grad_norm": 17.58880043029785, "learning_rate": 1.9415204678362576e-06, "loss": 1.2835, "num_tokens": 893592.0, "step": 800 }, { "epoch": 0.9081632653061225, "grad_norm": 17.943017959594727, "learning_rate": 1.91812865497076e-06, "loss": 1.2932, "num_tokens": 894697.0, "step": 801 }, { "epoch": 0.909297052154195, "grad_norm": 16.920461654663086, "learning_rate": 1.8947368421052634e-06, "loss": 1.4332, "num_tokens": 895818.0, "step": 802 }, { "epoch": 0.9104308390022676, "grad_norm": 17.876169204711914, "learning_rate": 1.8713450292397662e-06, "loss": 1.3829, "num_tokens": 896885.0, "step": 803 }, { "epoch": 0.9115646258503401, "grad_norm": 17.48735237121582, "learning_rate": 1.8479532163742692e-06, "loss": 1.2773, "num_tokens": 898032.0, "step": 804 }, { "epoch": 0.9126984126984127, "grad_norm": 16.804933547973633, "learning_rate": 1.8245614035087722e-06, "loss": 1.329, "num_tokens": 899163.0, "step": 805 }, { "epoch": 0.9138321995464853, "grad_norm": 16.497831344604492, "learning_rate": 1.801169590643275e-06, "loss": 1.2618, "num_tokens": 900304.0, "step": 806 }, { "epoch": 0.9149659863945578, "grad_norm": 18.51173210144043, "learning_rate": 1.777777777777778e-06, "loss": 1.2372, "num_tokens": 901464.0, "step": 807 }, { "epoch": 0.9160997732426304, "grad_norm": 17.494199752807617, "learning_rate": 1.7543859649122807e-06, "loss": 1.3628, "num_tokens": 902652.0, "step": 808 }, { "epoch": 0.9172335600907029, "grad_norm": 17.723461151123047, "learning_rate": 1.7309941520467837e-06, "loss": 1.2888, "num_tokens": 903725.0, "step": 809 }, { "epoch": 0.9183673469387755, "grad_norm": 16.55974578857422, "learning_rate": 1.7076023391812867e-06, "loss": 1.2621, "num_tokens": 904819.0, "step": 810 }, { "epoch": 0.9195011337868481, "grad_norm": 16.68052101135254, "learning_rate": 1.6842105263157895e-06, "loss": 1.3277, "num_tokens": 905994.0, "step": 811 }, { "epoch": 0.9206349206349206, "grad_norm": 17.137619018554688, "learning_rate": 1.6608187134502925e-06, "loss": 1.2731, "num_tokens": 907165.0, "step": 812 }, { "epoch": 0.9217687074829932, "grad_norm": 24.02943992614746, "learning_rate": 1.6374269005847953e-06, "loss": 1.3854, "num_tokens": 908315.0, "step": 813 }, { "epoch": 0.9229024943310657, "grad_norm": 19.52529525756836, "learning_rate": 1.6140350877192983e-06, "loss": 1.4027, "num_tokens": 909504.0, "step": 814 }, { "epoch": 0.9240362811791383, "grad_norm": 20.507009506225586, "learning_rate": 1.5906432748538015e-06, "loss": 1.4615, "num_tokens": 910579.0, "step": 815 }, { "epoch": 0.9251700680272109, "grad_norm": 17.278417587280273, "learning_rate": 1.5672514619883043e-06, "loss": 1.3415, "num_tokens": 911639.0, "step": 816 }, { "epoch": 0.9263038548752834, "grad_norm": 18.250825881958008, "learning_rate": 1.5438596491228073e-06, "loss": 1.3772, "num_tokens": 912701.0, "step": 817 }, { "epoch": 0.927437641723356, "grad_norm": 17.6839656829834, "learning_rate": 1.52046783625731e-06, "loss": 1.4774, "num_tokens": 913766.0, "step": 818 }, { "epoch": 0.9285714285714286, "grad_norm": 19.005475997924805, "learning_rate": 1.497076023391813e-06, "loss": 1.3713, "num_tokens": 914798.0, "step": 819 }, { "epoch": 0.9297052154195011, "grad_norm": 17.462848663330078, "learning_rate": 1.4736842105263159e-06, "loss": 1.2611, "num_tokens": 915953.0, "step": 820 }, { "epoch": 0.9308390022675737, "grad_norm": 16.43487548828125, "learning_rate": 1.4502923976608189e-06, "loss": 1.177, "num_tokens": 917153.0, "step": 821 }, { "epoch": 0.9319727891156463, "grad_norm": 18.54863166809082, "learning_rate": 1.4269005847953219e-06, "loss": 1.3797, "num_tokens": 918239.0, "step": 822 }, { "epoch": 0.9331065759637188, "grad_norm": 21.09725570678711, "learning_rate": 1.4035087719298246e-06, "loss": 1.2588, "num_tokens": 919263.0, "step": 823 }, { "epoch": 0.9342403628117913, "grad_norm": 17.768924713134766, "learning_rate": 1.3801169590643276e-06, "loss": 1.321, "num_tokens": 920398.0, "step": 824 }, { "epoch": 0.935374149659864, "grad_norm": 18.642379760742188, "learning_rate": 1.3567251461988304e-06, "loss": 1.3286, "num_tokens": 921477.0, "step": 825 }, { "epoch": 0.9365079365079365, "grad_norm": 18.273826599121094, "learning_rate": 1.3333333333333334e-06, "loss": 1.4168, "num_tokens": 922580.0, "step": 826 }, { "epoch": 0.937641723356009, "grad_norm": 16.867956161499023, "learning_rate": 1.3099415204678364e-06, "loss": 1.2582, "num_tokens": 923731.0, "step": 827 }, { "epoch": 0.9387755102040817, "grad_norm": 16.043872833251953, "learning_rate": 1.2865497076023392e-06, "loss": 1.2788, "num_tokens": 924867.0, "step": 828 }, { "epoch": 0.9399092970521542, "grad_norm": 16.63450813293457, "learning_rate": 1.2631578947368422e-06, "loss": 1.2313, "num_tokens": 925946.0, "step": 829 }, { "epoch": 0.9410430839002267, "grad_norm": 16.532018661499023, "learning_rate": 1.2397660818713452e-06, "loss": 1.3195, "num_tokens": 927211.0, "step": 830 }, { "epoch": 0.9421768707482994, "grad_norm": 17.918317794799805, "learning_rate": 1.216374269005848e-06, "loss": 1.2505, "num_tokens": 928388.0, "step": 831 }, { "epoch": 0.9433106575963719, "grad_norm": 18.10270881652832, "learning_rate": 1.192982456140351e-06, "loss": 1.3362, "num_tokens": 929505.0, "step": 832 }, { "epoch": 0.9444444444444444, "grad_norm": 20.00605583190918, "learning_rate": 1.1695906432748538e-06, "loss": 1.1922, "num_tokens": 930491.0, "step": 833 }, { "epoch": 0.9455782312925171, "grad_norm": 17.267173767089844, "learning_rate": 1.146198830409357e-06, "loss": 1.28, "num_tokens": 931679.0, "step": 834 }, { "epoch": 0.9467120181405896, "grad_norm": 18.59246063232422, "learning_rate": 1.1228070175438598e-06, "loss": 1.2352, "num_tokens": 932826.0, "step": 835 }, { "epoch": 0.9478458049886621, "grad_norm": 17.44966697692871, "learning_rate": 1.0994152046783627e-06, "loss": 1.2873, "num_tokens": 933959.0, "step": 836 }, { "epoch": 0.9489795918367347, "grad_norm": 17.430469512939453, "learning_rate": 1.0760233918128655e-06, "loss": 1.3142, "num_tokens": 935084.0, "step": 837 }, { "epoch": 0.9501133786848073, "grad_norm": 18.634151458740234, "learning_rate": 1.0526315789473685e-06, "loss": 1.4169, "num_tokens": 936207.0, "step": 838 }, { "epoch": 0.9512471655328798, "grad_norm": 16.79730796813965, "learning_rate": 1.0292397660818715e-06, "loss": 1.2891, "num_tokens": 937398.0, "step": 839 }, { "epoch": 0.9523809523809523, "grad_norm": 17.4179630279541, "learning_rate": 1.0058479532163743e-06, "loss": 1.4084, "num_tokens": 938566.0, "step": 840 }, { "epoch": 0.953514739229025, "grad_norm": 17.723804473876953, "learning_rate": 9.824561403508773e-07, "loss": 1.4206, "num_tokens": 939691.0, "step": 841 }, { "epoch": 0.9546485260770975, "grad_norm": 17.70682144165039, "learning_rate": 9.5906432748538e-07, "loss": 1.3328, "num_tokens": 940827.0, "step": 842 }, { "epoch": 0.95578231292517, "grad_norm": 17.740203857421875, "learning_rate": 9.356725146198831e-07, "loss": 1.3175, "num_tokens": 941874.0, "step": 843 }, { "epoch": 0.9569160997732427, "grad_norm": 17.87767219543457, "learning_rate": 9.122807017543861e-07, "loss": 1.1565, "num_tokens": 942976.0, "step": 844 }, { "epoch": 0.9580498866213152, "grad_norm": 19.07649803161621, "learning_rate": 8.88888888888889e-07, "loss": 1.334, "num_tokens": 944151.0, "step": 845 }, { "epoch": 0.9591836734693877, "grad_norm": 19.345378875732422, "learning_rate": 8.654970760233919e-07, "loss": 1.3327, "num_tokens": 945188.0, "step": 846 }, { "epoch": 0.9603174603174603, "grad_norm": 17.86591911315918, "learning_rate": 8.421052631578948e-07, "loss": 1.3708, "num_tokens": 946295.0, "step": 847 }, { "epoch": 0.9614512471655329, "grad_norm": 18.993871688842773, "learning_rate": 8.187134502923977e-07, "loss": 1.3972, "num_tokens": 947340.0, "step": 848 }, { "epoch": 0.9625850340136054, "grad_norm": 16.64432144165039, "learning_rate": 7.953216374269008e-07, "loss": 1.3542, "num_tokens": 948481.0, "step": 849 }, { "epoch": 0.963718820861678, "grad_norm": 19.02566909790039, "learning_rate": 7.719298245614036e-07, "loss": 1.192, "num_tokens": 949547.0, "step": 850 }, { "epoch": 0.9648526077097506, "grad_norm": 18.46371841430664, "learning_rate": 7.485380116959065e-07, "loss": 1.3366, "num_tokens": 950588.0, "step": 851 }, { "epoch": 0.9659863945578231, "grad_norm": 17.63727378845215, "learning_rate": 7.251461988304094e-07, "loss": 1.3576, "num_tokens": 951774.0, "step": 852 }, { "epoch": 0.9671201814058957, "grad_norm": 19.05787467956543, "learning_rate": 7.017543859649123e-07, "loss": 1.3228, "num_tokens": 952871.0, "step": 853 }, { "epoch": 0.9682539682539683, "grad_norm": 16.482786178588867, "learning_rate": 6.783625730994152e-07, "loss": 1.2648, "num_tokens": 954031.0, "step": 854 }, { "epoch": 0.9693877551020408, "grad_norm": 18.610177993774414, "learning_rate": 6.549707602339182e-07, "loss": 1.2641, "num_tokens": 955054.0, "step": 855 }, { "epoch": 0.9705215419501134, "grad_norm": 17.269947052001953, "learning_rate": 6.315789473684211e-07, "loss": 1.2354, "num_tokens": 956174.0, "step": 856 }, { "epoch": 0.971655328798186, "grad_norm": 17.863712310791016, "learning_rate": 6.08187134502924e-07, "loss": 1.4282, "num_tokens": 957337.0, "step": 857 }, { "epoch": 0.9727891156462585, "grad_norm": 18.089345932006836, "learning_rate": 5.847953216374269e-07, "loss": 1.3272, "num_tokens": 958396.0, "step": 858 }, { "epoch": 0.9739229024943311, "grad_norm": 17.56983184814453, "learning_rate": 5.614035087719299e-07, "loss": 1.366, "num_tokens": 959510.0, "step": 859 }, { "epoch": 0.9750566893424036, "grad_norm": 17.639245986938477, "learning_rate": 5.380116959064328e-07, "loss": 1.3214, "num_tokens": 960614.0, "step": 860 }, { "epoch": 0.9761904761904762, "grad_norm": 17.455869674682617, "learning_rate": 5.146198830409358e-07, "loss": 1.2943, "num_tokens": 961707.0, "step": 861 }, { "epoch": 0.9773242630385488, "grad_norm": 18.021507263183594, "learning_rate": 4.912280701754387e-07, "loss": 1.3798, "num_tokens": 962890.0, "step": 862 }, { "epoch": 0.9784580498866213, "grad_norm": 17.802026748657227, "learning_rate": 4.6783625730994155e-07, "loss": 1.3296, "num_tokens": 964107.0, "step": 863 }, { "epoch": 0.9795918367346939, "grad_norm": 17.913936614990234, "learning_rate": 4.444444444444445e-07, "loss": 1.3029, "num_tokens": 965201.0, "step": 864 }, { "epoch": 0.9807256235827665, "grad_norm": 18.913482666015625, "learning_rate": 4.210526315789474e-07, "loss": 1.2526, "num_tokens": 966250.0, "step": 865 }, { "epoch": 0.981859410430839, "grad_norm": 17.606752395629883, "learning_rate": 3.976608187134504e-07, "loss": 1.3328, "num_tokens": 967412.0, "step": 866 }, { "epoch": 0.9829931972789115, "grad_norm": 15.853782653808594, "learning_rate": 3.7426900584795327e-07, "loss": 1.2751, "num_tokens": 968670.0, "step": 867 }, { "epoch": 0.9841269841269841, "grad_norm": 19.007450103759766, "learning_rate": 3.5087719298245616e-07, "loss": 1.2168, "num_tokens": 969688.0, "step": 868 }, { "epoch": 0.9852607709750567, "grad_norm": 19.20392417907715, "learning_rate": 3.274853801169591e-07, "loss": 1.3602, "num_tokens": 970730.0, "step": 869 }, { "epoch": 0.9863945578231292, "grad_norm": 18.164915084838867, "learning_rate": 3.04093567251462e-07, "loss": 1.2965, "num_tokens": 971851.0, "step": 870 }, { "epoch": 0.9875283446712018, "grad_norm": 17.47812843322754, "learning_rate": 2.8070175438596494e-07, "loss": 1.3139, "num_tokens": 972955.0, "step": 871 }, { "epoch": 0.9886621315192744, "grad_norm": 16.587425231933594, "learning_rate": 2.573099415204679e-07, "loss": 1.349, "num_tokens": 974119.0, "step": 872 }, { "epoch": 0.9897959183673469, "grad_norm": 17.267658233642578, "learning_rate": 2.3391812865497077e-07, "loss": 1.3955, "num_tokens": 975268.0, "step": 873 }, { "epoch": 0.9909297052154195, "grad_norm": 16.937604904174805, "learning_rate": 2.105263157894737e-07, "loss": 1.2055, "num_tokens": 976366.0, "step": 874 }, { "epoch": 0.9920634920634921, "grad_norm": 19.098716735839844, "learning_rate": 1.8713450292397663e-07, "loss": 1.1838, "num_tokens": 977432.0, "step": 875 }, { "epoch": 0.9931972789115646, "grad_norm": 19.085880279541016, "learning_rate": 1.6374269005847955e-07, "loss": 1.2994, "num_tokens": 978586.0, "step": 876 }, { "epoch": 0.9943310657596371, "grad_norm": 17.368186950683594, "learning_rate": 1.4035087719298247e-07, "loss": 1.2564, "num_tokens": 979670.0, "step": 877 }, { "epoch": 0.9954648526077098, "grad_norm": 19.070894241333008, "learning_rate": 1.1695906432748539e-07, "loss": 1.2338, "num_tokens": 980667.0, "step": 878 }, { "epoch": 0.9965986394557823, "grad_norm": 18.711523056030273, "learning_rate": 9.356725146198832e-08, "loss": 1.379, "num_tokens": 981825.0, "step": 879 }, { "epoch": 0.9977324263038548, "grad_norm": 18.97348403930664, "learning_rate": 7.017543859649123e-08, "loss": 1.3963, "num_tokens": 982883.0, "step": 880 }, { "epoch": 0.9988662131519275, "grad_norm": 19.39207649230957, "learning_rate": 4.678362573099416e-08, "loss": 1.3719, "num_tokens": 983976.0, "step": 881 }, { "epoch": 1.0, "grad_norm": 18.084861755371094, "learning_rate": 2.339181286549708e-08, "loss": 1.2492, "num_tokens": 984517.0, "step": 882 }, { "epoch": 1.0, "step": 882, "total_flos": 1.907048611787571e+16, "train_loss": 1.394604523841486, "train_runtime": 215.8683, "train_samples_per_second": 32.654, "train_steps_per_second": 4.086 } ], "logging_steps": 1, "max_steps": 882, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.907048611787571e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }