{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9991726094322524, "eval_steps": 200, "global_step": 3172, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006303928135219258, "grad_norm": 0.23900407552719116, "learning_rate": 1.40625e-06, "loss": 0.6566, "step": 1 }, { "epoch": 0.0012607856270438517, "grad_norm": 0.26570916175842285, "learning_rate": 2.8125e-06, "loss": 0.691, "step": 2 }, { "epoch": 0.0018911784405657775, "grad_norm": 0.2456347942352295, "learning_rate": 4.21875e-06, "loss": 0.7193, "step": 3 }, { "epoch": 0.0025215712540877034, "grad_norm": 0.2497250735759735, "learning_rate": 5.625e-06, "loss": 0.63, "step": 4 }, { "epoch": 0.0031519640676096294, "grad_norm": 0.2710248827934265, "learning_rate": 7.031250000000001e-06, "loss": 0.6894, "step": 5 }, { "epoch": 0.003782356881131555, "grad_norm": 0.24278788268566132, "learning_rate": 8.4375e-06, "loss": 0.6779, "step": 6 }, { "epoch": 0.004412749694653481, "grad_norm": 0.27044716477394104, "learning_rate": 9.84375e-06, "loss": 0.7366, "step": 7 }, { "epoch": 0.005043142508175407, "grad_norm": 0.22668607532978058, "learning_rate": 1.125e-05, "loss": 0.6066, "step": 8 }, { "epoch": 0.005673535321697332, "grad_norm": 0.2294527143239975, "learning_rate": 1.2656250000000001e-05, "loss": 0.7224, "step": 9 }, { "epoch": 0.006303928135219259, "grad_norm": 0.22373972833156586, "learning_rate": 1.4062500000000001e-05, "loss": 0.6607, "step": 10 }, { "epoch": 0.0069343209487411845, "grad_norm": 0.22508320212364197, "learning_rate": 1.546875e-05, "loss": 0.7065, "step": 11 }, { "epoch": 0.00756471376226311, "grad_norm": 0.22706764936447144, "learning_rate": 1.6875e-05, "loss": 0.648, "step": 12 }, { "epoch": 0.008195106575785037, "grad_norm": 0.21693125367164612, "learning_rate": 1.828125e-05, "loss": 0.6044, "step": 13 }, { "epoch": 0.008825499389306962, "grad_norm": 0.1876903772354126, "learning_rate": 1.96875e-05, "loss": 0.5912, "step": 14 }, { "epoch": 0.009455892202828888, "grad_norm": 0.18589751422405243, "learning_rate": 2.109375e-05, "loss": 0.6084, "step": 15 }, { "epoch": 0.010086285016350813, "grad_norm": 0.21171413362026215, "learning_rate": 2.25e-05, "loss": 0.5924, "step": 16 }, { "epoch": 0.010716677829872739, "grad_norm": 0.20840738713741302, "learning_rate": 2.3906250000000002e-05, "loss": 0.5527, "step": 17 }, { "epoch": 0.011347070643394665, "grad_norm": 0.20482507348060608, "learning_rate": 2.5312500000000002e-05, "loss": 0.6516, "step": 18 }, { "epoch": 0.01197746345691659, "grad_norm": 0.25472748279571533, "learning_rate": 2.6718750000000002e-05, "loss": 0.6341, "step": 19 }, { "epoch": 0.012607856270438518, "grad_norm": 0.23628972470760345, "learning_rate": 2.8125000000000003e-05, "loss": 0.5931, "step": 20 }, { "epoch": 0.013238249083960443, "grad_norm": 0.19538824260234833, "learning_rate": 2.9531250000000003e-05, "loss": 0.5447, "step": 21 }, { "epoch": 0.013868641897482369, "grad_norm": 0.17556734383106232, "learning_rate": 3.09375e-05, "loss": 0.5111, "step": 22 }, { "epoch": 0.014499034711004295, "grad_norm": 0.19505266845226288, "learning_rate": 3.2343750000000004e-05, "loss": 0.5879, "step": 23 }, { "epoch": 0.01512942752452622, "grad_norm": 0.1797773689031601, "learning_rate": 3.375e-05, "loss": 0.5436, "step": 24 }, { "epoch": 0.015759820338048146, "grad_norm": 0.2432139664888382, "learning_rate": 3.5156250000000004e-05, "loss": 0.669, "step": 25 }, { "epoch": 0.016390213151570073, "grad_norm": 0.19737608730793, "learning_rate": 3.65625e-05, "loss": 0.5622, "step": 26 }, { "epoch": 0.017020605965091997, "grad_norm": 0.17237292230129242, "learning_rate": 3.7968750000000005e-05, "loss": 0.6074, "step": 27 }, { "epoch": 0.017650998778613924, "grad_norm": 0.17980431020259857, "learning_rate": 3.9375e-05, "loss": 0.4692, "step": 28 }, { "epoch": 0.01828139159213585, "grad_norm": 0.18721145391464233, "learning_rate": 4.0781250000000005e-05, "loss": 0.6626, "step": 29 }, { "epoch": 0.018911784405657776, "grad_norm": 0.18952064216136932, "learning_rate": 4.21875e-05, "loss": 0.5898, "step": 30 }, { "epoch": 0.019542177219179703, "grad_norm": 0.16960866749286652, "learning_rate": 4.3593750000000006e-05, "loss": 0.488, "step": 31 }, { "epoch": 0.020172570032701627, "grad_norm": 0.15774346888065338, "learning_rate": 4.5e-05, "loss": 0.538, "step": 32 }, { "epoch": 0.020802962846223554, "grad_norm": 0.1621467024087906, "learning_rate": 4.499998873858572e-05, "loss": 0.4839, "step": 33 }, { "epoch": 0.021433355659745478, "grad_norm": 0.17073126137256622, "learning_rate": 4.4999954954354146e-05, "loss": 0.5118, "step": 34 }, { "epoch": 0.022063748473267406, "grad_norm": 0.1898810863494873, "learning_rate": 4.4999898647339106e-05, "loss": 0.564, "step": 35 }, { "epoch": 0.02269414128678933, "grad_norm": 0.1633978635072708, "learning_rate": 4.4999819817596954e-05, "loss": 0.5113, "step": 36 }, { "epoch": 0.023324534100311257, "grad_norm": 0.17059823870658875, "learning_rate": 4.4999718465206607e-05, "loss": 0.5268, "step": 37 }, { "epoch": 0.02395492691383318, "grad_norm": 0.1455584019422531, "learning_rate": 4.499959459026952e-05, "loss": 0.4203, "step": 38 }, { "epoch": 0.024585319727355108, "grad_norm": 0.14489693939685822, "learning_rate": 4.499944819290968e-05, "loss": 0.4815, "step": 39 }, { "epoch": 0.025215712540877035, "grad_norm": 0.16840510070323944, "learning_rate": 4.4999279273273656e-05, "loss": 0.5266, "step": 40 }, { "epoch": 0.02584610535439896, "grad_norm": 0.13356979191303253, "learning_rate": 4.4999087831530515e-05, "loss": 0.5271, "step": 41 }, { "epoch": 0.026476498167920887, "grad_norm": 0.14093619585037231, "learning_rate": 4.4998873867871916e-05, "loss": 0.4549, "step": 42 }, { "epoch": 0.02710689098144281, "grad_norm": 0.16740302741527557, "learning_rate": 4.4998637382512016e-05, "loss": 0.5066, "step": 43 }, { "epoch": 0.027737283794964738, "grad_norm": 0.15250779688358307, "learning_rate": 4.4998378375687565e-05, "loss": 0.4683, "step": 44 }, { "epoch": 0.028367676608486662, "grad_norm": 0.1856726109981537, "learning_rate": 4.4998096847657815e-05, "loss": 0.5442, "step": 45 }, { "epoch": 0.02899806942200859, "grad_norm": 0.18478403985500336, "learning_rate": 4.499779279870459e-05, "loss": 0.6015, "step": 46 }, { "epoch": 0.029628462235530516, "grad_norm": 0.211555615067482, "learning_rate": 4.499746622913224e-05, "loss": 0.4835, "step": 47 }, { "epoch": 0.03025885504905244, "grad_norm": 0.16362585127353668, "learning_rate": 4.499711713926767e-05, "loss": 0.5241, "step": 48 }, { "epoch": 0.030889247862574368, "grad_norm": 0.16383248567581177, "learning_rate": 4.499674552946032e-05, "loss": 0.4803, "step": 49 }, { "epoch": 0.03151964067609629, "grad_norm": 0.17049849033355713, "learning_rate": 4.4996351400082177e-05, "loss": 0.5351, "step": 50 }, { "epoch": 0.032150033489618215, "grad_norm": 0.1521013379096985, "learning_rate": 4.4995934751527774e-05, "loss": 0.4387, "step": 51 }, { "epoch": 0.032780426303140146, "grad_norm": 0.1371651589870453, "learning_rate": 4.499549558421419e-05, "loss": 0.4201, "step": 52 }, { "epoch": 0.03341081911666207, "grad_norm": 0.18806736171245575, "learning_rate": 4.4995033898581017e-05, "loss": 0.4741, "step": 53 }, { "epoch": 0.034041211930183994, "grad_norm": 0.17703312635421753, "learning_rate": 4.499454969509043e-05, "loss": 0.5133, "step": 54 }, { "epoch": 0.034671604743705925, "grad_norm": 0.18577425181865692, "learning_rate": 4.499404297422711e-05, "loss": 0.4545, "step": 55 }, { "epoch": 0.03530199755722785, "grad_norm": 0.1458091139793396, "learning_rate": 4.4993513736498296e-05, "loss": 0.4347, "step": 56 }, { "epoch": 0.03593239037074977, "grad_norm": 0.1535828560590744, "learning_rate": 4.499296198243376e-05, "loss": 0.447, "step": 57 }, { "epoch": 0.0365627831842717, "grad_norm": 0.16728930175304413, "learning_rate": 4.4992387712585824e-05, "loss": 0.5249, "step": 58 }, { "epoch": 0.03719317599779363, "grad_norm": 0.17529086768627167, "learning_rate": 4.499179092752934e-05, "loss": 0.5676, "step": 59 }, { "epoch": 0.03782356881131555, "grad_norm": 0.155259370803833, "learning_rate": 4.4991171627861686e-05, "loss": 0.4816, "step": 60 }, { "epoch": 0.038453961624837475, "grad_norm": 0.17953960597515106, "learning_rate": 4.4990529814202806e-05, "loss": 0.4924, "step": 61 }, { "epoch": 0.039084354438359406, "grad_norm": 0.19025997817516327, "learning_rate": 4.4989865487195156e-05, "loss": 0.4962, "step": 62 }, { "epoch": 0.03971474725188133, "grad_norm": 0.22502540051937103, "learning_rate": 4.498917864750374e-05, "loss": 0.5378, "step": 63 }, { "epoch": 0.040345140065403254, "grad_norm": 0.15279769897460938, "learning_rate": 4.498846929581608e-05, "loss": 0.4582, "step": 64 }, { "epoch": 0.04097553287892518, "grad_norm": 0.15880808234214783, "learning_rate": 4.498773743284227e-05, "loss": 0.5224, "step": 65 }, { "epoch": 0.04160592569244711, "grad_norm": 0.16909122467041016, "learning_rate": 4.498698305931491e-05, "loss": 0.4602, "step": 66 }, { "epoch": 0.04223631850596903, "grad_norm": 0.16990695893764496, "learning_rate": 4.498620617598913e-05, "loss": 0.4741, "step": 67 }, { "epoch": 0.042866711319490956, "grad_norm": 0.16151531040668488, "learning_rate": 4.49854067836426e-05, "loss": 0.4053, "step": 68 }, { "epoch": 0.04349710413301288, "grad_norm": 0.18641473352909088, "learning_rate": 4.498458488307554e-05, "loss": 0.4993, "step": 69 }, { "epoch": 0.04412749694653481, "grad_norm": 0.1472584754228592, "learning_rate": 4.498374047511067e-05, "loss": 0.4444, "step": 70 }, { "epoch": 0.044757889760056735, "grad_norm": 0.13417768478393555, "learning_rate": 4.498287356059326e-05, "loss": 0.4316, "step": 71 }, { "epoch": 0.04538828257357866, "grad_norm": 0.1857917308807373, "learning_rate": 4.4981984140391097e-05, "loss": 0.463, "step": 72 }, { "epoch": 0.04601867538710059, "grad_norm": 0.16314299404621124, "learning_rate": 4.4981072215394514e-05, "loss": 0.5322, "step": 73 }, { "epoch": 0.046649068200622514, "grad_norm": 0.1854691356420517, "learning_rate": 4.498013778651636e-05, "loss": 0.4225, "step": 74 }, { "epoch": 0.04727946101414444, "grad_norm": 0.19411827623844147, "learning_rate": 4.4979180854692005e-05, "loss": 0.4597, "step": 75 }, { "epoch": 0.04790985382766636, "grad_norm": 0.14335975050926208, "learning_rate": 4.4978201420879354e-05, "loss": 0.4483, "step": 76 }, { "epoch": 0.04854024664118829, "grad_norm": 0.16315117478370667, "learning_rate": 4.4977199486058836e-05, "loss": 0.4745, "step": 77 }, { "epoch": 0.049170639454710216, "grad_norm": 0.15294815599918365, "learning_rate": 4.49761750512334e-05, "loss": 0.392, "step": 78 }, { "epoch": 0.04980103226823214, "grad_norm": 0.165781170129776, "learning_rate": 4.497512811742853e-05, "loss": 0.401, "step": 79 }, { "epoch": 0.05043142508175407, "grad_norm": 0.17177148163318634, "learning_rate": 4.497405868569221e-05, "loss": 0.49, "step": 80 }, { "epoch": 0.051061817895275995, "grad_norm": 0.15722817182540894, "learning_rate": 4.497296675709497e-05, "loss": 0.4317, "step": 81 }, { "epoch": 0.05169221070879792, "grad_norm": 0.14051659405231476, "learning_rate": 4.4971852332729826e-05, "loss": 0.4655, "step": 82 }, { "epoch": 0.05232260352231984, "grad_norm": 0.1831762194633484, "learning_rate": 4.4970715413712346e-05, "loss": 0.4518, "step": 83 }, { "epoch": 0.05295299633584177, "grad_norm": 0.225497305393219, "learning_rate": 4.4969556001180605e-05, "loss": 0.5071, "step": 84 }, { "epoch": 0.0535833891493637, "grad_norm": 0.18495135009288788, "learning_rate": 4.496837409629519e-05, "loss": 0.491, "step": 85 }, { "epoch": 0.05421378196288562, "grad_norm": 0.1926666647195816, "learning_rate": 4.4967169700239204e-05, "loss": 0.4815, "step": 86 }, { "epoch": 0.05484417477640755, "grad_norm": 0.2034398317337036, "learning_rate": 4.496594281421826e-05, "loss": 0.5421, "step": 87 }, { "epoch": 0.055474567589929476, "grad_norm": 0.18834415078163147, "learning_rate": 4.4964693439460494e-05, "loss": 0.5217, "step": 88 }, { "epoch": 0.0561049604034514, "grad_norm": 0.16215714812278748, "learning_rate": 4.4963421577216546e-05, "loss": 0.5103, "step": 89 }, { "epoch": 0.056735353216973324, "grad_norm": 0.15685607492923737, "learning_rate": 4.496212722875958e-05, "loss": 0.54, "step": 90 }, { "epoch": 0.057365746030495254, "grad_norm": 0.18496496975421906, "learning_rate": 4.496081039538524e-05, "loss": 0.4635, "step": 91 }, { "epoch": 0.05799613884401718, "grad_norm": 0.17332017421722412, "learning_rate": 4.4959471078411706e-05, "loss": 0.4878, "step": 92 }, { "epoch": 0.0586265316575391, "grad_norm": 0.16615472733974457, "learning_rate": 4.4958109279179644e-05, "loss": 0.45, "step": 93 }, { "epoch": 0.05925692447106103, "grad_norm": 0.17992667853832245, "learning_rate": 4.495672499905225e-05, "loss": 0.4838, "step": 94 }, { "epoch": 0.05988731728458296, "grad_norm": 0.15236912667751312, "learning_rate": 4.49553182394152e-05, "loss": 0.4665, "step": 95 }, { "epoch": 0.06051771009810488, "grad_norm": 0.17254213988780975, "learning_rate": 4.495388900167668e-05, "loss": 0.5133, "step": 96 }, { "epoch": 0.061148102911626805, "grad_norm": 0.15270383656024933, "learning_rate": 4.4952437287267376e-05, "loss": 0.4934, "step": 97 }, { "epoch": 0.061778495725148735, "grad_norm": 0.14834915101528168, "learning_rate": 4.4950963097640486e-05, "loss": 0.4598, "step": 98 }, { "epoch": 0.06240888853867066, "grad_norm": 0.14422562718391418, "learning_rate": 4.4949466434271685e-05, "loss": 0.4134, "step": 99 }, { "epoch": 0.06303928135219258, "grad_norm": 0.16636601090431213, "learning_rate": 4.494794729865916e-05, "loss": 0.4554, "step": 100 }, { "epoch": 0.06366967416571451, "grad_norm": 0.17716509103775024, "learning_rate": 4.4946405692323586e-05, "loss": 0.4314, "step": 101 }, { "epoch": 0.06430006697923643, "grad_norm": 0.17074547708034515, "learning_rate": 4.494484161680813e-05, "loss": 0.4192, "step": 102 }, { "epoch": 0.06493045979275837, "grad_norm": 0.14279553294181824, "learning_rate": 4.494325507367846e-05, "loss": 0.3965, "step": 103 }, { "epoch": 0.06556085260628029, "grad_norm": 0.17137177288532257, "learning_rate": 4.4941646064522724e-05, "loss": 0.4707, "step": 104 }, { "epoch": 0.06619124541980222, "grad_norm": 0.1495380699634552, "learning_rate": 4.4940014590951575e-05, "loss": 0.4655, "step": 105 }, { "epoch": 0.06682163823332414, "grad_norm": 0.15420159697532654, "learning_rate": 4.493836065459813e-05, "loss": 0.5068, "step": 106 }, { "epoch": 0.06745203104684606, "grad_norm": 0.1462813764810562, "learning_rate": 4.4936684257118e-05, "loss": 0.3858, "step": 107 }, { "epoch": 0.06808242386036799, "grad_norm": 0.18409891426563263, "learning_rate": 4.4934985400189296e-05, "loss": 0.4831, "step": 108 }, { "epoch": 0.06871281667388991, "grad_norm": 0.1710345447063446, "learning_rate": 4.493326408551259e-05, "loss": 0.4393, "step": 109 }, { "epoch": 0.06934320948741185, "grad_norm": 0.17826968431472778, "learning_rate": 4.493152031481095e-05, "loss": 0.4414, "step": 110 }, { "epoch": 0.06997360230093377, "grad_norm": 0.16518640518188477, "learning_rate": 4.492975408982991e-05, "loss": 0.4779, "step": 111 }, { "epoch": 0.0706039951144557, "grad_norm": 0.2125261276960373, "learning_rate": 4.492796541233749e-05, "loss": 0.4996, "step": 112 }, { "epoch": 0.07123438792797762, "grad_norm": 0.16218961775302887, "learning_rate": 4.4926154284124176e-05, "loss": 0.4495, "step": 113 }, { "epoch": 0.07186478074149955, "grad_norm": 0.16842223703861237, "learning_rate": 4.492432070700295e-05, "loss": 0.4078, "step": 114 }, { "epoch": 0.07249517355502147, "grad_norm": 0.16266340017318726, "learning_rate": 4.492246468280922e-05, "loss": 0.482, "step": 115 }, { "epoch": 0.0731255663685434, "grad_norm": 0.157426655292511, "learning_rate": 4.492058621340093e-05, "loss": 0.4512, "step": 116 }, { "epoch": 0.07375595918206533, "grad_norm": 0.15796181559562683, "learning_rate": 4.4918685300658424e-05, "loss": 0.4002, "step": 117 }, { "epoch": 0.07438635199558725, "grad_norm": 0.15874150395393372, "learning_rate": 4.4916761946484566e-05, "loss": 0.4356, "step": 118 }, { "epoch": 0.07501674480910918, "grad_norm": 0.1592918336391449, "learning_rate": 4.4914816152804645e-05, "loss": 0.4492, "step": 119 }, { "epoch": 0.0756471376226311, "grad_norm": 0.1482664942741394, "learning_rate": 4.491284792156644e-05, "loss": 0.4358, "step": 120 }, { "epoch": 0.07627753043615303, "grad_norm": 0.16430510580539703, "learning_rate": 4.491085725474018e-05, "loss": 0.4256, "step": 121 }, { "epoch": 0.07690792324967495, "grad_norm": 0.16677382588386536, "learning_rate": 4.4908844154318534e-05, "loss": 0.4289, "step": 122 }, { "epoch": 0.07753831606319687, "grad_norm": 0.213206946849823, "learning_rate": 4.490680862231667e-05, "loss": 0.5544, "step": 123 }, { "epoch": 0.07816870887671881, "grad_norm": 0.15621592104434967, "learning_rate": 4.4904750660772165e-05, "loss": 0.4191, "step": 124 }, { "epoch": 0.07879910169024074, "grad_norm": 0.19092267751693726, "learning_rate": 4.490267027174508e-05, "loss": 0.4882, "step": 125 }, { "epoch": 0.07942949450376266, "grad_norm": 0.22929497063159943, "learning_rate": 4.4900567457317904e-05, "loss": 0.5008, "step": 126 }, { "epoch": 0.08005988731728458, "grad_norm": 0.18289487063884735, "learning_rate": 4.48984422195956e-05, "loss": 0.4999, "step": 127 }, { "epoch": 0.08069028013080651, "grad_norm": 0.19063183665275574, "learning_rate": 4.489629456070555e-05, "loss": 0.539, "step": 128 }, { "epoch": 0.08132067294432843, "grad_norm": 0.15307193994522095, "learning_rate": 4.48941244827976e-05, "loss": 0.4312, "step": 129 }, { "epoch": 0.08195106575785036, "grad_norm": 0.1701776683330536, "learning_rate": 4.4891931988044025e-05, "loss": 0.4498, "step": 130 }, { "epoch": 0.0825814585713723, "grad_norm": 0.1710740327835083, "learning_rate": 4.488971707863954e-05, "loss": 0.4704, "step": 131 }, { "epoch": 0.08321185138489422, "grad_norm": 0.17142195999622345, "learning_rate": 4.4887479756801315e-05, "loss": 0.4366, "step": 132 }, { "epoch": 0.08384224419841614, "grad_norm": 0.1602955162525177, "learning_rate": 4.488522002476893e-05, "loss": 0.4615, "step": 133 }, { "epoch": 0.08447263701193806, "grad_norm": 0.19058693945407867, "learning_rate": 4.488293788480441e-05, "loss": 0.4678, "step": 134 }, { "epoch": 0.08510302982545999, "grad_norm": 0.1879880428314209, "learning_rate": 4.488063333919222e-05, "loss": 0.4998, "step": 135 }, { "epoch": 0.08573342263898191, "grad_norm": 0.15370860695838928, "learning_rate": 4.4878306390239236e-05, "loss": 0.4588, "step": 136 }, { "epoch": 0.08636381545250384, "grad_norm": 0.18422965705394745, "learning_rate": 4.487595704027476e-05, "loss": 0.4493, "step": 137 }, { "epoch": 0.08699420826602576, "grad_norm": 0.15197817981243134, "learning_rate": 4.487358529165054e-05, "loss": 0.4298, "step": 138 }, { "epoch": 0.0876246010795477, "grad_norm": 0.19497714936733246, "learning_rate": 4.4871191146740733e-05, "loss": 0.4643, "step": 139 }, { "epoch": 0.08825499389306962, "grad_norm": 0.16512389481067657, "learning_rate": 4.4868774607941894e-05, "loss": 0.4278, "step": 140 }, { "epoch": 0.08888538670659155, "grad_norm": 0.19072328507900238, "learning_rate": 4.4866335677673036e-05, "loss": 0.4674, "step": 141 }, { "epoch": 0.08951577952011347, "grad_norm": 0.18181082606315613, "learning_rate": 4.486387435837555e-05, "loss": 0.5149, "step": 142 }, { "epoch": 0.0901461723336354, "grad_norm": 0.15561337769031525, "learning_rate": 4.486139065251326e-05, "loss": 0.3933, "step": 143 }, { "epoch": 0.09077656514715732, "grad_norm": 0.16787169873714447, "learning_rate": 4.485888456257238e-05, "loss": 0.3935, "step": 144 }, { "epoch": 0.09140695796067924, "grad_norm": 0.19648166000843048, "learning_rate": 4.485635609106156e-05, "loss": 0.4691, "step": 145 }, { "epoch": 0.09203735077420118, "grad_norm": 0.16800902783870697, "learning_rate": 4.485380524051182e-05, "loss": 0.4609, "step": 146 }, { "epoch": 0.0926677435877231, "grad_norm": 0.1822555959224701, "learning_rate": 4.485123201347661e-05, "loss": 0.4519, "step": 147 }, { "epoch": 0.09329813640124503, "grad_norm": 0.18529456853866577, "learning_rate": 4.484863641253176e-05, "loss": 0.4701, "step": 148 }, { "epoch": 0.09392852921476695, "grad_norm": 0.17306600511074066, "learning_rate": 4.484601844027551e-05, "loss": 0.4405, "step": 149 }, { "epoch": 0.09455892202828887, "grad_norm": 0.16045962274074554, "learning_rate": 4.484337809932849e-05, "loss": 0.3381, "step": 150 }, { "epoch": 0.0951893148418108, "grad_norm": 0.20732329785823822, "learning_rate": 4.484071539233371e-05, "loss": 0.5217, "step": 151 }, { "epoch": 0.09581970765533272, "grad_norm": 0.17176291346549988, "learning_rate": 4.483803032195659e-05, "loss": 0.5006, "step": 152 }, { "epoch": 0.09645010046885466, "grad_norm": 0.18901783227920532, "learning_rate": 4.483532289088492e-05, "loss": 0.5125, "step": 153 }, { "epoch": 0.09708049328237658, "grad_norm": 0.16525357961654663, "learning_rate": 4.4832593101828874e-05, "loss": 0.4842, "step": 154 }, { "epoch": 0.09771088609589851, "grad_norm": 0.16560229659080505, "learning_rate": 4.4829840957521014e-05, "loss": 0.4463, "step": 155 }, { "epoch": 0.09834127890942043, "grad_norm": 0.18390001356601715, "learning_rate": 4.482706646071629e-05, "loss": 0.4678, "step": 156 }, { "epoch": 0.09897167172294236, "grad_norm": 0.17232733964920044, "learning_rate": 4.4824269614191985e-05, "loss": 0.4734, "step": 157 }, { "epoch": 0.09960206453646428, "grad_norm": 0.18861058354377747, "learning_rate": 4.482145042074781e-05, "loss": 0.4508, "step": 158 }, { "epoch": 0.1002324573499862, "grad_norm": 0.20135930180549622, "learning_rate": 4.481860888320581e-05, "loss": 0.5127, "step": 159 }, { "epoch": 0.10086285016350814, "grad_norm": 0.15137121081352234, "learning_rate": 4.4815745004410394e-05, "loss": 0.4318, "step": 160 }, { "epoch": 0.10149324297703007, "grad_norm": 0.16308103501796722, "learning_rate": 4.481285878722835e-05, "loss": 0.5055, "step": 161 }, { "epoch": 0.10212363579055199, "grad_norm": 0.14730596542358398, "learning_rate": 4.480995023454885e-05, "loss": 0.4132, "step": 162 }, { "epoch": 0.10275402860407391, "grad_norm": 0.1659645140171051, "learning_rate": 4.4807019349283365e-05, "loss": 0.4884, "step": 163 }, { "epoch": 0.10338442141759584, "grad_norm": 0.17068709433078766, "learning_rate": 4.4804066134365765e-05, "loss": 0.4674, "step": 164 }, { "epoch": 0.10401481423111776, "grad_norm": 0.1809260994195938, "learning_rate": 4.480109059275227e-05, "loss": 0.4232, "step": 165 }, { "epoch": 0.10464520704463968, "grad_norm": 0.17435210943222046, "learning_rate": 4.479809272742143e-05, "loss": 0.4616, "step": 166 }, { "epoch": 0.10527559985816162, "grad_norm": 0.15691553056240082, "learning_rate": 4.479507254137415e-05, "loss": 0.3953, "step": 167 }, { "epoch": 0.10590599267168355, "grad_norm": 0.1464596688747406, "learning_rate": 4.479203003763369e-05, "loss": 0.4254, "step": 168 }, { "epoch": 0.10653638548520547, "grad_norm": 0.16407829523086548, "learning_rate": 4.478896521924564e-05, "loss": 0.4266, "step": 169 }, { "epoch": 0.1071667782987274, "grad_norm": 0.15943580865859985, "learning_rate": 4.478587808927792e-05, "loss": 0.3904, "step": 170 }, { "epoch": 0.10779717111224932, "grad_norm": 0.18695223331451416, "learning_rate": 4.4782768650820794e-05, "loss": 0.5015, "step": 171 }, { "epoch": 0.10842756392577124, "grad_norm": 0.17117992043495178, "learning_rate": 4.477963690698687e-05, "loss": 0.3751, "step": 172 }, { "epoch": 0.10905795673929317, "grad_norm": 0.1756797730922699, "learning_rate": 4.477648286091104e-05, "loss": 0.4789, "step": 173 }, { "epoch": 0.1096883495528151, "grad_norm": 0.16904820501804352, "learning_rate": 4.477330651575057e-05, "loss": 0.5275, "step": 174 }, { "epoch": 0.11031874236633703, "grad_norm": 0.1578703373670578, "learning_rate": 4.4770107874685035e-05, "loss": 0.4494, "step": 175 }, { "epoch": 0.11094913517985895, "grad_norm": 0.16940182447433472, "learning_rate": 4.4766886940916305e-05, "loss": 0.4139, "step": 176 }, { "epoch": 0.11157952799338088, "grad_norm": 0.15721531212329865, "learning_rate": 4.476364371766859e-05, "loss": 0.3958, "step": 177 }, { "epoch": 0.1122099208069028, "grad_norm": 0.17659851908683777, "learning_rate": 4.47603782081884e-05, "loss": 0.4595, "step": 178 }, { "epoch": 0.11284031362042472, "grad_norm": 0.1890040636062622, "learning_rate": 4.475709041574456e-05, "loss": 0.4771, "step": 179 }, { "epoch": 0.11347070643394665, "grad_norm": 0.1727362871170044, "learning_rate": 4.47537803436282e-05, "loss": 0.499, "step": 180 }, { "epoch": 0.11410109924746858, "grad_norm": 0.18926569819450378, "learning_rate": 4.475044799515275e-05, "loss": 0.4292, "step": 181 }, { "epoch": 0.11473149206099051, "grad_norm": 0.17432500422000885, "learning_rate": 4.474709337365393e-05, "loss": 0.4015, "step": 182 }, { "epoch": 0.11536188487451243, "grad_norm": 0.1681092083454132, "learning_rate": 4.474371648248979e-05, "loss": 0.4705, "step": 183 }, { "epoch": 0.11599227768803436, "grad_norm": 0.14282703399658203, "learning_rate": 4.4740317325040615e-05, "loss": 0.3439, "step": 184 }, { "epoch": 0.11662267050155628, "grad_norm": 0.16238975524902344, "learning_rate": 4.473689590470903e-05, "loss": 0.3679, "step": 185 }, { "epoch": 0.1172530633150782, "grad_norm": 0.15502974390983582, "learning_rate": 4.4733452224919926e-05, "loss": 0.4495, "step": 186 }, { "epoch": 0.11788345612860013, "grad_norm": 0.18629074096679688, "learning_rate": 4.472998628912047e-05, "loss": 0.4752, "step": 187 }, { "epoch": 0.11851384894212207, "grad_norm": 0.19327090680599213, "learning_rate": 4.4726498100780116e-05, "loss": 0.4649, "step": 188 }, { "epoch": 0.11914424175564399, "grad_norm": 0.18306037783622742, "learning_rate": 4.47229876633906e-05, "loss": 0.4112, "step": 189 }, { "epoch": 0.11977463456916591, "grad_norm": 0.13204658031463623, "learning_rate": 4.471945498046591e-05, "loss": 0.3987, "step": 190 }, { "epoch": 0.12040502738268784, "grad_norm": 0.13157318532466888, "learning_rate": 4.471590005554233e-05, "loss": 0.3778, "step": 191 }, { "epoch": 0.12103542019620976, "grad_norm": 0.140708789229393, "learning_rate": 4.471232289217837e-05, "loss": 0.3759, "step": 192 }, { "epoch": 0.12166581300973169, "grad_norm": 0.15047132968902588, "learning_rate": 4.4708723493954836e-05, "loss": 0.3844, "step": 193 }, { "epoch": 0.12229620582325361, "grad_norm": 0.1584540754556656, "learning_rate": 4.4705101864474775e-05, "loss": 0.4779, "step": 194 }, { "epoch": 0.12292659863677555, "grad_norm": 0.16349102556705475, "learning_rate": 4.4701458007363485e-05, "loss": 0.3726, "step": 195 }, { "epoch": 0.12355699145029747, "grad_norm": 0.14389654994010925, "learning_rate": 4.4697791926268535e-05, "loss": 0.4332, "step": 196 }, { "epoch": 0.1241873842638194, "grad_norm": 0.15718668699264526, "learning_rate": 4.4694103624859706e-05, "loss": 0.3834, "step": 197 }, { "epoch": 0.12481777707734132, "grad_norm": 0.15873458981513977, "learning_rate": 4.4690393106829065e-05, "loss": 0.4657, "step": 198 }, { "epoch": 0.12544816989086324, "grad_norm": 0.17128777503967285, "learning_rate": 4.468666037589088e-05, "loss": 0.5291, "step": 199 }, { "epoch": 0.12607856270438517, "grad_norm": 0.17686426639556885, "learning_rate": 4.4682905435781666e-05, "loss": 0.3717, "step": 200 }, { "epoch": 0.12607856270438517, "eval_loss": 0.47015270590782166, "eval_runtime": 226.3613, "eval_samples_per_second": 4.418, "eval_steps_per_second": 4.418, "step": 200 }, { "epoch": 0.1267089555179071, "grad_norm": 0.18956759572029114, "learning_rate": 4.4679128290260173e-05, "loss": 0.4833, "step": 201 }, { "epoch": 0.12733934833142901, "grad_norm": 0.17402750253677368, "learning_rate": 4.4675328943107394e-05, "loss": 0.3957, "step": 202 }, { "epoch": 0.12796974114495094, "grad_norm": 0.1294248402118683, "learning_rate": 4.467150739812652e-05, "loss": 0.3687, "step": 203 }, { "epoch": 0.12860013395847286, "grad_norm": 0.15464654564857483, "learning_rate": 4.466766365914297e-05, "loss": 0.4203, "step": 204 }, { "epoch": 0.12923052677199479, "grad_norm": 0.1686786562204361, "learning_rate": 4.4663797730004384e-05, "loss": 0.4615, "step": 205 }, { "epoch": 0.12986091958551674, "grad_norm": 0.18524964153766632, "learning_rate": 4.4659909614580625e-05, "loss": 0.4875, "step": 206 }, { "epoch": 0.13049131239903866, "grad_norm": 0.14806324243545532, "learning_rate": 4.4655999316763744e-05, "loss": 0.4546, "step": 207 }, { "epoch": 0.13112170521256059, "grad_norm": 0.14976917207241058, "learning_rate": 4.465206684046801e-05, "loss": 0.4205, "step": 208 }, { "epoch": 0.1317520980260825, "grad_norm": 0.21427291631698608, "learning_rate": 4.4648112189629884e-05, "loss": 0.4247, "step": 209 }, { "epoch": 0.13238249083960443, "grad_norm": 0.1890469193458557, "learning_rate": 4.4644135368208025e-05, "loss": 0.4702, "step": 210 }, { "epoch": 0.13301288365312636, "grad_norm": 0.14907729625701904, "learning_rate": 4.4640136380183296e-05, "loss": 0.3715, "step": 211 }, { "epoch": 0.13364327646664828, "grad_norm": 0.1610517054796219, "learning_rate": 4.463611522955875e-05, "loss": 0.4277, "step": 212 }, { "epoch": 0.1342736692801702, "grad_norm": 0.16458484530448914, "learning_rate": 4.463207192035961e-05, "loss": 0.4274, "step": 213 }, { "epoch": 0.13490406209369213, "grad_norm": 0.1576109677553177, "learning_rate": 4.462800645663328e-05, "loss": 0.3847, "step": 214 }, { "epoch": 0.13553445490721405, "grad_norm": 0.1626528799533844, "learning_rate": 4.462391884244936e-05, "loss": 0.475, "step": 215 }, { "epoch": 0.13616484772073598, "grad_norm": 0.19211681187152863, "learning_rate": 4.461980908189961e-05, "loss": 0.4966, "step": 216 }, { "epoch": 0.1367952405342579, "grad_norm": 0.13622134923934937, "learning_rate": 4.461567717909796e-05, "loss": 0.4061, "step": 217 }, { "epoch": 0.13742563334777982, "grad_norm": 0.18433628976345062, "learning_rate": 4.46115231381805e-05, "loss": 0.4502, "step": 218 }, { "epoch": 0.13805602616130175, "grad_norm": 0.1514837145805359, "learning_rate": 4.460734696330548e-05, "loss": 0.4209, "step": 219 }, { "epoch": 0.1386864189748237, "grad_norm": 0.18339303135871887, "learning_rate": 4.4603148658653326e-05, "loss": 0.4564, "step": 220 }, { "epoch": 0.13931681178834562, "grad_norm": 0.18273302912712097, "learning_rate": 4.45989282284266e-05, "loss": 0.4833, "step": 221 }, { "epoch": 0.13994720460186755, "grad_norm": 0.15928077697753906, "learning_rate": 4.459468567685001e-05, "loss": 0.4816, "step": 222 }, { "epoch": 0.14057759741538947, "grad_norm": 0.16953788697719574, "learning_rate": 4.459042100817041e-05, "loss": 0.4381, "step": 223 }, { "epoch": 0.1412079902289114, "grad_norm": 0.19472193717956543, "learning_rate": 4.4586134226656804e-05, "loss": 0.4603, "step": 224 }, { "epoch": 0.14183838304243332, "grad_norm": 0.12984232604503632, "learning_rate": 4.458182533660031e-05, "loss": 0.3997, "step": 225 }, { "epoch": 0.14246877585595524, "grad_norm": 0.12314029783010483, "learning_rate": 4.4577494342314205e-05, "loss": 0.3807, "step": 226 }, { "epoch": 0.14309916866947717, "grad_norm": 0.16256219148635864, "learning_rate": 4.457314124813387e-05, "loss": 0.4091, "step": 227 }, { "epoch": 0.1437295614829991, "grad_norm": 0.1522122621536255, "learning_rate": 4.456876605841681e-05, "loss": 0.4658, "step": 228 }, { "epoch": 0.14435995429652101, "grad_norm": 0.15753582119941711, "learning_rate": 4.4564368777542665e-05, "loss": 0.4563, "step": 229 }, { "epoch": 0.14499034711004294, "grad_norm": 0.15977296233177185, "learning_rate": 4.4559949409913166e-05, "loss": 0.398, "step": 230 }, { "epoch": 0.14562073992356486, "grad_norm": 0.13135172426700592, "learning_rate": 4.455550795995218e-05, "loss": 0.4045, "step": 231 }, { "epoch": 0.1462511327370868, "grad_norm": 0.1841035783290863, "learning_rate": 4.4551044432105646e-05, "loss": 0.4978, "step": 232 }, { "epoch": 0.1468815255506087, "grad_norm": 0.13868924975395203, "learning_rate": 4.454655883084163e-05, "loss": 0.4585, "step": 233 }, { "epoch": 0.14751191836413066, "grad_norm": 0.17346960306167603, "learning_rate": 4.454205116065029e-05, "loss": 0.4473, "step": 234 }, { "epoch": 0.14814231117765259, "grad_norm": 0.1713954508304596, "learning_rate": 4.453752142604385e-05, "loss": 0.4346, "step": 235 }, { "epoch": 0.1487727039911745, "grad_norm": 0.17876474559307098, "learning_rate": 4.453296963155666e-05, "loss": 0.5037, "step": 236 }, { "epoch": 0.14940309680469643, "grad_norm": 0.14628243446350098, "learning_rate": 4.452839578174512e-05, "loss": 0.416, "step": 237 }, { "epoch": 0.15003348961821836, "grad_norm": 0.17380249500274658, "learning_rate": 4.452379988118774e-05, "loss": 0.4427, "step": 238 }, { "epoch": 0.15066388243174028, "grad_norm": 0.17569275200366974, "learning_rate": 4.4519181934485054e-05, "loss": 0.4676, "step": 239 }, { "epoch": 0.1512942752452622, "grad_norm": 0.16404937207698822, "learning_rate": 4.451454194625973e-05, "loss": 0.425, "step": 240 }, { "epoch": 0.15192466805878413, "grad_norm": 0.15239328145980835, "learning_rate": 4.450987992115642e-05, "loss": 0.3907, "step": 241 }, { "epoch": 0.15255506087230605, "grad_norm": 0.16638530790805817, "learning_rate": 4.450519586384192e-05, "loss": 0.3964, "step": 242 }, { "epoch": 0.15318545368582798, "grad_norm": 0.16477054357528687, "learning_rate": 4.4500489779005016e-05, "loss": 0.4793, "step": 243 }, { "epoch": 0.1538158464993499, "grad_norm": 0.19650088250637054, "learning_rate": 4.4495761671356574e-05, "loss": 0.5265, "step": 244 }, { "epoch": 0.15444623931287182, "grad_norm": 0.1595885008573532, "learning_rate": 4.4491011545629495e-05, "loss": 0.4893, "step": 245 }, { "epoch": 0.15507663212639375, "grad_norm": 0.17801198363304138, "learning_rate": 4.4486239406578736e-05, "loss": 0.5026, "step": 246 }, { "epoch": 0.15570702493991567, "grad_norm": 0.16517923772335052, "learning_rate": 4.4481445258981266e-05, "loss": 0.3776, "step": 247 }, { "epoch": 0.15633741775343762, "grad_norm": 0.1921938955783844, "learning_rate": 4.44766291076361e-05, "loss": 0.4077, "step": 248 }, { "epoch": 0.15696781056695955, "grad_norm": 0.15358708798885345, "learning_rate": 4.4471790957364284e-05, "loss": 0.4195, "step": 249 }, { "epoch": 0.15759820338048147, "grad_norm": 0.16735200583934784, "learning_rate": 4.446693081300887e-05, "loss": 0.4189, "step": 250 }, { "epoch": 0.1582285961940034, "grad_norm": 0.16738048195838928, "learning_rate": 4.446204867943493e-05, "loss": 0.4908, "step": 251 }, { "epoch": 0.15885898900752532, "grad_norm": 0.15153180062770844, "learning_rate": 4.445714456152955e-05, "loss": 0.5082, "step": 252 }, { "epoch": 0.15948938182104724, "grad_norm": 0.17264169454574585, "learning_rate": 4.445221846420183e-05, "loss": 0.4502, "step": 253 }, { "epoch": 0.16011977463456917, "grad_norm": 0.17027878761291504, "learning_rate": 4.4447270392382865e-05, "loss": 0.4815, "step": 254 }, { "epoch": 0.1607501674480911, "grad_norm": 0.17320281267166138, "learning_rate": 4.444230035102575e-05, "loss": 0.5007, "step": 255 }, { "epoch": 0.16138056026161302, "grad_norm": 0.1453806608915329, "learning_rate": 4.4437308345105554e-05, "loss": 0.4388, "step": 256 }, { "epoch": 0.16201095307513494, "grad_norm": 0.1596660614013672, "learning_rate": 4.443229437961937e-05, "loss": 0.5086, "step": 257 }, { "epoch": 0.16264134588865686, "grad_norm": 0.1486014723777771, "learning_rate": 4.442725845958623e-05, "loss": 0.4556, "step": 258 }, { "epoch": 0.1632717387021788, "grad_norm": 0.15127785503864288, "learning_rate": 4.442220059004717e-05, "loss": 0.467, "step": 259 }, { "epoch": 0.1639021315157007, "grad_norm": 0.1469348967075348, "learning_rate": 4.4417120776065207e-05, "loss": 0.4327, "step": 260 }, { "epoch": 0.16453252432922263, "grad_norm": 0.18044887483119965, "learning_rate": 4.441201902272529e-05, "loss": 0.4523, "step": 261 }, { "epoch": 0.1651629171427446, "grad_norm": 0.17343007028102875, "learning_rate": 4.440689533513435e-05, "loss": 0.4976, "step": 262 }, { "epoch": 0.1657933099562665, "grad_norm": 0.17072178423404694, "learning_rate": 4.440174971842128e-05, "loss": 0.4717, "step": 263 }, { "epoch": 0.16642370276978843, "grad_norm": 0.1547555923461914, "learning_rate": 4.439658217773691e-05, "loss": 0.3654, "step": 264 }, { "epoch": 0.16705409558331036, "grad_norm": 0.15340355038642883, "learning_rate": 4.4391392718254044e-05, "loss": 0.4425, "step": 265 }, { "epoch": 0.16768448839683228, "grad_norm": 0.1566683053970337, "learning_rate": 4.438618134516738e-05, "loss": 0.399, "step": 266 }, { "epoch": 0.1683148812103542, "grad_norm": 0.19167183339595795, "learning_rate": 4.438094806369359e-05, "loss": 0.4727, "step": 267 }, { "epoch": 0.16894527402387613, "grad_norm": 0.1682709902524948, "learning_rate": 4.4375692879071265e-05, "loss": 0.4369, "step": 268 }, { "epoch": 0.16957566683739805, "grad_norm": 0.19236905872821808, "learning_rate": 4.437041579656092e-05, "loss": 0.4765, "step": 269 }, { "epoch": 0.17020605965091998, "grad_norm": 0.20469754934310913, "learning_rate": 4.4365116821445e-05, "loss": 0.5317, "step": 270 }, { "epoch": 0.1708364524644419, "grad_norm": 0.15680178999900818, "learning_rate": 4.435979595902785e-05, "loss": 0.4359, "step": 271 }, { "epoch": 0.17146684527796383, "grad_norm": 0.1784251481294632, "learning_rate": 4.4354453214635725e-05, "loss": 0.5123, "step": 272 }, { "epoch": 0.17209723809148575, "grad_norm": 0.14888904988765717, "learning_rate": 4.4349088593616796e-05, "loss": 0.4192, "step": 273 }, { "epoch": 0.17272763090500767, "grad_norm": 0.18458880484104156, "learning_rate": 4.434370210134113e-05, "loss": 0.5188, "step": 274 }, { "epoch": 0.1733580237185296, "grad_norm": 0.15294192731380463, "learning_rate": 4.433829374320068e-05, "loss": 0.4913, "step": 275 }, { "epoch": 0.17398841653205152, "grad_norm": 0.15933866798877716, "learning_rate": 4.433286352460929e-05, "loss": 0.4916, "step": 276 }, { "epoch": 0.17461880934557347, "grad_norm": 0.21269841492176056, "learning_rate": 4.43274114510027e-05, "loss": 0.5095, "step": 277 }, { "epoch": 0.1752492021590954, "grad_norm": 0.1545397788286209, "learning_rate": 4.4321937527838496e-05, "loss": 0.4158, "step": 278 }, { "epoch": 0.17587959497261732, "grad_norm": 0.1311575025320053, "learning_rate": 4.431644176059617e-05, "loss": 0.3272, "step": 279 }, { "epoch": 0.17650998778613924, "grad_norm": 0.18615727126598358, "learning_rate": 4.431092415477706e-05, "loss": 0.4475, "step": 280 }, { "epoch": 0.17714038059966117, "grad_norm": 0.15645916759967804, "learning_rate": 4.430538471590437e-05, "loss": 0.4821, "step": 281 }, { "epoch": 0.1777707734131831, "grad_norm": 0.15182305872440338, "learning_rate": 4.429982344952316e-05, "loss": 0.3472, "step": 282 }, { "epoch": 0.17840116622670502, "grad_norm": 0.1452929824590683, "learning_rate": 4.429424036120034e-05, "loss": 0.3909, "step": 283 }, { "epoch": 0.17903155904022694, "grad_norm": 0.13374878466129303, "learning_rate": 4.4288635456524654e-05, "loss": 0.396, "step": 284 }, { "epoch": 0.17966195185374886, "grad_norm": 0.17100194096565247, "learning_rate": 4.428300874110671e-05, "loss": 0.5189, "step": 285 }, { "epoch": 0.1802923446672708, "grad_norm": 0.21815529465675354, "learning_rate": 4.427736022057892e-05, "loss": 0.4698, "step": 286 }, { "epoch": 0.1809227374807927, "grad_norm": 0.13070501387119293, "learning_rate": 4.427168990059554e-05, "loss": 0.4313, "step": 287 }, { "epoch": 0.18155313029431464, "grad_norm": 0.15077956020832062, "learning_rate": 4.426599778683263e-05, "loss": 0.4386, "step": 288 }, { "epoch": 0.18218352310783656, "grad_norm": 0.16122879087924957, "learning_rate": 4.426028388498811e-05, "loss": 0.4387, "step": 289 }, { "epoch": 0.18281391592135848, "grad_norm": 0.16982218623161316, "learning_rate": 4.4254548200781644e-05, "loss": 0.5156, "step": 290 }, { "epoch": 0.18344430873488043, "grad_norm": 0.16496285796165466, "learning_rate": 4.4248790739954764e-05, "loss": 0.4783, "step": 291 }, { "epoch": 0.18407470154840236, "grad_norm": 0.15385004878044128, "learning_rate": 4.424301150827075e-05, "loss": 0.4368, "step": 292 }, { "epoch": 0.18470509436192428, "grad_norm": 0.16213051974773407, "learning_rate": 4.423721051151472e-05, "loss": 0.4644, "step": 293 }, { "epoch": 0.1853354871754462, "grad_norm": 0.13787910342216492, "learning_rate": 4.423138775549353e-05, "loss": 0.4552, "step": 294 }, { "epoch": 0.18596587998896813, "grad_norm": 0.18587398529052734, "learning_rate": 4.422554324603586e-05, "loss": 0.5059, "step": 295 }, { "epoch": 0.18659627280249005, "grad_norm": 0.15263552963733673, "learning_rate": 4.421967698899215e-05, "loss": 0.4196, "step": 296 }, { "epoch": 0.18722666561601198, "grad_norm": 0.1526150405406952, "learning_rate": 4.421378899023461e-05, "loss": 0.403, "step": 297 }, { "epoch": 0.1878570584295339, "grad_norm": 0.15569239854812622, "learning_rate": 4.420787925565721e-05, "loss": 0.4181, "step": 298 }, { "epoch": 0.18848745124305583, "grad_norm": 0.15740294754505157, "learning_rate": 4.420194779117567e-05, "loss": 0.4911, "step": 299 }, { "epoch": 0.18911784405657775, "grad_norm": 0.1633409857749939, "learning_rate": 4.419599460272748e-05, "loss": 0.4817, "step": 300 }, { "epoch": 0.18974823687009967, "grad_norm": 0.14763297140598297, "learning_rate": 4.419001969627188e-05, "loss": 0.4, "step": 301 }, { "epoch": 0.1903786296836216, "grad_norm": 0.15027102828025818, "learning_rate": 4.4184023077789824e-05, "loss": 0.4388, "step": 302 }, { "epoch": 0.19100902249714352, "grad_norm": 0.1539442092180252, "learning_rate": 4.417800475328402e-05, "loss": 0.4624, "step": 303 }, { "epoch": 0.19163941531066545, "grad_norm": 0.17854462563991547, "learning_rate": 4.41719647287789e-05, "loss": 0.4952, "step": 304 }, { "epoch": 0.1922698081241874, "grad_norm": 0.1574363261461258, "learning_rate": 4.4165903010320614e-05, "loss": 0.4102, "step": 305 }, { "epoch": 0.19290020093770932, "grad_norm": 0.14707572758197784, "learning_rate": 4.4159819603977035e-05, "loss": 0.3848, "step": 306 }, { "epoch": 0.19353059375123124, "grad_norm": 0.1398138850927353, "learning_rate": 4.4153714515837734e-05, "loss": 0.3968, "step": 307 }, { "epoch": 0.19416098656475317, "grad_norm": 0.16176103055477142, "learning_rate": 4.4147587752014e-05, "loss": 0.4948, "step": 308 }, { "epoch": 0.1947913793782751, "grad_norm": 0.1668642908334732, "learning_rate": 4.414143931863881e-05, "loss": 0.4635, "step": 309 }, { "epoch": 0.19542177219179702, "grad_norm": 0.14107371866703033, "learning_rate": 4.413526922186683e-05, "loss": 0.3535, "step": 310 }, { "epoch": 0.19605216500531894, "grad_norm": 0.15369193255901337, "learning_rate": 4.412907746787444e-05, "loss": 0.3719, "step": 311 }, { "epoch": 0.19668255781884086, "grad_norm": 0.16373153030872345, "learning_rate": 4.412286406285964e-05, "loss": 0.4232, "step": 312 }, { "epoch": 0.1973129506323628, "grad_norm": 0.13650447130203247, "learning_rate": 4.411662901304217e-05, "loss": 0.4114, "step": 313 }, { "epoch": 0.1979433434458847, "grad_norm": 0.17574451863765717, "learning_rate": 4.411037232466339e-05, "loss": 0.5067, "step": 314 }, { "epoch": 0.19857373625940664, "grad_norm": 0.19951093196868896, "learning_rate": 4.410409400398634e-05, "loss": 0.5274, "step": 315 }, { "epoch": 0.19920412907292856, "grad_norm": 0.15199385583400726, "learning_rate": 4.409779405729572e-05, "loss": 0.4369, "step": 316 }, { "epoch": 0.19983452188645048, "grad_norm": 0.1495632529258728, "learning_rate": 4.409147249089786e-05, "loss": 0.4449, "step": 317 }, { "epoch": 0.2004649146999724, "grad_norm": 0.16323602199554443, "learning_rate": 4.408512931112073e-05, "loss": 0.42, "step": 318 }, { "epoch": 0.20109530751349436, "grad_norm": 0.16794022917747498, "learning_rate": 4.407876452431397e-05, "loss": 0.4548, "step": 319 }, { "epoch": 0.20172570032701628, "grad_norm": 0.1724035143852234, "learning_rate": 4.40723781368488e-05, "loss": 0.4969, "step": 320 }, { "epoch": 0.2023560931405382, "grad_norm": 0.16559875011444092, "learning_rate": 4.40659701551181e-05, "loss": 0.4681, "step": 321 }, { "epoch": 0.20298648595406013, "grad_norm": 0.1622600555419922, "learning_rate": 4.405954058553635e-05, "loss": 0.5081, "step": 322 }, { "epoch": 0.20361687876758205, "grad_norm": 0.1619727909564972, "learning_rate": 4.405308943453965e-05, "loss": 0.4478, "step": 323 }, { "epoch": 0.20424727158110398, "grad_norm": 0.12752674520015717, "learning_rate": 4.404661670858568e-05, "loss": 0.3779, "step": 324 }, { "epoch": 0.2048776643946259, "grad_norm": 0.163992777466774, "learning_rate": 4.404012241415375e-05, "loss": 0.3743, "step": 325 }, { "epoch": 0.20550805720814783, "grad_norm": 0.16608774662017822, "learning_rate": 4.403360655774474e-05, "loss": 0.4472, "step": 326 }, { "epoch": 0.20613845002166975, "grad_norm": 0.1689046025276184, "learning_rate": 4.4027069145881116e-05, "loss": 0.4166, "step": 327 }, { "epoch": 0.20676884283519167, "grad_norm": 0.180549755692482, "learning_rate": 4.402051018510691e-05, "loss": 0.4872, "step": 328 }, { "epoch": 0.2073992356487136, "grad_norm": 0.18701449036598206, "learning_rate": 4.4013929681987766e-05, "loss": 0.4188, "step": 329 }, { "epoch": 0.20802962846223552, "grad_norm": 0.161424919962883, "learning_rate": 4.400732764311084e-05, "loss": 0.4628, "step": 330 }, { "epoch": 0.20866002127575745, "grad_norm": 0.140961155295372, "learning_rate": 4.400070407508487e-05, "loss": 0.4086, "step": 331 }, { "epoch": 0.20929041408927937, "grad_norm": 0.12989573180675507, "learning_rate": 4.399405898454015e-05, "loss": 0.4245, "step": 332 }, { "epoch": 0.20992080690280132, "grad_norm": 0.1708124876022339, "learning_rate": 4.398739237812852e-05, "loss": 0.4754, "step": 333 }, { "epoch": 0.21055119971632325, "grad_norm": 0.1742548793554306, "learning_rate": 4.3980704262523335e-05, "loss": 0.4322, "step": 334 }, { "epoch": 0.21118159252984517, "grad_norm": 0.12426616251468658, "learning_rate": 4.3973994644419515e-05, "loss": 0.3293, "step": 335 }, { "epoch": 0.2118119853433671, "grad_norm": 0.12249540537595749, "learning_rate": 4.396726353053346e-05, "loss": 0.3001, "step": 336 }, { "epoch": 0.21244237815688902, "grad_norm": 0.19791898131370544, "learning_rate": 4.396051092760314e-05, "loss": 0.4886, "step": 337 }, { "epoch": 0.21307277097041094, "grad_norm": 0.18151599168777466, "learning_rate": 4.3953736842388006e-05, "loss": 0.4644, "step": 338 }, { "epoch": 0.21370316378393286, "grad_norm": 0.15587259829044342, "learning_rate": 4.3946941281669e-05, "loss": 0.4522, "step": 339 }, { "epoch": 0.2143335565974548, "grad_norm": 0.14205743372440338, "learning_rate": 4.394012425224858e-05, "loss": 0.4457, "step": 340 }, { "epoch": 0.2149639494109767, "grad_norm": 0.1723662167787552, "learning_rate": 4.39332857609507e-05, "loss": 0.4654, "step": 341 }, { "epoch": 0.21559434222449864, "grad_norm": 0.169973224401474, "learning_rate": 4.392642581462079e-05, "loss": 0.4172, "step": 342 }, { "epoch": 0.21622473503802056, "grad_norm": 0.20729491114616394, "learning_rate": 4.391954442012576e-05, "loss": 0.5264, "step": 343 }, { "epoch": 0.21685512785154248, "grad_norm": 0.14070910215377808, "learning_rate": 4.3912641584353965e-05, "loss": 0.3569, "step": 344 }, { "epoch": 0.2174855206650644, "grad_norm": 0.1460694670677185, "learning_rate": 4.390571731421527e-05, "loss": 0.3603, "step": 345 }, { "epoch": 0.21811591347858633, "grad_norm": 0.17333248257637024, "learning_rate": 4.389877161664096e-05, "loss": 0.4305, "step": 346 }, { "epoch": 0.21874630629210828, "grad_norm": 0.17414788901805878, "learning_rate": 4.3891804498583776e-05, "loss": 0.4788, "step": 347 }, { "epoch": 0.2193766991056302, "grad_norm": 0.13664431869983673, "learning_rate": 4.388481596701791e-05, "loss": 0.3736, "step": 348 }, { "epoch": 0.22000709191915213, "grad_norm": 0.17150871455669403, "learning_rate": 4.387780602893898e-05, "loss": 0.4915, "step": 349 }, { "epoch": 0.22063748473267406, "grad_norm": 0.15720312297344208, "learning_rate": 4.387077469136404e-05, "loss": 0.4359, "step": 350 }, { "epoch": 0.22126787754619598, "grad_norm": 0.15001578629016876, "learning_rate": 4.386372196133157e-05, "loss": 0.4145, "step": 351 }, { "epoch": 0.2218982703597179, "grad_norm": 0.18784606456756592, "learning_rate": 4.385664784590144e-05, "loss": 0.5409, "step": 352 }, { "epoch": 0.22252866317323983, "grad_norm": 0.18099607527256012, "learning_rate": 4.384955235215495e-05, "loss": 0.427, "step": 353 }, { "epoch": 0.22315905598676175, "grad_norm": 0.13089902698993683, "learning_rate": 4.3842435487194786e-05, "loss": 0.471, "step": 354 }, { "epoch": 0.22378944880028367, "grad_norm": 0.1351536512374878, "learning_rate": 4.383529725814504e-05, "loss": 0.4013, "step": 355 }, { "epoch": 0.2244198416138056, "grad_norm": 0.14495325088500977, "learning_rate": 4.3828137672151194e-05, "loss": 0.4244, "step": 356 }, { "epoch": 0.22505023442732752, "grad_norm": 0.1470225751399994, "learning_rate": 4.382095673638008e-05, "loss": 0.424, "step": 357 }, { "epoch": 0.22568062724084945, "grad_norm": 0.1563224196434021, "learning_rate": 4.3813754458019925e-05, "loss": 0.4162, "step": 358 }, { "epoch": 0.22631102005437137, "grad_norm": 0.13134637475013733, "learning_rate": 4.3806530844280324e-05, "loss": 0.3668, "step": 359 }, { "epoch": 0.2269414128678933, "grad_norm": 0.14569929242134094, "learning_rate": 4.379928590239221e-05, "loss": 0.3828, "step": 360 }, { "epoch": 0.22757180568141525, "grad_norm": 0.138423353433609, "learning_rate": 4.379201963960788e-05, "loss": 0.4409, "step": 361 }, { "epoch": 0.22820219849493717, "grad_norm": 0.16363699734210968, "learning_rate": 4.378473206320097e-05, "loss": 0.449, "step": 362 }, { "epoch": 0.2288325913084591, "grad_norm": 0.14801304042339325, "learning_rate": 4.3777423180466436e-05, "loss": 0.4623, "step": 363 }, { "epoch": 0.22946298412198102, "grad_norm": 0.20460020005702972, "learning_rate": 4.3770092998720605e-05, "loss": 0.5039, "step": 364 }, { "epoch": 0.23009337693550294, "grad_norm": 0.17865395545959473, "learning_rate": 4.376274152530107e-05, "loss": 0.4483, "step": 365 }, { "epoch": 0.23072376974902487, "grad_norm": 0.17483441531658173, "learning_rate": 4.3755368767566766e-05, "loss": 0.3916, "step": 366 }, { "epoch": 0.2313541625625468, "grad_norm": 0.13383352756500244, "learning_rate": 4.3747974732897945e-05, "loss": 0.379, "step": 367 }, { "epoch": 0.2319845553760687, "grad_norm": 0.1292165219783783, "learning_rate": 4.374055942869614e-05, "loss": 0.3773, "step": 368 }, { "epoch": 0.23261494818959064, "grad_norm": 0.1722308248281479, "learning_rate": 4.373312286238417e-05, "loss": 0.4104, "step": 369 }, { "epoch": 0.23324534100311256, "grad_norm": 0.13454218208789825, "learning_rate": 4.372566504140615e-05, "loss": 0.4483, "step": 370 }, { "epoch": 0.23387573381663448, "grad_norm": 0.16832982003688812, "learning_rate": 4.3718185973227477e-05, "loss": 0.3664, "step": 371 }, { "epoch": 0.2345061266301564, "grad_norm": 0.18032234907150269, "learning_rate": 4.371068566533479e-05, "loss": 0.4919, "step": 372 }, { "epoch": 0.23513651944367833, "grad_norm": 0.16197332739830017, "learning_rate": 4.3703164125236024e-05, "loss": 0.4078, "step": 373 }, { "epoch": 0.23576691225720026, "grad_norm": 0.14080944657325745, "learning_rate": 4.369562136046034e-05, "loss": 0.4235, "step": 374 }, { "epoch": 0.23639730507072218, "grad_norm": 0.13689732551574707, "learning_rate": 4.368805737855816e-05, "loss": 0.4384, "step": 375 }, { "epoch": 0.23702769788424413, "grad_norm": 0.13256917893886566, "learning_rate": 4.3680472187101135e-05, "loss": 0.3782, "step": 376 }, { "epoch": 0.23765809069776606, "grad_norm": 0.16238057613372803, "learning_rate": 4.367286579368216e-05, "loss": 0.4169, "step": 377 }, { "epoch": 0.23828848351128798, "grad_norm": 0.18068745732307434, "learning_rate": 4.366523820591535e-05, "loss": 0.4064, "step": 378 }, { "epoch": 0.2389188763248099, "grad_norm": 0.15773656964302063, "learning_rate": 4.365758943143602e-05, "loss": 0.4131, "step": 379 }, { "epoch": 0.23954926913833183, "grad_norm": 0.1459636241197586, "learning_rate": 4.36499194779007e-05, "loss": 0.4, "step": 380 }, { "epoch": 0.24017966195185375, "grad_norm": 0.15780167281627655, "learning_rate": 4.364222835298714e-05, "loss": 0.3767, "step": 381 }, { "epoch": 0.24081005476537568, "grad_norm": 0.19021441042423248, "learning_rate": 4.3634516064394264e-05, "loss": 0.4205, "step": 382 }, { "epoch": 0.2414404475788976, "grad_norm": 0.1581646203994751, "learning_rate": 4.362678261984218e-05, "loss": 0.4128, "step": 383 }, { "epoch": 0.24207084039241952, "grad_norm": 0.17075695097446442, "learning_rate": 4.361902802707219e-05, "loss": 0.4132, "step": 384 }, { "epoch": 0.24270123320594145, "grad_norm": 0.14225608110427856, "learning_rate": 4.361125229384674e-05, "loss": 0.3943, "step": 385 }, { "epoch": 0.24333162601946337, "grad_norm": 0.14167103171348572, "learning_rate": 4.3603455427949465e-05, "loss": 0.4041, "step": 386 }, { "epoch": 0.2439620188329853, "grad_norm": 0.16991916298866272, "learning_rate": 4.3595637437185133e-05, "loss": 0.4617, "step": 387 }, { "epoch": 0.24459241164650722, "grad_norm": 0.1786823570728302, "learning_rate": 4.358779832937968e-05, "loss": 0.5302, "step": 388 }, { "epoch": 0.24522280446002914, "grad_norm": 0.17845723032951355, "learning_rate": 4.3579938112380154e-05, "loss": 0.4843, "step": 389 }, { "epoch": 0.2458531972735511, "grad_norm": 0.14564692974090576, "learning_rate": 4.357205679405475e-05, "loss": 0.3841, "step": 390 }, { "epoch": 0.24648359008707302, "grad_norm": 0.1389242708683014, "learning_rate": 4.356415438229279e-05, "loss": 0.3282, "step": 391 }, { "epoch": 0.24711398290059494, "grad_norm": 0.15760691463947296, "learning_rate": 4.355623088500469e-05, "loss": 0.3952, "step": 392 }, { "epoch": 0.24774437571411687, "grad_norm": 0.24291901290416718, "learning_rate": 4.354828631012199e-05, "loss": 0.5629, "step": 393 }, { "epoch": 0.2483747685276388, "grad_norm": 0.1695348471403122, "learning_rate": 4.354032066559735e-05, "loss": 0.4149, "step": 394 }, { "epoch": 0.2490051613411607, "grad_norm": 0.16605013608932495, "learning_rate": 4.353233395940445e-05, "loss": 0.4163, "step": 395 }, { "epoch": 0.24963555415468264, "grad_norm": 0.1307298243045807, "learning_rate": 4.3524326199538144e-05, "loss": 0.3642, "step": 396 }, { "epoch": 0.2502659469682046, "grad_norm": 0.15121614933013916, "learning_rate": 4.351629739401429e-05, "loss": 0.3987, "step": 397 }, { "epoch": 0.2508963397817265, "grad_norm": 0.13745561242103577, "learning_rate": 4.3508247550869856e-05, "loss": 0.39, "step": 398 }, { "epoch": 0.25152673259524844, "grad_norm": 0.14871643483638763, "learning_rate": 4.350017667816283e-05, "loss": 0.3458, "step": 399 }, { "epoch": 0.25215712540877033, "grad_norm": 0.14765329658985138, "learning_rate": 4.34920847839723e-05, "loss": 0.3782, "step": 400 }, { "epoch": 0.25215712540877033, "eval_loss": 0.45531120896339417, "eval_runtime": 222.4652, "eval_samples_per_second": 4.495, "eval_steps_per_second": 4.495, "step": 400 }, { "epoch": 0.2527875182222923, "grad_norm": 0.18058902025222778, "learning_rate": 4.348397187639835e-05, "loss": 0.4872, "step": 401 }, { "epoch": 0.2534179110358142, "grad_norm": 0.1355857402086258, "learning_rate": 4.347583796356213e-05, "loss": 0.3838, "step": 402 }, { "epoch": 0.25404830384933613, "grad_norm": 0.14737853407859802, "learning_rate": 4.3467683053605806e-05, "loss": 0.4694, "step": 403 }, { "epoch": 0.25467869666285803, "grad_norm": 0.14057587087154388, "learning_rate": 4.3459507154692555e-05, "loss": 0.4103, "step": 404 }, { "epoch": 0.25530908947638, "grad_norm": 0.1664663404226303, "learning_rate": 4.3451310275006576e-05, "loss": 0.5004, "step": 405 }, { "epoch": 0.2559394822899019, "grad_norm": 0.12404580414295197, "learning_rate": 4.344309242275306e-05, "loss": 0.3744, "step": 406 }, { "epoch": 0.25656987510342383, "grad_norm": 0.13321265578269958, "learning_rate": 4.343485360615821e-05, "loss": 0.3566, "step": 407 }, { "epoch": 0.2572002679169457, "grad_norm": 0.17092850804328918, "learning_rate": 4.342659383346918e-05, "loss": 0.4143, "step": 408 }, { "epoch": 0.2578306607304677, "grad_norm": 0.14100387692451477, "learning_rate": 4.341831311295415e-05, "loss": 0.4269, "step": 409 }, { "epoch": 0.25846105354398957, "grad_norm": 0.1483694463968277, "learning_rate": 4.341001145290222e-05, "loss": 0.4244, "step": 410 }, { "epoch": 0.2590914463575115, "grad_norm": 0.1616392880678177, "learning_rate": 4.340168886162348e-05, "loss": 0.4426, "step": 411 }, { "epoch": 0.2597218391710335, "grad_norm": 0.1394892781972885, "learning_rate": 4.3393345347448964e-05, "loss": 0.4675, "step": 412 }, { "epoch": 0.26035223198455537, "grad_norm": 0.14571578800678253, "learning_rate": 4.338498091873066e-05, "loss": 0.3836, "step": 413 }, { "epoch": 0.2609826247980773, "grad_norm": 0.16842153668403625, "learning_rate": 4.3376595583841474e-05, "loss": 0.4068, "step": 414 }, { "epoch": 0.2616130176115992, "grad_norm": 0.16881997883319855, "learning_rate": 4.336818935117525e-05, "loss": 0.4047, "step": 415 }, { "epoch": 0.26224341042512117, "grad_norm": 0.15377576649188995, "learning_rate": 4.335976222914675e-05, "loss": 0.4123, "step": 416 }, { "epoch": 0.26287380323864307, "grad_norm": 0.1499597281217575, "learning_rate": 4.3351314226191656e-05, "loss": 0.4568, "step": 417 }, { "epoch": 0.263504196052165, "grad_norm": 0.13230498135089874, "learning_rate": 4.334284535076653e-05, "loss": 0.3797, "step": 418 }, { "epoch": 0.2641345888656869, "grad_norm": 0.14309710264205933, "learning_rate": 4.333435561134885e-05, "loss": 0.458, "step": 419 }, { "epoch": 0.26476498167920887, "grad_norm": 0.18320833146572113, "learning_rate": 4.332584501643695e-05, "loss": 0.4548, "step": 420 }, { "epoch": 0.26539537449273076, "grad_norm": 0.17113445699214935, "learning_rate": 4.331731357455009e-05, "loss": 0.4286, "step": 421 }, { "epoch": 0.2660257673062527, "grad_norm": 0.1291307806968689, "learning_rate": 4.3308761294228334e-05, "loss": 0.3992, "step": 422 }, { "epoch": 0.2666561601197746, "grad_norm": 0.13772858679294586, "learning_rate": 4.330018818403266e-05, "loss": 0.4343, "step": 423 }, { "epoch": 0.26728655293329656, "grad_norm": 0.16687078773975372, "learning_rate": 4.329159425254487e-05, "loss": 0.4587, "step": 424 }, { "epoch": 0.2679169457468185, "grad_norm": 0.13438096642494202, "learning_rate": 4.328297950836762e-05, "loss": 0.355, "step": 425 }, { "epoch": 0.2685473385603404, "grad_norm": 0.1611950397491455, "learning_rate": 4.3274343960124407e-05, "loss": 0.3935, "step": 426 }, { "epoch": 0.26917773137386236, "grad_norm": 0.143065944314003, "learning_rate": 4.3265687616459506e-05, "loss": 0.4134, "step": 427 }, { "epoch": 0.26980812418738426, "grad_norm": 0.14825253188610077, "learning_rate": 4.325701048603808e-05, "loss": 0.4583, "step": 428 }, { "epoch": 0.2704385170009062, "grad_norm": 0.17021827399730682, "learning_rate": 4.324831257754603e-05, "loss": 0.5149, "step": 429 }, { "epoch": 0.2710689098144281, "grad_norm": 0.1331031769514084, "learning_rate": 4.3239593899690125e-05, "loss": 0.3283, "step": 430 }, { "epoch": 0.27169930262795006, "grad_norm": 0.1685783565044403, "learning_rate": 4.323085446119785e-05, "loss": 0.52, "step": 431 }, { "epoch": 0.27232969544147195, "grad_norm": 0.15166236460208893, "learning_rate": 4.3222094270817546e-05, "loss": 0.4498, "step": 432 }, { "epoch": 0.2729600882549939, "grad_norm": 0.15902966260910034, "learning_rate": 4.321331333731828e-05, "loss": 0.4609, "step": 433 }, { "epoch": 0.2735904810685158, "grad_norm": 0.17017829418182373, "learning_rate": 4.3204511669489874e-05, "loss": 0.4214, "step": 434 }, { "epoch": 0.27422087388203775, "grad_norm": 0.163391575217247, "learning_rate": 4.319568927614295e-05, "loss": 0.4652, "step": 435 }, { "epoch": 0.27485126669555965, "grad_norm": 0.16209104657173157, "learning_rate": 4.318684616610885e-05, "loss": 0.4726, "step": 436 }, { "epoch": 0.2754816595090816, "grad_norm": 0.16812391579151154, "learning_rate": 4.3177982348239656e-05, "loss": 0.4635, "step": 437 }, { "epoch": 0.2761120523226035, "grad_norm": 0.16276055574417114, "learning_rate": 4.316909783140816e-05, "loss": 0.4833, "step": 438 }, { "epoch": 0.27674244513612545, "grad_norm": 0.1471432000398636, "learning_rate": 4.316019262450792e-05, "loss": 0.4585, "step": 439 }, { "epoch": 0.2773728379496474, "grad_norm": 0.1366051733493805, "learning_rate": 4.315126673645316e-05, "loss": 0.3718, "step": 440 }, { "epoch": 0.2780032307631693, "grad_norm": 0.14559337496757507, "learning_rate": 4.314232017617883e-05, "loss": 0.4245, "step": 441 }, { "epoch": 0.27863362357669125, "grad_norm": 0.14588779211044312, "learning_rate": 4.313335295264058e-05, "loss": 0.4753, "step": 442 }, { "epoch": 0.27926401639021314, "grad_norm": 0.16295523941516876, "learning_rate": 4.31243650748147e-05, "loss": 0.4217, "step": 443 }, { "epoch": 0.2798944092037351, "grad_norm": 0.1639915555715561, "learning_rate": 4.311535655169822e-05, "loss": 0.3766, "step": 444 }, { "epoch": 0.280524802017257, "grad_norm": 0.140822172164917, "learning_rate": 4.310632739230879e-05, "loss": 0.3774, "step": 445 }, { "epoch": 0.28115519483077894, "grad_norm": 0.14565636217594147, "learning_rate": 4.309727760568473e-05, "loss": 0.4208, "step": 446 }, { "epoch": 0.28178558764430084, "grad_norm": 0.14595231413841248, "learning_rate": 4.3088207200884995e-05, "loss": 0.4299, "step": 447 }, { "epoch": 0.2824159804578228, "grad_norm": 0.1536986231803894, "learning_rate": 4.307911618698922e-05, "loss": 0.4555, "step": 448 }, { "epoch": 0.2830463732713447, "grad_norm": 0.12186324596405029, "learning_rate": 4.307000457309763e-05, "loss": 0.4347, "step": 449 }, { "epoch": 0.28367676608486664, "grad_norm": 0.1660432517528534, "learning_rate": 4.3060872368331074e-05, "loss": 0.4679, "step": 450 }, { "epoch": 0.28430715889838853, "grad_norm": 0.15520548820495605, "learning_rate": 4.3051719581831036e-05, "loss": 0.4418, "step": 451 }, { "epoch": 0.2849375517119105, "grad_norm": 0.14485125243663788, "learning_rate": 4.304254622275958e-05, "loss": 0.4048, "step": 452 }, { "epoch": 0.28556794452543244, "grad_norm": 0.15754371881484985, "learning_rate": 4.3033352300299386e-05, "loss": 0.5319, "step": 453 }, { "epoch": 0.28619833733895433, "grad_norm": 0.17553724348545074, "learning_rate": 4.302413782365369e-05, "loss": 0.4475, "step": 454 }, { "epoch": 0.2868287301524763, "grad_norm": 0.1662897914648056, "learning_rate": 4.3014902802046315e-05, "loss": 0.4004, "step": 455 }, { "epoch": 0.2874591229659982, "grad_norm": 0.15161022543907166, "learning_rate": 4.300564724472167e-05, "loss": 0.4386, "step": 456 }, { "epoch": 0.28808951577952013, "grad_norm": 0.1543479859828949, "learning_rate": 4.2996371160944684e-05, "loss": 0.5125, "step": 457 }, { "epoch": 0.28871990859304203, "grad_norm": 0.14676740765571594, "learning_rate": 4.298707456000087e-05, "loss": 0.4285, "step": 458 }, { "epoch": 0.289350301406564, "grad_norm": 0.14895333349704742, "learning_rate": 4.297775745119625e-05, "loss": 0.4448, "step": 459 }, { "epoch": 0.2899806942200859, "grad_norm": 0.12872503697872162, "learning_rate": 4.296841984385739e-05, "loss": 0.3532, "step": 460 }, { "epoch": 0.29061108703360783, "grad_norm": 0.15547753870487213, "learning_rate": 4.295906174733137e-05, "loss": 0.4387, "step": 461 }, { "epoch": 0.2912414798471297, "grad_norm": 0.12598636746406555, "learning_rate": 4.294968317098578e-05, "loss": 0.3872, "step": 462 }, { "epoch": 0.2918718726606517, "grad_norm": 0.1377699226140976, "learning_rate": 4.2940284124208704e-05, "loss": 0.3701, "step": 463 }, { "epoch": 0.2925022654741736, "grad_norm": 0.1805099993944168, "learning_rate": 4.293086461640875e-05, "loss": 0.488, "step": 464 }, { "epoch": 0.2931326582876955, "grad_norm": 0.12898395955562592, "learning_rate": 4.2921424657014954e-05, "loss": 0.319, "step": 465 }, { "epoch": 0.2937630511012174, "grad_norm": 0.16145475208759308, "learning_rate": 4.291196425547687e-05, "loss": 0.4519, "step": 466 }, { "epoch": 0.29439344391473937, "grad_norm": 0.1678912341594696, "learning_rate": 4.290248342126451e-05, "loss": 0.4225, "step": 467 }, { "epoch": 0.2950238367282613, "grad_norm": 0.1473826915025711, "learning_rate": 4.289298216386829e-05, "loss": 0.3601, "step": 468 }, { "epoch": 0.2956542295417832, "grad_norm": 0.1619638204574585, "learning_rate": 4.288346049279914e-05, "loss": 0.4531, "step": 469 }, { "epoch": 0.29628462235530517, "grad_norm": 0.17025795578956604, "learning_rate": 4.287391841758838e-05, "loss": 0.4791, "step": 470 }, { "epoch": 0.29691501516882707, "grad_norm": 0.15568993985652924, "learning_rate": 4.286435594778776e-05, "loss": 0.4427, "step": 471 }, { "epoch": 0.297545407982349, "grad_norm": 0.1638522893190384, "learning_rate": 4.285477309296948e-05, "loss": 0.4029, "step": 472 }, { "epoch": 0.2981758007958709, "grad_norm": 0.1754094958305359, "learning_rate": 4.284516986272607e-05, "loss": 0.3822, "step": 473 }, { "epoch": 0.29880619360939287, "grad_norm": 0.15131361782550812, "learning_rate": 4.283554626667055e-05, "loss": 0.3986, "step": 474 }, { "epoch": 0.29943658642291476, "grad_norm": 0.1398685723543167, "learning_rate": 4.282590231443625e-05, "loss": 0.4285, "step": 475 }, { "epoch": 0.3000669792364367, "grad_norm": 0.15816210210323334, "learning_rate": 4.281623801567693e-05, "loss": 0.4436, "step": 476 }, { "epoch": 0.3006973720499586, "grad_norm": 0.16846077144145966, "learning_rate": 4.2806553380066674e-05, "loss": 0.3985, "step": 477 }, { "epoch": 0.30132776486348056, "grad_norm": 0.17154532670974731, "learning_rate": 4.279684841729996e-05, "loss": 0.4526, "step": 478 }, { "epoch": 0.30195815767700246, "grad_norm": 0.13398754596710205, "learning_rate": 4.2787123137091576e-05, "loss": 0.3518, "step": 479 }, { "epoch": 0.3025885504905244, "grad_norm": 0.14154496788978577, "learning_rate": 4.277737754917669e-05, "loss": 0.3814, "step": 480 }, { "epoch": 0.3032189433040463, "grad_norm": 0.1629718393087387, "learning_rate": 4.276761166331077e-05, "loss": 0.4, "step": 481 }, { "epoch": 0.30384933611756826, "grad_norm": 0.14869852364063263, "learning_rate": 4.2757825489269616e-05, "loss": 0.3429, "step": 482 }, { "epoch": 0.3044797289310902, "grad_norm": 0.15486106276512146, "learning_rate": 4.2748019036849325e-05, "loss": 0.3757, "step": 483 }, { "epoch": 0.3051101217446121, "grad_norm": 0.1632099747657776, "learning_rate": 4.273819231586629e-05, "loss": 0.3948, "step": 484 }, { "epoch": 0.30574051455813406, "grad_norm": 0.17350728809833527, "learning_rate": 4.272834533615723e-05, "loss": 0.4351, "step": 485 }, { "epoch": 0.30637090737165595, "grad_norm": 0.14235197007656097, "learning_rate": 4.271847810757908e-05, "loss": 0.3961, "step": 486 }, { "epoch": 0.3070013001851779, "grad_norm": 0.1834893822669983, "learning_rate": 4.270859064000911e-05, "loss": 0.4808, "step": 487 }, { "epoch": 0.3076316929986998, "grad_norm": 0.13989755511283875, "learning_rate": 4.2698682943344804e-05, "loss": 0.4272, "step": 488 }, { "epoch": 0.30826208581222175, "grad_norm": 0.1976575404405594, "learning_rate": 4.268875502750391e-05, "loss": 0.469, "step": 489 }, { "epoch": 0.30889247862574365, "grad_norm": 0.17637616395950317, "learning_rate": 4.267880690242443e-05, "loss": 0.4712, "step": 490 }, { "epoch": 0.3095228714392656, "grad_norm": 0.16208115220069885, "learning_rate": 4.266883857806456e-05, "loss": 0.4406, "step": 491 }, { "epoch": 0.3101532642527875, "grad_norm": 0.14250457286834717, "learning_rate": 4.2658850064402763e-05, "loss": 0.4061, "step": 492 }, { "epoch": 0.31078365706630945, "grad_norm": 0.15330681204795837, "learning_rate": 4.264884137143767e-05, "loss": 0.3706, "step": 493 }, { "epoch": 0.31141404987983135, "grad_norm": 0.13295991718769073, "learning_rate": 4.263881250918814e-05, "loss": 0.4272, "step": 494 }, { "epoch": 0.3120444426933533, "grad_norm": 0.143403097987175, "learning_rate": 4.2628763487693205e-05, "loss": 0.3909, "step": 495 }, { "epoch": 0.31267483550687525, "grad_norm": 0.17583061754703522, "learning_rate": 4.261869431701208e-05, "loss": 0.4029, "step": 496 }, { "epoch": 0.31330522832039714, "grad_norm": 0.134292870759964, "learning_rate": 4.260860500722415e-05, "loss": 0.3743, "step": 497 }, { "epoch": 0.3139356211339191, "grad_norm": 0.14064282178878784, "learning_rate": 4.259849556842898e-05, "loss": 0.3862, "step": 498 }, { "epoch": 0.314566013947441, "grad_norm": 0.2020834982395172, "learning_rate": 4.258836601074624e-05, "loss": 0.4301, "step": 499 }, { "epoch": 0.31519640676096294, "grad_norm": 0.17847569286823273, "learning_rate": 4.257821634431578e-05, "loss": 0.4587, "step": 500 }, { "epoch": 0.31582679957448484, "grad_norm": 0.16131030023097992, "learning_rate": 4.256804657929756e-05, "loss": 0.4235, "step": 501 }, { "epoch": 0.3164571923880068, "grad_norm": 0.11063003540039062, "learning_rate": 4.255785672587166e-05, "loss": 0.3409, "step": 502 }, { "epoch": 0.3170875852015287, "grad_norm": 0.16887235641479492, "learning_rate": 4.2547646794238276e-05, "loss": 0.4017, "step": 503 }, { "epoch": 0.31771797801505064, "grad_norm": 0.1624932438135147, "learning_rate": 4.253741679461771e-05, "loss": 0.4041, "step": 504 }, { "epoch": 0.31834837082857254, "grad_norm": 0.17934207618236542, "learning_rate": 4.2527166737250324e-05, "loss": 0.4966, "step": 505 }, { "epoch": 0.3189787636420945, "grad_norm": 0.15029865503311157, "learning_rate": 4.2516896632396576e-05, "loss": 0.4555, "step": 506 }, { "epoch": 0.3196091564556164, "grad_norm": 0.14951272308826447, "learning_rate": 4.2506606490337006e-05, "loss": 0.4224, "step": 507 }, { "epoch": 0.32023954926913833, "grad_norm": 0.15957953035831451, "learning_rate": 4.249629632137218e-05, "loss": 0.4615, "step": 508 }, { "epoch": 0.32086994208266023, "grad_norm": 0.18410411477088928, "learning_rate": 4.2485966135822736e-05, "loss": 0.522, "step": 509 }, { "epoch": 0.3215003348961822, "grad_norm": 0.13453824818134308, "learning_rate": 4.247561594402934e-05, "loss": 0.3849, "step": 510 }, { "epoch": 0.32213072770970413, "grad_norm": 0.1895299255847931, "learning_rate": 4.246524575635269e-05, "loss": 0.4298, "step": 511 }, { "epoch": 0.32276112052322603, "grad_norm": 0.17183034121990204, "learning_rate": 4.245485558317348e-05, "loss": 0.4133, "step": 512 }, { "epoch": 0.323391513336748, "grad_norm": 0.1502702534198761, "learning_rate": 4.244444543489244e-05, "loss": 0.4127, "step": 513 }, { "epoch": 0.3240219061502699, "grad_norm": 0.11526153981685638, "learning_rate": 4.243401532193027e-05, "loss": 0.3994, "step": 514 }, { "epoch": 0.32465229896379183, "grad_norm": 0.13428835570812225, "learning_rate": 4.242356525472767e-05, "loss": 0.4021, "step": 515 }, { "epoch": 0.3252826917773137, "grad_norm": 0.16996537148952484, "learning_rate": 4.241309524374531e-05, "loss": 0.4596, "step": 516 }, { "epoch": 0.3259130845908357, "grad_norm": 0.19075356423854828, "learning_rate": 4.240260529946383e-05, "loss": 0.5052, "step": 517 }, { "epoch": 0.3265434774043576, "grad_norm": 0.16824814677238464, "learning_rate": 4.239209543238381e-05, "loss": 0.4298, "step": 518 }, { "epoch": 0.3271738702178795, "grad_norm": 0.1488647311925888, "learning_rate": 4.238156565302578e-05, "loss": 0.3637, "step": 519 }, { "epoch": 0.3278042630314014, "grad_norm": 0.14669464528560638, "learning_rate": 4.2371015971930204e-05, "loss": 0.3611, "step": 520 }, { "epoch": 0.3284346558449234, "grad_norm": 0.14535008370876312, "learning_rate": 4.236044639965747e-05, "loss": 0.4349, "step": 521 }, { "epoch": 0.32906504865844527, "grad_norm": 0.14576545357704163, "learning_rate": 4.234985694678787e-05, "loss": 0.3882, "step": 522 }, { "epoch": 0.3296954414719672, "grad_norm": 0.16440042853355408, "learning_rate": 4.23392476239216e-05, "loss": 0.4597, "step": 523 }, { "epoch": 0.3303258342854892, "grad_norm": 0.15812115371227264, "learning_rate": 4.232861844167875e-05, "loss": 0.4408, "step": 524 }, { "epoch": 0.33095622709901107, "grad_norm": 0.15020525455474854, "learning_rate": 4.231796941069929e-05, "loss": 0.3864, "step": 525 }, { "epoch": 0.331586619912533, "grad_norm": 0.14310987293720245, "learning_rate": 4.230730054164305e-05, "loss": 0.4158, "step": 526 }, { "epoch": 0.3322170127260549, "grad_norm": 0.1488959640264511, "learning_rate": 4.229661184518973e-05, "loss": 0.3834, "step": 527 }, { "epoch": 0.33284740553957687, "grad_norm": 0.15532805025577545, "learning_rate": 4.228590333203886e-05, "loss": 0.383, "step": 528 }, { "epoch": 0.33347779835309876, "grad_norm": 0.16951856017112732, "learning_rate": 4.227517501290983e-05, "loss": 0.4878, "step": 529 }, { "epoch": 0.3341081911666207, "grad_norm": 0.1473962813615799, "learning_rate": 4.226442689854184e-05, "loss": 0.4814, "step": 530 }, { "epoch": 0.3347385839801426, "grad_norm": 0.15749426186084747, "learning_rate": 4.225365899969391e-05, "loss": 0.4194, "step": 531 }, { "epoch": 0.33536897679366456, "grad_norm": 0.16457794606685638, "learning_rate": 4.2242871327144855e-05, "loss": 0.4758, "step": 532 }, { "epoch": 0.33599936960718646, "grad_norm": 0.19138643145561218, "learning_rate": 4.2232063891693305e-05, "loss": 0.4677, "step": 533 }, { "epoch": 0.3366297624207084, "grad_norm": 0.15384329855442047, "learning_rate": 4.222123670415766e-05, "loss": 0.4107, "step": 534 }, { "epoch": 0.3372601552342303, "grad_norm": 0.1445811688899994, "learning_rate": 4.2210389775376076e-05, "loss": 0.3981, "step": 535 }, { "epoch": 0.33789054804775226, "grad_norm": 0.14559048414230347, "learning_rate": 4.219952311620651e-05, "loss": 0.4463, "step": 536 }, { "epoch": 0.33852094086127416, "grad_norm": 0.1466781497001648, "learning_rate": 4.2188636737526634e-05, "loss": 0.464, "step": 537 }, { "epoch": 0.3391513336747961, "grad_norm": 0.14743733406066895, "learning_rate": 4.217773065023388e-05, "loss": 0.4083, "step": 538 }, { "epoch": 0.33978172648831806, "grad_norm": 0.15406803786754608, "learning_rate": 4.2166804865245384e-05, "loss": 0.4374, "step": 539 }, { "epoch": 0.34041211930183995, "grad_norm": 0.1378432959318161, "learning_rate": 4.2155859393498034e-05, "loss": 0.3851, "step": 540 }, { "epoch": 0.3410425121153619, "grad_norm": 0.15630073845386505, "learning_rate": 4.2144894245948404e-05, "loss": 0.39, "step": 541 }, { "epoch": 0.3416729049288838, "grad_norm": 0.1572435051202774, "learning_rate": 4.213390943357275e-05, "loss": 0.4445, "step": 542 }, { "epoch": 0.34230329774240575, "grad_norm": 0.16819582879543304, "learning_rate": 4.2122904967367056e-05, "loss": 0.4271, "step": 543 }, { "epoch": 0.34293369055592765, "grad_norm": 0.16458523273468018, "learning_rate": 4.211188085834694e-05, "loss": 0.4699, "step": 544 }, { "epoch": 0.3435640833694496, "grad_norm": 0.1598554253578186, "learning_rate": 4.210083711754769e-05, "loss": 0.4468, "step": 545 }, { "epoch": 0.3441944761829715, "grad_norm": 0.18264490365982056, "learning_rate": 4.208977375602426e-05, "loss": 0.3785, "step": 546 }, { "epoch": 0.34482486899649345, "grad_norm": 0.1308012455701828, "learning_rate": 4.2078690784851236e-05, "loss": 0.3724, "step": 547 }, { "epoch": 0.34545526181001535, "grad_norm": 0.14296194911003113, "learning_rate": 4.206758821512283e-05, "loss": 0.389, "step": 548 }, { "epoch": 0.3460856546235373, "grad_norm": 0.14228825271129608, "learning_rate": 4.205646605795289e-05, "loss": 0.3776, "step": 549 }, { "epoch": 0.3467160474370592, "grad_norm": 0.1453482061624527, "learning_rate": 4.204532432447484e-05, "loss": 0.4125, "step": 550 }, { "epoch": 0.34734644025058115, "grad_norm": 0.1499478816986084, "learning_rate": 4.203416302584172e-05, "loss": 0.378, "step": 551 }, { "epoch": 0.34797683306410304, "grad_norm": 0.16379188001155853, "learning_rate": 4.202298217322616e-05, "loss": 0.4554, "step": 552 }, { "epoch": 0.348607225877625, "grad_norm": 0.16106747090816498, "learning_rate": 4.201178177782036e-05, "loss": 0.3583, "step": 553 }, { "epoch": 0.34923761869114694, "grad_norm": 0.13125036656856537, "learning_rate": 4.2000561850836056e-05, "loss": 0.403, "step": 554 }, { "epoch": 0.34986801150466884, "grad_norm": 0.15825971961021423, "learning_rate": 4.19893224035046e-05, "loss": 0.3882, "step": 555 }, { "epoch": 0.3504984043181908, "grad_norm": 0.12176766246557236, "learning_rate": 4.19780634470768e-05, "loss": 0.3691, "step": 556 }, { "epoch": 0.3511287971317127, "grad_norm": 0.14360551536083221, "learning_rate": 4.196678499282307e-05, "loss": 0.4186, "step": 557 }, { "epoch": 0.35175918994523464, "grad_norm": 0.15045621991157532, "learning_rate": 4.195548705203328e-05, "loss": 0.4297, "step": 558 }, { "epoch": 0.35238958275875654, "grad_norm": 0.15988411009311676, "learning_rate": 4.194416963601685e-05, "loss": 0.4631, "step": 559 }, { "epoch": 0.3530199755722785, "grad_norm": 0.15470652282238007, "learning_rate": 4.193283275610268e-05, "loss": 0.3725, "step": 560 }, { "epoch": 0.3536503683858004, "grad_norm": 0.15270668268203735, "learning_rate": 4.192147642363915e-05, "loss": 0.4065, "step": 561 }, { "epoch": 0.35428076119932234, "grad_norm": 0.14180682599544525, "learning_rate": 4.19101006499941e-05, "loss": 0.4172, "step": 562 }, { "epoch": 0.35491115401284423, "grad_norm": 0.1411629319190979, "learning_rate": 4.189870544655487e-05, "loss": 0.394, "step": 563 }, { "epoch": 0.3555415468263662, "grad_norm": 0.1397046446800232, "learning_rate": 4.1887290824728205e-05, "loss": 0.3998, "step": 564 }, { "epoch": 0.3561719396398881, "grad_norm": 0.11284170299768448, "learning_rate": 4.1875856795940316e-05, "loss": 0.3202, "step": 565 }, { "epoch": 0.35680233245341003, "grad_norm": 0.13387620449066162, "learning_rate": 4.186440337163684e-05, "loss": 0.4024, "step": 566 }, { "epoch": 0.357432725266932, "grad_norm": 0.12860415875911713, "learning_rate": 4.18529305632828e-05, "loss": 0.4669, "step": 567 }, { "epoch": 0.3580631180804539, "grad_norm": 0.1481974571943283, "learning_rate": 4.184143838236267e-05, "loss": 0.4428, "step": 568 }, { "epoch": 0.35869351089397583, "grad_norm": 0.15754854679107666, "learning_rate": 4.182992684038028e-05, "loss": 0.4612, "step": 569 }, { "epoch": 0.3593239037074977, "grad_norm": 0.17479227483272552, "learning_rate": 4.181839594885884e-05, "loss": 0.47, "step": 570 }, { "epoch": 0.3599542965210197, "grad_norm": 0.15785165131092072, "learning_rate": 4.180684571934097e-05, "loss": 0.4033, "step": 571 }, { "epoch": 0.3605846893345416, "grad_norm": 0.16270500421524048, "learning_rate": 4.179527616338859e-05, "loss": 0.4469, "step": 572 }, { "epoch": 0.3612150821480635, "grad_norm": 0.14534880220890045, "learning_rate": 4.1783687292583004e-05, "loss": 0.3873, "step": 573 }, { "epoch": 0.3618454749615854, "grad_norm": 0.12993444502353668, "learning_rate": 4.177207911852485e-05, "loss": 0.3789, "step": 574 }, { "epoch": 0.3624758677751074, "grad_norm": 0.12628543376922607, "learning_rate": 4.176045165283407e-05, "loss": 0.3762, "step": 575 }, { "epoch": 0.36310626058862927, "grad_norm": 0.15444335341453552, "learning_rate": 4.1748804907149925e-05, "loss": 0.3623, "step": 576 }, { "epoch": 0.3637366534021512, "grad_norm": 0.17257718741893768, "learning_rate": 4.173713889313099e-05, "loss": 0.4264, "step": 577 }, { "epoch": 0.3643670462156731, "grad_norm": 0.1389797180891037, "learning_rate": 4.1725453622455094e-05, "loss": 0.3736, "step": 578 }, { "epoch": 0.36499743902919507, "grad_norm": 0.14798113703727722, "learning_rate": 4.171374910681938e-05, "loss": 0.4604, "step": 579 }, { "epoch": 0.36562783184271697, "grad_norm": 0.16598446667194366, "learning_rate": 4.170202535794024e-05, "loss": 0.4466, "step": 580 }, { "epoch": 0.3662582246562389, "grad_norm": 0.14121782779693604, "learning_rate": 4.169028238755331e-05, "loss": 0.3377, "step": 581 }, { "epoch": 0.36688861746976087, "grad_norm": 0.13968297839164734, "learning_rate": 4.1678520207413485e-05, "loss": 0.4261, "step": 582 }, { "epoch": 0.36751901028328277, "grad_norm": 0.14780119061470032, "learning_rate": 4.166673882929487e-05, "loss": 0.4008, "step": 583 }, { "epoch": 0.3681494030968047, "grad_norm": 0.1448272168636322, "learning_rate": 4.1654938264990785e-05, "loss": 0.399, "step": 584 }, { "epoch": 0.3687797959103266, "grad_norm": 0.1570119857788086, "learning_rate": 4.16431185263138e-05, "loss": 0.4486, "step": 585 }, { "epoch": 0.36941018872384856, "grad_norm": 0.13365842401981354, "learning_rate": 4.1631279625095616e-05, "loss": 0.4559, "step": 586 }, { "epoch": 0.37004058153737046, "grad_norm": 0.142498180270195, "learning_rate": 4.161942157318716e-05, "loss": 0.3981, "step": 587 }, { "epoch": 0.3706709743508924, "grad_norm": 0.15279634296894073, "learning_rate": 4.160754438245851e-05, "loss": 0.3982, "step": 588 }, { "epoch": 0.3713013671644143, "grad_norm": 0.1295880526304245, "learning_rate": 4.159564806479891e-05, "loss": 0.3687, "step": 589 }, { "epoch": 0.37193175997793626, "grad_norm": 0.18762393295764923, "learning_rate": 4.1583732632116755e-05, "loss": 0.4631, "step": 590 }, { "epoch": 0.37256215279145816, "grad_norm": 0.12456995248794556, "learning_rate": 4.1571798096339544e-05, "loss": 0.4014, "step": 591 }, { "epoch": 0.3731925456049801, "grad_norm": 0.1376989632844925, "learning_rate": 4.1559844469413945e-05, "loss": 0.3587, "step": 592 }, { "epoch": 0.373822938418502, "grad_norm": 0.16048815846443176, "learning_rate": 4.154787176330571e-05, "loss": 0.4018, "step": 593 }, { "epoch": 0.37445333123202396, "grad_norm": 0.13161379098892212, "learning_rate": 4.153587998999969e-05, "loss": 0.4113, "step": 594 }, { "epoch": 0.3750837240455459, "grad_norm": 0.1387777328491211, "learning_rate": 4.152386916149981e-05, "loss": 0.3606, "step": 595 }, { "epoch": 0.3757141168590678, "grad_norm": 0.17377012968063354, "learning_rate": 4.1511839289829106e-05, "loss": 0.4833, "step": 596 }, { "epoch": 0.37634450967258976, "grad_norm": 0.1923179179430008, "learning_rate": 4.1499790387029646e-05, "loss": 0.4308, "step": 597 }, { "epoch": 0.37697490248611165, "grad_norm": 0.15764404833316803, "learning_rate": 4.1487722465162555e-05, "loss": 0.4458, "step": 598 }, { "epoch": 0.3776052952996336, "grad_norm": 0.1681838035583496, "learning_rate": 4.1475635536308e-05, "loss": 0.4405, "step": 599 }, { "epoch": 0.3782356881131555, "grad_norm": 0.154836967587471, "learning_rate": 4.146352961256518e-05, "loss": 0.4381, "step": 600 }, { "epoch": 0.3782356881131555, "eval_loss": 0.444912314414978, "eval_runtime": 222.7209, "eval_samples_per_second": 4.49, "eval_steps_per_second": 4.49, "step": 600 }, { "epoch": 0.37886608092667745, "grad_norm": 0.16694478690624237, "learning_rate": 4.14514047060523e-05, "loss": 0.4219, "step": 601 }, { "epoch": 0.37949647374019935, "grad_norm": 0.17993447184562683, "learning_rate": 4.1439260828906556e-05, "loss": 0.4688, "step": 602 }, { "epoch": 0.3801268665537213, "grad_norm": 0.14644283056259155, "learning_rate": 4.142709799328417e-05, "loss": 0.4383, "step": 603 }, { "epoch": 0.3807572593672432, "grad_norm": 0.1624111384153366, "learning_rate": 4.141491621136029e-05, "loss": 0.4815, "step": 604 }, { "epoch": 0.38138765218076515, "grad_norm": 0.14999568462371826, "learning_rate": 4.140271549532908e-05, "loss": 0.4006, "step": 605 }, { "epoch": 0.38201804499428704, "grad_norm": 0.1663047969341278, "learning_rate": 4.139049585740363e-05, "loss": 0.4411, "step": 606 }, { "epoch": 0.382648437807809, "grad_norm": 0.1374640315771103, "learning_rate": 4.137825730981597e-05, "loss": 0.3797, "step": 607 }, { "epoch": 0.3832788306213309, "grad_norm": 0.16739337146282196, "learning_rate": 4.136599986481706e-05, "loss": 0.3494, "step": 608 }, { "epoch": 0.38390922343485284, "grad_norm": 0.15691962838172913, "learning_rate": 4.1353723534676795e-05, "loss": 0.3653, "step": 609 }, { "epoch": 0.3845396162483748, "grad_norm": 0.14026010036468506, "learning_rate": 4.1341428331683954e-05, "loss": 0.425, "step": 610 }, { "epoch": 0.3851700090618967, "grad_norm": 0.13732515275478363, "learning_rate": 4.132911426814621e-05, "loss": 0.3926, "step": 611 }, { "epoch": 0.38580040187541864, "grad_norm": 0.21322603523731232, "learning_rate": 4.1316781356390134e-05, "loss": 0.5112, "step": 612 }, { "epoch": 0.38643079468894054, "grad_norm": 0.16545470058918, "learning_rate": 4.130442960876113e-05, "loss": 0.4436, "step": 613 }, { "epoch": 0.3870611875024625, "grad_norm": 0.1380622237920761, "learning_rate": 4.129205903762351e-05, "loss": 0.4221, "step": 614 }, { "epoch": 0.3876915803159844, "grad_norm": 0.14784802496433258, "learning_rate": 4.127966965536036e-05, "loss": 0.3305, "step": 615 }, { "epoch": 0.38832197312950634, "grad_norm": 0.1654745489358902, "learning_rate": 4.126726147437366e-05, "loss": 0.4587, "step": 616 }, { "epoch": 0.38895236594302823, "grad_norm": 0.16123948991298676, "learning_rate": 4.1254834507084165e-05, "loss": 0.4208, "step": 617 }, { "epoch": 0.3895827587565502, "grad_norm": 0.1417972445487976, "learning_rate": 4.124238876593145e-05, "loss": 0.349, "step": 618 }, { "epoch": 0.3902131515700721, "grad_norm": 0.13607749342918396, "learning_rate": 4.122992426337389e-05, "loss": 0.3971, "step": 619 }, { "epoch": 0.39084354438359403, "grad_norm": 0.16371870040893555, "learning_rate": 4.1217441011888644e-05, "loss": 0.4607, "step": 620 }, { "epoch": 0.39147393719711593, "grad_norm": 0.15990233421325684, "learning_rate": 4.120493902397161e-05, "loss": 0.4536, "step": 621 }, { "epoch": 0.3921043300106379, "grad_norm": 0.1312706023454666, "learning_rate": 4.119241831213747e-05, "loss": 0.3943, "step": 622 }, { "epoch": 0.39273472282415983, "grad_norm": 0.16973647475242615, "learning_rate": 4.117987888891963e-05, "loss": 0.4315, "step": 623 }, { "epoch": 0.39336511563768173, "grad_norm": 0.1553269475698471, "learning_rate": 4.116732076687026e-05, "loss": 0.451, "step": 624 }, { "epoch": 0.3939955084512037, "grad_norm": 0.13316068053245544, "learning_rate": 4.1154743958560195e-05, "loss": 0.3701, "step": 625 }, { "epoch": 0.3946259012647256, "grad_norm": 0.142917200922966, "learning_rate": 4.114214847657902e-05, "loss": 0.4027, "step": 626 }, { "epoch": 0.3952562940782475, "grad_norm": 0.16515448689460754, "learning_rate": 4.112953433353499e-05, "loss": 0.4192, "step": 627 }, { "epoch": 0.3958866868917694, "grad_norm": 0.1329452246427536, "learning_rate": 4.1116901542055044e-05, "loss": 0.3855, "step": 628 }, { "epoch": 0.3965170797052914, "grad_norm": 0.19352348148822784, "learning_rate": 4.1104250114784805e-05, "loss": 0.4795, "step": 629 }, { "epoch": 0.39714747251881327, "grad_norm": 0.1591757833957672, "learning_rate": 4.109158006438853e-05, "loss": 0.4383, "step": 630 }, { "epoch": 0.3977778653323352, "grad_norm": 0.15093408524990082, "learning_rate": 4.107889140354911e-05, "loss": 0.428, "step": 631 }, { "epoch": 0.3984082581458571, "grad_norm": 0.13541729748249054, "learning_rate": 4.1066184144968096e-05, "loss": 0.3743, "step": 632 }, { "epoch": 0.39903865095937907, "grad_norm": 0.15193648636341095, "learning_rate": 4.105345830136563e-05, "loss": 0.4469, "step": 633 }, { "epoch": 0.39966904377290097, "grad_norm": 0.1544993370771408, "learning_rate": 4.104071388548048e-05, "loss": 0.3963, "step": 634 }, { "epoch": 0.4002994365864229, "grad_norm": 0.17132483422756195, "learning_rate": 4.102795091006998e-05, "loss": 0.4785, "step": 635 }, { "epoch": 0.4009298293999448, "grad_norm": 0.18319329619407654, "learning_rate": 4.1015169387910056e-05, "loss": 0.398, "step": 636 }, { "epoch": 0.40156022221346677, "grad_norm": 0.1142900213599205, "learning_rate": 4.100236933179521e-05, "loss": 0.3647, "step": 637 }, { "epoch": 0.4021906150269887, "grad_norm": 0.1454845666885376, "learning_rate": 4.098955075453847e-05, "loss": 0.4765, "step": 638 }, { "epoch": 0.4028210078405106, "grad_norm": 0.15059219300746918, "learning_rate": 4.097671366897143e-05, "loss": 0.382, "step": 639 }, { "epoch": 0.40345140065403257, "grad_norm": 0.15379232168197632, "learning_rate": 4.0963858087944186e-05, "loss": 0.3915, "step": 640 }, { "epoch": 0.40408179346755446, "grad_norm": 0.14436917006969452, "learning_rate": 4.095098402432538e-05, "loss": 0.4359, "step": 641 }, { "epoch": 0.4047121862810764, "grad_norm": 0.16968920826911926, "learning_rate": 4.0938091491002125e-05, "loss": 0.3922, "step": 642 }, { "epoch": 0.4053425790945983, "grad_norm": 0.14354845881462097, "learning_rate": 4.092518050088005e-05, "loss": 0.4116, "step": 643 }, { "epoch": 0.40597297190812026, "grad_norm": 0.13624794781208038, "learning_rate": 4.091225106688322e-05, "loss": 0.4232, "step": 644 }, { "epoch": 0.40660336472164216, "grad_norm": 0.16531537473201752, "learning_rate": 4.089930320195421e-05, "loss": 0.4418, "step": 645 }, { "epoch": 0.4072337575351641, "grad_norm": 0.1489722579717636, "learning_rate": 4.088633691905402e-05, "loss": 0.41, "step": 646 }, { "epoch": 0.407864150348686, "grad_norm": 0.140624538064003, "learning_rate": 4.0873352231162064e-05, "loss": 0.4101, "step": 647 }, { "epoch": 0.40849454316220796, "grad_norm": 0.14307831227779388, "learning_rate": 4.0860349151276235e-05, "loss": 0.3795, "step": 648 }, { "epoch": 0.40912493597572985, "grad_norm": 0.1908441036939621, "learning_rate": 4.0847327692412787e-05, "loss": 0.4538, "step": 649 }, { "epoch": 0.4097553287892518, "grad_norm": 0.1263931393623352, "learning_rate": 4.08342878676064e-05, "loss": 0.3504, "step": 650 }, { "epoch": 0.4103857216027737, "grad_norm": 0.14092887938022614, "learning_rate": 4.0821229689910125e-05, "loss": 0.4195, "step": 651 }, { "epoch": 0.41101611441629565, "grad_norm": 0.1607782244682312, "learning_rate": 4.0808153172395394e-05, "loss": 0.3918, "step": 652 }, { "epoch": 0.4116465072298176, "grad_norm": 0.12833623588085175, "learning_rate": 4.079505832815198e-05, "loss": 0.3821, "step": 653 }, { "epoch": 0.4122769000433395, "grad_norm": 0.14452293515205383, "learning_rate": 4.078194517028802e-05, "loss": 0.4111, "step": 654 }, { "epoch": 0.41290729285686145, "grad_norm": 0.17976075410842896, "learning_rate": 4.076881371192998e-05, "loss": 0.473, "step": 655 }, { "epoch": 0.41353768567038335, "grad_norm": 0.1213824599981308, "learning_rate": 4.0755663966222645e-05, "loss": 0.3589, "step": 656 }, { "epoch": 0.4141680784839053, "grad_norm": 0.13618458807468414, "learning_rate": 4.0742495946329086e-05, "loss": 0.3873, "step": 657 }, { "epoch": 0.4147984712974272, "grad_norm": 0.12898501753807068, "learning_rate": 4.07293096654307e-05, "loss": 0.3933, "step": 658 }, { "epoch": 0.41542886411094915, "grad_norm": 0.1469496637582779, "learning_rate": 4.071610513672713e-05, "loss": 0.4204, "step": 659 }, { "epoch": 0.41605925692447104, "grad_norm": 0.14399708807468414, "learning_rate": 4.070288237343632e-05, "loss": 0.4844, "step": 660 }, { "epoch": 0.416689649737993, "grad_norm": 0.14005786180496216, "learning_rate": 4.068964138879444e-05, "loss": 0.4463, "step": 661 }, { "epoch": 0.4173200425515149, "grad_norm": 0.15872123837471008, "learning_rate": 4.0676382196055906e-05, "loss": 0.4447, "step": 662 }, { "epoch": 0.41795043536503684, "grad_norm": 0.1603812277317047, "learning_rate": 4.066310480849337e-05, "loss": 0.3953, "step": 663 }, { "epoch": 0.41858082817855874, "grad_norm": 0.1350301206111908, "learning_rate": 4.064980923939769e-05, "loss": 0.415, "step": 664 }, { "epoch": 0.4192112209920807, "grad_norm": 0.16543471813201904, "learning_rate": 4.063649550207792e-05, "loss": 0.4775, "step": 665 }, { "epoch": 0.41984161380560264, "grad_norm": 0.1276547908782959, "learning_rate": 4.0623163609861316e-05, "loss": 0.375, "step": 666 }, { "epoch": 0.42047200661912454, "grad_norm": 0.14537298679351807, "learning_rate": 4.0609813576093285e-05, "loss": 0.3697, "step": 667 }, { "epoch": 0.4211023994326465, "grad_norm": 0.14902625977993011, "learning_rate": 4.059644541413741e-05, "loss": 0.4501, "step": 668 }, { "epoch": 0.4217327922461684, "grad_norm": 0.1425814926624298, "learning_rate": 4.058305913737542e-05, "loss": 0.3701, "step": 669 }, { "epoch": 0.42236318505969034, "grad_norm": 0.1408214420080185, "learning_rate": 4.0569654759207164e-05, "loss": 0.357, "step": 670 }, { "epoch": 0.42299357787321223, "grad_norm": 0.15346527099609375, "learning_rate": 4.055623229305063e-05, "loss": 0.3679, "step": 671 }, { "epoch": 0.4236239706867342, "grad_norm": 0.14200004935264587, "learning_rate": 4.0542791752341904e-05, "loss": 0.374, "step": 672 }, { "epoch": 0.4242543635002561, "grad_norm": 0.13923034071922302, "learning_rate": 4.0529333150535145e-05, "loss": 0.4081, "step": 673 }, { "epoch": 0.42488475631377803, "grad_norm": 0.1957700550556183, "learning_rate": 4.051585650110264e-05, "loss": 0.5211, "step": 674 }, { "epoch": 0.42551514912729993, "grad_norm": 0.1250585913658142, "learning_rate": 4.0502361817534686e-05, "loss": 0.3656, "step": 675 }, { "epoch": 0.4261455419408219, "grad_norm": 0.14980201423168182, "learning_rate": 4.048884911333967e-05, "loss": 0.3976, "step": 676 }, { "epoch": 0.4267759347543438, "grad_norm": 0.14705874025821686, "learning_rate": 4.0475318402043996e-05, "loss": 0.4219, "step": 677 }, { "epoch": 0.42740632756786573, "grad_norm": 0.17089523375034332, "learning_rate": 4.046176969719212e-05, "loss": 0.5121, "step": 678 }, { "epoch": 0.4280367203813876, "grad_norm": 0.14580880105495453, "learning_rate": 4.0448203012346486e-05, "loss": 0.3927, "step": 679 }, { "epoch": 0.4286671131949096, "grad_norm": 0.14165879786014557, "learning_rate": 4.043461836108754e-05, "loss": 0.3613, "step": 680 }, { "epoch": 0.42929750600843153, "grad_norm": 0.17066994309425354, "learning_rate": 4.0421015757013714e-05, "loss": 0.4222, "step": 681 }, { "epoch": 0.4299278988219534, "grad_norm": 0.1570414900779724, "learning_rate": 4.040739521374143e-05, "loss": 0.42, "step": 682 }, { "epoch": 0.4305582916354754, "grad_norm": 0.16085675358772278, "learning_rate": 4.0393756744905025e-05, "loss": 0.3998, "step": 683 }, { "epoch": 0.4311886844489973, "grad_norm": 0.13752546906471252, "learning_rate": 4.038010036415683e-05, "loss": 0.4599, "step": 684 }, { "epoch": 0.4318190772625192, "grad_norm": 0.14098380506038666, "learning_rate": 4.036642608516707e-05, "loss": 0.4026, "step": 685 }, { "epoch": 0.4324494700760411, "grad_norm": 0.14366577565670013, "learning_rate": 4.035273392162389e-05, "loss": 0.469, "step": 686 }, { "epoch": 0.43307986288956307, "grad_norm": 0.14651672542095184, "learning_rate": 4.0339023887233366e-05, "loss": 0.4062, "step": 687 }, { "epoch": 0.43371025570308497, "grad_norm": 0.14978663623332977, "learning_rate": 4.032529599571942e-05, "loss": 0.3823, "step": 688 }, { "epoch": 0.4343406485166069, "grad_norm": 0.17241071164608002, "learning_rate": 4.031155026082389e-05, "loss": 0.418, "step": 689 }, { "epoch": 0.4349710413301288, "grad_norm": 0.18321415781974792, "learning_rate": 4.029778669630645e-05, "loss": 0.4246, "step": 690 }, { "epoch": 0.43560143414365077, "grad_norm": 0.12590643763542175, "learning_rate": 4.028400531594463e-05, "loss": 0.3959, "step": 691 }, { "epoch": 0.43623182695717266, "grad_norm": 0.1435738354921341, "learning_rate": 4.027020613353379e-05, "loss": 0.3956, "step": 692 }, { "epoch": 0.4368622197706946, "grad_norm": 0.15861615538597107, "learning_rate": 4.025638916288711e-05, "loss": 0.4415, "step": 693 }, { "epoch": 0.43749261258421657, "grad_norm": 0.14547286927700043, "learning_rate": 4.02425544178356e-05, "loss": 0.3686, "step": 694 }, { "epoch": 0.43812300539773846, "grad_norm": 0.13145437836647034, "learning_rate": 4.0228701912228015e-05, "loss": 0.3877, "step": 695 }, { "epoch": 0.4387533982112604, "grad_norm": 0.14630243182182312, "learning_rate": 4.021483165993093e-05, "loss": 0.3912, "step": 696 }, { "epoch": 0.4393837910247823, "grad_norm": 0.17564375698566437, "learning_rate": 4.020094367482867e-05, "loss": 0.4798, "step": 697 }, { "epoch": 0.44001418383830426, "grad_norm": 0.1785062700510025, "learning_rate": 4.0187037970823304e-05, "loss": 0.4223, "step": 698 }, { "epoch": 0.44064457665182616, "grad_norm": 0.14428603649139404, "learning_rate": 4.017311456183465e-05, "loss": 0.3492, "step": 699 }, { "epoch": 0.4412749694653481, "grad_norm": 0.15606176853179932, "learning_rate": 4.015917346180025e-05, "loss": 0.4348, "step": 700 }, { "epoch": 0.44190536227887, "grad_norm": 0.1602056622505188, "learning_rate": 4.014521468467533e-05, "loss": 0.4422, "step": 701 }, { "epoch": 0.44253575509239196, "grad_norm": 0.11997710913419724, "learning_rate": 4.013123824443284e-05, "loss": 0.4088, "step": 702 }, { "epoch": 0.44316614790591385, "grad_norm": 0.12170203775167465, "learning_rate": 4.011724415506341e-05, "loss": 0.3383, "step": 703 }, { "epoch": 0.4437965407194358, "grad_norm": 0.13026146590709686, "learning_rate": 4.0103232430575314e-05, "loss": 0.4095, "step": 704 }, { "epoch": 0.4444269335329577, "grad_norm": 0.1414802074432373, "learning_rate": 4.00892030849945e-05, "loss": 0.3865, "step": 705 }, { "epoch": 0.44505732634647965, "grad_norm": 0.1612880527973175, "learning_rate": 4.007515613236455e-05, "loss": 0.4371, "step": 706 }, { "epoch": 0.44568771916000155, "grad_norm": 0.13235218822956085, "learning_rate": 4.0061091586746665e-05, "loss": 0.4008, "step": 707 }, { "epoch": 0.4463181119735235, "grad_norm": 0.12737032771110535, "learning_rate": 4.0047009462219655e-05, "loss": 0.3763, "step": 708 }, { "epoch": 0.44694850478704545, "grad_norm": 0.13643917441368103, "learning_rate": 4.003290977287994e-05, "loss": 0.394, "step": 709 }, { "epoch": 0.44757889760056735, "grad_norm": 0.19716039299964905, "learning_rate": 4.001879253284152e-05, "loss": 0.5007, "step": 710 }, { "epoch": 0.4482092904140893, "grad_norm": 0.1554064154624939, "learning_rate": 4.0004657756235944e-05, "loss": 0.4375, "step": 711 }, { "epoch": 0.4488396832276112, "grad_norm": 0.17151057720184326, "learning_rate": 3.999050545721234e-05, "loss": 0.3899, "step": 712 }, { "epoch": 0.44947007604113315, "grad_norm": 0.1316985934972763, "learning_rate": 3.997633564993736e-05, "loss": 0.3585, "step": 713 }, { "epoch": 0.45010046885465504, "grad_norm": 0.16340312361717224, "learning_rate": 3.996214834859519e-05, "loss": 0.4085, "step": 714 }, { "epoch": 0.450730861668177, "grad_norm": 0.11996844410896301, "learning_rate": 3.994794356738753e-05, "loss": 0.361, "step": 715 }, { "epoch": 0.4513612544816989, "grad_norm": 0.14515595138072968, "learning_rate": 3.993372132053356e-05, "loss": 0.4109, "step": 716 }, { "epoch": 0.45199164729522084, "grad_norm": 0.14319062232971191, "learning_rate": 3.991948162226999e-05, "loss": 0.3465, "step": 717 }, { "epoch": 0.45262204010874274, "grad_norm": 0.16570813953876495, "learning_rate": 3.990522448685092e-05, "loss": 0.4548, "step": 718 }, { "epoch": 0.4532524329222647, "grad_norm": 0.17157788574695587, "learning_rate": 3.989094992854798e-05, "loss": 0.4332, "step": 719 }, { "epoch": 0.4538828257357866, "grad_norm": 0.15220706164836884, "learning_rate": 3.987665796165021e-05, "loss": 0.3821, "step": 720 }, { "epoch": 0.45451321854930854, "grad_norm": 0.13397742807865143, "learning_rate": 3.986234860046407e-05, "loss": 0.3754, "step": 721 }, { "epoch": 0.4551436113628305, "grad_norm": 0.14851902425289154, "learning_rate": 3.984802185931344e-05, "loss": 0.432, "step": 722 }, { "epoch": 0.4557740041763524, "grad_norm": 0.16764099895954132, "learning_rate": 3.98336777525396e-05, "loss": 0.4495, "step": 723 }, { "epoch": 0.45640439698987434, "grad_norm": 0.15480101108551025, "learning_rate": 3.9819316294501217e-05, "loss": 0.4505, "step": 724 }, { "epoch": 0.45703478980339624, "grad_norm": 0.15106666088104248, "learning_rate": 3.9804937499574296e-05, "loss": 0.4407, "step": 725 }, { "epoch": 0.4576651826169182, "grad_norm": 0.16314244270324707, "learning_rate": 3.979054138215225e-05, "loss": 0.3506, "step": 726 }, { "epoch": 0.4582955754304401, "grad_norm": 0.14463144540786743, "learning_rate": 3.977612795664579e-05, "loss": 0.4328, "step": 727 }, { "epoch": 0.45892596824396203, "grad_norm": 0.16658100485801697, "learning_rate": 3.976169723748296e-05, "loss": 0.4492, "step": 728 }, { "epoch": 0.45955636105748393, "grad_norm": 0.1776987612247467, "learning_rate": 3.9747249239109126e-05, "loss": 0.4352, "step": 729 }, { "epoch": 0.4601867538710059, "grad_norm": 0.1609821766614914, "learning_rate": 3.973278397598695e-05, "loss": 0.4014, "step": 730 }, { "epoch": 0.4608171466845278, "grad_norm": 0.16667957603931427, "learning_rate": 3.971830146259637e-05, "loss": 0.422, "step": 731 }, { "epoch": 0.46144753949804973, "grad_norm": 0.1358121931552887, "learning_rate": 3.970380171343458e-05, "loss": 0.3901, "step": 732 }, { "epoch": 0.4620779323115716, "grad_norm": 0.12884113192558289, "learning_rate": 3.968928474301606e-05, "loss": 0.3777, "step": 733 }, { "epoch": 0.4627083251250936, "grad_norm": 0.14654743671417236, "learning_rate": 3.9674750565872495e-05, "loss": 0.4344, "step": 734 }, { "epoch": 0.4633387179386155, "grad_norm": 0.15058912336826324, "learning_rate": 3.966019919655282e-05, "loss": 0.4133, "step": 735 }, { "epoch": 0.4639691107521374, "grad_norm": 0.13025817275047302, "learning_rate": 3.964563064962316e-05, "loss": 0.3639, "step": 736 }, { "epoch": 0.4645995035656594, "grad_norm": 0.146138533949852, "learning_rate": 3.9631044939666836e-05, "loss": 0.37, "step": 737 }, { "epoch": 0.4652298963791813, "grad_norm": 0.14348472654819489, "learning_rate": 3.961644208128438e-05, "loss": 0.4255, "step": 738 }, { "epoch": 0.4658602891927032, "grad_norm": 0.14459875226020813, "learning_rate": 3.9601822089093444e-05, "loss": 0.412, "step": 739 }, { "epoch": 0.4664906820062251, "grad_norm": 0.17654073238372803, "learning_rate": 3.9587184977728866e-05, "loss": 0.4538, "step": 740 }, { "epoch": 0.4671210748197471, "grad_norm": 0.13895660638809204, "learning_rate": 3.957253076184261e-05, "loss": 0.425, "step": 741 }, { "epoch": 0.46775146763326897, "grad_norm": 0.1470203697681427, "learning_rate": 3.9557859456103746e-05, "loss": 0.4476, "step": 742 }, { "epoch": 0.4683818604467909, "grad_norm": 0.14507576823234558, "learning_rate": 3.954317107519848e-05, "loss": 0.4101, "step": 743 }, { "epoch": 0.4690122532603128, "grad_norm": 0.129420667886734, "learning_rate": 3.9528465633830076e-05, "loss": 0.3631, "step": 744 }, { "epoch": 0.46964264607383477, "grad_norm": 0.12379541993141174, "learning_rate": 3.951374314671893e-05, "loss": 0.3204, "step": 745 }, { "epoch": 0.47027303888735666, "grad_norm": 0.14565017819404602, "learning_rate": 3.949900362860243e-05, "loss": 0.3734, "step": 746 }, { "epoch": 0.4709034317008786, "grad_norm": 0.1342315524816513, "learning_rate": 3.948424709423507e-05, "loss": 0.4007, "step": 747 }, { "epoch": 0.4715338245144005, "grad_norm": 0.13480636477470398, "learning_rate": 3.946947355838835e-05, "loss": 0.4612, "step": 748 }, { "epoch": 0.47216421732792246, "grad_norm": 0.13085278868675232, "learning_rate": 3.94546830358508e-05, "loss": 0.4104, "step": 749 }, { "epoch": 0.47279461014144436, "grad_norm": 0.16482636332511902, "learning_rate": 3.943987554142794e-05, "loss": 0.4542, "step": 750 }, { "epoch": 0.4734250029549663, "grad_norm": 0.15673352777957916, "learning_rate": 3.94250510899423e-05, "loss": 0.3808, "step": 751 }, { "epoch": 0.47405539576848826, "grad_norm": 0.15453855693340302, "learning_rate": 3.9410209696233356e-05, "loss": 0.3816, "step": 752 }, { "epoch": 0.47468578858201016, "grad_norm": 0.15016509592533112, "learning_rate": 3.939535137515758e-05, "loss": 0.39, "step": 753 }, { "epoch": 0.4753161813955321, "grad_norm": 0.13890962302684784, "learning_rate": 3.9380476141588355e-05, "loss": 0.412, "step": 754 }, { "epoch": 0.475946574209054, "grad_norm": 0.1330081820487976, "learning_rate": 3.9365584010416e-05, "loss": 0.3242, "step": 755 }, { "epoch": 0.47657696702257596, "grad_norm": 0.1311984807252884, "learning_rate": 3.935067499654777e-05, "loss": 0.3583, "step": 756 }, { "epoch": 0.47720735983609786, "grad_norm": 0.14548636972904205, "learning_rate": 3.9335749114907816e-05, "loss": 0.3684, "step": 757 }, { "epoch": 0.4778377526496198, "grad_norm": 0.1494605988264084, "learning_rate": 3.932080638043714e-05, "loss": 0.4363, "step": 758 }, { "epoch": 0.4784681454631417, "grad_norm": 0.126301571726799, "learning_rate": 3.930584680809364e-05, "loss": 0.3807, "step": 759 }, { "epoch": 0.47909853827666365, "grad_norm": 0.1573512852191925, "learning_rate": 3.9290870412852076e-05, "loss": 0.4454, "step": 760 }, { "epoch": 0.47972893109018555, "grad_norm": 0.16862505674362183, "learning_rate": 3.927587720970404e-05, "loss": 0.4264, "step": 761 }, { "epoch": 0.4803593239037075, "grad_norm": 0.16232459247112274, "learning_rate": 3.9260867213657934e-05, "loss": 0.3882, "step": 762 }, { "epoch": 0.4809897167172294, "grad_norm": 0.12893806397914886, "learning_rate": 3.9245840439739e-05, "loss": 0.3486, "step": 763 }, { "epoch": 0.48162010953075135, "grad_norm": 0.2132662981748581, "learning_rate": 3.923079690298924e-05, "loss": 0.5119, "step": 764 }, { "epoch": 0.4822505023442733, "grad_norm": 0.15508213639259338, "learning_rate": 3.921573661846747e-05, "loss": 0.3704, "step": 765 }, { "epoch": 0.4828808951577952, "grad_norm": 0.12923885881900787, "learning_rate": 3.920065960124925e-05, "loss": 0.4451, "step": 766 }, { "epoch": 0.48351128797131715, "grad_norm": 0.12894199788570404, "learning_rate": 3.918556586642689e-05, "loss": 0.3758, "step": 767 }, { "epoch": 0.48414168078483905, "grad_norm": 0.16268831491470337, "learning_rate": 3.917045542910944e-05, "loss": 0.4023, "step": 768 }, { "epoch": 0.484772073598361, "grad_norm": 0.14713963866233826, "learning_rate": 3.915532830442267e-05, "loss": 0.4138, "step": 769 }, { "epoch": 0.4854024664118829, "grad_norm": 0.14321781694889069, "learning_rate": 3.914018450750905e-05, "loss": 0.3415, "step": 770 }, { "epoch": 0.48603285922540485, "grad_norm": 0.15645352005958557, "learning_rate": 3.912502405352776e-05, "loss": 0.4201, "step": 771 }, { "epoch": 0.48666325203892674, "grad_norm": 0.15770842134952545, "learning_rate": 3.9109846957654605e-05, "loss": 0.3694, "step": 772 }, { "epoch": 0.4872936448524487, "grad_norm": 0.1686742752790451, "learning_rate": 3.9094653235082104e-05, "loss": 0.384, "step": 773 }, { "epoch": 0.4879240376659706, "grad_norm": 0.1598564088344574, "learning_rate": 3.9079442901019384e-05, "loss": 0.4244, "step": 774 }, { "epoch": 0.48855443047949254, "grad_norm": 0.14372321963310242, "learning_rate": 3.90642159706922e-05, "loss": 0.4358, "step": 775 }, { "epoch": 0.48918482329301444, "grad_norm": 0.13739392161369324, "learning_rate": 3.904897245934297e-05, "loss": 0.4267, "step": 776 }, { "epoch": 0.4898152161065364, "grad_norm": 0.16683433949947357, "learning_rate": 3.903371238223062e-05, "loss": 0.3607, "step": 777 }, { "epoch": 0.4904456089200583, "grad_norm": 0.16543376445770264, "learning_rate": 3.901843575463076e-05, "loss": 0.4113, "step": 778 }, { "epoch": 0.49107600173358024, "grad_norm": 0.1504923403263092, "learning_rate": 3.900314259183548e-05, "loss": 0.4765, "step": 779 }, { "epoch": 0.4917063945471022, "grad_norm": 0.16944220662117004, "learning_rate": 3.8987832909153484e-05, "loss": 0.4301, "step": 780 }, { "epoch": 0.4923367873606241, "grad_norm": 0.13818518817424774, "learning_rate": 3.8972506721909975e-05, "loss": 0.487, "step": 781 }, { "epoch": 0.49296718017414604, "grad_norm": 0.15189217031002045, "learning_rate": 3.895716404544669e-05, "loss": 0.3897, "step": 782 }, { "epoch": 0.49359757298766793, "grad_norm": 0.1592871993780136, "learning_rate": 3.894180489512187e-05, "loss": 0.3851, "step": 783 }, { "epoch": 0.4942279658011899, "grad_norm": 0.15947246551513672, "learning_rate": 3.8926429286310255e-05, "loss": 0.3861, "step": 784 }, { "epoch": 0.4948583586147118, "grad_norm": 0.1814534068107605, "learning_rate": 3.891103723440306e-05, "loss": 0.3973, "step": 785 }, { "epoch": 0.49548875142823373, "grad_norm": 0.12373247742652893, "learning_rate": 3.8895628754807936e-05, "loss": 0.3791, "step": 786 }, { "epoch": 0.4961191442417556, "grad_norm": 0.18603551387786865, "learning_rate": 3.888020386294902e-05, "loss": 0.3958, "step": 787 }, { "epoch": 0.4967495370552776, "grad_norm": 0.16354039311408997, "learning_rate": 3.886476257426683e-05, "loss": 0.3913, "step": 788 }, { "epoch": 0.4973799298687995, "grad_norm": 0.1754021793603897, "learning_rate": 3.8849304904218335e-05, "loss": 0.4388, "step": 789 }, { "epoch": 0.4980103226823214, "grad_norm": 0.16621164977550507, "learning_rate": 3.883383086827688e-05, "loss": 0.4342, "step": 790 }, { "epoch": 0.4986407154958433, "grad_norm": 0.18488451838493347, "learning_rate": 3.8818340481932206e-05, "loss": 0.4792, "step": 791 }, { "epoch": 0.4992711083093653, "grad_norm": 0.14420366287231445, "learning_rate": 3.8802833760690424e-05, "loss": 0.361, "step": 792 }, { "epoch": 0.4999015011228872, "grad_norm": 0.16407188773155212, "learning_rate": 3.878731072007398e-05, "loss": 0.3657, "step": 793 }, { "epoch": 0.5005318939364092, "grad_norm": 0.13707882165908813, "learning_rate": 3.877177137562166e-05, "loss": 0.4247, "step": 794 }, { "epoch": 0.501162286749931, "grad_norm": 0.13158880174160004, "learning_rate": 3.875621574288858e-05, "loss": 0.4068, "step": 795 }, { "epoch": 0.501792679563453, "grad_norm": 0.16040393710136414, "learning_rate": 3.874064383744616e-05, "loss": 0.4172, "step": 796 }, { "epoch": 0.5024230723769749, "grad_norm": 0.15780168771743774, "learning_rate": 3.8725055674882096e-05, "loss": 0.4144, "step": 797 }, { "epoch": 0.5030534651904969, "grad_norm": 0.14710697531700134, "learning_rate": 3.870945127080037e-05, "loss": 0.3704, "step": 798 }, { "epoch": 0.5036838580040187, "grad_norm": 0.18354450166225433, "learning_rate": 3.869383064082122e-05, "loss": 0.4544, "step": 799 }, { "epoch": 0.5043142508175407, "grad_norm": 0.14307357370853424, "learning_rate": 3.867819380058112e-05, "loss": 0.3872, "step": 800 }, { "epoch": 0.5043142508175407, "eval_loss": 0.4390900135040283, "eval_runtime": 222.7922, "eval_samples_per_second": 4.488, "eval_steps_per_second": 4.488, "step": 800 }, { "epoch": 0.5049446436310626, "grad_norm": 0.1444072276353836, "learning_rate": 3.866254076573278e-05, "loss": 0.4046, "step": 801 }, { "epoch": 0.5055750364445846, "grad_norm": 0.13857287168502808, "learning_rate": 3.864687155194512e-05, "loss": 0.4448, "step": 802 }, { "epoch": 0.5062054292581064, "grad_norm": 0.16345997154712677, "learning_rate": 3.8631186174903236e-05, "loss": 0.3848, "step": 803 }, { "epoch": 0.5068358220716284, "grad_norm": 0.13920150697231293, "learning_rate": 3.861548465030845e-05, "loss": 0.3894, "step": 804 }, { "epoch": 0.5074662148851503, "grad_norm": 0.15711119771003723, "learning_rate": 3.8599766993878173e-05, "loss": 0.433, "step": 805 }, { "epoch": 0.5080966076986723, "grad_norm": 0.16589997708797455, "learning_rate": 3.858403322134605e-05, "loss": 0.4011, "step": 806 }, { "epoch": 0.5087270005121942, "grad_norm": 0.15303505957126617, "learning_rate": 3.8568283348461794e-05, "loss": 0.4127, "step": 807 }, { "epoch": 0.5093573933257161, "grad_norm": 0.13526608049869537, "learning_rate": 3.855251739099127e-05, "loss": 0.3545, "step": 808 }, { "epoch": 0.509987786139238, "grad_norm": 0.14198428392410278, "learning_rate": 3.8536735364716425e-05, "loss": 0.4364, "step": 809 }, { "epoch": 0.51061817895276, "grad_norm": 0.14397798478603363, "learning_rate": 3.852093728543529e-05, "loss": 0.4014, "step": 810 }, { "epoch": 0.5112485717662819, "grad_norm": 0.13944432139396667, "learning_rate": 3.8505123168961987e-05, "loss": 0.4326, "step": 811 }, { "epoch": 0.5118789645798038, "grad_norm": 0.157311350107193, "learning_rate": 3.848929303112668e-05, "loss": 0.453, "step": 812 }, { "epoch": 0.5125093573933257, "grad_norm": 0.14742758870124817, "learning_rate": 3.8473446887775546e-05, "loss": 0.5064, "step": 813 }, { "epoch": 0.5131397502068477, "grad_norm": 0.1551235318183899, "learning_rate": 3.845758475477083e-05, "loss": 0.4191, "step": 814 }, { "epoch": 0.5137701430203696, "grad_norm": 0.13968229293823242, "learning_rate": 3.844170664799074e-05, "loss": 0.3774, "step": 815 }, { "epoch": 0.5144005358338914, "grad_norm": 0.14213190972805023, "learning_rate": 3.8425812583329514e-05, "loss": 0.411, "step": 816 }, { "epoch": 0.5150309286474134, "grad_norm": 0.1836906224489212, "learning_rate": 3.840990257669732e-05, "loss": 0.4508, "step": 817 }, { "epoch": 0.5156613214609354, "grad_norm": 0.13046447932720184, "learning_rate": 3.8393976644020325e-05, "loss": 0.3677, "step": 818 }, { "epoch": 0.5162917142744573, "grad_norm": 0.1387125700712204, "learning_rate": 3.837803480124061e-05, "loss": 0.3913, "step": 819 }, { "epoch": 0.5169221070879791, "grad_norm": 0.1600324660539627, "learning_rate": 3.8362077064316195e-05, "loss": 0.3776, "step": 820 }, { "epoch": 0.5175524999015011, "grad_norm": 0.12740956246852875, "learning_rate": 3.834610344922101e-05, "loss": 0.4008, "step": 821 }, { "epoch": 0.518182892715023, "grad_norm": 0.13033899664878845, "learning_rate": 3.833011397194487e-05, "loss": 0.4032, "step": 822 }, { "epoch": 0.518813285528545, "grad_norm": 0.15081040561199188, "learning_rate": 3.831410864849348e-05, "loss": 0.3461, "step": 823 }, { "epoch": 0.519443678342067, "grad_norm": 0.13452240824699402, "learning_rate": 3.829808749488841e-05, "loss": 0.3746, "step": 824 }, { "epoch": 0.5200740711555888, "grad_norm": 0.19129051268100739, "learning_rate": 3.8282050527167056e-05, "loss": 0.483, "step": 825 }, { "epoch": 0.5207044639691107, "grad_norm": 0.1633075773715973, "learning_rate": 3.826599776138266e-05, "loss": 0.4815, "step": 826 }, { "epoch": 0.5213348567826327, "grad_norm": 0.1542254090309143, "learning_rate": 3.824992921360428e-05, "loss": 0.4134, "step": 827 }, { "epoch": 0.5219652495961546, "grad_norm": 0.1658627986907959, "learning_rate": 3.8233844899916765e-05, "loss": 0.4263, "step": 828 }, { "epoch": 0.5225956424096765, "grad_norm": 0.15097402036190033, "learning_rate": 3.821774483642075e-05, "loss": 0.4252, "step": 829 }, { "epoch": 0.5232260352231984, "grad_norm": 0.14764727652072906, "learning_rate": 3.8201629039232624e-05, "loss": 0.4535, "step": 830 }, { "epoch": 0.5238564280367204, "grad_norm": 0.1661219298839569, "learning_rate": 3.818549752448454e-05, "loss": 0.4581, "step": 831 }, { "epoch": 0.5244868208502423, "grad_norm": 0.16364407539367676, "learning_rate": 3.816935030832439e-05, "loss": 0.415, "step": 832 }, { "epoch": 0.5251172136637642, "grad_norm": 0.14157086610794067, "learning_rate": 3.815318740691576e-05, "loss": 0.3201, "step": 833 }, { "epoch": 0.5257476064772861, "grad_norm": 0.1508474200963974, "learning_rate": 3.8137008836437954e-05, "loss": 0.4632, "step": 834 }, { "epoch": 0.5263779992908081, "grad_norm": 0.16357192397117615, "learning_rate": 3.8120814613085965e-05, "loss": 0.4291, "step": 835 }, { "epoch": 0.52700839210433, "grad_norm": 0.1774810403585434, "learning_rate": 3.810460475307044e-05, "loss": 0.4183, "step": 836 }, { "epoch": 0.527638784917852, "grad_norm": 0.14707089960575104, "learning_rate": 3.808837927261768e-05, "loss": 0.4873, "step": 837 }, { "epoch": 0.5282691777313738, "grad_norm": 0.17293687164783478, "learning_rate": 3.8072138187969637e-05, "loss": 0.3572, "step": 838 }, { "epoch": 0.5288995705448958, "grad_norm": 0.15435566008090973, "learning_rate": 3.805588151538387e-05, "loss": 0.4386, "step": 839 }, { "epoch": 0.5295299633584177, "grad_norm": 0.13135646283626556, "learning_rate": 3.803960927113355e-05, "loss": 0.3141, "step": 840 }, { "epoch": 0.5301603561719397, "grad_norm": 0.1465950310230255, "learning_rate": 3.802332147150742e-05, "loss": 0.4001, "step": 841 }, { "epoch": 0.5307907489854615, "grad_norm": 0.15999934077262878, "learning_rate": 3.800701813280982e-05, "loss": 0.4008, "step": 842 }, { "epoch": 0.5314211417989835, "grad_norm": 0.20061102509498596, "learning_rate": 3.7990699271360624e-05, "loss": 0.496, "step": 843 }, { "epoch": 0.5320515346125054, "grad_norm": 0.15036503970623016, "learning_rate": 3.7974364903495244e-05, "loss": 0.4079, "step": 844 }, { "epoch": 0.5326819274260274, "grad_norm": 0.14602142572402954, "learning_rate": 3.795801504556464e-05, "loss": 0.3936, "step": 845 }, { "epoch": 0.5333123202395492, "grad_norm": 0.15145988762378693, "learning_rate": 3.7941649713935235e-05, "loss": 0.4181, "step": 846 }, { "epoch": 0.5339427130530712, "grad_norm": 0.155500590801239, "learning_rate": 3.792526892498897e-05, "loss": 0.4588, "step": 847 }, { "epoch": 0.5345731058665931, "grad_norm": 0.15288515388965607, "learning_rate": 3.790887269512326e-05, "loss": 0.3514, "step": 848 }, { "epoch": 0.5352034986801151, "grad_norm": 0.13540484011173248, "learning_rate": 3.7892461040750975e-05, "loss": 0.41, "step": 849 }, { "epoch": 0.535833891493637, "grad_norm": 0.1624019742012024, "learning_rate": 3.787603397830042e-05, "loss": 0.4208, "step": 850 }, { "epoch": 0.5364642843071589, "grad_norm": 0.14963330328464508, "learning_rate": 3.785959152421531e-05, "loss": 0.4226, "step": 851 }, { "epoch": 0.5370946771206808, "grad_norm": 0.1439056098461151, "learning_rate": 3.784313369495479e-05, "loss": 0.3824, "step": 852 }, { "epoch": 0.5377250699342028, "grad_norm": 0.13627079129219055, "learning_rate": 3.78266605069934e-05, "loss": 0.3943, "step": 853 }, { "epoch": 0.5383554627477247, "grad_norm": 0.12575384974479675, "learning_rate": 3.7810171976821014e-05, "loss": 0.4188, "step": 854 }, { "epoch": 0.5389858555612466, "grad_norm": 0.11646067351102829, "learning_rate": 3.779366812094292e-05, "loss": 0.391, "step": 855 }, { "epoch": 0.5396162483747685, "grad_norm": 0.19494909048080444, "learning_rate": 3.777714895587971e-05, "loss": 0.4874, "step": 856 }, { "epoch": 0.5402466411882905, "grad_norm": 0.1547946184873581, "learning_rate": 3.7760614498167294e-05, "loss": 0.4467, "step": 857 }, { "epoch": 0.5408770340018124, "grad_norm": 0.12604691088199615, "learning_rate": 3.7744064764356926e-05, "loss": 0.3483, "step": 858 }, { "epoch": 0.5415074268153343, "grad_norm": 0.18869976699352264, "learning_rate": 3.772749977101512e-05, "loss": 0.4451, "step": 859 }, { "epoch": 0.5421378196288562, "grad_norm": 0.16798147559165955, "learning_rate": 3.771091953472368e-05, "loss": 0.4239, "step": 860 }, { "epoch": 0.5427682124423782, "grad_norm": 0.136243999004364, "learning_rate": 3.769432407207966e-05, "loss": 0.3578, "step": 861 }, { "epoch": 0.5433986052559001, "grad_norm": 0.1705186516046524, "learning_rate": 3.767771339969537e-05, "loss": 0.4837, "step": 862 }, { "epoch": 0.544028998069422, "grad_norm": 0.1542588621377945, "learning_rate": 3.766108753419832e-05, "loss": 0.3601, "step": 863 }, { "epoch": 0.5446593908829439, "grad_norm": 0.15121257305145264, "learning_rate": 3.764444649223126e-05, "loss": 0.3634, "step": 864 }, { "epoch": 0.5452897836964659, "grad_norm": 0.1671563982963562, "learning_rate": 3.762779029045211e-05, "loss": 0.3991, "step": 865 }, { "epoch": 0.5459201765099878, "grad_norm": 0.16092930734157562, "learning_rate": 3.7611118945533974e-05, "loss": 0.4546, "step": 866 }, { "epoch": 0.5465505693235098, "grad_norm": 0.1620696634054184, "learning_rate": 3.759443247416511e-05, "loss": 0.417, "step": 867 }, { "epoch": 0.5471809621370316, "grad_norm": 0.1499662548303604, "learning_rate": 3.7577730893048906e-05, "loss": 0.4409, "step": 868 }, { "epoch": 0.5478113549505536, "grad_norm": 0.14857828617095947, "learning_rate": 3.7561014218903915e-05, "loss": 0.392, "step": 869 }, { "epoch": 0.5484417477640755, "grad_norm": 0.1388327032327652, "learning_rate": 3.754428246846375e-05, "loss": 0.4249, "step": 870 }, { "epoch": 0.5490721405775975, "grad_norm": 0.12996181845664978, "learning_rate": 3.752753565847715e-05, "loss": 0.4354, "step": 871 }, { "epoch": 0.5497025333911193, "grad_norm": 0.14674219489097595, "learning_rate": 3.75107738057079e-05, "loss": 0.3574, "step": 872 }, { "epoch": 0.5503329262046412, "grad_norm": 0.14478866755962372, "learning_rate": 3.7493996926934886e-05, "loss": 0.4462, "step": 873 }, { "epoch": 0.5509633190181632, "grad_norm": 0.17569395899772644, "learning_rate": 3.7477205038952e-05, "loss": 0.4337, "step": 874 }, { "epoch": 0.5515937118316852, "grad_norm": 0.19228047132492065, "learning_rate": 3.7460398158568145e-05, "loss": 0.4272, "step": 875 }, { "epoch": 0.552224104645207, "grad_norm": 0.16651074588298798, "learning_rate": 3.7443576302607273e-05, "loss": 0.4306, "step": 876 }, { "epoch": 0.5528544974587289, "grad_norm": 0.15477438271045685, "learning_rate": 3.74267394879083e-05, "loss": 0.4356, "step": 877 }, { "epoch": 0.5534848902722509, "grad_norm": 0.13447880744934082, "learning_rate": 3.740988773132513e-05, "loss": 0.3399, "step": 878 }, { "epoch": 0.5541152830857728, "grad_norm": 0.1480575054883957, "learning_rate": 3.739302104972662e-05, "loss": 0.3658, "step": 879 }, { "epoch": 0.5547456758992948, "grad_norm": 0.1701166033744812, "learning_rate": 3.737613945999656e-05, "loss": 0.3752, "step": 880 }, { "epoch": 0.5553760687128166, "grad_norm": 0.16578589379787445, "learning_rate": 3.7359242979033655e-05, "loss": 0.378, "step": 881 }, { "epoch": 0.5560064615263386, "grad_norm": 0.1562201976776123, "learning_rate": 3.7342331623751554e-05, "loss": 0.3944, "step": 882 }, { "epoch": 0.5566368543398605, "grad_norm": 0.18078267574310303, "learning_rate": 3.732540541107876e-05, "loss": 0.4377, "step": 883 }, { "epoch": 0.5572672471533825, "grad_norm": 0.13611632585525513, "learning_rate": 3.730846435795864e-05, "loss": 0.3743, "step": 884 }, { "epoch": 0.5578976399669043, "grad_norm": 0.1585019826889038, "learning_rate": 3.729150848134947e-05, "loss": 0.4132, "step": 885 }, { "epoch": 0.5585280327804263, "grad_norm": 0.14006251096725464, "learning_rate": 3.727453779822431e-05, "loss": 0.369, "step": 886 }, { "epoch": 0.5591584255939482, "grad_norm": 0.1613488793373108, "learning_rate": 3.7257552325571055e-05, "loss": 0.4728, "step": 887 }, { "epoch": 0.5597888184074702, "grad_norm": 0.1657625436782837, "learning_rate": 3.7240552080392425e-05, "loss": 0.4398, "step": 888 }, { "epoch": 0.560419211220992, "grad_norm": 0.15457575023174286, "learning_rate": 3.722353707970591e-05, "loss": 0.4068, "step": 889 }, { "epoch": 0.561049604034514, "grad_norm": 0.13711117208003998, "learning_rate": 3.720650734054378e-05, "loss": 0.3746, "step": 890 }, { "epoch": 0.5616799968480359, "grad_norm": 0.1615670621395111, "learning_rate": 3.7189462879953035e-05, "loss": 0.4717, "step": 891 }, { "epoch": 0.5623103896615579, "grad_norm": 0.17041678726673126, "learning_rate": 3.717240371499545e-05, "loss": 0.4522, "step": 892 }, { "epoch": 0.5629407824750798, "grad_norm": 0.14367422461509705, "learning_rate": 3.715532986274748e-05, "loss": 0.3958, "step": 893 }, { "epoch": 0.5635711752886017, "grad_norm": 0.1326402723789215, "learning_rate": 3.713824134030031e-05, "loss": 0.3662, "step": 894 }, { "epoch": 0.5642015681021236, "grad_norm": 0.15629824995994568, "learning_rate": 3.7121138164759804e-05, "loss": 0.4241, "step": 895 }, { "epoch": 0.5648319609156456, "grad_norm": 0.14097222685813904, "learning_rate": 3.710402035324648e-05, "loss": 0.3501, "step": 896 }, { "epoch": 0.5654623537291675, "grad_norm": 0.1376744508743286, "learning_rate": 3.7086887922895525e-05, "loss": 0.3405, "step": 897 }, { "epoch": 0.5660927465426894, "grad_norm": 0.1320733278989792, "learning_rate": 3.706974089085675e-05, "loss": 0.3882, "step": 898 }, { "epoch": 0.5667231393562113, "grad_norm": 0.15197254717350006, "learning_rate": 3.705257927429458e-05, "loss": 0.3226, "step": 899 }, { "epoch": 0.5673535321697333, "grad_norm": 0.16653065383434296, "learning_rate": 3.703540309038805e-05, "loss": 0.4761, "step": 900 }, { "epoch": 0.5679839249832552, "grad_norm": 0.14119040966033936, "learning_rate": 3.7018212356330766e-05, "loss": 0.4173, "step": 901 }, { "epoch": 0.5686143177967771, "grad_norm": 0.12635064125061035, "learning_rate": 3.700100708933091e-05, "loss": 0.3189, "step": 902 }, { "epoch": 0.569244710610299, "grad_norm": 0.15768444538116455, "learning_rate": 3.698378730661121e-05, "loss": 0.4362, "step": 903 }, { "epoch": 0.569875103423821, "grad_norm": 0.1234264075756073, "learning_rate": 3.6966553025408906e-05, "loss": 0.3721, "step": 904 }, { "epoch": 0.5705054962373429, "grad_norm": 0.2133094221353531, "learning_rate": 3.694930426297577e-05, "loss": 0.5438, "step": 905 }, { "epoch": 0.5711358890508649, "grad_norm": 0.15444080531597137, "learning_rate": 3.693204103657807e-05, "loss": 0.4884, "step": 906 }, { "epoch": 0.5717662818643867, "grad_norm": 0.14413847029209137, "learning_rate": 3.691476336349654e-05, "loss": 0.3625, "step": 907 }, { "epoch": 0.5723966746779087, "grad_norm": 0.14719709753990173, "learning_rate": 3.689747126102639e-05, "loss": 0.4258, "step": 908 }, { "epoch": 0.5730270674914306, "grad_norm": 0.16312751173973083, "learning_rate": 3.6880164746477256e-05, "loss": 0.3715, "step": 909 }, { "epoch": 0.5736574603049526, "grad_norm": 0.15884438157081604, "learning_rate": 3.686284383717323e-05, "loss": 0.3683, "step": 910 }, { "epoch": 0.5742878531184744, "grad_norm": 0.1744864583015442, "learning_rate": 3.6845508550452784e-05, "loss": 0.4181, "step": 911 }, { "epoch": 0.5749182459319964, "grad_norm": 0.16491571068763733, "learning_rate": 3.682815890366879e-05, "loss": 0.4146, "step": 912 }, { "epoch": 0.5755486387455183, "grad_norm": 0.13504120707511902, "learning_rate": 3.6810794914188505e-05, "loss": 0.3924, "step": 913 }, { "epoch": 0.5761790315590403, "grad_norm": 0.16097284853458405, "learning_rate": 3.679341659939353e-05, "loss": 0.3781, "step": 914 }, { "epoch": 0.5768094243725621, "grad_norm": 0.1496012806892395, "learning_rate": 3.677602397667982e-05, "loss": 0.3979, "step": 915 }, { "epoch": 0.5774398171860841, "grad_norm": 0.14697562158107758, "learning_rate": 3.675861706345763e-05, "loss": 0.4282, "step": 916 }, { "epoch": 0.578070209999606, "grad_norm": 0.15116776525974274, "learning_rate": 3.674119587715156e-05, "loss": 0.3885, "step": 917 }, { "epoch": 0.578700602813128, "grad_norm": 0.20044632256031036, "learning_rate": 3.6723760435200446e-05, "loss": 0.5018, "step": 918 }, { "epoch": 0.5793309956266498, "grad_norm": 0.1666855663061142, "learning_rate": 3.670631075505743e-05, "loss": 0.4903, "step": 919 }, { "epoch": 0.5799613884401718, "grad_norm": 0.14125750958919525, "learning_rate": 3.66888468541899e-05, "loss": 0.4394, "step": 920 }, { "epoch": 0.5805917812536937, "grad_norm": 0.14239953458309174, "learning_rate": 3.6671368750079464e-05, "loss": 0.4216, "step": 921 }, { "epoch": 0.5812221740672157, "grad_norm": 0.15734338760375977, "learning_rate": 3.6653876460221975e-05, "loss": 0.4036, "step": 922 }, { "epoch": 0.5818525668807376, "grad_norm": 0.15128806233406067, "learning_rate": 3.6636370002127454e-05, "loss": 0.4421, "step": 923 }, { "epoch": 0.5824829596942595, "grad_norm": 0.1520044207572937, "learning_rate": 3.661884939332014e-05, "loss": 0.4424, "step": 924 }, { "epoch": 0.5831133525077814, "grad_norm": 0.1292216032743454, "learning_rate": 3.66013146513384e-05, "loss": 0.3393, "step": 925 }, { "epoch": 0.5837437453213034, "grad_norm": 0.13135753571987152, "learning_rate": 3.658376579373478e-05, "loss": 0.4032, "step": 926 }, { "epoch": 0.5843741381348253, "grad_norm": 0.15352147817611694, "learning_rate": 3.656620283807594e-05, "loss": 0.4581, "step": 927 }, { "epoch": 0.5850045309483471, "grad_norm": 0.131231427192688, "learning_rate": 3.654862580194265e-05, "loss": 0.3887, "step": 928 }, { "epoch": 0.5856349237618691, "grad_norm": 0.14569136500358582, "learning_rate": 3.6531034702929786e-05, "loss": 0.3301, "step": 929 }, { "epoch": 0.586265316575391, "grad_norm": 0.12439321726560593, "learning_rate": 3.651342955864629e-05, "loss": 0.3648, "step": 930 }, { "epoch": 0.586895709388913, "grad_norm": 0.16859029233455658, "learning_rate": 3.6495810386715173e-05, "loss": 0.3427, "step": 931 }, { "epoch": 0.5875261022024348, "grad_norm": 0.15363626182079315, "learning_rate": 3.647817720477348e-05, "loss": 0.4368, "step": 932 }, { "epoch": 0.5881564950159568, "grad_norm": 0.15837733447551727, "learning_rate": 3.6460530030472284e-05, "loss": 0.4757, "step": 933 }, { "epoch": 0.5887868878294787, "grad_norm": 0.13571250438690186, "learning_rate": 3.644286888147666e-05, "loss": 0.4153, "step": 934 }, { "epoch": 0.5894172806430007, "grad_norm": 0.15629878640174866, "learning_rate": 3.642519377546568e-05, "loss": 0.3564, "step": 935 }, { "epoch": 0.5900476734565226, "grad_norm": 0.18650545179843903, "learning_rate": 3.640750473013239e-05, "loss": 0.453, "step": 936 }, { "epoch": 0.5906780662700445, "grad_norm": 0.12352810800075531, "learning_rate": 3.638980176318378e-05, "loss": 0.3529, "step": 937 }, { "epoch": 0.5913084590835664, "grad_norm": 0.15796920657157898, "learning_rate": 3.637208489234077e-05, "loss": 0.4279, "step": 938 }, { "epoch": 0.5919388518970884, "grad_norm": 0.12148329615592957, "learning_rate": 3.635435413533821e-05, "loss": 0.329, "step": 939 }, { "epoch": 0.5925692447106103, "grad_norm": 0.13628233969211578, "learning_rate": 3.633660950992485e-05, "loss": 0.3973, "step": 940 }, { "epoch": 0.5931996375241322, "grad_norm": 0.16014662384986877, "learning_rate": 3.631885103386331e-05, "loss": 0.4577, "step": 941 }, { "epoch": 0.5938300303376541, "grad_norm": 0.15572448074817657, "learning_rate": 3.63010787249301e-05, "loss": 0.3086, "step": 942 }, { "epoch": 0.5944604231511761, "grad_norm": 0.13830454647541046, "learning_rate": 3.6283292600915554e-05, "loss": 0.3489, "step": 943 }, { "epoch": 0.595090815964698, "grad_norm": 0.19943471252918243, "learning_rate": 3.626549267962383e-05, "loss": 0.4587, "step": 944 }, { "epoch": 0.5957212087782199, "grad_norm": 0.17432811856269836, "learning_rate": 3.624767897887293e-05, "loss": 0.4835, "step": 945 }, { "epoch": 0.5963516015917418, "grad_norm": 0.16056682169437408, "learning_rate": 3.622985151649461e-05, "loss": 0.4116, "step": 946 }, { "epoch": 0.5969819944052638, "grad_norm": 0.1521116942167282, "learning_rate": 3.621201031033443e-05, "loss": 0.3751, "step": 947 }, { "epoch": 0.5976123872187857, "grad_norm": 0.15403449535369873, "learning_rate": 3.6194155378251694e-05, "loss": 0.4144, "step": 948 }, { "epoch": 0.5982427800323077, "grad_norm": 0.15799763798713684, "learning_rate": 3.617628673811945e-05, "loss": 0.3858, "step": 949 }, { "epoch": 0.5988731728458295, "grad_norm": 0.1562921553850174, "learning_rate": 3.615840440782447e-05, "loss": 0.3842, "step": 950 }, { "epoch": 0.5995035656593515, "grad_norm": 0.1535552740097046, "learning_rate": 3.6140508405267236e-05, "loss": 0.4072, "step": 951 }, { "epoch": 0.6001339584728734, "grad_norm": 0.14467324316501617, "learning_rate": 3.612259874836188e-05, "loss": 0.3785, "step": 952 }, { "epoch": 0.6007643512863954, "grad_norm": 0.17959435284137726, "learning_rate": 3.610467545503627e-05, "loss": 0.4085, "step": 953 }, { "epoch": 0.6013947440999172, "grad_norm": 0.1359519064426422, "learning_rate": 3.608673854323185e-05, "loss": 0.3582, "step": 954 }, { "epoch": 0.6020251369134392, "grad_norm": 0.1733047515153885, "learning_rate": 3.606878803090375e-05, "loss": 0.403, "step": 955 }, { "epoch": 0.6026555297269611, "grad_norm": 0.1633727103471756, "learning_rate": 3.6050823936020684e-05, "loss": 0.3801, "step": 956 }, { "epoch": 0.6032859225404831, "grad_norm": 0.1312204748392105, "learning_rate": 3.603284627656498e-05, "loss": 0.35, "step": 957 }, { "epoch": 0.6039163153540049, "grad_norm": 0.15054136514663696, "learning_rate": 3.601485507053254e-05, "loss": 0.3937, "step": 958 }, { "epoch": 0.6045467081675269, "grad_norm": 0.14886727929115295, "learning_rate": 3.5996850335932803e-05, "loss": 0.3875, "step": 959 }, { "epoch": 0.6051771009810488, "grad_norm": 0.15318524837493896, "learning_rate": 3.59788320907888e-05, "loss": 0.4236, "step": 960 }, { "epoch": 0.6058074937945708, "grad_norm": 0.12440776824951172, "learning_rate": 3.5960800353137044e-05, "loss": 0.4105, "step": 961 }, { "epoch": 0.6064378866080926, "grad_norm": 0.13105151057243347, "learning_rate": 3.594275514102757e-05, "loss": 0.3824, "step": 962 }, { "epoch": 0.6070682794216146, "grad_norm": 0.14685268700122833, "learning_rate": 3.59246964725239e-05, "loss": 0.4375, "step": 963 }, { "epoch": 0.6076986722351365, "grad_norm": 0.13163559138774872, "learning_rate": 3.5906624365703024e-05, "loss": 0.3601, "step": 964 }, { "epoch": 0.6083290650486585, "grad_norm": 0.17752410471439362, "learning_rate": 3.588853883865538e-05, "loss": 0.4102, "step": 965 }, { "epoch": 0.6089594578621804, "grad_norm": 0.13089518249034882, "learning_rate": 3.587043990948485e-05, "loss": 0.4087, "step": 966 }, { "epoch": 0.6095898506757023, "grad_norm": 0.13238897919654846, "learning_rate": 3.5852327596308724e-05, "loss": 0.4105, "step": 967 }, { "epoch": 0.6102202434892242, "grad_norm": 0.17239437997341156, "learning_rate": 3.583420191725769e-05, "loss": 0.4815, "step": 968 }, { "epoch": 0.6108506363027462, "grad_norm": 0.1444268673658371, "learning_rate": 3.5816062890475824e-05, "loss": 0.4018, "step": 969 }, { "epoch": 0.6114810291162681, "grad_norm": 0.14324289560317993, "learning_rate": 3.579791053412056e-05, "loss": 0.3432, "step": 970 }, { "epoch": 0.61211142192979, "grad_norm": 0.17291143536567688, "learning_rate": 3.5779744866362645e-05, "loss": 0.4084, "step": 971 }, { "epoch": 0.6127418147433119, "grad_norm": 0.1407811939716339, "learning_rate": 3.576156590538621e-05, "loss": 0.3445, "step": 972 }, { "epoch": 0.6133722075568339, "grad_norm": 0.158295676112175, "learning_rate": 3.5743373669388644e-05, "loss": 0.3933, "step": 973 }, { "epoch": 0.6140026003703558, "grad_norm": 0.1508656144142151, "learning_rate": 3.572516817658065e-05, "loss": 0.4026, "step": 974 }, { "epoch": 0.6146329931838777, "grad_norm": 0.12717203795909882, "learning_rate": 3.570694944518618e-05, "loss": 0.395, "step": 975 }, { "epoch": 0.6152633859973996, "grad_norm": 0.14242561161518097, "learning_rate": 3.568871749344246e-05, "loss": 0.347, "step": 976 }, { "epoch": 0.6158937788109216, "grad_norm": 0.15714222192764282, "learning_rate": 3.5670472339599944e-05, "loss": 0.4108, "step": 977 }, { "epoch": 0.6165241716244435, "grad_norm": 0.18858179450035095, "learning_rate": 3.565221400192227e-05, "loss": 0.4192, "step": 978 }, { "epoch": 0.6171545644379655, "grad_norm": 0.12744100391864777, "learning_rate": 3.5633942498686344e-05, "loss": 0.4141, "step": 979 }, { "epoch": 0.6177849572514873, "grad_norm": 0.12514697015285492, "learning_rate": 3.5615657848182175e-05, "loss": 0.3693, "step": 980 }, { "epoch": 0.6184153500650093, "grad_norm": 0.11576073616743088, "learning_rate": 3.5597360068712976e-05, "loss": 0.3417, "step": 981 }, { "epoch": 0.6190457428785312, "grad_norm": 0.14353668689727783, "learning_rate": 3.557904917859508e-05, "loss": 0.368, "step": 982 }, { "epoch": 0.6196761356920532, "grad_norm": 0.15207786858081818, "learning_rate": 3.556072519615797e-05, "loss": 0.3417, "step": 983 }, { "epoch": 0.620306528505575, "grad_norm": 0.22169116139411926, "learning_rate": 3.554238813974423e-05, "loss": 0.5027, "step": 984 }, { "epoch": 0.620936921319097, "grad_norm": 0.15463010966777802, "learning_rate": 3.5524038027709495e-05, "loss": 0.3696, "step": 985 }, { "epoch": 0.6215673141326189, "grad_norm": 0.19080068171024323, "learning_rate": 3.550567487842251e-05, "loss": 0.4981, "step": 986 }, { "epoch": 0.6221977069461408, "grad_norm": 0.14745832979679108, "learning_rate": 3.548729871026505e-05, "loss": 0.3288, "step": 987 }, { "epoch": 0.6228280997596627, "grad_norm": 0.16360951960086823, "learning_rate": 3.5468909541631925e-05, "loss": 0.4344, "step": 988 }, { "epoch": 0.6234584925731846, "grad_norm": 0.14530690014362335, "learning_rate": 3.545050739093097e-05, "loss": 0.3505, "step": 989 }, { "epoch": 0.6240888853867066, "grad_norm": 0.1620369851589203, "learning_rate": 3.5432092276583004e-05, "loss": 0.3794, "step": 990 }, { "epoch": 0.6247192782002285, "grad_norm": 0.15294252336025238, "learning_rate": 3.541366421702183e-05, "loss": 0.4205, "step": 991 }, { "epoch": 0.6253496710137505, "grad_norm": 0.1411382108926773, "learning_rate": 3.5395223230694186e-05, "loss": 0.4292, "step": 992 }, { "epoch": 0.6259800638272723, "grad_norm": 0.15150560438632965, "learning_rate": 3.537676933605979e-05, "loss": 0.3929, "step": 993 }, { "epoch": 0.6266104566407943, "grad_norm": 0.15619826316833496, "learning_rate": 3.535830255159125e-05, "loss": 0.3956, "step": 994 }, { "epoch": 0.6272408494543162, "grad_norm": 0.1517878621816635, "learning_rate": 3.533982289577409e-05, "loss": 0.3902, "step": 995 }, { "epoch": 0.6278712422678382, "grad_norm": 0.13279902935028076, "learning_rate": 3.5321330387106714e-05, "loss": 0.4051, "step": 996 }, { "epoch": 0.62850163508136, "grad_norm": 0.15745455026626587, "learning_rate": 3.5302825044100396e-05, "loss": 0.4214, "step": 997 }, { "epoch": 0.629132027894882, "grad_norm": 0.13961806893348694, "learning_rate": 3.528430688527925e-05, "loss": 0.3664, "step": 998 }, { "epoch": 0.6297624207084039, "grad_norm": 0.18050052225589752, "learning_rate": 3.5265775929180224e-05, "loss": 0.505, "step": 999 }, { "epoch": 0.6303928135219259, "grad_norm": 0.13805602490901947, "learning_rate": 3.5247232194353076e-05, "loss": 0.3696, "step": 1000 }, { "epoch": 0.6303928135219259, "eval_loss": 0.4341304302215576, "eval_runtime": 222.67, "eval_samples_per_second": 4.491, "eval_steps_per_second": 4.491, "step": 1000 }, { "epoch": 0.6310232063354477, "grad_norm": 0.1473533809185028, "learning_rate": 3.522867569936036e-05, "loss": 0.3384, "step": 1001 }, { "epoch": 0.6316535991489697, "grad_norm": 0.17155957221984863, "learning_rate": 3.5210106462777386e-05, "loss": 0.4029, "step": 1002 }, { "epoch": 0.6322839919624916, "grad_norm": 0.1242462620139122, "learning_rate": 3.519152450319225e-05, "loss": 0.3844, "step": 1003 }, { "epoch": 0.6329143847760136, "grad_norm": 0.15569186210632324, "learning_rate": 3.517292983920575e-05, "loss": 0.3944, "step": 1004 }, { "epoch": 0.6335447775895354, "grad_norm": 0.14550328254699707, "learning_rate": 3.5154322489431415e-05, "loss": 0.3772, "step": 1005 }, { "epoch": 0.6341751704030574, "grad_norm": 0.13598184287548065, "learning_rate": 3.5135702472495485e-05, "loss": 0.3739, "step": 1006 }, { "epoch": 0.6348055632165793, "grad_norm": 0.17206841707229614, "learning_rate": 3.511706980703686e-05, "loss": 0.4588, "step": 1007 }, { "epoch": 0.6354359560301013, "grad_norm": 0.1455783098936081, "learning_rate": 3.509842451170712e-05, "loss": 0.4551, "step": 1008 }, { "epoch": 0.6360663488436232, "grad_norm": 0.15823356807231903, "learning_rate": 3.507976660517047e-05, "loss": 0.3975, "step": 1009 }, { "epoch": 0.6366967416571451, "grad_norm": 0.1431056261062622, "learning_rate": 3.506109610610374e-05, "loss": 0.4013, "step": 1010 }, { "epoch": 0.637327134470667, "grad_norm": 0.14573250710964203, "learning_rate": 3.504241303319639e-05, "loss": 0.4071, "step": 1011 }, { "epoch": 0.637957527284189, "grad_norm": 0.15625494718551636, "learning_rate": 3.5023717405150436e-05, "loss": 0.4716, "step": 1012 }, { "epoch": 0.6385879200977109, "grad_norm": 0.17605744302272797, "learning_rate": 3.500500924068048e-05, "loss": 0.4658, "step": 1013 }, { "epoch": 0.6392183129112328, "grad_norm": 0.1206241101026535, "learning_rate": 3.4986288558513665e-05, "loss": 0.3339, "step": 1014 }, { "epoch": 0.6398487057247547, "grad_norm": 0.15223607420921326, "learning_rate": 3.4967555377389665e-05, "loss": 0.4369, "step": 1015 }, { "epoch": 0.6404790985382767, "grad_norm": 0.14725369215011597, "learning_rate": 3.494880971606068e-05, "loss": 0.4466, "step": 1016 }, { "epoch": 0.6411094913517986, "grad_norm": 0.14819376170635223, "learning_rate": 3.493005159329137e-05, "loss": 0.3763, "step": 1017 }, { "epoch": 0.6417398841653205, "grad_norm": 0.13168439269065857, "learning_rate": 3.491128102785891e-05, "loss": 0.3794, "step": 1018 }, { "epoch": 0.6423702769788424, "grad_norm": 0.17571082711219788, "learning_rate": 3.48924980385529e-05, "loss": 0.4371, "step": 1019 }, { "epoch": 0.6430006697923644, "grad_norm": 0.154144749045372, "learning_rate": 3.487370264417539e-05, "loss": 0.4427, "step": 1020 }, { "epoch": 0.6436310626058863, "grad_norm": 0.2012409120798111, "learning_rate": 3.4854894863540834e-05, "loss": 0.5475, "step": 1021 }, { "epoch": 0.6442614554194083, "grad_norm": 0.14221590757369995, "learning_rate": 3.4836074715476105e-05, "loss": 0.475, "step": 1022 }, { "epoch": 0.6448918482329301, "grad_norm": 0.12871815264225006, "learning_rate": 3.481724221882045e-05, "loss": 0.3711, "step": 1023 }, { "epoch": 0.6455222410464521, "grad_norm": 0.16108739376068115, "learning_rate": 3.479839739242547e-05, "loss": 0.4116, "step": 1024 }, { "epoch": 0.646152633859974, "grad_norm": 0.16816940903663635, "learning_rate": 3.47795402551551e-05, "loss": 0.4328, "step": 1025 }, { "epoch": 0.646783026673496, "grad_norm": 0.156079962849617, "learning_rate": 3.476067082588562e-05, "loss": 0.388, "step": 1026 }, { "epoch": 0.6474134194870178, "grad_norm": 0.1362450271844864, "learning_rate": 3.474178912350561e-05, "loss": 0.4717, "step": 1027 }, { "epoch": 0.6480438123005398, "grad_norm": 0.14149360358715057, "learning_rate": 3.4722895166915925e-05, "loss": 0.3397, "step": 1028 }, { "epoch": 0.6486742051140617, "grad_norm": 0.141607403755188, "learning_rate": 3.470398897502968e-05, "loss": 0.4298, "step": 1029 }, { "epoch": 0.6493045979275837, "grad_norm": 0.1447194516658783, "learning_rate": 3.4685070566772265e-05, "loss": 0.4161, "step": 1030 }, { "epoch": 0.6499349907411055, "grad_norm": 0.16443675756454468, "learning_rate": 3.4666139961081275e-05, "loss": 0.4223, "step": 1031 }, { "epoch": 0.6505653835546275, "grad_norm": 0.1416953057050705, "learning_rate": 3.4647197176906525e-05, "loss": 0.4227, "step": 1032 }, { "epoch": 0.6511957763681494, "grad_norm": 0.14385829865932465, "learning_rate": 3.462824223321002e-05, "loss": 0.3944, "step": 1033 }, { "epoch": 0.6518261691816714, "grad_norm": 0.13178785145282745, "learning_rate": 3.460927514896593e-05, "loss": 0.3748, "step": 1034 }, { "epoch": 0.6524565619951933, "grad_norm": 0.17496028542518616, "learning_rate": 3.459029594316058e-05, "loss": 0.4521, "step": 1035 }, { "epoch": 0.6530869548087151, "grad_norm": 0.13025426864624023, "learning_rate": 3.457130463479245e-05, "loss": 0.3935, "step": 1036 }, { "epoch": 0.6537173476222371, "grad_norm": 0.17129521071910858, "learning_rate": 3.4552301242872095e-05, "loss": 0.4211, "step": 1037 }, { "epoch": 0.654347740435759, "grad_norm": 0.14599891006946564, "learning_rate": 3.4533285786422206e-05, "loss": 0.3644, "step": 1038 }, { "epoch": 0.654978133249281, "grad_norm": 0.14229898154735565, "learning_rate": 3.451425828447752e-05, "loss": 0.3399, "step": 1039 }, { "epoch": 0.6556085260628028, "grad_norm": 0.13468730449676514, "learning_rate": 3.4495218756084855e-05, "loss": 0.4602, "step": 1040 }, { "epoch": 0.6562389188763248, "grad_norm": 0.14791233837604523, "learning_rate": 3.447616722030305e-05, "loss": 0.4587, "step": 1041 }, { "epoch": 0.6568693116898467, "grad_norm": 0.1709253489971161, "learning_rate": 3.445710369620297e-05, "loss": 0.4111, "step": 1042 }, { "epoch": 0.6574997045033687, "grad_norm": 0.1572764813899994, "learning_rate": 3.443802820286748e-05, "loss": 0.4076, "step": 1043 }, { "epoch": 0.6581300973168905, "grad_norm": 0.1494017392396927, "learning_rate": 3.441894075939144e-05, "loss": 0.3271, "step": 1044 }, { "epoch": 0.6587604901304125, "grad_norm": 0.1699901521205902, "learning_rate": 3.439984138488164e-05, "loss": 0.3869, "step": 1045 }, { "epoch": 0.6593908829439344, "grad_norm": 0.15090258419513702, "learning_rate": 3.438073009845684e-05, "loss": 0.3876, "step": 1046 }, { "epoch": 0.6600212757574564, "grad_norm": 0.14121317863464355, "learning_rate": 3.4361606919247733e-05, "loss": 0.3792, "step": 1047 }, { "epoch": 0.6606516685709783, "grad_norm": 0.19512726366519928, "learning_rate": 3.434247186639688e-05, "loss": 0.4897, "step": 1048 }, { "epoch": 0.6612820613845002, "grad_norm": 0.15658080577850342, "learning_rate": 3.432332495905875e-05, "loss": 0.4556, "step": 1049 }, { "epoch": 0.6619124541980221, "grad_norm": 0.13722775876522064, "learning_rate": 3.4304166216399696e-05, "loss": 0.4073, "step": 1050 }, { "epoch": 0.6625428470115441, "grad_norm": 0.1241544634103775, "learning_rate": 3.428499565759787e-05, "loss": 0.3478, "step": 1051 }, { "epoch": 0.663173239825066, "grad_norm": 0.1600281447172165, "learning_rate": 3.426581330184331e-05, "loss": 0.4351, "step": 1052 }, { "epoch": 0.6638036326385879, "grad_norm": 0.14616727828979492, "learning_rate": 3.4246619168337814e-05, "loss": 0.4346, "step": 1053 }, { "epoch": 0.6644340254521098, "grad_norm": 0.1428317278623581, "learning_rate": 3.4227413276295e-05, "loss": 0.4193, "step": 1054 }, { "epoch": 0.6650644182656318, "grad_norm": 0.15601269900798798, "learning_rate": 3.4208195644940246e-05, "loss": 0.3534, "step": 1055 }, { "epoch": 0.6656948110791537, "grad_norm": 0.12326395511627197, "learning_rate": 3.418896629351067e-05, "loss": 0.3143, "step": 1056 }, { "epoch": 0.6663252038926756, "grad_norm": 0.15271161496639252, "learning_rate": 3.416972524125515e-05, "loss": 0.4322, "step": 1057 }, { "epoch": 0.6669555967061975, "grad_norm": 0.15423282980918884, "learning_rate": 3.4150472507434245e-05, "loss": 0.3973, "step": 1058 }, { "epoch": 0.6675859895197195, "grad_norm": 0.14533115923404694, "learning_rate": 3.413120811132024e-05, "loss": 0.3989, "step": 1059 }, { "epoch": 0.6682163823332414, "grad_norm": 0.13678240776062012, "learning_rate": 3.411193207219705e-05, "loss": 0.4486, "step": 1060 }, { "epoch": 0.6688467751467633, "grad_norm": 0.17480261623859406, "learning_rate": 3.409264440936031e-05, "loss": 0.3908, "step": 1061 }, { "epoch": 0.6694771679602852, "grad_norm": 0.15733547508716583, "learning_rate": 3.407334514211722e-05, "loss": 0.3599, "step": 1062 }, { "epoch": 0.6701075607738072, "grad_norm": 0.14672769606113434, "learning_rate": 3.4054034289786643e-05, "loss": 0.3635, "step": 1063 }, { "epoch": 0.6707379535873291, "grad_norm": 0.1510699838399887, "learning_rate": 3.4034711871699026e-05, "loss": 0.4569, "step": 1064 }, { "epoch": 0.6713683464008511, "grad_norm": 0.13803254067897797, "learning_rate": 3.401537790719637e-05, "loss": 0.3643, "step": 1065 }, { "epoch": 0.6719987392143729, "grad_norm": 0.16187086701393127, "learning_rate": 3.3996032415632296e-05, "loss": 0.436, "step": 1066 }, { "epoch": 0.6726291320278949, "grad_norm": 0.16465528309345245, "learning_rate": 3.3976675416371874e-05, "loss": 0.413, "step": 1067 }, { "epoch": 0.6732595248414168, "grad_norm": 0.15643639862537384, "learning_rate": 3.395730692879178e-05, "loss": 0.469, "step": 1068 }, { "epoch": 0.6738899176549388, "grad_norm": 0.18642200529575348, "learning_rate": 3.393792697228014e-05, "loss": 0.4506, "step": 1069 }, { "epoch": 0.6745203104684606, "grad_norm": 0.1551331728696823, "learning_rate": 3.3918535566236585e-05, "loss": 0.451, "step": 1070 }, { "epoch": 0.6751507032819826, "grad_norm": 0.1358264684677124, "learning_rate": 3.3899132730072174e-05, "loss": 0.4065, "step": 1071 }, { "epoch": 0.6757810960955045, "grad_norm": 0.13260263204574585, "learning_rate": 3.387971848320944e-05, "loss": 0.3795, "step": 1072 }, { "epoch": 0.6764114889090265, "grad_norm": 0.13498187065124512, "learning_rate": 3.3860292845082335e-05, "loss": 0.4265, "step": 1073 }, { "epoch": 0.6770418817225483, "grad_norm": 0.15217430889606476, "learning_rate": 3.38408558351362e-05, "loss": 0.433, "step": 1074 }, { "epoch": 0.6776722745360703, "grad_norm": 0.14895272254943848, "learning_rate": 3.3821407472827764e-05, "loss": 0.3977, "step": 1075 }, { "epoch": 0.6783026673495922, "grad_norm": 0.14306458830833435, "learning_rate": 3.380194777762513e-05, "loss": 0.3816, "step": 1076 }, { "epoch": 0.6789330601631142, "grad_norm": 0.15742669999599457, "learning_rate": 3.378247676900772e-05, "loss": 0.4769, "step": 1077 }, { "epoch": 0.6795634529766361, "grad_norm": 0.1568499207496643, "learning_rate": 3.376299446646631e-05, "loss": 0.438, "step": 1078 }, { "epoch": 0.680193845790158, "grad_norm": 0.1393679827451706, "learning_rate": 3.374350088950297e-05, "loss": 0.4329, "step": 1079 }, { "epoch": 0.6808242386036799, "grad_norm": 0.14324618875980377, "learning_rate": 3.372399605763106e-05, "loss": 0.347, "step": 1080 }, { "epoch": 0.6814546314172019, "grad_norm": 0.14731600880622864, "learning_rate": 3.370447999037518e-05, "loss": 0.3856, "step": 1081 }, { "epoch": 0.6820850242307238, "grad_norm": 0.13627250492572784, "learning_rate": 3.3684952707271226e-05, "loss": 0.3602, "step": 1082 }, { "epoch": 0.6827154170442457, "grad_norm": 0.15852227807044983, "learning_rate": 3.366541422786627e-05, "loss": 0.3785, "step": 1083 }, { "epoch": 0.6833458098577676, "grad_norm": 0.16282619535923004, "learning_rate": 3.3645864571718634e-05, "loss": 0.3979, "step": 1084 }, { "epoch": 0.6839762026712896, "grad_norm": 0.14918841421604156, "learning_rate": 3.3626303758397795e-05, "loss": 0.4108, "step": 1085 }, { "epoch": 0.6846065954848115, "grad_norm": 0.16575288772583008, "learning_rate": 3.3606731807484426e-05, "loss": 0.4184, "step": 1086 }, { "epoch": 0.6852369882983333, "grad_norm": 0.1275065392255783, "learning_rate": 3.358714873857031e-05, "loss": 0.4038, "step": 1087 }, { "epoch": 0.6858673811118553, "grad_norm": 0.14004617929458618, "learning_rate": 3.356755457125843e-05, "loss": 0.4864, "step": 1088 }, { "epoch": 0.6864977739253773, "grad_norm": 0.1653287708759308, "learning_rate": 3.354794932516279e-05, "loss": 0.4619, "step": 1089 }, { "epoch": 0.6871281667388992, "grad_norm": 0.15715204179286957, "learning_rate": 3.3528333019908546e-05, "loss": 0.4256, "step": 1090 }, { "epoch": 0.6877585595524212, "grad_norm": 0.1482582837343216, "learning_rate": 3.350870567513191e-05, "loss": 0.3792, "step": 1091 }, { "epoch": 0.688388952365943, "grad_norm": 0.14185462892055511, "learning_rate": 3.3489067310480124e-05, "loss": 0.3564, "step": 1092 }, { "epoch": 0.689019345179465, "grad_norm": 0.14399947226047516, "learning_rate": 3.34694179456115e-05, "loss": 0.4025, "step": 1093 }, { "epoch": 0.6896497379929869, "grad_norm": 0.17147956788539886, "learning_rate": 3.3449757600195324e-05, "loss": 0.4628, "step": 1094 }, { "epoch": 0.6902801308065089, "grad_norm": 0.18218888342380524, "learning_rate": 3.343008629391189e-05, "loss": 0.4818, "step": 1095 }, { "epoch": 0.6909105236200307, "grad_norm": 0.1327589452266693, "learning_rate": 3.341040404645247e-05, "loss": 0.4052, "step": 1096 }, { "epoch": 0.6915409164335526, "grad_norm": 0.1555880606174469, "learning_rate": 3.339071087751928e-05, "loss": 0.4009, "step": 1097 }, { "epoch": 0.6921713092470746, "grad_norm": 0.14132007956504822, "learning_rate": 3.337100680682546e-05, "loss": 0.3736, "step": 1098 }, { "epoch": 0.6928017020605965, "grad_norm": 0.12532183527946472, "learning_rate": 3.335129185409509e-05, "loss": 0.3825, "step": 1099 }, { "epoch": 0.6934320948741184, "grad_norm": 0.1209065243601799, "learning_rate": 3.3331566039063104e-05, "loss": 0.3834, "step": 1100 }, { "epoch": 0.6940624876876403, "grad_norm": 0.15299884974956512, "learning_rate": 3.331182938147534e-05, "loss": 0.3554, "step": 1101 }, { "epoch": 0.6946928805011623, "grad_norm": 0.15046365559101105, "learning_rate": 3.329208190108849e-05, "loss": 0.358, "step": 1102 }, { "epoch": 0.6953232733146842, "grad_norm": 0.1376636028289795, "learning_rate": 3.327232361767006e-05, "loss": 0.374, "step": 1103 }, { "epoch": 0.6959536661282061, "grad_norm": 0.17495515942573547, "learning_rate": 3.3252554550998376e-05, "loss": 0.4924, "step": 1104 }, { "epoch": 0.696584058941728, "grad_norm": 0.1590355485677719, "learning_rate": 3.323277472086258e-05, "loss": 0.4235, "step": 1105 }, { "epoch": 0.69721445175525, "grad_norm": 0.13074256479740143, "learning_rate": 3.3212984147062556e-05, "loss": 0.3432, "step": 1106 }, { "epoch": 0.6978448445687719, "grad_norm": 0.1273060142993927, "learning_rate": 3.319318284940896e-05, "loss": 0.3498, "step": 1107 }, { "epoch": 0.6984752373822939, "grad_norm": 0.142560213804245, "learning_rate": 3.3173370847723185e-05, "loss": 0.4031, "step": 1108 }, { "epoch": 0.6991056301958157, "grad_norm": 0.16981838643550873, "learning_rate": 3.315354816183732e-05, "loss": 0.4806, "step": 1109 }, { "epoch": 0.6997360230093377, "grad_norm": 0.12655171751976013, "learning_rate": 3.313371481159418e-05, "loss": 0.3589, "step": 1110 }, { "epoch": 0.7003664158228596, "grad_norm": 0.15416717529296875, "learning_rate": 3.3113870816847234e-05, "loss": 0.4265, "step": 1111 }, { "epoch": 0.7009968086363816, "grad_norm": 0.1576196402311325, "learning_rate": 3.309401619746061e-05, "loss": 0.3929, "step": 1112 }, { "epoch": 0.7016272014499034, "grad_norm": 0.142515167593956, "learning_rate": 3.307415097330906e-05, "loss": 0.3744, "step": 1113 }, { "epoch": 0.7022575942634254, "grad_norm": 0.12994253635406494, "learning_rate": 3.305427516427798e-05, "loss": 0.3346, "step": 1114 }, { "epoch": 0.7028879870769473, "grad_norm": 0.1487005203962326, "learning_rate": 3.303438879026334e-05, "loss": 0.3755, "step": 1115 }, { "epoch": 0.7035183798904693, "grad_norm": 0.13746176660060883, "learning_rate": 3.301449187117168e-05, "loss": 0.3748, "step": 1116 }, { "epoch": 0.7041487727039911, "grad_norm": 0.10952261835336685, "learning_rate": 3.299458442692014e-05, "loss": 0.3291, "step": 1117 }, { "epoch": 0.7047791655175131, "grad_norm": 0.14707626402378082, "learning_rate": 3.297466647743631e-05, "loss": 0.376, "step": 1118 }, { "epoch": 0.705409558331035, "grad_norm": 0.15899558365345, "learning_rate": 3.295473804265839e-05, "loss": 0.4085, "step": 1119 }, { "epoch": 0.706039951144557, "grad_norm": 0.1395518183708191, "learning_rate": 3.293479914253502e-05, "loss": 0.3351, "step": 1120 }, { "epoch": 0.7066703439580789, "grad_norm": 0.13215899467468262, "learning_rate": 3.291484979702533e-05, "loss": 0.3844, "step": 1121 }, { "epoch": 0.7073007367716008, "grad_norm": 0.12787476181983948, "learning_rate": 3.28948900260989e-05, "loss": 0.4299, "step": 1122 }, { "epoch": 0.7079311295851227, "grad_norm": 0.1378302276134491, "learning_rate": 3.2874919849735766e-05, "loss": 0.3924, "step": 1123 }, { "epoch": 0.7085615223986447, "grad_norm": 0.18550285696983337, "learning_rate": 3.285493928792636e-05, "loss": 0.5406, "step": 1124 }, { "epoch": 0.7091919152121666, "grad_norm": 0.15087875723838806, "learning_rate": 3.283494836067152e-05, "loss": 0.4411, "step": 1125 }, { "epoch": 0.7098223080256885, "grad_norm": 0.14296133816242218, "learning_rate": 3.281494708798245e-05, "loss": 0.42, "step": 1126 }, { "epoch": 0.7104527008392104, "grad_norm": 0.16558092832565308, "learning_rate": 3.2794935489880715e-05, "loss": 0.4505, "step": 1127 }, { "epoch": 0.7110830936527324, "grad_norm": 0.14892105758190155, "learning_rate": 3.277491358639822e-05, "loss": 0.4671, "step": 1128 }, { "epoch": 0.7117134864662543, "grad_norm": 0.14008575677871704, "learning_rate": 3.275488139757718e-05, "loss": 0.3752, "step": 1129 }, { "epoch": 0.7123438792797762, "grad_norm": 0.12162633240222931, "learning_rate": 3.2734838943470116e-05, "loss": 0.3555, "step": 1130 }, { "epoch": 0.7129742720932981, "grad_norm": 0.17720535397529602, "learning_rate": 3.271478624413981e-05, "loss": 0.4747, "step": 1131 }, { "epoch": 0.7136046649068201, "grad_norm": 0.13535228371620178, "learning_rate": 3.2694723319659304e-05, "loss": 0.4067, "step": 1132 }, { "epoch": 0.714235057720342, "grad_norm": 0.15453457832336426, "learning_rate": 3.267465019011189e-05, "loss": 0.4482, "step": 1133 }, { "epoch": 0.714865450533864, "grad_norm": 0.13449645042419434, "learning_rate": 3.2654566875591054e-05, "loss": 0.3486, "step": 1134 }, { "epoch": 0.7154958433473858, "grad_norm": 0.14120006561279297, "learning_rate": 3.2634473396200495e-05, "loss": 0.4511, "step": 1135 }, { "epoch": 0.7161262361609078, "grad_norm": 0.13961990177631378, "learning_rate": 3.261436977205407e-05, "loss": 0.3933, "step": 1136 }, { "epoch": 0.7167566289744297, "grad_norm": 0.1289156675338745, "learning_rate": 3.259425602327581e-05, "loss": 0.3766, "step": 1137 }, { "epoch": 0.7173870217879517, "grad_norm": 0.15146243572235107, "learning_rate": 3.2574132169999856e-05, "loss": 0.4708, "step": 1138 }, { "epoch": 0.7180174146014735, "grad_norm": 0.1951146125793457, "learning_rate": 3.25539982323705e-05, "loss": 0.4631, "step": 1139 }, { "epoch": 0.7186478074149955, "grad_norm": 0.17794398963451385, "learning_rate": 3.253385423054209e-05, "loss": 0.4093, "step": 1140 }, { "epoch": 0.7192782002285174, "grad_norm": 0.13891717791557312, "learning_rate": 3.251370018467908e-05, "loss": 0.3927, "step": 1141 }, { "epoch": 0.7199085930420394, "grad_norm": 0.15158259868621826, "learning_rate": 3.249353611495595e-05, "loss": 0.4271, "step": 1142 }, { "epoch": 0.7205389858555612, "grad_norm": 0.14888045191764832, "learning_rate": 3.247336204155724e-05, "loss": 0.3671, "step": 1143 }, { "epoch": 0.7211693786690831, "grad_norm": 0.15289460122585297, "learning_rate": 3.245317798467748e-05, "loss": 0.4726, "step": 1144 }, { "epoch": 0.7217997714826051, "grad_norm": 0.15524503588676453, "learning_rate": 3.243298396452122e-05, "loss": 0.4251, "step": 1145 }, { "epoch": 0.722430164296127, "grad_norm": 0.1572655588388443, "learning_rate": 3.241278000130297e-05, "loss": 0.3803, "step": 1146 }, { "epoch": 0.723060557109649, "grad_norm": 0.15959100425243378, "learning_rate": 3.2392566115247166e-05, "loss": 0.3879, "step": 1147 }, { "epoch": 0.7236909499231708, "grad_norm": 0.1515599489212036, "learning_rate": 3.2372342326588234e-05, "loss": 0.3734, "step": 1148 }, { "epoch": 0.7243213427366928, "grad_norm": 0.15384040772914886, "learning_rate": 3.235210865557047e-05, "loss": 0.4155, "step": 1149 }, { "epoch": 0.7249517355502147, "grad_norm": 0.13184258341789246, "learning_rate": 3.233186512244807e-05, "loss": 0.3707, "step": 1150 }, { "epoch": 0.7255821283637367, "grad_norm": 0.1614120900630951, "learning_rate": 3.23116117474851e-05, "loss": 0.3841, "step": 1151 }, { "epoch": 0.7262125211772585, "grad_norm": 0.16377252340316772, "learning_rate": 3.2291348550955506e-05, "loss": 0.4358, "step": 1152 }, { "epoch": 0.7268429139907805, "grad_norm": 0.17715105414390564, "learning_rate": 3.227107555314303e-05, "loss": 0.4311, "step": 1153 }, { "epoch": 0.7274733068043024, "grad_norm": 0.12353774160146713, "learning_rate": 3.2250792774341235e-05, "loss": 0.3998, "step": 1154 }, { "epoch": 0.7281036996178244, "grad_norm": 0.18568025529384613, "learning_rate": 3.223050023485349e-05, "loss": 0.3944, "step": 1155 }, { "epoch": 0.7287340924313462, "grad_norm": 0.14664076268672943, "learning_rate": 3.221019795499291e-05, "loss": 0.4321, "step": 1156 }, { "epoch": 0.7293644852448682, "grad_norm": 0.14460389316082, "learning_rate": 3.218988595508239e-05, "loss": 0.4517, "step": 1157 }, { "epoch": 0.7299948780583901, "grad_norm": 0.13419942557811737, "learning_rate": 3.2169564255454525e-05, "loss": 0.3954, "step": 1158 }, { "epoch": 0.7306252708719121, "grad_norm": 0.12872156500816345, "learning_rate": 3.2149232876451636e-05, "loss": 0.3859, "step": 1159 }, { "epoch": 0.7312556636854339, "grad_norm": 0.17439115047454834, "learning_rate": 3.212889183842573e-05, "loss": 0.4542, "step": 1160 }, { "epoch": 0.7318860564989559, "grad_norm": 0.1659138798713684, "learning_rate": 3.210854116173849e-05, "loss": 0.4502, "step": 1161 }, { "epoch": 0.7325164493124778, "grad_norm": 0.1282382756471634, "learning_rate": 3.208818086676123e-05, "loss": 0.345, "step": 1162 }, { "epoch": 0.7331468421259998, "grad_norm": 0.14103926718235016, "learning_rate": 3.206781097387492e-05, "loss": 0.3867, "step": 1163 }, { "epoch": 0.7337772349395217, "grad_norm": 0.13906313478946686, "learning_rate": 3.2047431503470094e-05, "loss": 0.423, "step": 1164 }, { "epoch": 0.7344076277530436, "grad_norm": 0.18526320159435272, "learning_rate": 3.2027042475946924e-05, "loss": 0.4396, "step": 1165 }, { "epoch": 0.7350380205665655, "grad_norm": 0.12118617445230484, "learning_rate": 3.200664391171511e-05, "loss": 0.3873, "step": 1166 }, { "epoch": 0.7356684133800875, "grad_norm": 0.1466676890850067, "learning_rate": 3.198623583119392e-05, "loss": 0.3754, "step": 1167 }, { "epoch": 0.7362988061936094, "grad_norm": 0.1386253833770752, "learning_rate": 3.196581825481213e-05, "loss": 0.4402, "step": 1168 }, { "epoch": 0.7369291990071313, "grad_norm": 0.14989392459392548, "learning_rate": 3.194539120300804e-05, "loss": 0.3425, "step": 1169 }, { "epoch": 0.7375595918206532, "grad_norm": 0.14861762523651123, "learning_rate": 3.192495469622943e-05, "loss": 0.3832, "step": 1170 }, { "epoch": 0.7381899846341752, "grad_norm": 0.14428891241550446, "learning_rate": 3.190450875493354e-05, "loss": 0.4072, "step": 1171 }, { "epoch": 0.7388203774476971, "grad_norm": 0.12648852169513702, "learning_rate": 3.1884053399587054e-05, "loss": 0.3947, "step": 1172 }, { "epoch": 0.739450770261219, "grad_norm": 0.1434190273284912, "learning_rate": 3.186358865066608e-05, "loss": 0.3826, "step": 1173 }, { "epoch": 0.7400811630747409, "grad_norm": 0.1685863435268402, "learning_rate": 3.184311452865613e-05, "loss": 0.4157, "step": 1174 }, { "epoch": 0.7407115558882629, "grad_norm": 0.15299765765666962, "learning_rate": 3.182263105405211e-05, "loss": 0.3789, "step": 1175 }, { "epoch": 0.7413419487017848, "grad_norm": 0.11780162155628204, "learning_rate": 3.1802138247358255e-05, "loss": 0.3864, "step": 1176 }, { "epoch": 0.7419723415153068, "grad_norm": 0.1341855227947235, "learning_rate": 3.1781636129088186e-05, "loss": 0.3601, "step": 1177 }, { "epoch": 0.7426027343288286, "grad_norm": 0.14695020020008087, "learning_rate": 3.176112471976481e-05, "loss": 0.4271, "step": 1178 }, { "epoch": 0.7432331271423506, "grad_norm": 0.15160372853279114, "learning_rate": 3.1740604039920354e-05, "loss": 0.4123, "step": 1179 }, { "epoch": 0.7438635199558725, "grad_norm": 0.14544130861759186, "learning_rate": 3.172007411009631e-05, "loss": 0.4237, "step": 1180 }, { "epoch": 0.7444939127693945, "grad_norm": 0.1619604080915451, "learning_rate": 3.169953495084345e-05, "loss": 0.4467, "step": 1181 }, { "epoch": 0.7451243055829163, "grad_norm": 0.1389506459236145, "learning_rate": 3.167898658272176e-05, "loss": 0.3846, "step": 1182 }, { "epoch": 0.7457546983964383, "grad_norm": 0.14674296975135803, "learning_rate": 3.165842902630047e-05, "loss": 0.3836, "step": 1183 }, { "epoch": 0.7463850912099602, "grad_norm": 0.1308245211839676, "learning_rate": 3.163786230215798e-05, "loss": 0.3929, "step": 1184 }, { "epoch": 0.7470154840234822, "grad_norm": 0.14882703125476837, "learning_rate": 3.161728643088189e-05, "loss": 0.4624, "step": 1185 }, { "epoch": 0.747645876837004, "grad_norm": 0.14521747827529907, "learning_rate": 3.159670143306893e-05, "loss": 0.3438, "step": 1186 }, { "epoch": 0.748276269650526, "grad_norm": 0.14668028056621552, "learning_rate": 3.157610732932502e-05, "loss": 0.3262, "step": 1187 }, { "epoch": 0.7489066624640479, "grad_norm": 0.13926194608211517, "learning_rate": 3.155550414026512e-05, "loss": 0.3905, "step": 1188 }, { "epoch": 0.7495370552775699, "grad_norm": 0.19401229918003082, "learning_rate": 3.153489188651335e-05, "loss": 0.5161, "step": 1189 }, { "epoch": 0.7501674480910918, "grad_norm": 0.19589434564113617, "learning_rate": 3.151427058870287e-05, "loss": 0.557, "step": 1190 }, { "epoch": 0.7507978409046137, "grad_norm": 0.12772412598133087, "learning_rate": 3.1493640267475883e-05, "loss": 0.393, "step": 1191 }, { "epoch": 0.7514282337181356, "grad_norm": 0.15101544559001923, "learning_rate": 3.147300094348366e-05, "loss": 0.3938, "step": 1192 }, { "epoch": 0.7520586265316576, "grad_norm": 0.12661443650722504, "learning_rate": 3.1452352637386465e-05, "loss": 0.3683, "step": 1193 }, { "epoch": 0.7526890193451795, "grad_norm": 0.14246030151844025, "learning_rate": 3.1431695369853554e-05, "loss": 0.4059, "step": 1194 }, { "epoch": 0.7533194121587014, "grad_norm": 0.14124950766563416, "learning_rate": 3.1411029161563146e-05, "loss": 0.3998, "step": 1195 }, { "epoch": 0.7539498049722233, "grad_norm": 0.14489232003688812, "learning_rate": 3.139035403320242e-05, "loss": 0.3704, "step": 1196 }, { "epoch": 0.7545801977857453, "grad_norm": 0.16528914868831635, "learning_rate": 3.1369670005467486e-05, "loss": 0.3777, "step": 1197 }, { "epoch": 0.7552105905992672, "grad_norm": 0.1324455440044403, "learning_rate": 3.1348977099063354e-05, "loss": 0.3648, "step": 1198 }, { "epoch": 0.755840983412789, "grad_norm": 0.14387087523937225, "learning_rate": 3.132827533470393e-05, "loss": 0.4182, "step": 1199 }, { "epoch": 0.756471376226311, "grad_norm": 0.17798291146755219, "learning_rate": 3.130756473311197e-05, "loss": 0.4266, "step": 1200 }, { "epoch": 0.756471376226311, "eval_loss": 0.42969560623168945, "eval_runtime": 222.3502, "eval_samples_per_second": 4.497, "eval_steps_per_second": 4.497, "step": 1200 }, { "epoch": 0.757101769039833, "grad_norm": 0.13976815342903137, "learning_rate": 3.12868453150191e-05, "loss": 0.4242, "step": 1201 }, { "epoch": 0.7577321618533549, "grad_norm": 0.16970422863960266, "learning_rate": 3.1266117101165755e-05, "loss": 0.3967, "step": 1202 }, { "epoch": 0.7583625546668767, "grad_norm": 0.14190421998500824, "learning_rate": 3.124538011230119e-05, "loss": 0.4174, "step": 1203 }, { "epoch": 0.7589929474803987, "grad_norm": 0.18532924354076385, "learning_rate": 3.1224634369183415e-05, "loss": 0.4296, "step": 1204 }, { "epoch": 0.7596233402939206, "grad_norm": 0.14926187694072723, "learning_rate": 3.120387989257924e-05, "loss": 0.3885, "step": 1205 }, { "epoch": 0.7602537331074426, "grad_norm": 0.15586240589618683, "learning_rate": 3.118311670326418e-05, "loss": 0.4022, "step": 1206 }, { "epoch": 0.7608841259209645, "grad_norm": 0.15296033024787903, "learning_rate": 3.116234482202252e-05, "loss": 0.384, "step": 1207 }, { "epoch": 0.7615145187344864, "grad_norm": 0.13187125325202942, "learning_rate": 3.114156426964719e-05, "loss": 0.399, "step": 1208 }, { "epoch": 0.7621449115480083, "grad_norm": 0.15693902969360352, "learning_rate": 3.1120775066939835e-05, "loss": 0.418, "step": 1209 }, { "epoch": 0.7627753043615303, "grad_norm": 0.13276666402816772, "learning_rate": 3.1099977234710743e-05, "loss": 0.4256, "step": 1210 }, { "epoch": 0.7634056971750522, "grad_norm": 0.16410839557647705, "learning_rate": 3.107917079377886e-05, "loss": 0.4195, "step": 1211 }, { "epoch": 0.7640360899885741, "grad_norm": 0.1418546438217163, "learning_rate": 3.105835576497174e-05, "loss": 0.3913, "step": 1212 }, { "epoch": 0.764666482802096, "grad_norm": 0.1336127668619156, "learning_rate": 3.103753216912552e-05, "loss": 0.4342, "step": 1213 }, { "epoch": 0.765296875615618, "grad_norm": 0.14342859387397766, "learning_rate": 3.101670002708492e-05, "loss": 0.3532, "step": 1214 }, { "epoch": 0.7659272684291399, "grad_norm": 0.13100619614124298, "learning_rate": 3.0995859359703234e-05, "loss": 0.3356, "step": 1215 }, { "epoch": 0.7665576612426618, "grad_norm": 0.18905891478061676, "learning_rate": 3.0975010187842256e-05, "loss": 0.4619, "step": 1216 }, { "epoch": 0.7671880540561837, "grad_norm": 0.16561256349086761, "learning_rate": 3.0954152532372335e-05, "loss": 0.4726, "step": 1217 }, { "epoch": 0.7678184468697057, "grad_norm": 0.14073455333709717, "learning_rate": 3.093328641417226e-05, "loss": 0.3954, "step": 1218 }, { "epoch": 0.7684488396832276, "grad_norm": 0.15636882185935974, "learning_rate": 3.0912411854129344e-05, "loss": 0.4532, "step": 1219 }, { "epoch": 0.7690792324967496, "grad_norm": 0.14515434205532074, "learning_rate": 3.089152887313931e-05, "loss": 0.3979, "step": 1220 }, { "epoch": 0.7697096253102714, "grad_norm": 0.15273843705654144, "learning_rate": 3.0870637492106346e-05, "loss": 0.4146, "step": 1221 }, { "epoch": 0.7703400181237934, "grad_norm": 0.14053373038768768, "learning_rate": 3.084973773194301e-05, "loss": 0.4118, "step": 1222 }, { "epoch": 0.7709704109373153, "grad_norm": 0.16905413568019867, "learning_rate": 3.0828829613570275e-05, "loss": 0.4402, "step": 1223 }, { "epoch": 0.7716008037508373, "grad_norm": 0.1700507253408432, "learning_rate": 3.080791315791747e-05, "loss": 0.3915, "step": 1224 }, { "epoch": 0.7722311965643591, "grad_norm": 0.1725083589553833, "learning_rate": 3.0786988385922274e-05, "loss": 0.4382, "step": 1225 }, { "epoch": 0.7728615893778811, "grad_norm": 0.12850132584571838, "learning_rate": 3.07660553185307e-05, "loss": 0.4087, "step": 1226 }, { "epoch": 0.773491982191403, "grad_norm": 0.14398561418056488, "learning_rate": 3.0745113976697036e-05, "loss": 0.4638, "step": 1227 }, { "epoch": 0.774122375004925, "grad_norm": 0.1383698284626007, "learning_rate": 3.0724164381383884e-05, "loss": 0.3122, "step": 1228 }, { "epoch": 0.7747527678184468, "grad_norm": 0.14155955612659454, "learning_rate": 3.070320655356209e-05, "loss": 0.3587, "step": 1229 }, { "epoch": 0.7753831606319688, "grad_norm": 0.12928208708763123, "learning_rate": 3.068224051421074e-05, "loss": 0.3619, "step": 1230 }, { "epoch": 0.7760135534454907, "grad_norm": 0.16862641274929047, "learning_rate": 3.0661266284317164e-05, "loss": 0.4479, "step": 1231 }, { "epoch": 0.7766439462590127, "grad_norm": 0.14298008382320404, "learning_rate": 3.064028388487685e-05, "loss": 0.349, "step": 1232 }, { "epoch": 0.7772743390725346, "grad_norm": 0.14893986284732819, "learning_rate": 3.06192933368935e-05, "loss": 0.3044, "step": 1233 }, { "epoch": 0.7779047318860565, "grad_norm": 0.16283908486366272, "learning_rate": 3.059829466137895e-05, "loss": 0.506, "step": 1234 }, { "epoch": 0.7785351246995784, "grad_norm": 0.14608779549598694, "learning_rate": 3.057728787935319e-05, "loss": 0.4079, "step": 1235 }, { "epoch": 0.7791655175131004, "grad_norm": 0.14887581765651703, "learning_rate": 3.055627301184431e-05, "loss": 0.3728, "step": 1236 }, { "epoch": 0.7797959103266223, "grad_norm": 0.15198326110839844, "learning_rate": 3.053525007988851e-05, "loss": 0.4171, "step": 1237 }, { "epoch": 0.7804263031401442, "grad_norm": 0.14626415073871613, "learning_rate": 3.0514219104530034e-05, "loss": 0.3743, "step": 1238 }, { "epoch": 0.7810566959536661, "grad_norm": 0.15413986146450043, "learning_rate": 3.049318010682121e-05, "loss": 0.4569, "step": 1239 }, { "epoch": 0.7816870887671881, "grad_norm": 0.16661998629570007, "learning_rate": 3.047213310782237e-05, "loss": 0.3922, "step": 1240 }, { "epoch": 0.78231748158071, "grad_norm": 0.14461380243301392, "learning_rate": 3.0451078128601875e-05, "loss": 0.3893, "step": 1241 }, { "epoch": 0.7829478743942319, "grad_norm": 0.15647661685943604, "learning_rate": 3.0430015190236064e-05, "loss": 0.4276, "step": 1242 }, { "epoch": 0.7835782672077538, "grad_norm": 0.1723053753376007, "learning_rate": 3.0408944313809254e-05, "loss": 0.4251, "step": 1243 }, { "epoch": 0.7842086600212758, "grad_norm": 0.1330244243144989, "learning_rate": 3.0387865520413694e-05, "loss": 0.3697, "step": 1244 }, { "epoch": 0.7848390528347977, "grad_norm": 0.1832405924797058, "learning_rate": 3.0366778831149553e-05, "loss": 0.5055, "step": 1245 }, { "epoch": 0.7854694456483197, "grad_norm": 0.1423068344593048, "learning_rate": 3.0345684267124928e-05, "loss": 0.3877, "step": 1246 }, { "epoch": 0.7860998384618415, "grad_norm": 0.13884073495864868, "learning_rate": 3.0324581849455786e-05, "loss": 0.3952, "step": 1247 }, { "epoch": 0.7867302312753635, "grad_norm": 0.14646825194358826, "learning_rate": 3.030347159926595e-05, "loss": 0.3548, "step": 1248 }, { "epoch": 0.7873606240888854, "grad_norm": 0.13451075553894043, "learning_rate": 3.028235353768709e-05, "loss": 0.4441, "step": 1249 }, { "epoch": 0.7879910169024074, "grad_norm": 0.13031382858753204, "learning_rate": 3.0261227685858693e-05, "loss": 0.415, "step": 1250 }, { "epoch": 0.7886214097159292, "grad_norm": 0.16893799602985382, "learning_rate": 3.024009406492805e-05, "loss": 0.4956, "step": 1251 }, { "epoch": 0.7892518025294512, "grad_norm": 0.14744479954242706, "learning_rate": 3.0218952696050224e-05, "loss": 0.386, "step": 1252 }, { "epoch": 0.7898821953429731, "grad_norm": 0.12958194315433502, "learning_rate": 3.019780360038802e-05, "loss": 0.374, "step": 1253 }, { "epoch": 0.790512588156495, "grad_norm": 0.14195840060710907, "learning_rate": 3.0176646799112015e-05, "loss": 0.3455, "step": 1254 }, { "epoch": 0.7911429809700169, "grad_norm": 0.1527252197265625, "learning_rate": 3.015548231340046e-05, "loss": 0.4011, "step": 1255 }, { "epoch": 0.7917733737835388, "grad_norm": 0.14155863225460052, "learning_rate": 3.0134310164439315e-05, "loss": 0.3637, "step": 1256 }, { "epoch": 0.7924037665970608, "grad_norm": 0.16283807158470154, "learning_rate": 3.0113130373422213e-05, "loss": 0.4651, "step": 1257 }, { "epoch": 0.7930341594105828, "grad_norm": 0.14141714572906494, "learning_rate": 3.0091942961550433e-05, "loss": 0.3961, "step": 1258 }, { "epoch": 0.7936645522241046, "grad_norm": 0.19669808447360992, "learning_rate": 3.0070747950032894e-05, "loss": 0.3923, "step": 1259 }, { "epoch": 0.7942949450376265, "grad_norm": 0.16972172260284424, "learning_rate": 3.0049545360086087e-05, "loss": 0.4162, "step": 1260 }, { "epoch": 0.7949253378511485, "grad_norm": 0.1589929312467575, "learning_rate": 3.0028335212934143e-05, "loss": 0.4726, "step": 1261 }, { "epoch": 0.7955557306646704, "grad_norm": 0.154354989528656, "learning_rate": 3.0007117529808713e-05, "loss": 0.3982, "step": 1262 }, { "epoch": 0.7961861234781924, "grad_norm": 0.13349230587482452, "learning_rate": 2.998589233194901e-05, "loss": 0.3245, "step": 1263 }, { "epoch": 0.7968165162917142, "grad_norm": 0.12602095305919647, "learning_rate": 2.996465964060177e-05, "loss": 0.3663, "step": 1264 }, { "epoch": 0.7974469091052362, "grad_norm": 0.15197496116161346, "learning_rate": 2.994341947702122e-05, "loss": 0.4175, "step": 1265 }, { "epoch": 0.7980773019187581, "grad_norm": 0.14406231045722961, "learning_rate": 2.992217186246908e-05, "loss": 0.3619, "step": 1266 }, { "epoch": 0.7987076947322801, "grad_norm": 0.16416853666305542, "learning_rate": 2.9900916818214522e-05, "loss": 0.4152, "step": 1267 }, { "epoch": 0.7993380875458019, "grad_norm": 0.1644122153520584, "learning_rate": 2.9879654365534157e-05, "loss": 0.4807, "step": 1268 }, { "epoch": 0.7999684803593239, "grad_norm": 0.13329042494297028, "learning_rate": 2.9858384525712007e-05, "loss": 0.3947, "step": 1269 }, { "epoch": 0.8005988731728458, "grad_norm": 0.17329329252243042, "learning_rate": 2.983710732003949e-05, "loss": 0.5129, "step": 1270 }, { "epoch": 0.8012292659863678, "grad_norm": 0.20010189712047577, "learning_rate": 2.9815822769815405e-05, "loss": 0.4258, "step": 1271 }, { "epoch": 0.8018596587998896, "grad_norm": 0.1383606642484665, "learning_rate": 2.97945308963459e-05, "loss": 0.4292, "step": 1272 }, { "epoch": 0.8024900516134116, "grad_norm": 0.18742382526397705, "learning_rate": 2.977323172094444e-05, "loss": 0.4427, "step": 1273 }, { "epoch": 0.8031204444269335, "grad_norm": 0.12892279028892517, "learning_rate": 2.9751925264931825e-05, "loss": 0.3814, "step": 1274 }, { "epoch": 0.8037508372404555, "grad_norm": 0.13529056310653687, "learning_rate": 2.973061154963612e-05, "loss": 0.3902, "step": 1275 }, { "epoch": 0.8043812300539774, "grad_norm": 0.1281760334968567, "learning_rate": 2.9709290596392665e-05, "loss": 0.3686, "step": 1276 }, { "epoch": 0.8050116228674993, "grad_norm": 0.12750031054019928, "learning_rate": 2.9687962426544054e-05, "loss": 0.3661, "step": 1277 }, { "epoch": 0.8056420156810212, "grad_norm": 0.19774696230888367, "learning_rate": 2.966662706144008e-05, "loss": 0.4035, "step": 1278 }, { "epoch": 0.8062724084945432, "grad_norm": 0.1890394538640976, "learning_rate": 2.9645284522437767e-05, "loss": 0.4657, "step": 1279 }, { "epoch": 0.8069028013080651, "grad_norm": 0.1572059839963913, "learning_rate": 2.9623934830901302e-05, "loss": 0.3899, "step": 1280 }, { "epoch": 0.807533194121587, "grad_norm": 0.1299111694097519, "learning_rate": 2.960257800820205e-05, "loss": 0.3763, "step": 1281 }, { "epoch": 0.8081635869351089, "grad_norm": 0.17201797664165497, "learning_rate": 2.958121407571849e-05, "loss": 0.4203, "step": 1282 }, { "epoch": 0.8087939797486309, "grad_norm": 0.1822853982448578, "learning_rate": 2.9559843054836234e-05, "loss": 0.4675, "step": 1283 }, { "epoch": 0.8094243725621528, "grad_norm": 0.14285458624362946, "learning_rate": 2.953846496694799e-05, "loss": 0.381, "step": 1284 }, { "epoch": 0.8100547653756747, "grad_norm": 0.13120800256729126, "learning_rate": 2.951707983345353e-05, "loss": 0.351, "step": 1285 }, { "epoch": 0.8106851581891966, "grad_norm": 0.15498490631580353, "learning_rate": 2.949568767575969e-05, "loss": 0.4421, "step": 1286 }, { "epoch": 0.8113155510027186, "grad_norm": 0.13642612099647522, "learning_rate": 2.9474288515280327e-05, "loss": 0.4148, "step": 1287 }, { "epoch": 0.8119459438162405, "grad_norm": 0.14929735660552979, "learning_rate": 2.945288237343632e-05, "loss": 0.4098, "step": 1288 }, { "epoch": 0.8125763366297625, "grad_norm": 0.15047243237495422, "learning_rate": 2.9431469271655517e-05, "loss": 0.4056, "step": 1289 }, { "epoch": 0.8132067294432843, "grad_norm": 0.17344744503498077, "learning_rate": 2.9410049231372764e-05, "loss": 0.4745, "step": 1290 }, { "epoch": 0.8138371222568063, "grad_norm": 0.13470345735549927, "learning_rate": 2.9388622274029818e-05, "loss": 0.3921, "step": 1291 }, { "epoch": 0.8144675150703282, "grad_norm": 0.18539802730083466, "learning_rate": 2.936718842107539e-05, "loss": 0.4469, "step": 1292 }, { "epoch": 0.8150979078838502, "grad_norm": 0.1868763566017151, "learning_rate": 2.9345747693965062e-05, "loss": 0.4339, "step": 1293 }, { "epoch": 0.815728300697372, "grad_norm": 0.13025294244289398, "learning_rate": 2.9324300114161335e-05, "loss": 0.3505, "step": 1294 }, { "epoch": 0.816358693510894, "grad_norm": 0.14217989146709442, "learning_rate": 2.9302845703133535e-05, "loss": 0.482, "step": 1295 }, { "epoch": 0.8169890863244159, "grad_norm": 0.1494067758321762, "learning_rate": 2.9281384482357847e-05, "loss": 0.4544, "step": 1296 }, { "epoch": 0.8176194791379379, "grad_norm": 0.139066681265831, "learning_rate": 2.9259916473317262e-05, "loss": 0.3703, "step": 1297 }, { "epoch": 0.8182498719514597, "grad_norm": 0.1567106544971466, "learning_rate": 2.9238441697501573e-05, "loss": 0.4444, "step": 1298 }, { "epoch": 0.8188802647649817, "grad_norm": 0.12469413131475449, "learning_rate": 2.9216960176407353e-05, "loss": 0.3793, "step": 1299 }, { "epoch": 0.8195106575785036, "grad_norm": 0.15910683572292328, "learning_rate": 2.91954719315379e-05, "loss": 0.3861, "step": 1300 }, { "epoch": 0.8201410503920256, "grad_norm": 0.12822704017162323, "learning_rate": 2.917397698440328e-05, "loss": 0.337, "step": 1301 }, { "epoch": 0.8207714432055474, "grad_norm": 0.1806858628988266, "learning_rate": 2.915247535652023e-05, "loss": 0.4272, "step": 1302 }, { "epoch": 0.8214018360190694, "grad_norm": 0.1507682353258133, "learning_rate": 2.913096706941221e-05, "loss": 0.4612, "step": 1303 }, { "epoch": 0.8220322288325913, "grad_norm": 0.1620669811964035, "learning_rate": 2.9109452144609315e-05, "loss": 0.4811, "step": 1304 }, { "epoch": 0.8226626216461133, "grad_norm": 0.1458032876253128, "learning_rate": 2.908793060364832e-05, "loss": 0.4287, "step": 1305 }, { "epoch": 0.8232930144596352, "grad_norm": 0.14256137609481812, "learning_rate": 2.906640246807258e-05, "loss": 0.3712, "step": 1306 }, { "epoch": 0.823923407273157, "grad_norm": 0.1815001517534256, "learning_rate": 2.904486775943208e-05, "loss": 0.4335, "step": 1307 }, { "epoch": 0.824553800086679, "grad_norm": 0.18930688500404358, "learning_rate": 2.902332649928339e-05, "loss": 0.4208, "step": 1308 }, { "epoch": 0.825184192900201, "grad_norm": 0.15047843754291534, "learning_rate": 2.90017787091896e-05, "loss": 0.4442, "step": 1309 }, { "epoch": 0.8258145857137229, "grad_norm": 0.1436319202184677, "learning_rate": 2.8980224410720387e-05, "loss": 0.3839, "step": 1310 }, { "epoch": 0.8264449785272447, "grad_norm": 0.16568589210510254, "learning_rate": 2.895866362545191e-05, "loss": 0.4122, "step": 1311 }, { "epoch": 0.8270753713407667, "grad_norm": 0.13160791993141174, "learning_rate": 2.893709637496683e-05, "loss": 0.383, "step": 1312 }, { "epoch": 0.8277057641542886, "grad_norm": 0.12996850907802582, "learning_rate": 2.891552268085427e-05, "loss": 0.3561, "step": 1313 }, { "epoch": 0.8283361569678106, "grad_norm": 0.12997615337371826, "learning_rate": 2.8893942564709828e-05, "loss": 0.367, "step": 1314 }, { "epoch": 0.8289665497813324, "grad_norm": 0.1627953201532364, "learning_rate": 2.8872356048135515e-05, "loss": 0.4475, "step": 1315 }, { "epoch": 0.8295969425948544, "grad_norm": 0.15016132593154907, "learning_rate": 2.8850763152739736e-05, "loss": 0.3783, "step": 1316 }, { "epoch": 0.8302273354083763, "grad_norm": 0.14278893172740936, "learning_rate": 2.8829163900137306e-05, "loss": 0.3903, "step": 1317 }, { "epoch": 0.8308577282218983, "grad_norm": 0.13539397716522217, "learning_rate": 2.8807558311949387e-05, "loss": 0.3694, "step": 1318 }, { "epoch": 0.8314881210354202, "grad_norm": 0.15693554282188416, "learning_rate": 2.8785946409803485e-05, "loss": 0.4324, "step": 1319 }, { "epoch": 0.8321185138489421, "grad_norm": 0.13900567591190338, "learning_rate": 2.876432821533344e-05, "loss": 0.3951, "step": 1320 }, { "epoch": 0.832748906662464, "grad_norm": 0.15492331981658936, "learning_rate": 2.8742703750179375e-05, "loss": 0.4227, "step": 1321 }, { "epoch": 0.833379299475986, "grad_norm": 0.18184980750083923, "learning_rate": 2.8721073035987694e-05, "loss": 0.416, "step": 1322 }, { "epoch": 0.8340096922895079, "grad_norm": 0.16962742805480957, "learning_rate": 2.8699436094411058e-05, "loss": 0.4497, "step": 1323 }, { "epoch": 0.8346400851030298, "grad_norm": 0.1357308179140091, "learning_rate": 2.8677792947108362e-05, "loss": 0.3872, "step": 1324 }, { "epoch": 0.8352704779165517, "grad_norm": 0.13641200959682465, "learning_rate": 2.8656143615744708e-05, "loss": 0.3701, "step": 1325 }, { "epoch": 0.8359008707300737, "grad_norm": 0.15859255194664001, "learning_rate": 2.8634488121991412e-05, "loss": 0.3139, "step": 1326 }, { "epoch": 0.8365312635435956, "grad_norm": 0.1266074925661087, "learning_rate": 2.8612826487525913e-05, "loss": 0.3832, "step": 1327 }, { "epoch": 0.8371616563571175, "grad_norm": 0.16178956627845764, "learning_rate": 2.859115873403184e-05, "loss": 0.4407, "step": 1328 }, { "epoch": 0.8377920491706394, "grad_norm": 0.14203162491321564, "learning_rate": 2.8569484883198924e-05, "loss": 0.4151, "step": 1329 }, { "epoch": 0.8384224419841614, "grad_norm": 0.16555646061897278, "learning_rate": 2.854780495672301e-05, "loss": 0.3897, "step": 1330 }, { "epoch": 0.8390528347976833, "grad_norm": 0.11057502776384354, "learning_rate": 2.852611897630602e-05, "loss": 0.3404, "step": 1331 }, { "epoch": 0.8396832276112053, "grad_norm": 0.12985214591026306, "learning_rate": 2.850442696365594e-05, "loss": 0.3201, "step": 1332 }, { "epoch": 0.8403136204247271, "grad_norm": 0.15618446469306946, "learning_rate": 2.8482728940486774e-05, "loss": 0.4648, "step": 1333 }, { "epoch": 0.8409440132382491, "grad_norm": 0.1684415489435196, "learning_rate": 2.846102492851859e-05, "loss": 0.4042, "step": 1334 }, { "epoch": 0.841574406051771, "grad_norm": 0.16451717913150787, "learning_rate": 2.8439314949477393e-05, "loss": 0.4144, "step": 1335 }, { "epoch": 0.842204798865293, "grad_norm": 0.1372554451227188, "learning_rate": 2.841759902509521e-05, "loss": 0.3653, "step": 1336 }, { "epoch": 0.8428351916788148, "grad_norm": 0.14489951729774475, "learning_rate": 2.8395877177109986e-05, "loss": 0.3859, "step": 1337 }, { "epoch": 0.8434655844923368, "grad_norm": 0.13698627054691315, "learning_rate": 2.837414942726561e-05, "loss": 0.4311, "step": 1338 }, { "epoch": 0.8440959773058587, "grad_norm": 0.12621767818927765, "learning_rate": 2.835241579731188e-05, "loss": 0.3912, "step": 1339 }, { "epoch": 0.8447263701193807, "grad_norm": 0.18279710412025452, "learning_rate": 2.8330676309004474e-05, "loss": 0.4154, "step": 1340 }, { "epoch": 0.8453567629329025, "grad_norm": 0.1633724719285965, "learning_rate": 2.830893098410494e-05, "loss": 0.3936, "step": 1341 }, { "epoch": 0.8459871557464245, "grad_norm": 0.1703907698392868, "learning_rate": 2.8287179844380662e-05, "loss": 0.4769, "step": 1342 }, { "epoch": 0.8466175485599464, "grad_norm": 0.14323486387729645, "learning_rate": 2.826542291160485e-05, "loss": 0.3686, "step": 1343 }, { "epoch": 0.8472479413734684, "grad_norm": 0.1713230162858963, "learning_rate": 2.8243660207556516e-05, "loss": 0.3874, "step": 1344 }, { "epoch": 0.8478783341869903, "grad_norm": 0.1804395169019699, "learning_rate": 2.822189175402044e-05, "loss": 0.4208, "step": 1345 }, { "epoch": 0.8485087270005122, "grad_norm": 0.12683406472206116, "learning_rate": 2.820011757278716e-05, "loss": 0.34, "step": 1346 }, { "epoch": 0.8491391198140341, "grad_norm": 0.119718998670578, "learning_rate": 2.8178337685652956e-05, "loss": 0.4118, "step": 1347 }, { "epoch": 0.8497695126275561, "grad_norm": 0.13840489089488983, "learning_rate": 2.815655211441981e-05, "loss": 0.4252, "step": 1348 }, { "epoch": 0.850399905441078, "grad_norm": 0.13725785911083221, "learning_rate": 2.8134760880895384e-05, "loss": 0.4144, "step": 1349 }, { "epoch": 0.8510302982545999, "grad_norm": 0.147429421544075, "learning_rate": 2.8112964006893044e-05, "loss": 0.4391, "step": 1350 }, { "epoch": 0.8516606910681218, "grad_norm": 0.14347843825817108, "learning_rate": 2.8091161514231764e-05, "loss": 0.3921, "step": 1351 }, { "epoch": 0.8522910838816438, "grad_norm": 0.1464390754699707, "learning_rate": 2.806935342473616e-05, "loss": 0.3983, "step": 1352 }, { "epoch": 0.8529214766951657, "grad_norm": 0.1333981603384018, "learning_rate": 2.8047539760236457e-05, "loss": 0.3999, "step": 1353 }, { "epoch": 0.8535518695086876, "grad_norm": 0.142788827419281, "learning_rate": 2.8025720542568438e-05, "loss": 0.3837, "step": 1354 }, { "epoch": 0.8541822623222095, "grad_norm": 0.1281784325838089, "learning_rate": 2.8003895793573467e-05, "loss": 0.3303, "step": 1355 }, { "epoch": 0.8548126551357315, "grad_norm": 0.14220544695854187, "learning_rate": 2.7982065535098436e-05, "loss": 0.3639, "step": 1356 }, { "epoch": 0.8554430479492534, "grad_norm": 0.13342589139938354, "learning_rate": 2.796022978899575e-05, "loss": 0.357, "step": 1357 }, { "epoch": 0.8560734407627753, "grad_norm": 0.15548692643642426, "learning_rate": 2.7938388577123307e-05, "loss": 0.4021, "step": 1358 }, { "epoch": 0.8567038335762972, "grad_norm": 0.12782569229602814, "learning_rate": 2.791654192134449e-05, "loss": 0.3451, "step": 1359 }, { "epoch": 0.8573342263898192, "grad_norm": 0.15871256589889526, "learning_rate": 2.7894689843528103e-05, "loss": 0.3999, "step": 1360 }, { "epoch": 0.8579646192033411, "grad_norm": 0.14577703177928925, "learning_rate": 2.7872832365548415e-05, "loss": 0.4041, "step": 1361 }, { "epoch": 0.8585950120168631, "grad_norm": 0.1306508630514145, "learning_rate": 2.785096950928506e-05, "loss": 0.4234, "step": 1362 }, { "epoch": 0.8592254048303849, "grad_norm": 0.18583562970161438, "learning_rate": 2.7829101296623095e-05, "loss": 0.4297, "step": 1363 }, { "epoch": 0.8598557976439068, "grad_norm": 0.14241017401218414, "learning_rate": 2.780722774945291e-05, "loss": 0.3435, "step": 1364 }, { "epoch": 0.8604861904574288, "grad_norm": 0.18705232441425323, "learning_rate": 2.7785348889670246e-05, "loss": 0.4676, "step": 1365 }, { "epoch": 0.8611165832709508, "grad_norm": 0.1302051544189453, "learning_rate": 2.7763464739176167e-05, "loss": 0.3381, "step": 1366 }, { "epoch": 0.8617469760844726, "grad_norm": 0.1736343502998352, "learning_rate": 2.7741575319877023e-05, "loss": 0.4425, "step": 1367 }, { "epoch": 0.8623773688979945, "grad_norm": 0.14309068024158478, "learning_rate": 2.771968065368444e-05, "loss": 0.4022, "step": 1368 }, { "epoch": 0.8630077617115165, "grad_norm": 0.1872050166130066, "learning_rate": 2.7697780762515295e-05, "loss": 0.4522, "step": 1369 }, { "epoch": 0.8636381545250384, "grad_norm": 0.1302749216556549, "learning_rate": 2.7675875668291715e-05, "loss": 0.3819, "step": 1370 }, { "epoch": 0.8642685473385603, "grad_norm": 0.14372815191745758, "learning_rate": 2.7653965392940994e-05, "loss": 0.3799, "step": 1371 }, { "epoch": 0.8648989401520822, "grad_norm": 0.13147087395191193, "learning_rate": 2.763204995839567e-05, "loss": 0.3821, "step": 1372 }, { "epoch": 0.8655293329656042, "grad_norm": 0.16376769542694092, "learning_rate": 2.761012938659338e-05, "loss": 0.4577, "step": 1373 }, { "epoch": 0.8661597257791261, "grad_norm": 0.13191644847393036, "learning_rate": 2.758820369947696e-05, "loss": 0.3956, "step": 1374 }, { "epoch": 0.8667901185926481, "grad_norm": 0.14816711843013763, "learning_rate": 2.756627291899433e-05, "loss": 0.3705, "step": 1375 }, { "epoch": 0.8674205114061699, "grad_norm": 0.16423572599887848, "learning_rate": 2.7544337067098536e-05, "loss": 0.4717, "step": 1376 }, { "epoch": 0.8680509042196919, "grad_norm": 0.1640395224094391, "learning_rate": 2.752239616574767e-05, "loss": 0.4146, "step": 1377 }, { "epoch": 0.8686812970332138, "grad_norm": 0.15758387744426727, "learning_rate": 2.75004502369049e-05, "loss": 0.4276, "step": 1378 }, { "epoch": 0.8693116898467358, "grad_norm": 0.16378600895404816, "learning_rate": 2.7478499302538426e-05, "loss": 0.3942, "step": 1379 }, { "epoch": 0.8699420826602576, "grad_norm": 0.15028297901153564, "learning_rate": 2.7456543384621444e-05, "loss": 0.383, "step": 1380 }, { "epoch": 0.8705724754737796, "grad_norm": 0.16802988946437836, "learning_rate": 2.743458250513216e-05, "loss": 0.4308, "step": 1381 }, { "epoch": 0.8712028682873015, "grad_norm": 0.16376203298568726, "learning_rate": 2.7412616686053725e-05, "loss": 0.3934, "step": 1382 }, { "epoch": 0.8718332611008235, "grad_norm": 0.14017964899539948, "learning_rate": 2.739064594937425e-05, "loss": 0.3852, "step": 1383 }, { "epoch": 0.8724636539143453, "grad_norm": 0.1617506444454193, "learning_rate": 2.7368670317086766e-05, "loss": 0.4662, "step": 1384 }, { "epoch": 0.8730940467278673, "grad_norm": 0.14962440729141235, "learning_rate": 2.73466898111892e-05, "loss": 0.3929, "step": 1385 }, { "epoch": 0.8737244395413892, "grad_norm": 0.16706913709640503, "learning_rate": 2.7324704453684352e-05, "loss": 0.4208, "step": 1386 }, { "epoch": 0.8743548323549112, "grad_norm": 0.16016672551631927, "learning_rate": 2.7302714266579897e-05, "loss": 0.3574, "step": 1387 }, { "epoch": 0.8749852251684331, "grad_norm": 0.13596835732460022, "learning_rate": 2.7280719271888326e-05, "loss": 0.4341, "step": 1388 }, { "epoch": 0.875615617981955, "grad_norm": 0.11969844996929169, "learning_rate": 2.725871949162695e-05, "loss": 0.378, "step": 1389 }, { "epoch": 0.8762460107954769, "grad_norm": 0.13828162848949432, "learning_rate": 2.7236714947817875e-05, "loss": 0.3713, "step": 1390 }, { "epoch": 0.8768764036089989, "grad_norm": 0.15005554258823395, "learning_rate": 2.721470566248796e-05, "loss": 0.4491, "step": 1391 }, { "epoch": 0.8775067964225208, "grad_norm": 0.1482902467250824, "learning_rate": 2.7192691657668842e-05, "loss": 0.4512, "step": 1392 }, { "epoch": 0.8781371892360427, "grad_norm": 0.12331166118383408, "learning_rate": 2.7170672955396837e-05, "loss": 0.3675, "step": 1393 }, { "epoch": 0.8787675820495646, "grad_norm": 0.13660334050655365, "learning_rate": 2.7148649577713005e-05, "loss": 0.3699, "step": 1394 }, { "epoch": 0.8793979748630866, "grad_norm": 0.137289896607399, "learning_rate": 2.712662154666306e-05, "loss": 0.3468, "step": 1395 }, { "epoch": 0.8800283676766085, "grad_norm": 0.16336098313331604, "learning_rate": 2.7104588884297388e-05, "loss": 0.4172, "step": 1396 }, { "epoch": 0.8806587604901304, "grad_norm": 0.16273409128189087, "learning_rate": 2.7082551612671002e-05, "loss": 0.4059, "step": 1397 }, { "epoch": 0.8812891533036523, "grad_norm": 0.16388723254203796, "learning_rate": 2.7060509753843532e-05, "loss": 0.4598, "step": 1398 }, { "epoch": 0.8819195461171743, "grad_norm": 0.1399746984243393, "learning_rate": 2.7038463329879205e-05, "loss": 0.4212, "step": 1399 }, { "epoch": 0.8825499389306962, "grad_norm": 0.14388558268547058, "learning_rate": 2.7016412362846804e-05, "loss": 0.4263, "step": 1400 }, { "epoch": 0.8825499389306962, "eval_loss": 0.42592132091522217, "eval_runtime": 222.6108, "eval_samples_per_second": 4.492, "eval_steps_per_second": 4.492, "step": 1400 }, { "epoch": 0.8831803317442181, "grad_norm": 0.14138349890708923, "learning_rate": 2.6994356874819683e-05, "loss": 0.4132, "step": 1401 }, { "epoch": 0.88381072455774, "grad_norm": 0.1343000829219818, "learning_rate": 2.6972296887875695e-05, "loss": 0.3542, "step": 1402 }, { "epoch": 0.884441117371262, "grad_norm": 0.15190397202968597, "learning_rate": 2.6950232424097215e-05, "loss": 0.4265, "step": 1403 }, { "epoch": 0.8850715101847839, "grad_norm": 0.1640225201845169, "learning_rate": 2.6928163505571093e-05, "loss": 0.3829, "step": 1404 }, { "epoch": 0.8857019029983059, "grad_norm": 0.14996233582496643, "learning_rate": 2.690609015438864e-05, "loss": 0.3758, "step": 1405 }, { "epoch": 0.8863322958118277, "grad_norm": 0.1581767052412033, "learning_rate": 2.68840123926456e-05, "loss": 0.4537, "step": 1406 }, { "epoch": 0.8869626886253497, "grad_norm": 0.16030016541481018, "learning_rate": 2.6861930242442137e-05, "loss": 0.3675, "step": 1407 }, { "epoch": 0.8875930814388716, "grad_norm": 0.1415213793516159, "learning_rate": 2.6839843725882805e-05, "loss": 0.3931, "step": 1408 }, { "epoch": 0.8882234742523936, "grad_norm": 0.1286267340183258, "learning_rate": 2.6817752865076526e-05, "loss": 0.3673, "step": 1409 }, { "epoch": 0.8888538670659154, "grad_norm": 0.1294373869895935, "learning_rate": 2.6795657682136582e-05, "loss": 0.3691, "step": 1410 }, { "epoch": 0.8894842598794374, "grad_norm": 0.13593226671218872, "learning_rate": 2.6773558199180566e-05, "loss": 0.32, "step": 1411 }, { "epoch": 0.8901146526929593, "grad_norm": 0.1556108444929123, "learning_rate": 2.675145443833039e-05, "loss": 0.4227, "step": 1412 }, { "epoch": 0.8907450455064813, "grad_norm": 0.13537348806858063, "learning_rate": 2.6729346421712237e-05, "loss": 0.3302, "step": 1413 }, { "epoch": 0.8913754383200031, "grad_norm": 0.14231304824352264, "learning_rate": 2.6707234171456555e-05, "loss": 0.4624, "step": 1414 }, { "epoch": 0.892005831133525, "grad_norm": 0.16324934363365173, "learning_rate": 2.668511770969803e-05, "loss": 0.4407, "step": 1415 }, { "epoch": 0.892636223947047, "grad_norm": 0.146931454539299, "learning_rate": 2.666299705857556e-05, "loss": 0.4546, "step": 1416 }, { "epoch": 0.893266616760569, "grad_norm": 0.1308489888906479, "learning_rate": 2.6640872240232247e-05, "loss": 0.3906, "step": 1417 }, { "epoch": 0.8938970095740909, "grad_norm": 0.1489260494709015, "learning_rate": 2.661874327681536e-05, "loss": 0.3559, "step": 1418 }, { "epoch": 0.8945274023876127, "grad_norm": 0.14801530539989471, "learning_rate": 2.65966101904763e-05, "loss": 0.4261, "step": 1419 }, { "epoch": 0.8951577952011347, "grad_norm": 0.138169065117836, "learning_rate": 2.6574473003370612e-05, "loss": 0.3814, "step": 1420 }, { "epoch": 0.8957881880146567, "grad_norm": 0.15858194231987, "learning_rate": 2.6552331737657956e-05, "loss": 0.4008, "step": 1421 }, { "epoch": 0.8964185808281786, "grad_norm": 0.1418287456035614, "learning_rate": 2.6530186415502055e-05, "loss": 0.3684, "step": 1422 }, { "epoch": 0.8970489736417004, "grad_norm": 0.1921471655368805, "learning_rate": 2.6508037059070698e-05, "loss": 0.4454, "step": 1423 }, { "epoch": 0.8976793664552224, "grad_norm": 0.16956478357315063, "learning_rate": 2.6485883690535716e-05, "loss": 0.4696, "step": 1424 }, { "epoch": 0.8983097592687443, "grad_norm": 0.15770865976810455, "learning_rate": 2.6463726332072958e-05, "loss": 0.3732, "step": 1425 }, { "epoch": 0.8989401520822663, "grad_norm": 0.15551583468914032, "learning_rate": 2.6441565005862253e-05, "loss": 0.3584, "step": 1426 }, { "epoch": 0.8995705448957881, "grad_norm": 0.16392450034618378, "learning_rate": 2.6419399734087426e-05, "loss": 0.3833, "step": 1427 }, { "epoch": 0.9002009377093101, "grad_norm": 0.17206740379333496, "learning_rate": 2.6397230538936233e-05, "loss": 0.4071, "step": 1428 }, { "epoch": 0.900831330522832, "grad_norm": 0.13827435672283173, "learning_rate": 2.6375057442600353e-05, "loss": 0.3501, "step": 1429 }, { "epoch": 0.901461723336354, "grad_norm": 0.20176725089550018, "learning_rate": 2.635288046727539e-05, "loss": 0.4255, "step": 1430 }, { "epoch": 0.9020921161498759, "grad_norm": 0.16234435141086578, "learning_rate": 2.6330699635160814e-05, "loss": 0.4142, "step": 1431 }, { "epoch": 0.9027225089633978, "grad_norm": 0.1656573861837387, "learning_rate": 2.6308514968459968e-05, "loss": 0.4694, "step": 1432 }, { "epoch": 0.9033529017769197, "grad_norm": 0.16659680008888245, "learning_rate": 2.628632648938002e-05, "loss": 0.4834, "step": 1433 }, { "epoch": 0.9039832945904417, "grad_norm": 0.14273500442504883, "learning_rate": 2.626413422013197e-05, "loss": 0.3792, "step": 1434 }, { "epoch": 0.9046136874039636, "grad_norm": 0.15806609392166138, "learning_rate": 2.6241938182930593e-05, "loss": 0.3034, "step": 1435 }, { "epoch": 0.9052440802174855, "grad_norm": 0.14422351121902466, "learning_rate": 2.6219738399994453e-05, "loss": 0.4349, "step": 1436 }, { "epoch": 0.9058744730310074, "grad_norm": 0.1485350877046585, "learning_rate": 2.6197534893545858e-05, "loss": 0.3553, "step": 1437 }, { "epoch": 0.9065048658445294, "grad_norm": 0.14150355756282806, "learning_rate": 2.617532768581084e-05, "loss": 0.3638, "step": 1438 }, { "epoch": 0.9071352586580513, "grad_norm": 0.13242359459400177, "learning_rate": 2.6153116799019138e-05, "loss": 0.3722, "step": 1439 }, { "epoch": 0.9077656514715732, "grad_norm": 0.15447986125946045, "learning_rate": 2.6130902255404177e-05, "loss": 0.3917, "step": 1440 }, { "epoch": 0.9083960442850951, "grad_norm": 0.18113017082214355, "learning_rate": 2.6108684077203037e-05, "loss": 0.3853, "step": 1441 }, { "epoch": 0.9090264370986171, "grad_norm": 0.19897674024105072, "learning_rate": 2.6086462286656432e-05, "loss": 0.4622, "step": 1442 }, { "epoch": 0.909656829912139, "grad_norm": 0.1397695243358612, "learning_rate": 2.606423690600871e-05, "loss": 0.3387, "step": 1443 }, { "epoch": 0.910287222725661, "grad_norm": 0.1728425920009613, "learning_rate": 2.6042007957507797e-05, "loss": 0.4727, "step": 1444 }, { "epoch": 0.9109176155391828, "grad_norm": 0.1335446983575821, "learning_rate": 2.60197754634052e-05, "loss": 0.3824, "step": 1445 }, { "epoch": 0.9115480083527048, "grad_norm": 0.16681966185569763, "learning_rate": 2.5997539445955955e-05, "loss": 0.4989, "step": 1446 }, { "epoch": 0.9121784011662267, "grad_norm": 0.14989428222179413, "learning_rate": 2.597529992741866e-05, "loss": 0.3926, "step": 1447 }, { "epoch": 0.9128087939797487, "grad_norm": 0.14452800154685974, "learning_rate": 2.595305693005538e-05, "loss": 0.3706, "step": 1448 }, { "epoch": 0.9134391867932705, "grad_norm": 0.12137465924024582, "learning_rate": 2.5930810476131697e-05, "loss": 0.3181, "step": 1449 }, { "epoch": 0.9140695796067925, "grad_norm": 0.1557312160730362, "learning_rate": 2.590856058791663e-05, "loss": 0.4069, "step": 1450 }, { "epoch": 0.9146999724203144, "grad_norm": 0.14804257452487946, "learning_rate": 2.588630728768263e-05, "loss": 0.3671, "step": 1451 }, { "epoch": 0.9153303652338364, "grad_norm": 0.17212040722370148, "learning_rate": 2.5864050597705593e-05, "loss": 0.4591, "step": 1452 }, { "epoch": 0.9159607580473582, "grad_norm": 0.119450144469738, "learning_rate": 2.5841790540264776e-05, "loss": 0.3433, "step": 1453 }, { "epoch": 0.9165911508608802, "grad_norm": 0.14319778978824615, "learning_rate": 2.581952713764284e-05, "loss": 0.4054, "step": 1454 }, { "epoch": 0.9172215436744021, "grad_norm": 0.12508279085159302, "learning_rate": 2.5797260412125756e-05, "loss": 0.3926, "step": 1455 }, { "epoch": 0.9178519364879241, "grad_norm": 0.146225243806839, "learning_rate": 2.5774990386002857e-05, "loss": 0.3393, "step": 1456 }, { "epoch": 0.9184823293014459, "grad_norm": 0.14512450993061066, "learning_rate": 2.575271708156675e-05, "loss": 0.363, "step": 1457 }, { "epoch": 0.9191127221149679, "grad_norm": 0.12677498161792755, "learning_rate": 2.5730440521113347e-05, "loss": 0.4141, "step": 1458 }, { "epoch": 0.9197431149284898, "grad_norm": 0.16313442587852478, "learning_rate": 2.570816072694181e-05, "loss": 0.387, "step": 1459 }, { "epoch": 0.9203735077420118, "grad_norm": 0.12096891552209854, "learning_rate": 2.5685877721354537e-05, "loss": 0.3407, "step": 1460 }, { "epoch": 0.9210039005555337, "grad_norm": 0.1445506513118744, "learning_rate": 2.5663591526657146e-05, "loss": 0.429, "step": 1461 }, { "epoch": 0.9216342933690556, "grad_norm": 0.1301969736814499, "learning_rate": 2.5641302165158435e-05, "loss": 0.3671, "step": 1462 }, { "epoch": 0.9222646861825775, "grad_norm": 0.14147797226905823, "learning_rate": 2.5619009659170385e-05, "loss": 0.4343, "step": 1463 }, { "epoch": 0.9228950789960995, "grad_norm": 0.15561676025390625, "learning_rate": 2.559671403100812e-05, "loss": 0.4057, "step": 1464 }, { "epoch": 0.9235254718096214, "grad_norm": 0.16878941655158997, "learning_rate": 2.5574415302989894e-05, "loss": 0.4392, "step": 1465 }, { "epoch": 0.9241558646231433, "grad_norm": 0.13875456154346466, "learning_rate": 2.5552113497437047e-05, "loss": 0.3901, "step": 1466 }, { "epoch": 0.9247862574366652, "grad_norm": 0.1460646241903305, "learning_rate": 2.5529808636674022e-05, "loss": 0.3717, "step": 1467 }, { "epoch": 0.9254166502501872, "grad_norm": 0.15384356677532196, "learning_rate": 2.550750074302831e-05, "loss": 0.3628, "step": 1468 }, { "epoch": 0.9260470430637091, "grad_norm": 0.16636264324188232, "learning_rate": 2.5485189838830437e-05, "loss": 0.3677, "step": 1469 }, { "epoch": 0.926677435877231, "grad_norm": 0.17758719623088837, "learning_rate": 2.5462875946413945e-05, "loss": 0.4612, "step": 1470 }, { "epoch": 0.9273078286907529, "grad_norm": 0.12851199507713318, "learning_rate": 2.544055908811536e-05, "loss": 0.393, "step": 1471 }, { "epoch": 0.9279382215042749, "grad_norm": 0.20510762929916382, "learning_rate": 2.541823928627419e-05, "loss": 0.4241, "step": 1472 }, { "epoch": 0.9285686143177968, "grad_norm": 0.14449749886989594, "learning_rate": 2.5395916563232873e-05, "loss": 0.3731, "step": 1473 }, { "epoch": 0.9291990071313188, "grad_norm": 0.13960114121437073, "learning_rate": 2.5373590941336798e-05, "loss": 0.3782, "step": 1474 }, { "epoch": 0.9298293999448406, "grad_norm": 0.15464681386947632, "learning_rate": 2.5351262442934225e-05, "loss": 0.3528, "step": 1475 }, { "epoch": 0.9304597927583625, "grad_norm": 0.16516605019569397, "learning_rate": 2.5328931090376307e-05, "loss": 0.4447, "step": 1476 }, { "epoch": 0.9310901855718845, "grad_norm": 0.16184210777282715, "learning_rate": 2.5306596906017057e-05, "loss": 0.4308, "step": 1477 }, { "epoch": 0.9317205783854065, "grad_norm": 0.12153437733650208, "learning_rate": 2.5284259912213328e-05, "loss": 0.3333, "step": 1478 }, { "epoch": 0.9323509711989283, "grad_norm": 0.14005841314792633, "learning_rate": 2.526192013132476e-05, "loss": 0.3686, "step": 1479 }, { "epoch": 0.9329813640124502, "grad_norm": 0.14325931668281555, "learning_rate": 2.523957758571381e-05, "loss": 0.3911, "step": 1480 }, { "epoch": 0.9336117568259722, "grad_norm": 0.14206531643867493, "learning_rate": 2.5217232297745695e-05, "loss": 0.339, "step": 1481 }, { "epoch": 0.9342421496394941, "grad_norm": 0.12405907362699509, "learning_rate": 2.519488428978837e-05, "loss": 0.3635, "step": 1482 }, { "epoch": 0.934872542453016, "grad_norm": 0.14162199199199677, "learning_rate": 2.5172533584212515e-05, "loss": 0.3568, "step": 1483 }, { "epoch": 0.9355029352665379, "grad_norm": 0.13539136946201324, "learning_rate": 2.5150180203391514e-05, "loss": 0.3735, "step": 1484 }, { "epoch": 0.9361333280800599, "grad_norm": 0.13682807981967926, "learning_rate": 2.5127824169701437e-05, "loss": 0.3539, "step": 1485 }, { "epoch": 0.9367637208935818, "grad_norm": 0.15052659809589386, "learning_rate": 2.5105465505520986e-05, "loss": 0.4234, "step": 1486 }, { "epoch": 0.9373941137071038, "grad_norm": 0.15359781682491302, "learning_rate": 2.508310423323152e-05, "loss": 0.4297, "step": 1487 }, { "epoch": 0.9380245065206256, "grad_norm": 0.15407221019268036, "learning_rate": 2.506074037521699e-05, "loss": 0.4242, "step": 1488 }, { "epoch": 0.9386548993341476, "grad_norm": 0.13012506067752838, "learning_rate": 2.503837395386396e-05, "loss": 0.4042, "step": 1489 }, { "epoch": 0.9392852921476695, "grad_norm": 0.1508699506521225, "learning_rate": 2.5016004991561532e-05, "loss": 0.4303, "step": 1490 }, { "epoch": 0.9399156849611915, "grad_norm": 0.18919318914413452, "learning_rate": 2.4993633510701366e-05, "loss": 0.4513, "step": 1491 }, { "epoch": 0.9405460777747133, "grad_norm": 0.17468729615211487, "learning_rate": 2.4971259533677642e-05, "loss": 0.4648, "step": 1492 }, { "epoch": 0.9411764705882353, "grad_norm": 0.13577426970005035, "learning_rate": 2.4948883082887034e-05, "loss": 0.3959, "step": 1493 }, { "epoch": 0.9418068634017572, "grad_norm": 0.15910327434539795, "learning_rate": 2.4926504180728703e-05, "loss": 0.4816, "step": 1494 }, { "epoch": 0.9424372562152792, "grad_norm": 0.1459847092628479, "learning_rate": 2.490412284960425e-05, "loss": 0.3895, "step": 1495 }, { "epoch": 0.943067649028801, "grad_norm": 0.15411172807216644, "learning_rate": 2.488173911191772e-05, "loss": 0.4221, "step": 1496 }, { "epoch": 0.943698041842323, "grad_norm": 0.13575363159179688, "learning_rate": 2.4859352990075558e-05, "loss": 0.3708, "step": 1497 }, { "epoch": 0.9443284346558449, "grad_norm": 0.16765348613262177, "learning_rate": 2.48369645064866e-05, "loss": 0.4337, "step": 1498 }, { "epoch": 0.9449588274693669, "grad_norm": 0.1297028809785843, "learning_rate": 2.4814573683562044e-05, "loss": 0.3579, "step": 1499 }, { "epoch": 0.9455892202828887, "grad_norm": 0.1517636775970459, "learning_rate": 2.4792180543715438e-05, "loss": 0.4615, "step": 1500 }, { "epoch": 0.9462196130964107, "grad_norm": 0.17348432540893555, "learning_rate": 2.476978510936263e-05, "loss": 0.4367, "step": 1501 }, { "epoch": 0.9468500059099326, "grad_norm": 0.15146148204803467, "learning_rate": 2.4747387402921776e-05, "loss": 0.336, "step": 1502 }, { "epoch": 0.9474803987234546, "grad_norm": 0.13965347409248352, "learning_rate": 2.4724987446813324e-05, "loss": 0.331, "step": 1503 }, { "epoch": 0.9481107915369765, "grad_norm": 0.1368507295846939, "learning_rate": 2.470258526345994e-05, "loss": 0.4138, "step": 1504 }, { "epoch": 0.9487411843504984, "grad_norm": 0.13618862628936768, "learning_rate": 2.4680180875286548e-05, "loss": 0.354, "step": 1505 }, { "epoch": 0.9493715771640203, "grad_norm": 0.17908114194869995, "learning_rate": 2.465777430472026e-05, "loss": 0.4809, "step": 1506 }, { "epoch": 0.9500019699775423, "grad_norm": 0.13375219702720642, "learning_rate": 2.4635365574190383e-05, "loss": 0.3861, "step": 1507 }, { "epoch": 0.9506323627910642, "grad_norm": 0.1537475287914276, "learning_rate": 2.4612954706128387e-05, "loss": 0.4372, "step": 1508 }, { "epoch": 0.9512627556045861, "grad_norm": 0.15866872668266296, "learning_rate": 2.4590541722967872e-05, "loss": 0.3914, "step": 1509 }, { "epoch": 0.951893148418108, "grad_norm": 0.15313230454921722, "learning_rate": 2.4568126647144573e-05, "loss": 0.3593, "step": 1510 }, { "epoch": 0.95252354123163, "grad_norm": 0.14494845271110535, "learning_rate": 2.4545709501096288e-05, "loss": 0.3509, "step": 1511 }, { "epoch": 0.9531539340451519, "grad_norm": 0.1457335203886032, "learning_rate": 2.4523290307262925e-05, "loss": 0.4383, "step": 1512 }, { "epoch": 0.9537843268586738, "grad_norm": 0.15442660450935364, "learning_rate": 2.4500869088086415e-05, "loss": 0.369, "step": 1513 }, { "epoch": 0.9544147196721957, "grad_norm": 0.12074555456638336, "learning_rate": 2.4478445866010736e-05, "loss": 0.2969, "step": 1514 }, { "epoch": 0.9550451124857177, "grad_norm": 0.1371067464351654, "learning_rate": 2.4456020663481845e-05, "loss": 0.356, "step": 1515 }, { "epoch": 0.9556755052992396, "grad_norm": 0.19766010344028473, "learning_rate": 2.4433593502947717e-05, "loss": 0.4266, "step": 1516 }, { "epoch": 0.9563058981127616, "grad_norm": 0.11943485587835312, "learning_rate": 2.441116440685825e-05, "loss": 0.3104, "step": 1517 }, { "epoch": 0.9569362909262834, "grad_norm": 0.16624711453914642, "learning_rate": 2.438873339766531e-05, "loss": 0.4282, "step": 1518 }, { "epoch": 0.9575666837398054, "grad_norm": 0.12300334125757217, "learning_rate": 2.4366300497822646e-05, "loss": 0.3538, "step": 1519 }, { "epoch": 0.9581970765533273, "grad_norm": 0.17065255343914032, "learning_rate": 2.4343865729785945e-05, "loss": 0.4407, "step": 1520 }, { "epoch": 0.9588274693668493, "grad_norm": 0.16122186183929443, "learning_rate": 2.4321429116012716e-05, "loss": 0.4291, "step": 1521 }, { "epoch": 0.9594578621803711, "grad_norm": 0.12593595683574677, "learning_rate": 2.429899067896234e-05, "loss": 0.3771, "step": 1522 }, { "epoch": 0.960088254993893, "grad_norm": 0.1505429744720459, "learning_rate": 2.427655044109603e-05, "loss": 0.3829, "step": 1523 }, { "epoch": 0.960718647807415, "grad_norm": 0.1560903638601303, "learning_rate": 2.425410842487678e-05, "loss": 0.3833, "step": 1524 }, { "epoch": 0.961349040620937, "grad_norm": 0.14203353226184845, "learning_rate": 2.423166465276939e-05, "loss": 0.3857, "step": 1525 }, { "epoch": 0.9619794334344588, "grad_norm": 0.1508108377456665, "learning_rate": 2.4209219147240385e-05, "loss": 0.4051, "step": 1526 }, { "epoch": 0.9626098262479807, "grad_norm": 0.12329906225204468, "learning_rate": 2.4186771930758066e-05, "loss": 0.3441, "step": 1527 }, { "epoch": 0.9632402190615027, "grad_norm": 0.13662351667881012, "learning_rate": 2.4164323025792402e-05, "loss": 0.3676, "step": 1528 }, { "epoch": 0.9638706118750247, "grad_norm": 0.15648755431175232, "learning_rate": 2.4141872454815093e-05, "loss": 0.4035, "step": 1529 }, { "epoch": 0.9645010046885466, "grad_norm": 0.15462183952331543, "learning_rate": 2.4119420240299476e-05, "loss": 0.3807, "step": 1530 }, { "epoch": 0.9651313975020684, "grad_norm": 0.14754685759544373, "learning_rate": 2.4096966404720555e-05, "loss": 0.3369, "step": 1531 }, { "epoch": 0.9657617903155904, "grad_norm": 0.13474981486797333, "learning_rate": 2.407451097055494e-05, "loss": 0.3715, "step": 1532 }, { "epoch": 0.9663921831291123, "grad_norm": 0.19845913350582123, "learning_rate": 2.4052053960280847e-05, "loss": 0.4587, "step": 1533 }, { "epoch": 0.9670225759426343, "grad_norm": 0.1516077220439911, "learning_rate": 2.402959539637808e-05, "loss": 0.387, "step": 1534 }, { "epoch": 0.9676529687561561, "grad_norm": 0.160874143242836, "learning_rate": 2.4007135301327985e-05, "loss": 0.3907, "step": 1535 }, { "epoch": 0.9682833615696781, "grad_norm": 0.16115260124206543, "learning_rate": 2.3984673697613446e-05, "loss": 0.4068, "step": 1536 }, { "epoch": 0.9689137543832, "grad_norm": 0.11696796864271164, "learning_rate": 2.396221060771885e-05, "loss": 0.3385, "step": 1537 }, { "epoch": 0.969544147196722, "grad_norm": 0.19254839420318604, "learning_rate": 2.393974605413009e-05, "loss": 0.4503, "step": 1538 }, { "epoch": 0.9701745400102438, "grad_norm": 0.13219331204891205, "learning_rate": 2.39172800593345e-05, "loss": 0.3776, "step": 1539 }, { "epoch": 0.9708049328237658, "grad_norm": 0.138895183801651, "learning_rate": 2.3894812645820877e-05, "loss": 0.4329, "step": 1540 }, { "epoch": 0.9714353256372877, "grad_norm": 0.13477686047554016, "learning_rate": 2.387234383607943e-05, "loss": 0.3652, "step": 1541 }, { "epoch": 0.9720657184508097, "grad_norm": 0.15185682475566864, "learning_rate": 2.384987365260176e-05, "loss": 0.4455, "step": 1542 }, { "epoch": 0.9726961112643316, "grad_norm": 0.14270374178886414, "learning_rate": 2.3827402117880854e-05, "loss": 0.4097, "step": 1543 }, { "epoch": 0.9733265040778535, "grad_norm": 0.14587655663490295, "learning_rate": 2.380492925441104e-05, "loss": 0.3952, "step": 1544 }, { "epoch": 0.9739568968913754, "grad_norm": 0.13921095430850983, "learning_rate": 2.378245508468799e-05, "loss": 0.3444, "step": 1545 }, { "epoch": 0.9745872897048974, "grad_norm": 0.16872672736644745, "learning_rate": 2.375997963120867e-05, "loss": 0.4061, "step": 1546 }, { "epoch": 0.9752176825184193, "grad_norm": 0.14401352405548096, "learning_rate": 2.3737502916471346e-05, "loss": 0.4147, "step": 1547 }, { "epoch": 0.9758480753319412, "grad_norm": 0.16273754835128784, "learning_rate": 2.371502496297552e-05, "loss": 0.3681, "step": 1548 }, { "epoch": 0.9764784681454631, "grad_norm": 0.1970919370651245, "learning_rate": 2.3692545793221974e-05, "loss": 0.4568, "step": 1549 }, { "epoch": 0.9771088609589851, "grad_norm": 0.12945570051670074, "learning_rate": 2.3670065429712665e-05, "loss": 0.3762, "step": 1550 }, { "epoch": 0.977739253772507, "grad_norm": 0.1520601063966751, "learning_rate": 2.3647583894950787e-05, "loss": 0.3706, "step": 1551 }, { "epoch": 0.9783696465860289, "grad_norm": 0.14032632112503052, "learning_rate": 2.3625101211440665e-05, "loss": 0.3885, "step": 1552 }, { "epoch": 0.9790000393995508, "grad_norm": 0.1562827080488205, "learning_rate": 2.36026174016878e-05, "loss": 0.4199, "step": 1553 }, { "epoch": 0.9796304322130728, "grad_norm": 0.18005892634391785, "learning_rate": 2.358013248819882e-05, "loss": 0.4708, "step": 1554 }, { "epoch": 0.9802608250265947, "grad_norm": 0.17608360946178436, "learning_rate": 2.355764649348144e-05, "loss": 0.4308, "step": 1555 }, { "epoch": 0.9808912178401166, "grad_norm": 0.19211320579051971, "learning_rate": 2.353515944004448e-05, "loss": 0.5204, "step": 1556 }, { "epoch": 0.9815216106536385, "grad_norm": 0.12346751987934113, "learning_rate": 2.35126713503978e-05, "loss": 0.3678, "step": 1557 }, { "epoch": 0.9821520034671605, "grad_norm": 0.1775476336479187, "learning_rate": 2.3490182247052312e-05, "loss": 0.4032, "step": 1558 }, { "epoch": 0.9827823962806824, "grad_norm": 0.14146657288074493, "learning_rate": 2.3467692152519934e-05, "loss": 0.3351, "step": 1559 }, { "epoch": 0.9834127890942044, "grad_norm": 0.16345363855361938, "learning_rate": 2.3445201089313583e-05, "loss": 0.3726, "step": 1560 }, { "epoch": 0.9840431819077262, "grad_norm": 0.14180757105350494, "learning_rate": 2.342270907994714e-05, "loss": 0.346, "step": 1561 }, { "epoch": 0.9846735747212482, "grad_norm": 0.15017127990722656, "learning_rate": 2.3400216146935433e-05, "loss": 0.4195, "step": 1562 }, { "epoch": 0.9853039675347701, "grad_norm": 0.1315138190984726, "learning_rate": 2.337772231279422e-05, "loss": 0.3626, "step": 1563 }, { "epoch": 0.9859343603482921, "grad_norm": 0.1329270601272583, "learning_rate": 2.3355227600040146e-05, "loss": 0.3395, "step": 1564 }, { "epoch": 0.9865647531618139, "grad_norm": 0.14351126551628113, "learning_rate": 2.333273203119076e-05, "loss": 0.378, "step": 1565 }, { "epoch": 0.9871951459753359, "grad_norm": 0.14194701611995697, "learning_rate": 2.331023562876445e-05, "loss": 0.3862, "step": 1566 }, { "epoch": 0.9878255387888578, "grad_norm": 0.12102647870779037, "learning_rate": 2.328773841528045e-05, "loss": 0.3812, "step": 1567 }, { "epoch": 0.9884559316023798, "grad_norm": 0.18585304915905, "learning_rate": 2.3265240413258784e-05, "loss": 0.4972, "step": 1568 }, { "epoch": 0.9890863244159016, "grad_norm": 0.1484466940164566, "learning_rate": 2.3242741645220293e-05, "loss": 0.3705, "step": 1569 }, { "epoch": 0.9897167172294236, "grad_norm": 0.1592293083667755, "learning_rate": 2.322024213368657e-05, "loss": 0.4305, "step": 1570 }, { "epoch": 0.9903471100429455, "grad_norm": 0.15290561318397522, "learning_rate": 2.3197741901179957e-05, "loss": 0.4293, "step": 1571 }, { "epoch": 0.9909775028564675, "grad_norm": 0.14812232553958893, "learning_rate": 2.3175240970223506e-05, "loss": 0.421, "step": 1572 }, { "epoch": 0.9916078956699894, "grad_norm": 0.1472492218017578, "learning_rate": 2.315273936334099e-05, "loss": 0.4024, "step": 1573 }, { "epoch": 0.9922382884835113, "grad_norm": 0.12985913455486298, "learning_rate": 2.3130237103056834e-05, "loss": 0.3311, "step": 1574 }, { "epoch": 0.9928686812970332, "grad_norm": 0.16290757060050964, "learning_rate": 2.3107734211896132e-05, "loss": 0.4139, "step": 1575 }, { "epoch": 0.9934990741105552, "grad_norm": 0.14641249179840088, "learning_rate": 2.3085230712384618e-05, "loss": 0.4256, "step": 1576 }, { "epoch": 0.9941294669240771, "grad_norm": 0.1735769808292389, "learning_rate": 2.3062726627048607e-05, "loss": 0.4302, "step": 1577 }, { "epoch": 0.994759859737599, "grad_norm": 0.12449291348457336, "learning_rate": 2.3040221978415033e-05, "loss": 0.3884, "step": 1578 }, { "epoch": 0.9953902525511209, "grad_norm": 0.15413033962249756, "learning_rate": 2.301771678901136e-05, "loss": 0.4274, "step": 1579 }, { "epoch": 0.9960206453646429, "grad_norm": 0.14631548523902893, "learning_rate": 2.299521108136563e-05, "loss": 0.372, "step": 1580 }, { "epoch": 0.9966510381781648, "grad_norm": 0.17966602742671967, "learning_rate": 2.2972704878006366e-05, "loss": 0.4, "step": 1581 }, { "epoch": 0.9972814309916866, "grad_norm": 0.13735203444957733, "learning_rate": 2.2950198201462623e-05, "loss": 0.4069, "step": 1582 }, { "epoch": 0.9979118238052086, "grad_norm": 0.15251851081848145, "learning_rate": 2.292769107426391e-05, "loss": 0.4148, "step": 1583 }, { "epoch": 0.9985422166187305, "grad_norm": 0.14987708628177643, "learning_rate": 2.2905183518940175e-05, "loss": 0.4326, "step": 1584 }, { "epoch": 0.9991726094322525, "grad_norm": 0.14043983817100525, "learning_rate": 2.2882675558021835e-05, "loss": 0.3963, "step": 1585 }, { "epoch": 0.9998030022457745, "grad_norm": 0.1419934183359146, "learning_rate": 2.286016721403966e-05, "loss": 0.3756, "step": 1586 }, { "epoch": 1.0, "grad_norm": 0.2177281379699707, "learning_rate": 2.2837658509524855e-05, "loss": 0.4348, "step": 1587 }, { "epoch": 1.0006303928135218, "grad_norm": 0.13241924345493317, "learning_rate": 2.281514946700894e-05, "loss": 0.3234, "step": 1588 }, { "epoch": 1.001260785627044, "grad_norm": 0.14077149331569672, "learning_rate": 2.279264010902382e-05, "loss": 0.3567, "step": 1589 }, { "epoch": 1.0018911784405657, "grad_norm": 0.1576465517282486, "learning_rate": 2.2770130458101662e-05, "loss": 0.3527, "step": 1590 }, { "epoch": 1.0025215712540878, "grad_norm": 0.15964816510677338, "learning_rate": 2.2747620536774984e-05, "loss": 0.4073, "step": 1591 }, { "epoch": 1.0031519640676096, "grad_norm": 0.15599475800991058, "learning_rate": 2.2725110367576528e-05, "loss": 0.343, "step": 1592 }, { "epoch": 1.0037823568811315, "grad_norm": 0.13730469346046448, "learning_rate": 2.2702599973039306e-05, "loss": 0.3627, "step": 1593 }, { "epoch": 1.0044127496946536, "grad_norm": 0.1274457722902298, "learning_rate": 2.2680089375696554e-05, "loss": 0.3106, "step": 1594 }, { "epoch": 1.0050431425081754, "grad_norm": 0.1535639464855194, "learning_rate": 2.26575785980817e-05, "loss": 0.3128, "step": 1595 }, { "epoch": 1.0056735353216972, "grad_norm": 0.13755956292152405, "learning_rate": 2.2635067662728382e-05, "loss": 0.3614, "step": 1596 }, { "epoch": 1.0063039281352193, "grad_norm": 0.14729587733745575, "learning_rate": 2.261255659217035e-05, "loss": 0.3973, "step": 1597 }, { "epoch": 1.0069343209487411, "grad_norm": 0.13851574063301086, "learning_rate": 2.259004540894153e-05, "loss": 0.3385, "step": 1598 }, { "epoch": 1.0075647137622632, "grad_norm": 0.19435077905654907, "learning_rate": 2.256753413557594e-05, "loss": 0.428, "step": 1599 }, { "epoch": 1.008195106575785, "grad_norm": 0.1472771167755127, "learning_rate": 2.254502279460769e-05, "loss": 0.351, "step": 1600 }, { "epoch": 1.008195106575785, "eval_loss": 0.4261360764503479, "eval_runtime": 222.4194, "eval_samples_per_second": 4.496, "eval_steps_per_second": 4.496, "step": 1600 }, { "epoch": 1.0088254993893069, "grad_norm": 0.15507906675338745, "learning_rate": 2.252251140857097e-05, "loss": 0.298, "step": 1601 }, { "epoch": 1.009455892202829, "grad_norm": 0.15746253728866577, "learning_rate": 2.25e-05, "loss": 0.3894, "step": 1602 }, { "epoch": 1.0100862850163508, "grad_norm": 0.14388109743595123, "learning_rate": 2.2477488591429033e-05, "loss": 0.3008, "step": 1603 }, { "epoch": 1.0107166778298728, "grad_norm": 0.1770758479833603, "learning_rate": 2.245497720539231e-05, "loss": 0.3707, "step": 1604 }, { "epoch": 1.0113470706433947, "grad_norm": 0.15947437286376953, "learning_rate": 2.2432465864424062e-05, "loss": 0.3468, "step": 1605 }, { "epoch": 1.0119774634569165, "grad_norm": 0.14842569828033447, "learning_rate": 2.2409954591058474e-05, "loss": 0.2939, "step": 1606 }, { "epoch": 1.0126078562704386, "grad_norm": 0.1525873839855194, "learning_rate": 2.238744340782965e-05, "loss": 0.3332, "step": 1607 }, { "epoch": 1.0132382490839604, "grad_norm": 0.14072124660015106, "learning_rate": 2.2364932337271627e-05, "loss": 0.3691, "step": 1608 }, { "epoch": 1.0138686418974823, "grad_norm": 0.14068391919136047, "learning_rate": 2.2342421401918298e-05, "loss": 0.3615, "step": 1609 }, { "epoch": 1.0144990347110043, "grad_norm": 0.15972381830215454, "learning_rate": 2.2319910624303452e-05, "loss": 0.2993, "step": 1610 }, { "epoch": 1.0151294275245262, "grad_norm": 0.16424645483493805, "learning_rate": 2.2297400026960697e-05, "loss": 0.3269, "step": 1611 }, { "epoch": 1.0157598203380482, "grad_norm": 0.14915169775485992, "learning_rate": 2.2274889632423475e-05, "loss": 0.385, "step": 1612 }, { "epoch": 1.01639021315157, "grad_norm": 0.1401686817407608, "learning_rate": 2.225237946322502e-05, "loss": 0.3614, "step": 1613 }, { "epoch": 1.017020605965092, "grad_norm": 0.16717317700386047, "learning_rate": 2.222986954189834e-05, "loss": 0.3794, "step": 1614 }, { "epoch": 1.017650998778614, "grad_norm": 0.16320808231830597, "learning_rate": 2.2207359890976184e-05, "loss": 0.4097, "step": 1615 }, { "epoch": 1.0182813915921358, "grad_norm": 0.1880963146686554, "learning_rate": 2.2184850532991065e-05, "loss": 0.3861, "step": 1616 }, { "epoch": 1.0189117844056579, "grad_norm": 0.16419583559036255, "learning_rate": 2.2162341490475148e-05, "loss": 0.3844, "step": 1617 }, { "epoch": 1.0195421772191797, "grad_norm": 0.1421871930360794, "learning_rate": 2.2139832785960342e-05, "loss": 0.354, "step": 1618 }, { "epoch": 1.0201725700327016, "grad_norm": 0.1516357809305191, "learning_rate": 2.2117324441978175e-05, "loss": 0.3302, "step": 1619 }, { "epoch": 1.0208029628462236, "grad_norm": 0.1635531634092331, "learning_rate": 2.2094816481059827e-05, "loss": 0.3938, "step": 1620 }, { "epoch": 1.0214333556597455, "grad_norm": 0.1413624882698059, "learning_rate": 2.20723089257361e-05, "loss": 0.3953, "step": 1621 }, { "epoch": 1.0220637484732673, "grad_norm": 0.18992073833942413, "learning_rate": 2.204980179853738e-05, "loss": 0.3696, "step": 1622 }, { "epoch": 1.0226941412867894, "grad_norm": 0.1756129264831543, "learning_rate": 2.2027295121993637e-05, "loss": 0.3644, "step": 1623 }, { "epoch": 1.0233245341003112, "grad_norm": 0.18942801654338837, "learning_rate": 2.200478891863438e-05, "loss": 0.3855, "step": 1624 }, { "epoch": 1.0239549269138333, "grad_norm": 0.15456733107566833, "learning_rate": 2.198228321098864e-05, "loss": 0.3558, "step": 1625 }, { "epoch": 1.0245853197273551, "grad_norm": 0.16767869889736176, "learning_rate": 2.1959778021584977e-05, "loss": 0.3341, "step": 1626 }, { "epoch": 1.025215712540877, "grad_norm": 0.1457701325416565, "learning_rate": 2.1937273372951392e-05, "loss": 0.3264, "step": 1627 }, { "epoch": 1.025846105354399, "grad_norm": 0.2074301540851593, "learning_rate": 2.1914769287615388e-05, "loss": 0.3717, "step": 1628 }, { "epoch": 1.0264764981679209, "grad_norm": 0.17067231237888336, "learning_rate": 2.1892265788103867e-05, "loss": 0.3604, "step": 1629 }, { "epoch": 1.0271068909814427, "grad_norm": 0.14882232248783112, "learning_rate": 2.1869762896943172e-05, "loss": 0.3606, "step": 1630 }, { "epoch": 1.0277372837949648, "grad_norm": 0.20086149871349335, "learning_rate": 2.1847260636659014e-05, "loss": 0.4098, "step": 1631 }, { "epoch": 1.0283676766084866, "grad_norm": 0.16884002089500427, "learning_rate": 2.1824759029776497e-05, "loss": 0.3098, "step": 1632 }, { "epoch": 1.0289980694220087, "grad_norm": 0.13446485996246338, "learning_rate": 2.1802258098820045e-05, "loss": 0.2947, "step": 1633 }, { "epoch": 1.0296284622355305, "grad_norm": 0.18780672550201416, "learning_rate": 2.1779757866313433e-05, "loss": 0.3743, "step": 1634 }, { "epoch": 1.0302588550490523, "grad_norm": 0.16584482789039612, "learning_rate": 2.1757258354779704e-05, "loss": 0.3698, "step": 1635 }, { "epoch": 1.0308892478625744, "grad_norm": 0.1275024265050888, "learning_rate": 2.1734759586741222e-05, "loss": 0.3158, "step": 1636 }, { "epoch": 1.0315196406760962, "grad_norm": 0.15358369052410126, "learning_rate": 2.171226158471955e-05, "loss": 0.3502, "step": 1637 }, { "epoch": 1.0321500334896183, "grad_norm": 0.13834697008132935, "learning_rate": 2.1689764371235556e-05, "loss": 0.3342, "step": 1638 }, { "epoch": 1.0327804263031402, "grad_norm": 0.16860130429267883, "learning_rate": 2.1667267968809242e-05, "loss": 0.3234, "step": 1639 }, { "epoch": 1.033410819116662, "grad_norm": 0.17650333046913147, "learning_rate": 2.164477239995986e-05, "loss": 0.3425, "step": 1640 }, { "epoch": 1.034041211930184, "grad_norm": 0.1395651251077652, "learning_rate": 2.162227768720579e-05, "loss": 0.303, "step": 1641 }, { "epoch": 1.034671604743706, "grad_norm": 0.1621188074350357, "learning_rate": 2.1599783853064574e-05, "loss": 0.3546, "step": 1642 }, { "epoch": 1.035301997557228, "grad_norm": 0.15481868386268616, "learning_rate": 2.1577290920052863e-05, "loss": 0.3367, "step": 1643 }, { "epoch": 1.0359323903707498, "grad_norm": 0.1645256131887436, "learning_rate": 2.155479891068642e-05, "loss": 0.3282, "step": 1644 }, { "epoch": 1.0365627831842716, "grad_norm": 0.1496310532093048, "learning_rate": 2.153230784748007e-05, "loss": 0.284, "step": 1645 }, { "epoch": 1.0371931759977937, "grad_norm": 0.21159502863883972, "learning_rate": 2.150981775294769e-05, "loss": 0.3727, "step": 1646 }, { "epoch": 1.0378235688113155, "grad_norm": 0.16791081428527832, "learning_rate": 2.14873286496022e-05, "loss": 0.3644, "step": 1647 }, { "epoch": 1.0384539616248374, "grad_norm": 0.19133862853050232, "learning_rate": 2.1464840559955525e-05, "loss": 0.3774, "step": 1648 }, { "epoch": 1.0390843544383594, "grad_norm": 0.1551189422607422, "learning_rate": 2.144235350651856e-05, "loss": 0.4179, "step": 1649 }, { "epoch": 1.0397147472518813, "grad_norm": 0.14688484370708466, "learning_rate": 2.1419867511801187e-05, "loss": 0.3788, "step": 1650 }, { "epoch": 1.0403451400654034, "grad_norm": 0.16884516179561615, "learning_rate": 2.13973825983122e-05, "loss": 0.362, "step": 1651 }, { "epoch": 1.0409755328789252, "grad_norm": 0.15721917152404785, "learning_rate": 2.1374898788559338e-05, "loss": 0.3686, "step": 1652 }, { "epoch": 1.041605925692447, "grad_norm": 0.18979185819625854, "learning_rate": 2.1352416105049215e-05, "loss": 0.3916, "step": 1653 }, { "epoch": 1.042236318505969, "grad_norm": 0.13208013772964478, "learning_rate": 2.1329934570287338e-05, "loss": 0.3572, "step": 1654 }, { "epoch": 1.042866711319491, "grad_norm": 0.12008725106716156, "learning_rate": 2.130745420677803e-05, "loss": 0.3206, "step": 1655 }, { "epoch": 1.0434971041330128, "grad_norm": 0.1520049124956131, "learning_rate": 2.1284975037024486e-05, "loss": 0.3464, "step": 1656 }, { "epoch": 1.0441274969465348, "grad_norm": 0.18334481120109558, "learning_rate": 2.1262497083528657e-05, "loss": 0.2915, "step": 1657 }, { "epoch": 1.0447578897600567, "grad_norm": 0.14017915725708008, "learning_rate": 2.1240020368791336e-05, "loss": 0.316, "step": 1658 }, { "epoch": 1.0453882825735787, "grad_norm": 0.16284134984016418, "learning_rate": 2.1217544915312014e-05, "loss": 0.3782, "step": 1659 }, { "epoch": 1.0460186753871006, "grad_norm": 0.13631150126457214, "learning_rate": 2.1195070745588966e-05, "loss": 0.3128, "step": 1660 }, { "epoch": 1.0466490682006224, "grad_norm": 0.17620840668678284, "learning_rate": 2.1172597882119152e-05, "loss": 0.3419, "step": 1661 }, { "epoch": 1.0472794610141445, "grad_norm": 0.15279735624790192, "learning_rate": 2.1150126347398248e-05, "loss": 0.3849, "step": 1662 }, { "epoch": 1.0479098538276663, "grad_norm": 0.1570388674736023, "learning_rate": 2.1127656163920575e-05, "loss": 0.2943, "step": 1663 }, { "epoch": 1.0485402466411884, "grad_norm": 0.16196565330028534, "learning_rate": 2.1105187354179126e-05, "loss": 0.3545, "step": 1664 }, { "epoch": 1.0491706394547102, "grad_norm": 0.16797305643558502, "learning_rate": 2.10827199406655e-05, "loss": 0.3147, "step": 1665 }, { "epoch": 1.049801032268232, "grad_norm": 0.16055622696876526, "learning_rate": 2.1060253945869915e-05, "loss": 0.3912, "step": 1666 }, { "epoch": 1.0504314250817541, "grad_norm": 0.14581353962421417, "learning_rate": 2.103778939228115e-05, "loss": 0.3271, "step": 1667 }, { "epoch": 1.051061817895276, "grad_norm": 0.16159775853157043, "learning_rate": 2.101532630238656e-05, "loss": 0.3505, "step": 1668 }, { "epoch": 1.0516922107087978, "grad_norm": 0.15286363661289215, "learning_rate": 2.0992864698672018e-05, "loss": 0.3272, "step": 1669 }, { "epoch": 1.0523226035223199, "grad_norm": 0.14952370524406433, "learning_rate": 2.0970404603621925e-05, "loss": 0.3497, "step": 1670 }, { "epoch": 1.0529529963358417, "grad_norm": 0.15633340179920197, "learning_rate": 2.0947946039719153e-05, "loss": 0.3394, "step": 1671 }, { "epoch": 1.0535833891493638, "grad_norm": 0.17084918916225433, "learning_rate": 2.0925489029445066e-05, "loss": 0.3367, "step": 1672 }, { "epoch": 1.0542137819628856, "grad_norm": 0.15072505176067352, "learning_rate": 2.0903033595279448e-05, "loss": 0.3312, "step": 1673 }, { "epoch": 1.0548441747764075, "grad_norm": 0.18470540642738342, "learning_rate": 2.0880579759700534e-05, "loss": 0.361, "step": 1674 }, { "epoch": 1.0554745675899295, "grad_norm": 0.1253766268491745, "learning_rate": 2.085812754518491e-05, "loss": 0.2552, "step": 1675 }, { "epoch": 1.0561049604034514, "grad_norm": 0.15152914822101593, "learning_rate": 2.0835676974207604e-05, "loss": 0.3833, "step": 1676 }, { "epoch": 1.0567353532169734, "grad_norm": 0.15731677412986755, "learning_rate": 2.0813228069241937e-05, "loss": 0.331, "step": 1677 }, { "epoch": 1.0573657460304953, "grad_norm": 0.17662854492664337, "learning_rate": 2.0790780852759618e-05, "loss": 0.4095, "step": 1678 }, { "epoch": 1.057996138844017, "grad_norm": 0.142434760928154, "learning_rate": 2.076833534723061e-05, "loss": 0.3207, "step": 1679 }, { "epoch": 1.0586265316575392, "grad_norm": 0.16224971413612366, "learning_rate": 2.0745891575123224e-05, "loss": 0.3565, "step": 1680 }, { "epoch": 1.059256924471061, "grad_norm": 0.16012756526470184, "learning_rate": 2.0723449558903976e-05, "loss": 0.3237, "step": 1681 }, { "epoch": 1.0598873172845829, "grad_norm": 0.15715405344963074, "learning_rate": 2.0701009321037666e-05, "loss": 0.3296, "step": 1682 }, { "epoch": 1.060517710098105, "grad_norm": 0.17331728339195251, "learning_rate": 2.067857088398729e-05, "loss": 0.3466, "step": 1683 }, { "epoch": 1.0611481029116268, "grad_norm": 0.14372803270816803, "learning_rate": 2.0656134270214064e-05, "loss": 0.3718, "step": 1684 }, { "epoch": 1.0617784957251488, "grad_norm": 0.18391430377960205, "learning_rate": 2.0633699502177353e-05, "loss": 0.4057, "step": 1685 }, { "epoch": 1.0624088885386707, "grad_norm": 0.15161548554897308, "learning_rate": 2.0611266602334698e-05, "loss": 0.3942, "step": 1686 }, { "epoch": 1.0630392813521925, "grad_norm": 0.21158887445926666, "learning_rate": 2.058883559314175e-05, "loss": 0.4186, "step": 1687 }, { "epoch": 1.0636696741657146, "grad_norm": 0.14235830307006836, "learning_rate": 2.0566406497052286e-05, "loss": 0.3225, "step": 1688 }, { "epoch": 1.0643000669792364, "grad_norm": 0.14076681435108185, "learning_rate": 2.054397933651815e-05, "loss": 0.3632, "step": 1689 }, { "epoch": 1.0649304597927585, "grad_norm": 0.1497487723827362, "learning_rate": 2.052155413398927e-05, "loss": 0.3167, "step": 1690 }, { "epoch": 1.0655608526062803, "grad_norm": 0.16239683330059052, "learning_rate": 2.0499130911913584e-05, "loss": 0.3964, "step": 1691 }, { "epoch": 1.0661912454198021, "grad_norm": 0.1456744372844696, "learning_rate": 2.0476709692737078e-05, "loss": 0.3236, "step": 1692 }, { "epoch": 1.0668216382333242, "grad_norm": 0.1592414826154709, "learning_rate": 2.0454290498903715e-05, "loss": 0.3432, "step": 1693 }, { "epoch": 1.067452031046846, "grad_norm": 0.13664771616458893, "learning_rate": 2.0431873352855436e-05, "loss": 0.357, "step": 1694 }, { "epoch": 1.068082423860368, "grad_norm": 0.14796030521392822, "learning_rate": 2.0409458277032127e-05, "loss": 0.3883, "step": 1695 }, { "epoch": 1.06871281667389, "grad_norm": 0.14164042472839355, "learning_rate": 2.038704529387162e-05, "loss": 0.3135, "step": 1696 }, { "epoch": 1.0693432094874118, "grad_norm": 0.15346303582191467, "learning_rate": 2.0364634425809613e-05, "loss": 0.3435, "step": 1697 }, { "epoch": 1.0699736023009339, "grad_norm": 0.1760052591562271, "learning_rate": 2.034222569527975e-05, "loss": 0.4371, "step": 1698 }, { "epoch": 1.0706039951144557, "grad_norm": 0.1785290688276291, "learning_rate": 2.0319819124713452e-05, "loss": 0.4007, "step": 1699 }, { "epoch": 1.0712343879279775, "grad_norm": 0.13086946308612823, "learning_rate": 2.0297414736540066e-05, "loss": 0.288, "step": 1700 }, { "epoch": 1.0718647807414996, "grad_norm": 0.18402013182640076, "learning_rate": 2.0275012553186682e-05, "loss": 0.3993, "step": 1701 }, { "epoch": 1.0724951735550214, "grad_norm": 0.14493998885154724, "learning_rate": 2.0252612597078227e-05, "loss": 0.3638, "step": 1702 }, { "epoch": 1.0731255663685433, "grad_norm": 0.17171445488929749, "learning_rate": 2.0230214890637377e-05, "loss": 0.4127, "step": 1703 }, { "epoch": 1.0737559591820653, "grad_norm": 0.16459907591342926, "learning_rate": 2.0207819456284572e-05, "loss": 0.2843, "step": 1704 }, { "epoch": 1.0743863519955872, "grad_norm": 0.16528962552547455, "learning_rate": 2.0185426316437956e-05, "loss": 0.3822, "step": 1705 }, { "epoch": 1.0750167448091092, "grad_norm": 0.14452378451824188, "learning_rate": 2.0163035493513405e-05, "loss": 0.301, "step": 1706 }, { "epoch": 1.075647137622631, "grad_norm": 0.1770389974117279, "learning_rate": 2.0140647009924445e-05, "loss": 0.3514, "step": 1707 }, { "epoch": 1.076277530436153, "grad_norm": 0.15465545654296875, "learning_rate": 2.0118260888082286e-05, "loss": 0.327, "step": 1708 }, { "epoch": 1.076907923249675, "grad_norm": 0.12285317480564117, "learning_rate": 2.0095877150395754e-05, "loss": 0.2968, "step": 1709 }, { "epoch": 1.0775383160631968, "grad_norm": 0.13278360664844513, "learning_rate": 2.0073495819271303e-05, "loss": 0.3318, "step": 1710 }, { "epoch": 1.078168708876719, "grad_norm": 0.16636794805526733, "learning_rate": 2.005111691711297e-05, "loss": 0.4132, "step": 1711 }, { "epoch": 1.0787991016902407, "grad_norm": 0.17786742746829987, "learning_rate": 2.0028740466322367e-05, "loss": 0.3533, "step": 1712 }, { "epoch": 1.0794294945037626, "grad_norm": 0.17079973220825195, "learning_rate": 2.0006366489298637e-05, "loss": 0.3396, "step": 1713 }, { "epoch": 1.0800598873172846, "grad_norm": 0.17479632794857025, "learning_rate": 1.9983995008438474e-05, "loss": 0.3884, "step": 1714 }, { "epoch": 1.0806902801308065, "grad_norm": 0.1269015371799469, "learning_rate": 1.996162604613604e-05, "loss": 0.3791, "step": 1715 }, { "epoch": 1.0813206729443285, "grad_norm": 0.15591496229171753, "learning_rate": 1.9939259624783013e-05, "loss": 0.3828, "step": 1716 }, { "epoch": 1.0819510657578504, "grad_norm": 0.13570600748062134, "learning_rate": 1.991689576676848e-05, "loss": 0.3429, "step": 1717 }, { "epoch": 1.0825814585713722, "grad_norm": 0.1412353813648224, "learning_rate": 1.989453449447902e-05, "loss": 0.3769, "step": 1718 }, { "epoch": 1.0832118513848943, "grad_norm": 0.1421315222978592, "learning_rate": 1.9872175830298565e-05, "loss": 0.3805, "step": 1719 }, { "epoch": 1.0838422441984161, "grad_norm": 0.14278772473335266, "learning_rate": 1.984981979660849e-05, "loss": 0.3063, "step": 1720 }, { "epoch": 1.084472637011938, "grad_norm": 0.16554735600948334, "learning_rate": 1.982746641578749e-05, "loss": 0.3635, "step": 1721 }, { "epoch": 1.08510302982546, "grad_norm": 0.14530974626541138, "learning_rate": 1.980511571021164e-05, "loss": 0.3411, "step": 1722 }, { "epoch": 1.0857334226389819, "grad_norm": 0.12940828502178192, "learning_rate": 1.978276770225431e-05, "loss": 0.3391, "step": 1723 }, { "epoch": 1.086363815452504, "grad_norm": 0.16847068071365356, "learning_rate": 1.9760422414286198e-05, "loss": 0.3779, "step": 1724 }, { "epoch": 1.0869942082660258, "grad_norm": 0.15065135061740875, "learning_rate": 1.9738079868675245e-05, "loss": 0.3144, "step": 1725 }, { "epoch": 1.0876246010795476, "grad_norm": 0.16651591658592224, "learning_rate": 1.9715740087786682e-05, "loss": 0.3138, "step": 1726 }, { "epoch": 1.0882549938930697, "grad_norm": 0.16407909989356995, "learning_rate": 1.9693403093982942e-05, "loss": 0.3587, "step": 1727 }, { "epoch": 1.0888853867065915, "grad_norm": 0.18480424582958221, "learning_rate": 1.96710689096237e-05, "loss": 0.3492, "step": 1728 }, { "epoch": 1.0895157795201134, "grad_norm": 0.12064468115568161, "learning_rate": 1.9648737557065778e-05, "loss": 0.2683, "step": 1729 }, { "epoch": 1.0901461723336354, "grad_norm": 0.1776457130908966, "learning_rate": 1.962640905866321e-05, "loss": 0.3334, "step": 1730 }, { "epoch": 1.0907765651471573, "grad_norm": 0.18231508135795593, "learning_rate": 1.9604083436767123e-05, "loss": 0.3286, "step": 1731 }, { "epoch": 1.0914069579606793, "grad_norm": 0.17567265033721924, "learning_rate": 1.9581760713725814e-05, "loss": 0.3887, "step": 1732 }, { "epoch": 1.0920373507742012, "grad_norm": 0.14002569019794464, "learning_rate": 1.9559440911884643e-05, "loss": 0.3509, "step": 1733 }, { "epoch": 1.092667743587723, "grad_norm": 0.16191992163658142, "learning_rate": 1.953712405358606e-05, "loss": 0.3311, "step": 1734 }, { "epoch": 1.093298136401245, "grad_norm": 0.16020630300045013, "learning_rate": 1.9514810161169562e-05, "loss": 0.3539, "step": 1735 }, { "epoch": 1.093928529214767, "grad_norm": 0.16078157722949982, "learning_rate": 1.9492499256971698e-05, "loss": 0.3259, "step": 1736 }, { "epoch": 1.094558922028289, "grad_norm": 0.1885467916727066, "learning_rate": 1.9470191363325974e-05, "loss": 0.419, "step": 1737 }, { "epoch": 1.0951893148418108, "grad_norm": 0.14935602247714996, "learning_rate": 1.944788650256296e-05, "loss": 0.2964, "step": 1738 }, { "epoch": 1.0958197076553327, "grad_norm": 0.15341497957706451, "learning_rate": 1.942558469701011e-05, "loss": 0.303, "step": 1739 }, { "epoch": 1.0964501004688547, "grad_norm": 0.14188827574253082, "learning_rate": 1.940328596899189e-05, "loss": 0.302, "step": 1740 }, { "epoch": 1.0970804932823766, "grad_norm": 0.14294935762882233, "learning_rate": 1.9380990340829614e-05, "loss": 0.3174, "step": 1741 }, { "epoch": 1.0977108860958986, "grad_norm": 0.17663875222206116, "learning_rate": 1.935869783484157e-05, "loss": 0.3676, "step": 1742 }, { "epoch": 1.0983412789094205, "grad_norm": 0.15765130519866943, "learning_rate": 1.933640847334286e-05, "loss": 0.34, "step": 1743 }, { "epoch": 1.0989716717229423, "grad_norm": 0.15313151478767395, "learning_rate": 1.931412227864547e-05, "loss": 0.3366, "step": 1744 }, { "epoch": 1.0996020645364644, "grad_norm": 0.19496582448482513, "learning_rate": 1.929183927305819e-05, "loss": 0.4336, "step": 1745 }, { "epoch": 1.1002324573499862, "grad_norm": 0.17884379625320435, "learning_rate": 1.926955947888666e-05, "loss": 0.3065, "step": 1746 }, { "epoch": 1.100862850163508, "grad_norm": 0.19162435829639435, "learning_rate": 1.9247282918433253e-05, "loss": 0.3758, "step": 1747 }, { "epoch": 1.10149324297703, "grad_norm": 0.217264324426651, "learning_rate": 1.9225009613997152e-05, "loss": 0.3557, "step": 1748 }, { "epoch": 1.102123635790552, "grad_norm": 0.15280327200889587, "learning_rate": 1.9202739587874247e-05, "loss": 0.3406, "step": 1749 }, { "epoch": 1.102754028604074, "grad_norm": 0.16503563523292542, "learning_rate": 1.9180472862357166e-05, "loss": 0.3315, "step": 1750 }, { "epoch": 1.1033844214175959, "grad_norm": 0.180845245718956, "learning_rate": 1.915820945973522e-05, "loss": 0.3854, "step": 1751 }, { "epoch": 1.1040148142311177, "grad_norm": 0.21107171475887299, "learning_rate": 1.913594940229441e-05, "loss": 0.406, "step": 1752 }, { "epoch": 1.1046452070446398, "grad_norm": 0.1476421058177948, "learning_rate": 1.911369271231737e-05, "loss": 0.3955, "step": 1753 }, { "epoch": 1.1052755998581616, "grad_norm": 0.1788562536239624, "learning_rate": 1.9091439412083374e-05, "loss": 0.3686, "step": 1754 }, { "epoch": 1.1059059926716834, "grad_norm": 0.11860582232475281, "learning_rate": 1.90691895238683e-05, "loss": 0.2905, "step": 1755 }, { "epoch": 1.1065363854852055, "grad_norm": 0.1615850329399109, "learning_rate": 1.9046943069944623e-05, "loss": 0.365, "step": 1756 }, { "epoch": 1.1071667782987273, "grad_norm": 0.1617215871810913, "learning_rate": 1.9024700072581343e-05, "loss": 0.3549, "step": 1757 }, { "epoch": 1.1077971711122494, "grad_norm": 0.1754276603460312, "learning_rate": 1.9002460554044048e-05, "loss": 0.3082, "step": 1758 }, { "epoch": 1.1084275639257712, "grad_norm": 0.17688316106796265, "learning_rate": 1.8980224536594804e-05, "loss": 0.3494, "step": 1759 }, { "epoch": 1.109057956739293, "grad_norm": 0.14567002654075623, "learning_rate": 1.895799204249221e-05, "loss": 0.2989, "step": 1760 }, { "epoch": 1.1096883495528151, "grad_norm": 0.1490413397550583, "learning_rate": 1.893576309399129e-05, "loss": 0.3265, "step": 1761 }, { "epoch": 1.110318742366337, "grad_norm": 0.17052440345287323, "learning_rate": 1.8913537713343575e-05, "loss": 0.3928, "step": 1762 }, { "epoch": 1.110949135179859, "grad_norm": 0.17583999037742615, "learning_rate": 1.8891315922796972e-05, "loss": 0.3459, "step": 1763 }, { "epoch": 1.111579527993381, "grad_norm": 0.19256591796875, "learning_rate": 1.8869097744595832e-05, "loss": 0.3684, "step": 1764 }, { "epoch": 1.1122099208069027, "grad_norm": 0.13533738255500793, "learning_rate": 1.8846883200980865e-05, "loss": 0.3343, "step": 1765 }, { "epoch": 1.1128403136204248, "grad_norm": 0.13766324520111084, "learning_rate": 1.8824672314189165e-05, "loss": 0.324, "step": 1766 }, { "epoch": 1.1134707064339466, "grad_norm": 0.1536354422569275, "learning_rate": 1.8802465106454145e-05, "loss": 0.326, "step": 1767 }, { "epoch": 1.1141010992474687, "grad_norm": 0.17327311635017395, "learning_rate": 1.878026160000555e-05, "loss": 0.3406, "step": 1768 }, { "epoch": 1.1147314920609905, "grad_norm": 0.1594127118587494, "learning_rate": 1.875806181706941e-05, "loss": 0.3698, "step": 1769 }, { "epoch": 1.1153618848745124, "grad_norm": 0.15584924817085266, "learning_rate": 1.8735865779868038e-05, "loss": 0.3275, "step": 1770 }, { "epoch": 1.1159922776880344, "grad_norm": 0.15024584531784058, "learning_rate": 1.871367351061998e-05, "loss": 0.3456, "step": 1771 }, { "epoch": 1.1166226705015563, "grad_norm": 0.14913159608840942, "learning_rate": 1.8691485031540035e-05, "loss": 0.3551, "step": 1772 }, { "epoch": 1.1172530633150781, "grad_norm": 0.18090124428272247, "learning_rate": 1.8669300364839185e-05, "loss": 0.3763, "step": 1773 }, { "epoch": 1.1178834561286002, "grad_norm": 0.14488834142684937, "learning_rate": 1.8647119532724616e-05, "loss": 0.354, "step": 1774 }, { "epoch": 1.118513848942122, "grad_norm": 0.1709679216146469, "learning_rate": 1.862494255739965e-05, "loss": 0.3878, "step": 1775 }, { "epoch": 1.119144241755644, "grad_norm": 0.12193349003791809, "learning_rate": 1.8602769461063773e-05, "loss": 0.3088, "step": 1776 }, { "epoch": 1.119774634569166, "grad_norm": 0.15026722848415375, "learning_rate": 1.8580600265912573e-05, "loss": 0.39, "step": 1777 }, { "epoch": 1.1204050273826878, "grad_norm": 0.14931859076023102, "learning_rate": 1.855843499413775e-05, "loss": 0.3017, "step": 1778 }, { "epoch": 1.1210354201962098, "grad_norm": 0.1615418940782547, "learning_rate": 1.8536273667927045e-05, "loss": 0.3286, "step": 1779 }, { "epoch": 1.1216658130097317, "grad_norm": 0.18689070641994476, "learning_rate": 1.8514116309464294e-05, "loss": 0.3529, "step": 1780 }, { "epoch": 1.1222962058232535, "grad_norm": 0.1535871922969818, "learning_rate": 1.8491962940929302e-05, "loss": 0.3603, "step": 1781 }, { "epoch": 1.1229265986367756, "grad_norm": 0.12987935543060303, "learning_rate": 1.8469813584497954e-05, "loss": 0.303, "step": 1782 }, { "epoch": 1.1235569914502974, "grad_norm": 0.1690494865179062, "learning_rate": 1.8447668262342047e-05, "loss": 0.3499, "step": 1783 }, { "epoch": 1.1241873842638195, "grad_norm": 0.1679484099149704, "learning_rate": 1.8425526996629394e-05, "loss": 0.3893, "step": 1784 }, { "epoch": 1.1248177770773413, "grad_norm": 0.19813470542430878, "learning_rate": 1.8403389809523707e-05, "loss": 0.3479, "step": 1785 }, { "epoch": 1.1254481698908632, "grad_norm": 0.17155030369758606, "learning_rate": 1.838125672318465e-05, "loss": 0.3111, "step": 1786 }, { "epoch": 1.1260785627043852, "grad_norm": 0.16126155853271484, "learning_rate": 1.8359127759767753e-05, "loss": 0.3233, "step": 1787 }, { "epoch": 1.126708955517907, "grad_norm": 0.18906235694885254, "learning_rate": 1.8337002941424442e-05, "loss": 0.4023, "step": 1788 }, { "epoch": 1.1273393483314291, "grad_norm": 0.1417372077703476, "learning_rate": 1.8314882290301973e-05, "loss": 0.2812, "step": 1789 }, { "epoch": 1.127969741144951, "grad_norm": 0.2051158845424652, "learning_rate": 1.829276582854345e-05, "loss": 0.3346, "step": 1790 }, { "epoch": 1.1286001339584728, "grad_norm": 0.1490676999092102, "learning_rate": 1.8270653578287766e-05, "loss": 0.3457, "step": 1791 }, { "epoch": 1.1292305267719949, "grad_norm": 0.1721116602420807, "learning_rate": 1.8248545561669614e-05, "loss": 0.3525, "step": 1792 }, { "epoch": 1.1298609195855167, "grad_norm": 0.15207171440124512, "learning_rate": 1.8226441800819434e-05, "loss": 0.3986, "step": 1793 }, { "epoch": 1.1304913123990388, "grad_norm": 0.15704981982707977, "learning_rate": 1.820434231786342e-05, "loss": 0.3752, "step": 1794 }, { "epoch": 1.1311217052125606, "grad_norm": 0.17935959994792938, "learning_rate": 1.8182247134923474e-05, "loss": 0.4284, "step": 1795 }, { "epoch": 1.1317520980260825, "grad_norm": 0.15869015455245972, "learning_rate": 1.81601562741172e-05, "loss": 0.3213, "step": 1796 }, { "epoch": 1.1323824908396045, "grad_norm": 0.1865074187517166, "learning_rate": 1.8138069757557866e-05, "loss": 0.3612, "step": 1797 }, { "epoch": 1.1330128836531264, "grad_norm": 0.153265580534935, "learning_rate": 1.811598760735441e-05, "loss": 0.3984, "step": 1798 }, { "epoch": 1.1336432764666482, "grad_norm": 0.1499812752008438, "learning_rate": 1.809390984561136e-05, "loss": 0.3472, "step": 1799 }, { "epoch": 1.1342736692801703, "grad_norm": 0.15162378549575806, "learning_rate": 1.807183649442891e-05, "loss": 0.384, "step": 1800 }, { "epoch": 1.1342736692801703, "eval_loss": 0.42443564534187317, "eval_runtime": 222.4133, "eval_samples_per_second": 4.496, "eval_steps_per_second": 4.496, "step": 1800 }, { "epoch": 1.134904062093692, "grad_norm": 0.2255977839231491, "learning_rate": 1.8049767575902784e-05, "loss": 0.4607, "step": 1801 }, { "epoch": 1.135534454907214, "grad_norm": 0.15398207306861877, "learning_rate": 1.802770311212431e-05, "loss": 0.3314, "step": 1802 }, { "epoch": 1.136164847720736, "grad_norm": 0.1576896458864212, "learning_rate": 1.8005643125180323e-05, "loss": 0.3493, "step": 1803 }, { "epoch": 1.1367952405342578, "grad_norm": 0.17448103427886963, "learning_rate": 1.7983587637153202e-05, "loss": 0.3861, "step": 1804 }, { "epoch": 1.13742563334778, "grad_norm": 0.19642406702041626, "learning_rate": 1.79615366701208e-05, "loss": 0.3913, "step": 1805 }, { "epoch": 1.1380560261613017, "grad_norm": 0.1716988980770111, "learning_rate": 1.7939490246156474e-05, "loss": 0.3679, "step": 1806 }, { "epoch": 1.1386864189748236, "grad_norm": 0.1577298492193222, "learning_rate": 1.7917448387329e-05, "loss": 0.3465, "step": 1807 }, { "epoch": 1.1393168117883457, "grad_norm": 0.16155940294265747, "learning_rate": 1.789541111570262e-05, "loss": 0.3487, "step": 1808 }, { "epoch": 1.1399472046018675, "grad_norm": 0.16611319780349731, "learning_rate": 1.787337845333694e-05, "loss": 0.3402, "step": 1809 }, { "epoch": 1.1405775974153896, "grad_norm": 0.17726682126522064, "learning_rate": 1.7851350422286998e-05, "loss": 0.3009, "step": 1810 }, { "epoch": 1.1412079902289114, "grad_norm": 0.17919518053531647, "learning_rate": 1.7829327044603162e-05, "loss": 0.3766, "step": 1811 }, { "epoch": 1.1418383830424332, "grad_norm": 0.15726324915885925, "learning_rate": 1.7807308342331164e-05, "loss": 0.336, "step": 1812 }, { "epoch": 1.1424687758559553, "grad_norm": 0.18717758357524872, "learning_rate": 1.7785294337512036e-05, "loss": 0.3744, "step": 1813 }, { "epoch": 1.1430991686694771, "grad_norm": 0.1685945689678192, "learning_rate": 1.776328505218213e-05, "loss": 0.3486, "step": 1814 }, { "epoch": 1.1437295614829992, "grad_norm": 0.17545783519744873, "learning_rate": 1.7741280508373052e-05, "loss": 0.3274, "step": 1815 }, { "epoch": 1.144359954296521, "grad_norm": 0.1535610854625702, "learning_rate": 1.771928072811168e-05, "loss": 0.3411, "step": 1816 }, { "epoch": 1.1449903471100429, "grad_norm": 0.15853679180145264, "learning_rate": 1.7697285733420106e-05, "loss": 0.3439, "step": 1817 }, { "epoch": 1.145620739923565, "grad_norm": 0.199609637260437, "learning_rate": 1.7675295546315654e-05, "loss": 0.4275, "step": 1818 }, { "epoch": 1.1462511327370868, "grad_norm": 0.1530158966779709, "learning_rate": 1.7653310188810803e-05, "loss": 0.3901, "step": 1819 }, { "epoch": 1.1468815255506086, "grad_norm": 0.1571042537689209, "learning_rate": 1.763132968291324e-05, "loss": 0.3225, "step": 1820 }, { "epoch": 1.1475119183641307, "grad_norm": 0.17838561534881592, "learning_rate": 1.7609354050625745e-05, "loss": 0.3634, "step": 1821 }, { "epoch": 1.1481423111776525, "grad_norm": 0.1688927263021469, "learning_rate": 1.758738331394628e-05, "loss": 0.3285, "step": 1822 }, { "epoch": 1.1487727039911746, "grad_norm": 0.1764656901359558, "learning_rate": 1.756541749486784e-05, "loss": 0.4225, "step": 1823 }, { "epoch": 1.1494030968046964, "grad_norm": 0.172949880361557, "learning_rate": 1.7543456615378562e-05, "loss": 0.3206, "step": 1824 }, { "epoch": 1.1500334896182183, "grad_norm": 0.15606196224689484, "learning_rate": 1.752150069746158e-05, "loss": 0.3027, "step": 1825 }, { "epoch": 1.1506638824317403, "grad_norm": 0.14963605999946594, "learning_rate": 1.7499549763095104e-05, "loss": 0.3614, "step": 1826 }, { "epoch": 1.1512942752452622, "grad_norm": 0.1628952920436859, "learning_rate": 1.7477603834252337e-05, "loss": 0.3728, "step": 1827 }, { "epoch": 1.151924668058784, "grad_norm": 0.15831774473190308, "learning_rate": 1.745566293290147e-05, "loss": 0.3207, "step": 1828 }, { "epoch": 1.152555060872306, "grad_norm": 0.16237981617450714, "learning_rate": 1.7433727081005667e-05, "loss": 0.3094, "step": 1829 }, { "epoch": 1.153185453685828, "grad_norm": 0.16303876042366028, "learning_rate": 1.7411796300523045e-05, "loss": 0.3491, "step": 1830 }, { "epoch": 1.15381584649935, "grad_norm": 0.17776191234588623, "learning_rate": 1.738987061340662e-05, "loss": 0.3718, "step": 1831 }, { "epoch": 1.1544462393128718, "grad_norm": 0.16025635600090027, "learning_rate": 1.7367950041604338e-05, "loss": 0.3671, "step": 1832 }, { "epoch": 1.1550766321263937, "grad_norm": 0.1274735927581787, "learning_rate": 1.7346034607059002e-05, "loss": 0.3221, "step": 1833 }, { "epoch": 1.1557070249399157, "grad_norm": 0.15472093224525452, "learning_rate": 1.7324124331708295e-05, "loss": 0.3289, "step": 1834 }, { "epoch": 1.1563374177534376, "grad_norm": 0.17002683877944946, "learning_rate": 1.7302219237484705e-05, "loss": 0.3171, "step": 1835 }, { "epoch": 1.1569678105669596, "grad_norm": 0.14287510514259338, "learning_rate": 1.7280319346315566e-05, "loss": 0.3065, "step": 1836 }, { "epoch": 1.1575982033804815, "grad_norm": 0.18563435971736908, "learning_rate": 1.725842468012298e-05, "loss": 0.379, "step": 1837 }, { "epoch": 1.1582285961940033, "grad_norm": 0.16103070974349976, "learning_rate": 1.7236535260823843e-05, "loss": 0.3817, "step": 1838 }, { "epoch": 1.1588589890075254, "grad_norm": 0.16440634429454803, "learning_rate": 1.721465111032975e-05, "loss": 0.3782, "step": 1839 }, { "epoch": 1.1594893818210472, "grad_norm": 0.1373133808374405, "learning_rate": 1.7192772250547097e-05, "loss": 0.301, "step": 1840 }, { "epoch": 1.1601197746345693, "grad_norm": 0.17497479915618896, "learning_rate": 1.7170898703376905e-05, "loss": 0.3573, "step": 1841 }, { "epoch": 1.1607501674480911, "grad_norm": 0.13609153032302856, "learning_rate": 1.7149030490714945e-05, "loss": 0.3322, "step": 1842 }, { "epoch": 1.161380560261613, "grad_norm": 0.22188684344291687, "learning_rate": 1.7127167634451588e-05, "loss": 0.4429, "step": 1843 }, { "epoch": 1.162010953075135, "grad_norm": 0.2214953899383545, "learning_rate": 1.7105310156471903e-05, "loss": 0.419, "step": 1844 }, { "epoch": 1.1626413458886569, "grad_norm": 0.15619449317455292, "learning_rate": 1.708345807865552e-05, "loss": 0.4114, "step": 1845 }, { "epoch": 1.1632717387021787, "grad_norm": 0.15597301721572876, "learning_rate": 1.70616114228767e-05, "loss": 0.3476, "step": 1846 }, { "epoch": 1.1639021315157008, "grad_norm": 0.1534939855337143, "learning_rate": 1.7039770211004256e-05, "loss": 0.3128, "step": 1847 }, { "epoch": 1.1645325243292226, "grad_norm": 0.1397959440946579, "learning_rate": 1.701793446490157e-05, "loss": 0.3239, "step": 1848 }, { "epoch": 1.1651629171427447, "grad_norm": 0.1773579716682434, "learning_rate": 1.6996104206426536e-05, "loss": 0.3735, "step": 1849 }, { "epoch": 1.1657933099562665, "grad_norm": 0.14740358293056488, "learning_rate": 1.6974279457431565e-05, "loss": 0.3029, "step": 1850 }, { "epoch": 1.1664237027697884, "grad_norm": 0.15667730569839478, "learning_rate": 1.6952460239763546e-05, "loss": 0.3595, "step": 1851 }, { "epoch": 1.1670540955833104, "grad_norm": 0.21429184079170227, "learning_rate": 1.693064657526384e-05, "loss": 0.4161, "step": 1852 }, { "epoch": 1.1676844883968323, "grad_norm": 0.13290826976299286, "learning_rate": 1.690883848576824e-05, "loss": 0.3327, "step": 1853 }, { "epoch": 1.168314881210354, "grad_norm": 0.1636621654033661, "learning_rate": 1.688703599310696e-05, "loss": 0.3268, "step": 1854 }, { "epoch": 1.1689452740238762, "grad_norm": 0.13847965002059937, "learning_rate": 1.686523911910462e-05, "loss": 0.3375, "step": 1855 }, { "epoch": 1.169575666837398, "grad_norm": 0.1506374478340149, "learning_rate": 1.68434478855802e-05, "loss": 0.3697, "step": 1856 }, { "epoch": 1.17020605965092, "grad_norm": 0.15315689146518707, "learning_rate": 1.6821662314347047e-05, "loss": 0.3167, "step": 1857 }, { "epoch": 1.170836452464442, "grad_norm": 0.16829730570316315, "learning_rate": 1.6799882427212843e-05, "loss": 0.3697, "step": 1858 }, { "epoch": 1.1714668452779637, "grad_norm": 0.18139970302581787, "learning_rate": 1.6778108245979564e-05, "loss": 0.3428, "step": 1859 }, { "epoch": 1.1720972380914858, "grad_norm": 0.17139574885368347, "learning_rate": 1.6756339792443494e-05, "loss": 0.3537, "step": 1860 }, { "epoch": 1.1727276309050076, "grad_norm": 0.17136725783348083, "learning_rate": 1.6734577088395147e-05, "loss": 0.3734, "step": 1861 }, { "epoch": 1.1733580237185297, "grad_norm": 0.1732012778520584, "learning_rate": 1.6712820155619348e-05, "loss": 0.4209, "step": 1862 }, { "epoch": 1.1739884165320515, "grad_norm": 0.17362134158611298, "learning_rate": 1.669106901589506e-05, "loss": 0.3422, "step": 1863 }, { "epoch": 1.1746188093455734, "grad_norm": 0.1846878081560135, "learning_rate": 1.6669323690995532e-05, "loss": 0.4042, "step": 1864 }, { "epoch": 1.1752492021590955, "grad_norm": 0.1782120168209076, "learning_rate": 1.6647584202688126e-05, "loss": 0.3604, "step": 1865 }, { "epoch": 1.1758795949726173, "grad_norm": 0.15381821990013123, "learning_rate": 1.6625850572734398e-05, "loss": 0.3528, "step": 1866 }, { "epoch": 1.1765099877861394, "grad_norm": 0.20381847023963928, "learning_rate": 1.6604122822890017e-05, "loss": 0.3198, "step": 1867 }, { "epoch": 1.1771403805996612, "grad_norm": 0.16701090335845947, "learning_rate": 1.6582400974904797e-05, "loss": 0.3492, "step": 1868 }, { "epoch": 1.177770773413183, "grad_norm": 0.14173492789268494, "learning_rate": 1.656068505052261e-05, "loss": 0.2858, "step": 1869 }, { "epoch": 1.178401166226705, "grad_norm": 0.16685733199119568, "learning_rate": 1.653897507148142e-05, "loss": 0.3855, "step": 1870 }, { "epoch": 1.179031559040227, "grad_norm": 0.15488892793655396, "learning_rate": 1.6517271059513225e-05, "loss": 0.3503, "step": 1871 }, { "epoch": 1.1796619518537488, "grad_norm": 0.15705163776874542, "learning_rate": 1.649557303634407e-05, "loss": 0.3356, "step": 1872 }, { "epoch": 1.1802923446672708, "grad_norm": 0.17007462680339813, "learning_rate": 1.6473881023693984e-05, "loss": 0.3568, "step": 1873 }, { "epoch": 1.1809227374807927, "grad_norm": 0.15087947249412537, "learning_rate": 1.6452195043276995e-05, "loss": 0.3495, "step": 1874 }, { "epoch": 1.1815531302943145, "grad_norm": 0.16111180186271667, "learning_rate": 1.643051511680108e-05, "loss": 0.3755, "step": 1875 }, { "epoch": 1.1821835231078366, "grad_norm": 0.16729417443275452, "learning_rate": 1.6408841265968166e-05, "loss": 0.3515, "step": 1876 }, { "epoch": 1.1828139159213584, "grad_norm": 0.20641054213047028, "learning_rate": 1.638717351247409e-05, "loss": 0.4274, "step": 1877 }, { "epoch": 1.1834443087348805, "grad_norm": 0.15898416936397552, "learning_rate": 1.6365511878008594e-05, "loss": 0.3482, "step": 1878 }, { "epoch": 1.1840747015484023, "grad_norm": 0.15798181295394897, "learning_rate": 1.6343856384255288e-05, "loss": 0.3218, "step": 1879 }, { "epoch": 1.1847050943619242, "grad_norm": 0.1792026311159134, "learning_rate": 1.6322207052891648e-05, "loss": 0.3921, "step": 1880 }, { "epoch": 1.1853354871754462, "grad_norm": 0.1666623055934906, "learning_rate": 1.6300563905588945e-05, "loss": 0.3244, "step": 1881 }, { "epoch": 1.185965879988968, "grad_norm": 0.1739160269498825, "learning_rate": 1.6278926964012316e-05, "loss": 0.3323, "step": 1882 }, { "epoch": 1.1865962728024901, "grad_norm": 0.15621982514858246, "learning_rate": 1.6257296249820628e-05, "loss": 0.3212, "step": 1883 }, { "epoch": 1.187226665616012, "grad_norm": 0.13840575516223907, "learning_rate": 1.6235671784666564e-05, "loss": 0.3351, "step": 1884 }, { "epoch": 1.1878570584295338, "grad_norm": 0.14696210622787476, "learning_rate": 1.6214053590196517e-05, "loss": 0.3669, "step": 1885 }, { "epoch": 1.1884874512430559, "grad_norm": 0.17982372641563416, "learning_rate": 1.6192441688050623e-05, "loss": 0.3206, "step": 1886 }, { "epoch": 1.1891178440565777, "grad_norm": 0.13978564739227295, "learning_rate": 1.6170836099862697e-05, "loss": 0.339, "step": 1887 }, { "epoch": 1.1897482368700998, "grad_norm": 0.12686683237552643, "learning_rate": 1.614923684726027e-05, "loss": 0.2845, "step": 1888 }, { "epoch": 1.1903786296836216, "grad_norm": 0.17271322011947632, "learning_rate": 1.6127643951864488e-05, "loss": 0.3649, "step": 1889 }, { "epoch": 1.1910090224971435, "grad_norm": 0.15250203013420105, "learning_rate": 1.6106057435290175e-05, "loss": 0.3493, "step": 1890 }, { "epoch": 1.1916394153106655, "grad_norm": 0.21040686964988708, "learning_rate": 1.608447731914573e-05, "loss": 0.4127, "step": 1891 }, { "epoch": 1.1922698081241874, "grad_norm": 0.16836979985237122, "learning_rate": 1.6062903625033177e-05, "loss": 0.378, "step": 1892 }, { "epoch": 1.1929002009377094, "grad_norm": 0.16101545095443726, "learning_rate": 1.6041336374548093e-05, "loss": 0.3219, "step": 1893 }, { "epoch": 1.1935305937512313, "grad_norm": 0.1391444206237793, "learning_rate": 1.6019775589279616e-05, "loss": 0.3327, "step": 1894 }, { "epoch": 1.1941609865647531, "grad_norm": 0.16221654415130615, "learning_rate": 1.5998221290810398e-05, "loss": 0.3633, "step": 1895 }, { "epoch": 1.1947913793782752, "grad_norm": 0.15801361203193665, "learning_rate": 1.597667350071662e-05, "loss": 0.3559, "step": 1896 }, { "epoch": 1.195421772191797, "grad_norm": 0.17644530534744263, "learning_rate": 1.5955132240567918e-05, "loss": 0.3376, "step": 1897 }, { "epoch": 1.1960521650053189, "grad_norm": 0.17779093980789185, "learning_rate": 1.5933597531927422e-05, "loss": 0.3468, "step": 1898 }, { "epoch": 1.196682557818841, "grad_norm": 0.17327743768692017, "learning_rate": 1.591206939635168e-05, "loss": 0.3524, "step": 1899 }, { "epoch": 1.1973129506323628, "grad_norm": 0.18291594088077545, "learning_rate": 1.5890547855390687e-05, "loss": 0.3162, "step": 1900 }, { "epoch": 1.1979433434458846, "grad_norm": 0.18050126731395721, "learning_rate": 1.5869032930587788e-05, "loss": 0.3384, "step": 1901 }, { "epoch": 1.1985737362594067, "grad_norm": 0.1861979067325592, "learning_rate": 1.5847524643479776e-05, "loss": 0.3859, "step": 1902 }, { "epoch": 1.1992041290729285, "grad_norm": 0.15310519933700562, "learning_rate": 1.5826023015596723e-05, "loss": 0.2921, "step": 1903 }, { "epoch": 1.1998345218864506, "grad_norm": 0.18371252715587616, "learning_rate": 1.5804528068462103e-05, "loss": 0.3805, "step": 1904 }, { "epoch": 1.2004649146999724, "grad_norm": 0.19243298470973969, "learning_rate": 1.578303982359265e-05, "loss": 0.4009, "step": 1905 }, { "epoch": 1.2010953075134942, "grad_norm": 0.17407071590423584, "learning_rate": 1.576155830249843e-05, "loss": 0.3416, "step": 1906 }, { "epoch": 1.2017257003270163, "grad_norm": 0.12393369525671005, "learning_rate": 1.574008352668274e-05, "loss": 0.2966, "step": 1907 }, { "epoch": 1.2023560931405382, "grad_norm": 0.20292018353939056, "learning_rate": 1.571861551764216e-05, "loss": 0.3988, "step": 1908 }, { "epoch": 1.2029864859540602, "grad_norm": 0.15854638814926147, "learning_rate": 1.5697154296866468e-05, "loss": 0.3335, "step": 1909 }, { "epoch": 1.203616878767582, "grad_norm": 0.1645781546831131, "learning_rate": 1.567569988583867e-05, "loss": 0.3353, "step": 1910 }, { "epoch": 1.204247271581104, "grad_norm": 0.1488713175058365, "learning_rate": 1.5654252306034934e-05, "loss": 0.3427, "step": 1911 }, { "epoch": 1.204877664394626, "grad_norm": 0.14664226770401, "learning_rate": 1.5632811578924614e-05, "loss": 0.3492, "step": 1912 }, { "epoch": 1.2055080572081478, "grad_norm": 0.18963125348091125, "learning_rate": 1.5611377725970182e-05, "loss": 0.3855, "step": 1913 }, { "epoch": 1.2061384500216699, "grad_norm": 0.14855507016181946, "learning_rate": 1.558995076862724e-05, "loss": 0.3235, "step": 1914 }, { "epoch": 1.2067688428351917, "grad_norm": 0.14204628765583038, "learning_rate": 1.556853072834448e-05, "loss": 0.3609, "step": 1915 }, { "epoch": 1.2073992356487135, "grad_norm": 0.13496729731559753, "learning_rate": 1.5547117626563687e-05, "loss": 0.2936, "step": 1916 }, { "epoch": 1.2080296284622356, "grad_norm": 0.16182400286197662, "learning_rate": 1.5525711484719676e-05, "loss": 0.2771, "step": 1917 }, { "epoch": 1.2086600212757574, "grad_norm": 0.14185205101966858, "learning_rate": 1.5504312324240312e-05, "loss": 0.2653, "step": 1918 }, { "epoch": 1.2092904140892793, "grad_norm": 0.13705161213874817, "learning_rate": 1.5482920166546473e-05, "loss": 0.3319, "step": 1919 }, { "epoch": 1.2099208069028013, "grad_norm": 0.15866032242774963, "learning_rate": 1.546153503305202e-05, "loss": 0.3551, "step": 1920 }, { "epoch": 1.2105511997163232, "grad_norm": 0.15233084559440613, "learning_rate": 1.5440156945163762e-05, "loss": 0.3683, "step": 1921 }, { "epoch": 1.2111815925298453, "grad_norm": 0.17361125349998474, "learning_rate": 1.5418785924281516e-05, "loss": 0.3799, "step": 1922 }, { "epoch": 1.211811985343367, "grad_norm": 0.15449875593185425, "learning_rate": 1.5397421991797952e-05, "loss": 0.3449, "step": 1923 }, { "epoch": 1.212442378156889, "grad_norm": 0.1920737475156784, "learning_rate": 1.53760651690987e-05, "loss": 0.3539, "step": 1924 }, { "epoch": 1.213072770970411, "grad_norm": 0.1675095111131668, "learning_rate": 1.5354715477562236e-05, "loss": 0.3222, "step": 1925 }, { "epoch": 1.2137031637839328, "grad_norm": 0.15904296934604645, "learning_rate": 1.533337293855993e-05, "loss": 0.3203, "step": 1926 }, { "epoch": 1.2143335565974547, "grad_norm": 0.1559150665998459, "learning_rate": 1.5312037573455955e-05, "loss": 0.3502, "step": 1927 }, { "epoch": 1.2149639494109767, "grad_norm": 0.14907754957675934, "learning_rate": 1.529070940360734e-05, "loss": 0.3525, "step": 1928 }, { "epoch": 1.2155943422244986, "grad_norm": 0.19221338629722595, "learning_rate": 1.526938845036388e-05, "loss": 0.3653, "step": 1929 }, { "epoch": 1.2162247350380206, "grad_norm": 0.14959098398685455, "learning_rate": 1.5248074735068178e-05, "loss": 0.3614, "step": 1930 }, { "epoch": 1.2168551278515425, "grad_norm": 0.15030638873577118, "learning_rate": 1.5226768279055559e-05, "loss": 0.3303, "step": 1931 }, { "epoch": 1.2174855206650643, "grad_norm": 0.1545165628194809, "learning_rate": 1.5205469103654104e-05, "loss": 0.3569, "step": 1932 }, { "epoch": 1.2181159134785864, "grad_norm": 0.16187258064746857, "learning_rate": 1.5184177230184595e-05, "loss": 0.3707, "step": 1933 }, { "epoch": 1.2187463062921082, "grad_norm": 0.1676671952009201, "learning_rate": 1.5162892679960513e-05, "loss": 0.3382, "step": 1934 }, { "epoch": 1.2193766991056303, "grad_norm": 0.1787586361169815, "learning_rate": 1.5141615474287996e-05, "loss": 0.33, "step": 1935 }, { "epoch": 1.2200070919191521, "grad_norm": 0.18215793371200562, "learning_rate": 1.5120345634465848e-05, "loss": 0.3825, "step": 1936 }, { "epoch": 1.220637484732674, "grad_norm": 0.17851123213768005, "learning_rate": 1.5099083181785479e-05, "loss": 0.4282, "step": 1937 }, { "epoch": 1.221267877546196, "grad_norm": 0.17377333343029022, "learning_rate": 1.5077828137530923e-05, "loss": 0.3273, "step": 1938 }, { "epoch": 1.2218982703597179, "grad_norm": 0.16321879625320435, "learning_rate": 1.5056580522978783e-05, "loss": 0.3081, "step": 1939 }, { "epoch": 1.22252866317324, "grad_norm": 0.1564747542142868, "learning_rate": 1.5035340359398237e-05, "loss": 0.3613, "step": 1940 }, { "epoch": 1.2231590559867618, "grad_norm": 0.1720079630613327, "learning_rate": 1.5014107668050992e-05, "loss": 0.3934, "step": 1941 }, { "epoch": 1.2237894488002836, "grad_norm": 0.15148408710956573, "learning_rate": 1.4992882470191297e-05, "loss": 0.2705, "step": 1942 }, { "epoch": 1.2244198416138057, "grad_norm": 0.1540118306875229, "learning_rate": 1.4971664787065858e-05, "loss": 0.309, "step": 1943 }, { "epoch": 1.2250502344273275, "grad_norm": 0.14092116057872772, "learning_rate": 1.4950454639913916e-05, "loss": 0.3714, "step": 1944 }, { "epoch": 1.2256806272408494, "grad_norm": 0.1863245815038681, "learning_rate": 1.492925204996711e-05, "loss": 0.4278, "step": 1945 }, { "epoch": 1.2263110200543714, "grad_norm": 0.14283518493175507, "learning_rate": 1.4908057038449573e-05, "loss": 0.2725, "step": 1946 }, { "epoch": 1.2269414128678933, "grad_norm": 0.1478254646062851, "learning_rate": 1.4886869626577791e-05, "loss": 0.339, "step": 1947 }, { "epoch": 1.2275718056814153, "grad_norm": 0.1424347460269928, "learning_rate": 1.4865689835560691e-05, "loss": 0.2964, "step": 1948 }, { "epoch": 1.2282021984949372, "grad_norm": 0.1579347401857376, "learning_rate": 1.4844517686599545e-05, "loss": 0.3859, "step": 1949 }, { "epoch": 1.228832591308459, "grad_norm": 0.1495109647512436, "learning_rate": 1.482335320088799e-05, "loss": 0.3267, "step": 1950 }, { "epoch": 1.229462984121981, "grad_norm": 0.14779378473758698, "learning_rate": 1.4802196399611978e-05, "loss": 0.3386, "step": 1951 }, { "epoch": 1.230093376935503, "grad_norm": 0.13087581098079681, "learning_rate": 1.4781047303949784e-05, "loss": 0.3595, "step": 1952 }, { "epoch": 1.2307237697490248, "grad_norm": 0.18340842425823212, "learning_rate": 1.4759905935071952e-05, "loss": 0.3052, "step": 1953 }, { "epoch": 1.2313541625625468, "grad_norm": 0.1894017606973648, "learning_rate": 1.473877231414131e-05, "loss": 0.3803, "step": 1954 }, { "epoch": 1.2319845553760687, "grad_norm": 0.17660388350486755, "learning_rate": 1.4717646462312912e-05, "loss": 0.3708, "step": 1955 }, { "epoch": 1.2326149481895907, "grad_norm": 0.19020117819309235, "learning_rate": 1.4696528400734054e-05, "loss": 0.2913, "step": 1956 }, { "epoch": 1.2332453410031126, "grad_norm": 0.15870848298072815, "learning_rate": 1.4675418150544217e-05, "loss": 0.38, "step": 1957 }, { "epoch": 1.2338757338166344, "grad_norm": 0.1692182570695877, "learning_rate": 1.4654315732875073e-05, "loss": 0.3718, "step": 1958 }, { "epoch": 1.2345061266301565, "grad_norm": 0.19379208981990814, "learning_rate": 1.463322116885045e-05, "loss": 0.4182, "step": 1959 }, { "epoch": 1.2351365194436783, "grad_norm": 0.14317487180233002, "learning_rate": 1.4612134479586316e-05, "loss": 0.338, "step": 1960 }, { "epoch": 1.2357669122572004, "grad_norm": 0.14631275832653046, "learning_rate": 1.4591055686190749e-05, "loss": 0.2957, "step": 1961 }, { "epoch": 1.2363973050707222, "grad_norm": 0.1501506268978119, "learning_rate": 1.4569984809763939e-05, "loss": 0.3617, "step": 1962 }, { "epoch": 1.237027697884244, "grad_norm": 0.14559954404830933, "learning_rate": 1.4548921871398122e-05, "loss": 0.3309, "step": 1963 }, { "epoch": 1.2376580906977661, "grad_norm": 0.16393865644931793, "learning_rate": 1.4527866892177639e-05, "loss": 0.3721, "step": 1964 }, { "epoch": 1.238288483511288, "grad_norm": 0.15368694067001343, "learning_rate": 1.4506819893178792e-05, "loss": 0.3812, "step": 1965 }, { "epoch": 1.23891887632481, "grad_norm": 0.17005257308483124, "learning_rate": 1.4485780895469974e-05, "loss": 0.3725, "step": 1966 }, { "epoch": 1.2395492691383319, "grad_norm": 0.17467091977596283, "learning_rate": 1.4464749920111495e-05, "loss": 0.337, "step": 1967 }, { "epoch": 1.2401796619518537, "grad_norm": 0.14071719348430634, "learning_rate": 1.444372698815569e-05, "loss": 0.3416, "step": 1968 }, { "epoch": 1.2408100547653758, "grad_norm": 0.13080553710460663, "learning_rate": 1.4422712120646809e-05, "loss": 0.288, "step": 1969 }, { "epoch": 1.2414404475788976, "grad_norm": 0.1427469402551651, "learning_rate": 1.4401705338621053e-05, "loss": 0.3098, "step": 1970 }, { "epoch": 1.2420708403924194, "grad_norm": 0.15654031932353973, "learning_rate": 1.4380706663106503e-05, "loss": 0.345, "step": 1971 }, { "epoch": 1.2427012332059415, "grad_norm": 0.18579089641571045, "learning_rate": 1.4359716115123156e-05, "loss": 0.3677, "step": 1972 }, { "epoch": 1.2433316260194633, "grad_norm": 0.15906280279159546, "learning_rate": 1.4338733715682842e-05, "loss": 0.356, "step": 1973 }, { "epoch": 1.2439620188329852, "grad_norm": 0.16996033489704132, "learning_rate": 1.4317759485789262e-05, "loss": 0.3887, "step": 1974 }, { "epoch": 1.2445924116465072, "grad_norm": 0.16264496743679047, "learning_rate": 1.4296793446437915e-05, "loss": 0.3404, "step": 1975 }, { "epoch": 1.245222804460029, "grad_norm": 0.18813878297805786, "learning_rate": 1.427583561861612e-05, "loss": 0.3911, "step": 1976 }, { "epoch": 1.2458531972735511, "grad_norm": 0.17374739050865173, "learning_rate": 1.4254886023302962e-05, "loss": 0.3744, "step": 1977 }, { "epoch": 1.246483590087073, "grad_norm": 0.16600753366947174, "learning_rate": 1.4233944681469304e-05, "loss": 0.3308, "step": 1978 }, { "epoch": 1.2471139829005948, "grad_norm": 0.16969601809978485, "learning_rate": 1.4213011614077722e-05, "loss": 0.3617, "step": 1979 }, { "epoch": 1.247744375714117, "grad_norm": 0.14293773472309113, "learning_rate": 1.4192086842082532e-05, "loss": 0.3456, "step": 1980 }, { "epoch": 1.2483747685276387, "grad_norm": 0.1606130748987198, "learning_rate": 1.4171170386429728e-05, "loss": 0.3757, "step": 1981 }, { "epoch": 1.2490051613411608, "grad_norm": 0.15878310799598694, "learning_rate": 1.4150262268057002e-05, "loss": 0.3353, "step": 1982 }, { "epoch": 1.2496355541546826, "grad_norm": 0.17191563546657562, "learning_rate": 1.4129362507893657e-05, "loss": 0.354, "step": 1983 }, { "epoch": 1.2502659469682045, "grad_norm": 0.16638456284999847, "learning_rate": 1.4108471126860693e-05, "loss": 0.378, "step": 1984 }, { "epoch": 1.2508963397817265, "grad_norm": 0.16778291761875153, "learning_rate": 1.4087588145870657e-05, "loss": 0.3866, "step": 1985 }, { "epoch": 1.2515267325952484, "grad_norm": 0.1632990837097168, "learning_rate": 1.4066713585827747e-05, "loss": 0.3486, "step": 1986 }, { "epoch": 1.2521571254087704, "grad_norm": 0.15390026569366455, "learning_rate": 1.404584746762767e-05, "loss": 0.3778, "step": 1987 }, { "epoch": 1.2527875182222923, "grad_norm": 0.15472039580345154, "learning_rate": 1.4024989812157745e-05, "loss": 0.3031, "step": 1988 }, { "epoch": 1.2534179110358141, "grad_norm": 0.14273808896541595, "learning_rate": 1.400414064029677e-05, "loss": 0.3105, "step": 1989 }, { "epoch": 1.2540483038493362, "grad_norm": 0.13901756703853607, "learning_rate": 1.3983299972915085e-05, "loss": 0.3639, "step": 1990 }, { "epoch": 1.254678696662858, "grad_norm": 0.15769080817699432, "learning_rate": 1.3962467830874486e-05, "loss": 0.3572, "step": 1991 }, { "epoch": 1.25530908947638, "grad_norm": 0.18810272216796875, "learning_rate": 1.3941644235028269e-05, "loss": 0.3733, "step": 1992 }, { "epoch": 1.255939482289902, "grad_norm": 0.13771487772464752, "learning_rate": 1.3920829206221138e-05, "loss": 0.3542, "step": 1993 }, { "epoch": 1.2565698751034238, "grad_norm": 0.18687589466571808, "learning_rate": 1.3900022765289261e-05, "loss": 0.3899, "step": 1994 }, { "epoch": 1.2572002679169456, "grad_norm": 0.15968801081180573, "learning_rate": 1.3879224933060176e-05, "loss": 0.4121, "step": 1995 }, { "epoch": 1.2578306607304677, "grad_norm": 0.1799834817647934, "learning_rate": 1.3858435730352819e-05, "loss": 0.3319, "step": 1996 }, { "epoch": 1.2584610535439895, "grad_norm": 0.20274387300014496, "learning_rate": 1.3837655177977489e-05, "loss": 0.3676, "step": 1997 }, { "epoch": 1.2590914463575116, "grad_norm": 0.19322608411312103, "learning_rate": 1.3816883296735821e-05, "loss": 0.4142, "step": 1998 }, { "epoch": 1.2597218391710334, "grad_norm": 0.18503984808921814, "learning_rate": 1.3796120107420767e-05, "loss": 0.3196, "step": 1999 }, { "epoch": 1.2603522319845553, "grad_norm": 0.17512302100658417, "learning_rate": 1.3775365630816591e-05, "loss": 0.4094, "step": 2000 }, { "epoch": 1.2603522319845553, "eval_loss": 0.4224383533000946, "eval_runtime": 222.5349, "eval_samples_per_second": 4.494, "eval_steps_per_second": 4.494, "step": 2000 }, { "epoch": 1.2609826247980773, "grad_norm": 0.17680183053016663, "learning_rate": 1.375461988769882e-05, "loss": 0.3257, "step": 2001 }, { "epoch": 1.2616130176115992, "grad_norm": 0.16907788813114166, "learning_rate": 1.3733882898834255e-05, "loss": 0.3264, "step": 2002 }, { "epoch": 1.2622434104251212, "grad_norm": 0.19145682454109192, "learning_rate": 1.3713154684980903e-05, "loss": 0.3352, "step": 2003 }, { "epoch": 1.262873803238643, "grad_norm": 0.15243153274059296, "learning_rate": 1.3692435266888044e-05, "loss": 0.3908, "step": 2004 }, { "epoch": 1.263504196052165, "grad_norm": 0.17341181635856628, "learning_rate": 1.367172466529608e-05, "loss": 0.3609, "step": 2005 }, { "epoch": 1.264134588865687, "grad_norm": 0.207267627120018, "learning_rate": 1.3651022900936656e-05, "loss": 0.3579, "step": 2006 }, { "epoch": 1.2647649816792088, "grad_norm": 0.15846067667007446, "learning_rate": 1.3630329994532515e-05, "loss": 0.3012, "step": 2007 }, { "epoch": 1.2653953744927309, "grad_norm": 0.15553094446659088, "learning_rate": 1.3609645966797588e-05, "loss": 0.3624, "step": 2008 }, { "epoch": 1.2660257673062527, "grad_norm": 0.16129595041275024, "learning_rate": 1.3588970838436862e-05, "loss": 0.3522, "step": 2009 }, { "epoch": 1.2666561601197746, "grad_norm": 0.16139130294322968, "learning_rate": 1.3568304630146457e-05, "loss": 0.3419, "step": 2010 }, { "epoch": 1.2672865529332966, "grad_norm": 0.1251128762960434, "learning_rate": 1.3547647362613538e-05, "loss": 0.299, "step": 2011 }, { "epoch": 1.2679169457468185, "grad_norm": 0.17684251070022583, "learning_rate": 1.3526999056516346e-05, "loss": 0.3454, "step": 2012 }, { "epoch": 1.2685473385603405, "grad_norm": 0.16017553210258484, "learning_rate": 1.3506359732524123e-05, "loss": 0.3684, "step": 2013 }, { "epoch": 1.2691777313738624, "grad_norm": 0.14525946974754333, "learning_rate": 1.3485729411297147e-05, "loss": 0.3145, "step": 2014 }, { "epoch": 1.2698081241873842, "grad_norm": 0.1517396867275238, "learning_rate": 1.3465108113486656e-05, "loss": 0.3048, "step": 2015 }, { "epoch": 1.2704385170009063, "grad_norm": 0.12699000537395477, "learning_rate": 1.3444495859734885e-05, "loss": 0.2737, "step": 2016 }, { "epoch": 1.271068909814428, "grad_norm": 0.14624783396720886, "learning_rate": 1.3423892670674989e-05, "loss": 0.3573, "step": 2017 }, { "epoch": 1.2716993026279502, "grad_norm": 0.16006223857402802, "learning_rate": 1.340329856693107e-05, "loss": 0.3779, "step": 2018 }, { "epoch": 1.272329695441472, "grad_norm": 0.17546787858009338, "learning_rate": 1.3382713569118122e-05, "loss": 0.3166, "step": 2019 }, { "epoch": 1.2729600882549938, "grad_norm": 0.1470988541841507, "learning_rate": 1.3362137697842032e-05, "loss": 0.3314, "step": 2020 }, { "epoch": 1.2735904810685157, "grad_norm": 0.16196255385875702, "learning_rate": 1.3341570973699536e-05, "loss": 0.367, "step": 2021 }, { "epoch": 1.2742208738820378, "grad_norm": 0.15870201587677002, "learning_rate": 1.3321013417278243e-05, "loss": 0.3421, "step": 2022 }, { "epoch": 1.2748512666955596, "grad_norm": 0.14421947300434113, "learning_rate": 1.3300465049156553e-05, "loss": 0.2975, "step": 2023 }, { "epoch": 1.2754816595090817, "grad_norm": 0.16223183274269104, "learning_rate": 1.3279925889903695e-05, "loss": 0.3614, "step": 2024 }, { "epoch": 1.2761120523226035, "grad_norm": 0.17637160420417786, "learning_rate": 1.325939596007965e-05, "loss": 0.4007, "step": 2025 }, { "epoch": 1.2767424451361253, "grad_norm": 0.17480182647705078, "learning_rate": 1.32388752802352e-05, "loss": 0.3837, "step": 2026 }, { "epoch": 1.2773728379496474, "grad_norm": 0.15375201404094696, "learning_rate": 1.3218363870911818e-05, "loss": 0.253, "step": 2027 }, { "epoch": 1.2780032307631692, "grad_norm": 0.1700395941734314, "learning_rate": 1.3197861752641752e-05, "loss": 0.3247, "step": 2028 }, { "epoch": 1.2786336235766913, "grad_norm": 0.17732472717761993, "learning_rate": 1.3177368945947902e-05, "loss": 0.3686, "step": 2029 }, { "epoch": 1.2792640163902131, "grad_norm": 0.15792888402938843, "learning_rate": 1.3156885471343877e-05, "loss": 0.3475, "step": 2030 }, { "epoch": 1.279894409203735, "grad_norm": 0.16673409938812256, "learning_rate": 1.3136411349333921e-05, "loss": 0.3013, "step": 2031 }, { "epoch": 1.280524802017257, "grad_norm": 0.14620770514011383, "learning_rate": 1.3115946600412955e-05, "loss": 0.3683, "step": 2032 }, { "epoch": 1.2811551948307789, "grad_norm": 0.17873959243297577, "learning_rate": 1.3095491245066465e-05, "loss": 0.3408, "step": 2033 }, { "epoch": 1.281785587644301, "grad_norm": 0.1617429405450821, "learning_rate": 1.3075045303770581e-05, "loss": 0.359, "step": 2034 }, { "epoch": 1.2824159804578228, "grad_norm": 0.187462717294693, "learning_rate": 1.3054608796991957e-05, "loss": 0.3613, "step": 2035 }, { "epoch": 1.2830463732713446, "grad_norm": 0.14734844863414764, "learning_rate": 1.3034181745187877e-05, "loss": 0.3299, "step": 2036 }, { "epoch": 1.2836767660848667, "grad_norm": 0.18680807948112488, "learning_rate": 1.3013764168806091e-05, "loss": 0.3326, "step": 2037 }, { "epoch": 1.2843071588983885, "grad_norm": 0.1547849178314209, "learning_rate": 1.2993356088284902e-05, "loss": 0.3732, "step": 2038 }, { "epoch": 1.2849375517119106, "grad_norm": 0.15319564938545227, "learning_rate": 1.2972957524053088e-05, "loss": 0.3105, "step": 2039 }, { "epoch": 1.2855679445254324, "grad_norm": 0.16970223188400269, "learning_rate": 1.2952568496529912e-05, "loss": 0.3467, "step": 2040 }, { "epoch": 1.2861983373389543, "grad_norm": 0.18748734891414642, "learning_rate": 1.2932189026125086e-05, "loss": 0.3412, "step": 2041 }, { "epoch": 1.2868287301524763, "grad_norm": 0.15902206301689148, "learning_rate": 1.2911819133238775e-05, "loss": 0.3001, "step": 2042 }, { "epoch": 1.2874591229659982, "grad_norm": 0.20623432099819183, "learning_rate": 1.2891458838261515e-05, "loss": 0.3626, "step": 2043 }, { "epoch": 1.2880895157795202, "grad_norm": 0.18049001693725586, "learning_rate": 1.2871108161574274e-05, "loss": 0.3765, "step": 2044 }, { "epoch": 1.288719908593042, "grad_norm": 0.17701010406017303, "learning_rate": 1.2850767123548368e-05, "loss": 0.3259, "step": 2045 }, { "epoch": 1.289350301406564, "grad_norm": 0.20592482388019562, "learning_rate": 1.2830435744545487e-05, "loss": 0.3574, "step": 2046 }, { "epoch": 1.2899806942200858, "grad_norm": 0.1588132679462433, "learning_rate": 1.2810114044917618e-05, "loss": 0.3072, "step": 2047 }, { "epoch": 1.2906110870336078, "grad_norm": 0.16019609570503235, "learning_rate": 1.2789802045007092e-05, "loss": 0.402, "step": 2048 }, { "epoch": 1.2912414798471297, "grad_norm": 0.14784568548202515, "learning_rate": 1.2769499765146514e-05, "loss": 0.3119, "step": 2049 }, { "epoch": 1.2918718726606517, "grad_norm": 0.20480933785438538, "learning_rate": 1.2749207225658772e-05, "loss": 0.4032, "step": 2050 }, { "epoch": 1.2925022654741736, "grad_norm": 0.19020628929138184, "learning_rate": 1.2728924446856978e-05, "loss": 0.36, "step": 2051 }, { "epoch": 1.2931326582876954, "grad_norm": 0.16703538596630096, "learning_rate": 1.2708651449044498e-05, "loss": 0.3897, "step": 2052 }, { "epoch": 1.2937630511012175, "grad_norm": 0.15481066703796387, "learning_rate": 1.2688388252514897e-05, "loss": 0.3668, "step": 2053 }, { "epoch": 1.2943934439147393, "grad_norm": 0.17388780415058136, "learning_rate": 1.2668134877551945e-05, "loss": 0.3633, "step": 2054 }, { "epoch": 1.2950238367282614, "grad_norm": 0.15646809339523315, "learning_rate": 1.2647891344429541e-05, "loss": 0.3962, "step": 2055 }, { "epoch": 1.2956542295417832, "grad_norm": 0.17562836408615112, "learning_rate": 1.262765767341178e-05, "loss": 0.3227, "step": 2056 }, { "epoch": 1.296284622355305, "grad_norm": 0.1547730267047882, "learning_rate": 1.2607433884752831e-05, "loss": 0.3233, "step": 2057 }, { "epoch": 1.2969150151688271, "grad_norm": 0.17791743576526642, "learning_rate": 1.2587219998697041e-05, "loss": 0.4285, "step": 2058 }, { "epoch": 1.297545407982349, "grad_norm": 0.16143293678760529, "learning_rate": 1.256701603547878e-05, "loss": 0.3373, "step": 2059 }, { "epoch": 1.298175800795871, "grad_norm": 0.16266663372516632, "learning_rate": 1.2546822015322526e-05, "loss": 0.2953, "step": 2060 }, { "epoch": 1.2988061936093929, "grad_norm": 0.17324328422546387, "learning_rate": 1.252663795844277e-05, "loss": 0.3864, "step": 2061 }, { "epoch": 1.2994365864229147, "grad_norm": 0.14631371200084686, "learning_rate": 1.2506463885044056e-05, "loss": 0.3076, "step": 2062 }, { "epoch": 1.3000669792364368, "grad_norm": 0.17927135527133942, "learning_rate": 1.2486299815320925e-05, "loss": 0.3168, "step": 2063 }, { "epoch": 1.3006973720499586, "grad_norm": 0.14138492941856384, "learning_rate": 1.2466145769457918e-05, "loss": 0.2993, "step": 2064 }, { "epoch": 1.3013277648634807, "grad_norm": 0.14290207624435425, "learning_rate": 1.244600176762951e-05, "loss": 0.343, "step": 2065 }, { "epoch": 1.3019581576770025, "grad_norm": 0.1918410360813141, "learning_rate": 1.2425867830000147e-05, "loss": 0.3881, "step": 2066 }, { "epoch": 1.3025885504905244, "grad_norm": 0.14091849327087402, "learning_rate": 1.2405743976724194e-05, "loss": 0.3455, "step": 2067 }, { "epoch": 1.3032189433040462, "grad_norm": 0.1309073269367218, "learning_rate": 1.238563022794594e-05, "loss": 0.2491, "step": 2068 }, { "epoch": 1.3038493361175683, "grad_norm": 0.14994269609451294, "learning_rate": 1.2365526603799513e-05, "loss": 0.3525, "step": 2069 }, { "epoch": 1.3044797289310903, "grad_norm": 0.18416768312454224, "learning_rate": 1.2345433124408949e-05, "loss": 0.3717, "step": 2070 }, { "epoch": 1.3051101217446122, "grad_norm": 0.15417777001857758, "learning_rate": 1.2325349809888112e-05, "loss": 0.3617, "step": 2071 }, { "epoch": 1.305740514558134, "grad_norm": 0.17371056973934174, "learning_rate": 1.2305276680340704e-05, "loss": 0.3664, "step": 2072 }, { "epoch": 1.3063709073716558, "grad_norm": 0.19204451143741608, "learning_rate": 1.2285213755860199e-05, "loss": 0.3769, "step": 2073 }, { "epoch": 1.307001300185178, "grad_norm": 0.16981463134288788, "learning_rate": 1.2265161056529899e-05, "loss": 0.3194, "step": 2074 }, { "epoch": 1.3076316929986997, "grad_norm": 0.16115683317184448, "learning_rate": 1.2245118602422822e-05, "loss": 0.2995, "step": 2075 }, { "epoch": 1.3082620858122218, "grad_norm": 0.16867254674434662, "learning_rate": 1.222508641360179e-05, "loss": 0.362, "step": 2076 }, { "epoch": 1.3088924786257436, "grad_norm": 0.20875564217567444, "learning_rate": 1.2205064510119293e-05, "loss": 0.42, "step": 2077 }, { "epoch": 1.3095228714392655, "grad_norm": 0.14017994701862335, "learning_rate": 1.2185052912017564e-05, "loss": 0.3294, "step": 2078 }, { "epoch": 1.3101532642527876, "grad_norm": 0.17594243586063385, "learning_rate": 1.216505163932848e-05, "loss": 0.3656, "step": 2079 }, { "epoch": 1.3107836570663094, "grad_norm": 0.18787644803524017, "learning_rate": 1.2145060712073642e-05, "loss": 0.3789, "step": 2080 }, { "epoch": 1.3114140498798315, "grad_norm": 0.14373567700386047, "learning_rate": 1.2125080150264233e-05, "loss": 0.3008, "step": 2081 }, { "epoch": 1.3120444426933533, "grad_norm": 0.16203191876411438, "learning_rate": 1.2105109973901107e-05, "loss": 0.3153, "step": 2082 }, { "epoch": 1.3126748355068751, "grad_norm": 0.1497282087802887, "learning_rate": 1.208515020297468e-05, "loss": 0.3249, "step": 2083 }, { "epoch": 1.3133052283203972, "grad_norm": 0.14576971530914307, "learning_rate": 1.2065200857464989e-05, "loss": 0.3079, "step": 2084 }, { "epoch": 1.313935621133919, "grad_norm": 0.19296002388000488, "learning_rate": 1.2045261957341617e-05, "loss": 0.3555, "step": 2085 }, { "epoch": 1.314566013947441, "grad_norm": 0.1784832924604416, "learning_rate": 1.2025333522563699e-05, "loss": 0.3714, "step": 2086 }, { "epoch": 1.315196406760963, "grad_norm": 0.21400387585163116, "learning_rate": 1.200541557307988e-05, "loss": 0.434, "step": 2087 }, { "epoch": 1.3158267995744848, "grad_norm": 0.18918611109256744, "learning_rate": 1.1985508128828318e-05, "loss": 0.3567, "step": 2088 }, { "epoch": 1.3164571923880068, "grad_norm": 0.16789653897285461, "learning_rate": 1.1965611209736663e-05, "loss": 0.3048, "step": 2089 }, { "epoch": 1.3170875852015287, "grad_norm": 0.20766571164131165, "learning_rate": 1.1945724835722029e-05, "loss": 0.3841, "step": 2090 }, { "epoch": 1.3177179780150508, "grad_norm": 0.15403051674365997, "learning_rate": 1.1925849026690944e-05, "loss": 0.3633, "step": 2091 }, { "epoch": 1.3183483708285726, "grad_norm": 0.17515239119529724, "learning_rate": 1.1905983802539399e-05, "loss": 0.4071, "step": 2092 }, { "epoch": 1.3189787636420944, "grad_norm": 0.18618489801883698, "learning_rate": 1.1886129183152768e-05, "loss": 0.3248, "step": 2093 }, { "epoch": 1.3196091564556163, "grad_norm": 0.15004846453666687, "learning_rate": 1.1866285188405826e-05, "loss": 0.3313, "step": 2094 }, { "epoch": 1.3202395492691383, "grad_norm": 0.1696581244468689, "learning_rate": 1.1846451838162683e-05, "loss": 0.3451, "step": 2095 }, { "epoch": 1.3208699420826602, "grad_norm": 0.13555723428726196, "learning_rate": 1.1826629152276831e-05, "loss": 0.3332, "step": 2096 }, { "epoch": 1.3215003348961822, "grad_norm": 0.14103135466575623, "learning_rate": 1.1806817150591042e-05, "loss": 0.3281, "step": 2097 }, { "epoch": 1.322130727709704, "grad_norm": 0.16602687537670135, "learning_rate": 1.1787015852937451e-05, "loss": 0.3702, "step": 2098 }, { "epoch": 1.322761120523226, "grad_norm": 0.18209081888198853, "learning_rate": 1.1767225279137426e-05, "loss": 0.371, "step": 2099 }, { "epoch": 1.323391513336748, "grad_norm": 0.15361028909683228, "learning_rate": 1.1747445449001628e-05, "loss": 0.3664, "step": 2100 }, { "epoch": 1.3240219061502698, "grad_norm": 0.1743720918893814, "learning_rate": 1.1727676382329951e-05, "loss": 0.3428, "step": 2101 }, { "epoch": 1.3246522989637919, "grad_norm": 0.15461906790733337, "learning_rate": 1.1707918098911518e-05, "loss": 0.3322, "step": 2102 }, { "epoch": 1.3252826917773137, "grad_norm": 0.17772087454795837, "learning_rate": 1.1688170618524662e-05, "loss": 0.3166, "step": 2103 }, { "epoch": 1.3259130845908356, "grad_norm": 0.17029738426208496, "learning_rate": 1.1668433960936909e-05, "loss": 0.3881, "step": 2104 }, { "epoch": 1.3265434774043576, "grad_norm": 0.1838579922914505, "learning_rate": 1.1648708145904926e-05, "loss": 0.3728, "step": 2105 }, { "epoch": 1.3271738702178795, "grad_norm": 0.16194473206996918, "learning_rate": 1.1628993193174543e-05, "loss": 0.3705, "step": 2106 }, { "epoch": 1.3278042630314015, "grad_norm": 0.16256876289844513, "learning_rate": 1.1609289122480726e-05, "loss": 0.3453, "step": 2107 }, { "epoch": 1.3284346558449234, "grad_norm": 0.13366344571113586, "learning_rate": 1.158959595354754e-05, "loss": 0.2767, "step": 2108 }, { "epoch": 1.3290650486584452, "grad_norm": 0.17386309802532196, "learning_rate": 1.1569913706088115e-05, "loss": 0.4169, "step": 2109 }, { "epoch": 1.3296954414719673, "grad_norm": 0.16652558743953705, "learning_rate": 1.155024239980468e-05, "loss": 0.3581, "step": 2110 }, { "epoch": 1.3303258342854891, "grad_norm": 0.14673203229904175, "learning_rate": 1.1530582054388504e-05, "loss": 0.3549, "step": 2111 }, { "epoch": 1.3309562270990112, "grad_norm": 0.15078188478946686, "learning_rate": 1.1510932689519882e-05, "loss": 0.348, "step": 2112 }, { "epoch": 1.331586619912533, "grad_norm": 0.19112563133239746, "learning_rate": 1.1491294324868103e-05, "loss": 0.3475, "step": 2113 }, { "epoch": 1.3322170127260549, "grad_norm": 0.1545761227607727, "learning_rate": 1.1471666980091462e-05, "loss": 0.3901, "step": 2114 }, { "epoch": 1.332847405539577, "grad_norm": 0.16541506350040436, "learning_rate": 1.1452050674837217e-05, "loss": 0.4023, "step": 2115 }, { "epoch": 1.3334777983530988, "grad_norm": 0.15039975941181183, "learning_rate": 1.1432445428741585e-05, "loss": 0.378, "step": 2116 }, { "epoch": 1.3341081911666208, "grad_norm": 0.15127238631248474, "learning_rate": 1.1412851261429689e-05, "loss": 0.3087, "step": 2117 }, { "epoch": 1.3347385839801427, "grad_norm": 0.1384199857711792, "learning_rate": 1.1393268192515592e-05, "loss": 0.2421, "step": 2118 }, { "epoch": 1.3353689767936645, "grad_norm": 0.15712042152881622, "learning_rate": 1.1373696241602206e-05, "loss": 0.3087, "step": 2119 }, { "epoch": 1.3359993696071863, "grad_norm": 0.1710873246192932, "learning_rate": 1.1354135428281377e-05, "loss": 0.405, "step": 2120 }, { "epoch": 1.3366297624207084, "grad_norm": 0.16128316521644592, "learning_rate": 1.1334585772133736e-05, "loss": 0.3004, "step": 2121 }, { "epoch": 1.3372601552342303, "grad_norm": 0.1520003229379654, "learning_rate": 1.131504729272879e-05, "loss": 0.3234, "step": 2122 }, { "epoch": 1.3378905480477523, "grad_norm": 0.17195023596286774, "learning_rate": 1.1295520009624825e-05, "loss": 0.3663, "step": 2123 }, { "epoch": 1.3385209408612742, "grad_norm": 0.15014538168907166, "learning_rate": 1.1276003942368952e-05, "loss": 0.3301, "step": 2124 }, { "epoch": 1.339151333674796, "grad_norm": 0.1437128782272339, "learning_rate": 1.1256499110497032e-05, "loss": 0.3344, "step": 2125 }, { "epoch": 1.339781726488318, "grad_norm": 0.15259809792041779, "learning_rate": 1.12370055335337e-05, "loss": 0.3522, "step": 2126 }, { "epoch": 1.34041211930184, "grad_norm": 0.17295295000076294, "learning_rate": 1.1217523230992287e-05, "loss": 0.3374, "step": 2127 }, { "epoch": 1.341042512115362, "grad_norm": 0.12949199974536896, "learning_rate": 1.1198052222374879e-05, "loss": 0.3374, "step": 2128 }, { "epoch": 1.3416729049288838, "grad_norm": 0.17765846848487854, "learning_rate": 1.1178592527172235e-05, "loss": 0.382, "step": 2129 }, { "epoch": 1.3423032977424056, "grad_norm": 0.18448440730571747, "learning_rate": 1.1159144164863806e-05, "loss": 0.3577, "step": 2130 }, { "epoch": 1.3429336905559277, "grad_norm": 0.18258066475391388, "learning_rate": 1.1139707154917666e-05, "loss": 0.3923, "step": 2131 }, { "epoch": 1.3435640833694495, "grad_norm": 0.16213437914848328, "learning_rate": 1.1120281516790557e-05, "loss": 0.4219, "step": 2132 }, { "epoch": 1.3441944761829716, "grad_norm": 0.17956312000751495, "learning_rate": 1.1100867269927825e-05, "loss": 0.3848, "step": 2133 }, { "epoch": 1.3448248689964934, "grad_norm": 0.14712972939014435, "learning_rate": 1.1081464433763425e-05, "loss": 0.3267, "step": 2134 }, { "epoch": 1.3454552618100153, "grad_norm": 0.16251666843891144, "learning_rate": 1.106207302771986e-05, "loss": 0.3602, "step": 2135 }, { "epoch": 1.3460856546235374, "grad_norm": 0.17616884410381317, "learning_rate": 1.1042693071208228e-05, "loss": 0.352, "step": 2136 }, { "epoch": 1.3467160474370592, "grad_norm": 0.13910022377967834, "learning_rate": 1.1023324583628122e-05, "loss": 0.2441, "step": 2137 }, { "epoch": 1.3473464402505813, "grad_norm": 0.1513376086950302, "learning_rate": 1.100396758436772e-05, "loss": 0.3176, "step": 2138 }, { "epoch": 1.347976833064103, "grad_norm": 0.17463596165180206, "learning_rate": 1.098462209280363e-05, "loss": 0.365, "step": 2139 }, { "epoch": 1.348607225877625, "grad_norm": 0.17284253239631653, "learning_rate": 1.096528812830099e-05, "loss": 0.3961, "step": 2140 }, { "epoch": 1.349237618691147, "grad_norm": 0.14869636297225952, "learning_rate": 1.0945965710213364e-05, "loss": 0.3043, "step": 2141 }, { "epoch": 1.3498680115046688, "grad_norm": 0.17893320322036743, "learning_rate": 1.0926654857882784e-05, "loss": 0.3753, "step": 2142 }, { "epoch": 1.350498404318191, "grad_norm": 0.20987479388713837, "learning_rate": 1.0907355590639695e-05, "loss": 0.3507, "step": 2143 }, { "epoch": 1.3511287971317127, "grad_norm": 0.14228267967700958, "learning_rate": 1.0888067927802953e-05, "loss": 0.2932, "step": 2144 }, { "epoch": 1.3517591899452346, "grad_norm": 0.15405458211898804, "learning_rate": 1.0868791888679774e-05, "loss": 0.2966, "step": 2145 }, { "epoch": 1.3523895827587564, "grad_norm": 0.14483889937400818, "learning_rate": 1.0849527492565762e-05, "loss": 0.3889, "step": 2146 }, { "epoch": 1.3530199755722785, "grad_norm": 0.18476107716560364, "learning_rate": 1.0830274758744857e-05, "loss": 0.3637, "step": 2147 }, { "epoch": 1.3536503683858003, "grad_norm": 0.20455743372440338, "learning_rate": 1.0811033706489342e-05, "loss": 0.4167, "step": 2148 }, { "epoch": 1.3542807611993224, "grad_norm": 0.1842804104089737, "learning_rate": 1.0791804355059767e-05, "loss": 0.3541, "step": 2149 }, { "epoch": 1.3549111540128442, "grad_norm": 0.20898349583148956, "learning_rate": 1.0772586723705007e-05, "loss": 0.4065, "step": 2150 }, { "epoch": 1.355541546826366, "grad_norm": 0.16361692547798157, "learning_rate": 1.0753380831662187e-05, "loss": 0.4172, "step": 2151 }, { "epoch": 1.3561719396398881, "grad_norm": 0.19685503840446472, "learning_rate": 1.07341866981567e-05, "loss": 0.3342, "step": 2152 }, { "epoch": 1.35680233245341, "grad_norm": 0.16768209636211395, "learning_rate": 1.0715004342402133e-05, "loss": 0.3338, "step": 2153 }, { "epoch": 1.357432725266932, "grad_norm": 0.16943420469760895, "learning_rate": 1.0695833783600312e-05, "loss": 0.328, "step": 2154 }, { "epoch": 1.3580631180804539, "grad_norm": 0.1994352638721466, "learning_rate": 1.0676675040941247e-05, "loss": 0.376, "step": 2155 }, { "epoch": 1.3586935108939757, "grad_norm": 0.1642264872789383, "learning_rate": 1.0657528133603128e-05, "loss": 0.2913, "step": 2156 }, { "epoch": 1.3593239037074978, "grad_norm": 0.17089161276817322, "learning_rate": 1.0638393080752273e-05, "loss": 0.3303, "step": 2157 }, { "epoch": 1.3599542965210196, "grad_norm": 0.1726408749818802, "learning_rate": 1.0619269901543167e-05, "loss": 0.3242, "step": 2158 }, { "epoch": 1.3605846893345417, "grad_norm": 0.16919533908367157, "learning_rate": 1.0600158615118361e-05, "loss": 0.4038, "step": 2159 }, { "epoch": 1.3612150821480635, "grad_norm": 0.19589287042617798, "learning_rate": 1.0581059240608571e-05, "loss": 0.3371, "step": 2160 }, { "epoch": 1.3618454749615854, "grad_norm": 0.1313522905111313, "learning_rate": 1.0561971797132524e-05, "loss": 0.303, "step": 2161 }, { "epoch": 1.3624758677751074, "grad_norm": 0.144204244017601, "learning_rate": 1.0542896303797043e-05, "loss": 0.3262, "step": 2162 }, { "epoch": 1.3631062605886293, "grad_norm": 0.18240010738372803, "learning_rate": 1.0523832779696962e-05, "loss": 0.3823, "step": 2163 }, { "epoch": 1.3637366534021513, "grad_norm": 0.12797421216964722, "learning_rate": 1.0504781243915153e-05, "loss": 0.3045, "step": 2164 }, { "epoch": 1.3643670462156732, "grad_norm": 0.1489725559949875, "learning_rate": 1.0485741715522484e-05, "loss": 0.3592, "step": 2165 }, { "epoch": 1.364997439029195, "grad_norm": 0.17021243274211884, "learning_rate": 1.0466714213577807e-05, "loss": 0.3543, "step": 2166 }, { "epoch": 1.3656278318427169, "grad_norm": 0.14448320865631104, "learning_rate": 1.0447698757127913e-05, "loss": 0.3197, "step": 2167 }, { "epoch": 1.366258224656239, "grad_norm": 0.17550738155841827, "learning_rate": 1.0428695365207561e-05, "loss": 0.3569, "step": 2168 }, { "epoch": 1.366888617469761, "grad_norm": 0.16940660774707794, "learning_rate": 1.0409704056839423e-05, "loss": 0.3336, "step": 2169 }, { "epoch": 1.3675190102832828, "grad_norm": 0.1698468029499054, "learning_rate": 1.0390724851034088e-05, "loss": 0.3461, "step": 2170 }, { "epoch": 1.3681494030968047, "grad_norm": 0.21772147715091705, "learning_rate": 1.0371757766789994e-05, "loss": 0.3958, "step": 2171 }, { "epoch": 1.3687797959103265, "grad_norm": 0.1430359184741974, "learning_rate": 1.0352802823093483e-05, "loss": 0.2761, "step": 2172 }, { "epoch": 1.3694101887238486, "grad_norm": 0.1858154535293579, "learning_rate": 1.0333860038918728e-05, "loss": 0.2856, "step": 2173 }, { "epoch": 1.3700405815373704, "grad_norm": 0.15812067687511444, "learning_rate": 1.0314929433227745e-05, "loss": 0.3216, "step": 2174 }, { "epoch": 1.3706709743508925, "grad_norm": 0.19749779999256134, "learning_rate": 1.0296011024970326e-05, "loss": 0.4061, "step": 2175 }, { "epoch": 1.3713013671644143, "grad_norm": 0.1705770045518875, "learning_rate": 1.0277104833084083e-05, "loss": 0.3822, "step": 2176 }, { "epoch": 1.3719317599779361, "grad_norm": 0.18999949097633362, "learning_rate": 1.025821087649439e-05, "loss": 0.4095, "step": 2177 }, { "epoch": 1.3725621527914582, "grad_norm": 0.16586129367351532, "learning_rate": 1.0239329174114383e-05, "loss": 0.3592, "step": 2178 }, { "epoch": 1.37319254560498, "grad_norm": 0.15350428223609924, "learning_rate": 1.0220459744844905e-05, "loss": 0.3844, "step": 2179 }, { "epoch": 1.3738229384185021, "grad_norm": 0.16602548956871033, "learning_rate": 1.0201602607574546e-05, "loss": 0.4005, "step": 2180 }, { "epoch": 1.374453331232024, "grad_norm": 0.16380132734775543, "learning_rate": 1.0182757781179552e-05, "loss": 0.3572, "step": 2181 }, { "epoch": 1.3750837240455458, "grad_norm": 0.20154309272766113, "learning_rate": 1.01639252845239e-05, "loss": 0.3688, "step": 2182 }, { "epoch": 1.3757141168590679, "grad_norm": 0.17154088616371155, "learning_rate": 1.0145105136459172e-05, "loss": 0.3749, "step": 2183 }, { "epoch": 1.3763445096725897, "grad_norm": 0.13803677260875702, "learning_rate": 1.0126297355824628e-05, "loss": 0.3064, "step": 2184 }, { "epoch": 1.3769749024861118, "grad_norm": 0.15523208677768707, "learning_rate": 1.0107501961447112e-05, "loss": 0.3564, "step": 2185 }, { "epoch": 1.3776052952996336, "grad_norm": 0.18922999501228333, "learning_rate": 1.0088718972141097e-05, "loss": 0.3339, "step": 2186 }, { "epoch": 1.3782356881131554, "grad_norm": 0.17507292330265045, "learning_rate": 1.0069948406708632e-05, "loss": 0.3967, "step": 2187 }, { "epoch": 1.3788660809266775, "grad_norm": 0.2225351333618164, "learning_rate": 1.0051190283939332e-05, "loss": 0.4534, "step": 2188 }, { "epoch": 1.3794964737401993, "grad_norm": 0.15008077025413513, "learning_rate": 1.0032444622610339e-05, "loss": 0.2755, "step": 2189 }, { "epoch": 1.3801268665537214, "grad_norm": 0.1385604292154312, "learning_rate": 1.0013711441486339e-05, "loss": 0.2841, "step": 2190 }, { "epoch": 1.3807572593672433, "grad_norm": 0.2007773369550705, "learning_rate": 9.994990759319522e-06, "loss": 0.3863, "step": 2191 }, { "epoch": 1.381387652180765, "grad_norm": 0.1429567039012909, "learning_rate": 9.976282594849569e-06, "loss": 0.3438, "step": 2192 }, { "epoch": 1.382018044994287, "grad_norm": 0.14305874705314636, "learning_rate": 9.957586966803615e-06, "loss": 0.2912, "step": 2193 }, { "epoch": 1.382648437807809, "grad_norm": 0.14862282574176788, "learning_rate": 9.93890389389626e-06, "loss": 0.3041, "step": 2194 }, { "epoch": 1.3832788306213308, "grad_norm": 0.16503016650676727, "learning_rate": 9.920233394829533e-06, "loss": 0.2807, "step": 2195 }, { "epoch": 1.383909223434853, "grad_norm": 0.16887208819389343, "learning_rate": 9.901575488292886e-06, "loss": 0.3668, "step": 2196 }, { "epoch": 1.3845396162483747, "grad_norm": 0.1434229612350464, "learning_rate": 9.882930192963142e-06, "loss": 0.2897, "step": 2197 }, { "epoch": 1.3851700090618966, "grad_norm": 0.21803705394268036, "learning_rate": 9.864297527504525e-06, "loss": 0.4214, "step": 2198 }, { "epoch": 1.3858004018754186, "grad_norm": 0.1603592485189438, "learning_rate": 9.845677510568584e-06, "loss": 0.3886, "step": 2199 }, { "epoch": 1.3864307946889405, "grad_norm": 0.161991149187088, "learning_rate": 9.82707016079426e-06, "loss": 0.2961, "step": 2200 }, { "epoch": 1.3864307946889405, "eval_loss": 0.4210342466831207, "eval_runtime": 222.2931, "eval_samples_per_second": 4.499, "eval_steps_per_second": 4.499, "step": 2200 }, { "epoch": 1.3870611875024625, "grad_norm": 0.15873432159423828, "learning_rate": 9.808475496807757e-06, "loss": 0.3394, "step": 2201 }, { "epoch": 1.3876915803159844, "grad_norm": 0.15210144221782684, "learning_rate": 9.789893537222622e-06, "loss": 0.3294, "step": 2202 }, { "epoch": 1.3883219731295062, "grad_norm": 0.15166395902633667, "learning_rate": 9.771324300639652e-06, "loss": 0.4074, "step": 2203 }, { "epoch": 1.3889523659430283, "grad_norm": 0.18236121535301208, "learning_rate": 9.752767805646932e-06, "loss": 0.3576, "step": 2204 }, { "epoch": 1.3895827587565501, "grad_norm": 0.15060827136039734, "learning_rate": 9.734224070819779e-06, "loss": 0.3644, "step": 2205 }, { "epoch": 1.3902131515700722, "grad_norm": 0.15586620569229126, "learning_rate": 9.71569311472076e-06, "loss": 0.2944, "step": 2206 }, { "epoch": 1.390843544383594, "grad_norm": 0.1764363795518875, "learning_rate": 9.697174955899612e-06, "loss": 0.3629, "step": 2207 }, { "epoch": 1.3914739371971159, "grad_norm": 0.13980361819267273, "learning_rate": 9.678669612893287e-06, "loss": 0.3418, "step": 2208 }, { "epoch": 1.392104330010638, "grad_norm": 0.19479088485240936, "learning_rate": 9.660177104225908e-06, "loss": 0.4051, "step": 2209 }, { "epoch": 1.3927347228241598, "grad_norm": 0.16188308596611023, "learning_rate": 9.641697448408756e-06, "loss": 0.3823, "step": 2210 }, { "epoch": 1.3933651156376818, "grad_norm": 0.16864702105522156, "learning_rate": 9.623230663940215e-06, "loss": 0.3365, "step": 2211 }, { "epoch": 1.3939955084512037, "grad_norm": 0.1589280664920807, "learning_rate": 9.604776769305815e-06, "loss": 0.3006, "step": 2212 }, { "epoch": 1.3946259012647255, "grad_norm": 0.16485220193862915, "learning_rate": 9.586335782978175e-06, "loss": 0.2999, "step": 2213 }, { "epoch": 1.3952562940782476, "grad_norm": 0.16778141260147095, "learning_rate": 9.567907723417002e-06, "loss": 0.3433, "step": 2214 }, { "epoch": 1.3958866868917694, "grad_norm": 0.17085351049900055, "learning_rate": 9.549492609069036e-06, "loss": 0.3674, "step": 2215 }, { "epoch": 1.3965170797052915, "grad_norm": 0.1922229677438736, "learning_rate": 9.531090458368078e-06, "loss": 0.3561, "step": 2216 }, { "epoch": 1.3971474725188133, "grad_norm": 0.1994408369064331, "learning_rate": 9.512701289734957e-06, "loss": 0.3808, "step": 2217 }, { "epoch": 1.3977778653323352, "grad_norm": 0.14683650434017181, "learning_rate": 9.494325121577504e-06, "loss": 0.3088, "step": 2218 }, { "epoch": 1.398408258145857, "grad_norm": 0.18013975024223328, "learning_rate": 9.475961972290513e-06, "loss": 0.3011, "step": 2219 }, { "epoch": 1.399038650959379, "grad_norm": 0.17953065037727356, "learning_rate": 9.457611860255786e-06, "loss": 0.4288, "step": 2220 }, { "epoch": 1.399669043772901, "grad_norm": 0.13922902941703796, "learning_rate": 9.439274803842024e-06, "loss": 0.3322, "step": 2221 }, { "epoch": 1.400299436586423, "grad_norm": 0.1882917881011963, "learning_rate": 9.420950821404922e-06, "loss": 0.3984, "step": 2222 }, { "epoch": 1.4009298293999448, "grad_norm": 0.13556498289108276, "learning_rate": 9.402639931287032e-06, "loss": 0.3062, "step": 2223 }, { "epoch": 1.4015602222134667, "grad_norm": 0.16906505823135376, "learning_rate": 9.38434215181784e-06, "loss": 0.3651, "step": 2224 }, { "epoch": 1.4021906150269887, "grad_norm": 0.16540083289146423, "learning_rate": 9.366057501313665e-06, "loss": 0.3759, "step": 2225 }, { "epoch": 1.4028210078405106, "grad_norm": 0.1791892945766449, "learning_rate": 9.347785998077727e-06, "loss": 0.3819, "step": 2226 }, { "epoch": 1.4034514006540326, "grad_norm": 0.15491949021816254, "learning_rate": 9.329527660400064e-06, "loss": 0.3316, "step": 2227 }, { "epoch": 1.4040817934675545, "grad_norm": 0.17092517018318176, "learning_rate": 9.311282506557547e-06, "loss": 0.368, "step": 2228 }, { "epoch": 1.4047121862810763, "grad_norm": 0.15667341649532318, "learning_rate": 9.293050554813825e-06, "loss": 0.2982, "step": 2229 }, { "epoch": 1.4053425790945984, "grad_norm": 0.14160802960395813, "learning_rate": 9.274831823419357e-06, "loss": 0.3122, "step": 2230 }, { "epoch": 1.4059729719081202, "grad_norm": 0.19099406898021698, "learning_rate": 9.256626330611357e-06, "loss": 0.377, "step": 2231 }, { "epoch": 1.4066033647216423, "grad_norm": 0.1867620050907135, "learning_rate": 9.2384340946138e-06, "loss": 0.3396, "step": 2232 }, { "epoch": 1.407233757535164, "grad_norm": 0.17459844052791595, "learning_rate": 9.22025513363736e-06, "loss": 0.3364, "step": 2233 }, { "epoch": 1.407864150348686, "grad_norm": 0.16290386021137238, "learning_rate": 9.202089465879452e-06, "loss": 0.3307, "step": 2234 }, { "epoch": 1.408494543162208, "grad_norm": 0.16637465357780457, "learning_rate": 9.183937109524177e-06, "loss": 0.3042, "step": 2235 }, { "epoch": 1.4091249359757299, "grad_norm": 0.19044050574302673, "learning_rate": 9.165798082742316e-06, "loss": 0.3813, "step": 2236 }, { "epoch": 1.409755328789252, "grad_norm": 0.19611325860023499, "learning_rate": 9.147672403691284e-06, "loss": 0.344, "step": 2237 }, { "epoch": 1.4103857216027738, "grad_norm": 0.16075120866298676, "learning_rate": 9.129560090515161e-06, "loss": 0.3024, "step": 2238 }, { "epoch": 1.4110161144162956, "grad_norm": 0.15303272008895874, "learning_rate": 9.11146116134462e-06, "loss": 0.332, "step": 2239 }, { "epoch": 1.4116465072298177, "grad_norm": 0.1527310013771057, "learning_rate": 9.093375634296984e-06, "loss": 0.3317, "step": 2240 }, { "epoch": 1.4122769000433395, "grad_norm": 0.20200815796852112, "learning_rate": 9.075303527476101e-06, "loss": 0.4097, "step": 2241 }, { "epoch": 1.4129072928568616, "grad_norm": 0.17556019127368927, "learning_rate": 9.057244858972435e-06, "loss": 0.3812, "step": 2242 }, { "epoch": 1.4135376856703834, "grad_norm": 0.17567972838878632, "learning_rate": 9.039199646862952e-06, "loss": 0.4171, "step": 2243 }, { "epoch": 1.4141680784839052, "grad_norm": 0.19437840580940247, "learning_rate": 9.021167909211204e-06, "loss": 0.3437, "step": 2244 }, { "epoch": 1.414798471297427, "grad_norm": 0.1861252784729004, "learning_rate": 9.003149664067198e-06, "loss": 0.412, "step": 2245 }, { "epoch": 1.4154288641109491, "grad_norm": 0.18544334173202515, "learning_rate": 8.985144929467478e-06, "loss": 0.3603, "step": 2246 }, { "epoch": 1.416059256924471, "grad_norm": 0.16569247841835022, "learning_rate": 8.96715372343503e-06, "loss": 0.3239, "step": 2247 }, { "epoch": 1.416689649737993, "grad_norm": 0.20697587728500366, "learning_rate": 8.949176063979324e-06, "loss": 0.4214, "step": 2248 }, { "epoch": 1.417320042551515, "grad_norm": 0.17573077976703644, "learning_rate": 8.931211969096257e-06, "loss": 0.2999, "step": 2249 }, { "epoch": 1.4179504353650367, "grad_norm": 0.16955791413784027, "learning_rate": 8.91326145676816e-06, "loss": 0.342, "step": 2250 }, { "epoch": 1.4185808281785588, "grad_norm": 0.21412505209445953, "learning_rate": 8.89532454496374e-06, "loss": 0.3414, "step": 2251 }, { "epoch": 1.4192112209920806, "grad_norm": 0.17389221489429474, "learning_rate": 8.877401251638119e-06, "loss": 0.3694, "step": 2252 }, { "epoch": 1.4198416138056027, "grad_norm": 0.1807471066713333, "learning_rate": 8.85949159473277e-06, "loss": 0.4271, "step": 2253 }, { "epoch": 1.4204720066191245, "grad_norm": 0.15700294077396393, "learning_rate": 8.841595592175534e-06, "loss": 0.2964, "step": 2254 }, { "epoch": 1.4211023994326464, "grad_norm": 0.19284594058990479, "learning_rate": 8.823713261880553e-06, "loss": 0.4243, "step": 2255 }, { "epoch": 1.4217327922461684, "grad_norm": 0.2265140563249588, "learning_rate": 8.80584462174831e-06, "loss": 0.4123, "step": 2256 }, { "epoch": 1.4223631850596903, "grad_norm": 0.16101014614105225, "learning_rate": 8.787989689665575e-06, "loss": 0.3469, "step": 2257 }, { "epoch": 1.4229935778732123, "grad_norm": 0.14722800254821777, "learning_rate": 8.770148483505399e-06, "loss": 0.3058, "step": 2258 }, { "epoch": 1.4236239706867342, "grad_norm": 0.21112282574176788, "learning_rate": 8.752321021127079e-06, "loss": 0.3872, "step": 2259 }, { "epoch": 1.424254363500256, "grad_norm": 0.23365963995456696, "learning_rate": 8.734507320376179e-06, "loss": 0.4056, "step": 2260 }, { "epoch": 1.424884756313778, "grad_norm": 0.17326068878173828, "learning_rate": 8.716707399084452e-06, "loss": 0.3536, "step": 2261 }, { "epoch": 1.4255151491273, "grad_norm": 0.14267979562282562, "learning_rate": 8.698921275069906e-06, "loss": 0.3063, "step": 2262 }, { "epoch": 1.426145541940822, "grad_norm": 0.22588102519512177, "learning_rate": 8.681148966136691e-06, "loss": 0.3752, "step": 2263 }, { "epoch": 1.4267759347543438, "grad_norm": 0.20228253304958344, "learning_rate": 8.663390490075164e-06, "loss": 0.3556, "step": 2264 }, { "epoch": 1.4274063275678657, "grad_norm": 0.18117617070674896, "learning_rate": 8.6456458646618e-06, "loss": 0.3498, "step": 2265 }, { "epoch": 1.4280367203813875, "grad_norm": 0.14785583317279816, "learning_rate": 8.62791510765924e-06, "loss": 0.2724, "step": 2266 }, { "epoch": 1.4286671131949096, "grad_norm": 0.17025531828403473, "learning_rate": 8.610198236816226e-06, "loss": 0.3058, "step": 2267 }, { "epoch": 1.4292975060084316, "grad_norm": 0.1619190275669098, "learning_rate": 8.592495269867618e-06, "loss": 0.3293, "step": 2268 }, { "epoch": 1.4299278988219535, "grad_norm": 0.14388024806976318, "learning_rate": 8.574806224534324e-06, "loss": 0.3411, "step": 2269 }, { "epoch": 1.4305582916354753, "grad_norm": 0.16051793098449707, "learning_rate": 8.557131118523345e-06, "loss": 0.4002, "step": 2270 }, { "epoch": 1.4311886844489972, "grad_norm": 0.17801295220851898, "learning_rate": 8.539469969527723e-06, "loss": 0.3817, "step": 2271 }, { "epoch": 1.4318190772625192, "grad_norm": 0.18220989406108856, "learning_rate": 8.521822795226532e-06, "loss": 0.333, "step": 2272 }, { "epoch": 1.432449470076041, "grad_norm": 0.16216981410980225, "learning_rate": 8.504189613284834e-06, "loss": 0.3632, "step": 2273 }, { "epoch": 1.4330798628895631, "grad_norm": 0.20460912585258484, "learning_rate": 8.486570441353714e-06, "loss": 0.4964, "step": 2274 }, { "epoch": 1.433710255703085, "grad_norm": 0.20223969221115112, "learning_rate": 8.468965297070215e-06, "loss": 0.3536, "step": 2275 }, { "epoch": 1.4343406485166068, "grad_norm": 0.21065102517604828, "learning_rate": 8.451374198057354e-06, "loss": 0.3481, "step": 2276 }, { "epoch": 1.4349710413301289, "grad_norm": 0.12952826917171478, "learning_rate": 8.433797161924065e-06, "loss": 0.2528, "step": 2277 }, { "epoch": 1.4356014341436507, "grad_norm": 0.15633657574653625, "learning_rate": 8.416234206265222e-06, "loss": 0.3528, "step": 2278 }, { "epoch": 1.4362318269571728, "grad_norm": 0.13650497794151306, "learning_rate": 8.3986853486616e-06, "loss": 0.3002, "step": 2279 }, { "epoch": 1.4368622197706946, "grad_norm": 0.21313124895095825, "learning_rate": 8.381150606679868e-06, "loss": 0.4149, "step": 2280 }, { "epoch": 1.4374926125842165, "grad_norm": 0.16620555520057678, "learning_rate": 8.363629997872549e-06, "loss": 0.3494, "step": 2281 }, { "epoch": 1.4381230053977385, "grad_norm": 0.1616056263446808, "learning_rate": 8.346123539778038e-06, "loss": 0.3469, "step": 2282 }, { "epoch": 1.4387533982112604, "grad_norm": 0.15768451988697052, "learning_rate": 8.328631249920537e-06, "loss": 0.365, "step": 2283 }, { "epoch": 1.4393837910247824, "grad_norm": 0.13965880870819092, "learning_rate": 8.311153145810112e-06, "loss": 0.3409, "step": 2284 }, { "epoch": 1.4400141838383043, "grad_norm": 0.13279767334461212, "learning_rate": 8.293689244942575e-06, "loss": 0.3028, "step": 2285 }, { "epoch": 1.440644576651826, "grad_norm": 0.2043708711862564, "learning_rate": 8.276239564799564e-06, "loss": 0.3897, "step": 2286 }, { "epoch": 1.4412749694653482, "grad_norm": 0.16391275823116302, "learning_rate": 8.258804122848448e-06, "loss": 0.314, "step": 2287 }, { "epoch": 1.44190536227887, "grad_norm": 0.16918890178203583, "learning_rate": 8.241382936542367e-06, "loss": 0.3634, "step": 2288 }, { "epoch": 1.442535755092392, "grad_norm": 0.19429834187030792, "learning_rate": 8.223976023320185e-06, "loss": 0.3305, "step": 2289 }, { "epoch": 1.443166147905914, "grad_norm": 0.15197838842868805, "learning_rate": 8.206583400606478e-06, "loss": 0.2995, "step": 2290 }, { "epoch": 1.4437965407194358, "grad_norm": 0.1665520817041397, "learning_rate": 8.189205085811502e-06, "loss": 0.3707, "step": 2291 }, { "epoch": 1.4444269335329576, "grad_norm": 0.16098156571388245, "learning_rate": 8.171841096331216e-06, "loss": 0.3067, "step": 2292 }, { "epoch": 1.4450573263464797, "grad_norm": 0.1439000517129898, "learning_rate": 8.154491449547222e-06, "loss": 0.3317, "step": 2293 }, { "epoch": 1.4456877191600015, "grad_norm": 0.175605908036232, "learning_rate": 8.13715616282678e-06, "loss": 0.4274, "step": 2294 }, { "epoch": 1.4463181119735236, "grad_norm": 0.17509125173091888, "learning_rate": 8.119835253522749e-06, "loss": 0.4218, "step": 2295 }, { "epoch": 1.4469485047870454, "grad_norm": 0.1781781017780304, "learning_rate": 8.102528738973618e-06, "loss": 0.3294, "step": 2296 }, { "epoch": 1.4475788976005672, "grad_norm": 0.19787555932998657, "learning_rate": 8.085236636503464e-06, "loss": 0.4119, "step": 2297 }, { "epoch": 1.4482092904140893, "grad_norm": 0.1802581250667572, "learning_rate": 8.067958963421942e-06, "loss": 0.4097, "step": 2298 }, { "epoch": 1.4488396832276111, "grad_norm": 0.19002848863601685, "learning_rate": 8.050695737024237e-06, "loss": 0.3125, "step": 2299 }, { "epoch": 1.4494700760411332, "grad_norm": 0.19523286819458008, "learning_rate": 8.033446974591107e-06, "loss": 0.3657, "step": 2300 }, { "epoch": 1.450100468854655, "grad_norm": 0.15178027749061584, "learning_rate": 8.016212693388793e-06, "loss": 0.3377, "step": 2301 }, { "epoch": 1.4507308616681769, "grad_norm": 0.1866513192653656, "learning_rate": 7.99899291066909e-06, "loss": 0.4018, "step": 2302 }, { "epoch": 1.451361254481699, "grad_norm": 0.1409262716770172, "learning_rate": 7.981787643669234e-06, "loss": 0.3302, "step": 2303 }, { "epoch": 1.4519916472952208, "grad_norm": 0.14522545039653778, "learning_rate": 7.964596909611959e-06, "loss": 0.3362, "step": 2304 }, { "epoch": 1.4526220401087429, "grad_norm": 0.14141473174095154, "learning_rate": 7.94742072570543e-06, "loss": 0.3365, "step": 2305 }, { "epoch": 1.4532524329222647, "grad_norm": 0.1505013406276703, "learning_rate": 7.930259109143259e-06, "loss": 0.3251, "step": 2306 }, { "epoch": 1.4538828257357865, "grad_norm": 0.14216575026512146, "learning_rate": 7.91311207710448e-06, "loss": 0.3438, "step": 2307 }, { "epoch": 1.4545132185493086, "grad_norm": 0.148826003074646, "learning_rate": 7.895979646753533e-06, "loss": 0.3608, "step": 2308 }, { "epoch": 1.4551436113628304, "grad_norm": 0.19999346137046814, "learning_rate": 7.878861835240207e-06, "loss": 0.3857, "step": 2309 }, { "epoch": 1.4557740041763525, "grad_norm": 0.19859881699085236, "learning_rate": 7.861758659699695e-06, "loss": 0.3852, "step": 2310 }, { "epoch": 1.4564043969898743, "grad_norm": 0.1679404079914093, "learning_rate": 7.844670137252523e-06, "loss": 0.3958, "step": 2311 }, { "epoch": 1.4570347898033962, "grad_norm": 0.19567959010601044, "learning_rate": 7.827596285004562e-06, "loss": 0.3816, "step": 2312 }, { "epoch": 1.4576651826169182, "grad_norm": 0.16922706365585327, "learning_rate": 7.810537120046968e-06, "loss": 0.3706, "step": 2313 }, { "epoch": 1.45829557543044, "grad_norm": 0.15642282366752625, "learning_rate": 7.793492659456228e-06, "loss": 0.3059, "step": 2314 }, { "epoch": 1.4589259682439621, "grad_norm": 0.15599018335342407, "learning_rate": 7.77646292029409e-06, "loss": 0.3399, "step": 2315 }, { "epoch": 1.459556361057484, "grad_norm": 0.19208070635795593, "learning_rate": 7.75944791960758e-06, "loss": 0.378, "step": 2316 }, { "epoch": 1.4601867538710058, "grad_norm": 0.17060023546218872, "learning_rate": 7.742447674428948e-06, "loss": 0.3805, "step": 2317 }, { "epoch": 1.4608171466845277, "grad_norm": 0.18306058645248413, "learning_rate": 7.725462201775696e-06, "loss": 0.3457, "step": 2318 }, { "epoch": 1.4614475394980497, "grad_norm": 0.17282027006149292, "learning_rate": 7.708491518650531e-06, "loss": 0.3346, "step": 2319 }, { "epoch": 1.4620779323115716, "grad_norm": 0.22159677743911743, "learning_rate": 7.69153564204136e-06, "loss": 0.4076, "step": 2320 }, { "epoch": 1.4627083251250936, "grad_norm": 0.1395348459482193, "learning_rate": 7.674594588921249e-06, "loss": 0.34, "step": 2321 }, { "epoch": 1.4633387179386155, "grad_norm": 0.1594412624835968, "learning_rate": 7.657668376248456e-06, "loss": 0.3198, "step": 2322 }, { "epoch": 1.4639691107521373, "grad_norm": 0.18476234376430511, "learning_rate": 7.64075702096634e-06, "loss": 0.3493, "step": 2323 }, { "epoch": 1.4645995035656594, "grad_norm": 0.14744731783866882, "learning_rate": 7.623860540003451e-06, "loss": 0.4012, "step": 2324 }, { "epoch": 1.4652298963791812, "grad_norm": 0.15878044068813324, "learning_rate": 7.6069789502733835e-06, "loss": 0.3396, "step": 2325 }, { "epoch": 1.4658602891927033, "grad_norm": 0.16711211204528809, "learning_rate": 7.590112268674876e-06, "loss": 0.3348, "step": 2326 }, { "epoch": 1.4664906820062251, "grad_norm": 0.14225564897060394, "learning_rate": 7.5732605120917064e-06, "loss": 0.3227, "step": 2327 }, { "epoch": 1.467121074819747, "grad_norm": 0.14761362969875336, "learning_rate": 7.556423697392737e-06, "loss": 0.372, "step": 2328 }, { "epoch": 1.467751467633269, "grad_norm": 0.1955820918083191, "learning_rate": 7.539601841431863e-06, "loss": 0.3537, "step": 2329 }, { "epoch": 1.4683818604467909, "grad_norm": 0.14523617923259735, "learning_rate": 7.5227949610480175e-06, "loss": 0.2896, "step": 2330 }, { "epoch": 1.469012253260313, "grad_norm": 0.13613255321979523, "learning_rate": 7.506003073065118e-06, "loss": 0.2918, "step": 2331 }, { "epoch": 1.4696426460738348, "grad_norm": 0.1531195491552353, "learning_rate": 7.489226194292096e-06, "loss": 0.3214, "step": 2332 }, { "epoch": 1.4702730388873566, "grad_norm": 0.12154468894004822, "learning_rate": 7.472464341522855e-06, "loss": 0.3022, "step": 2333 }, { "epoch": 1.4709034317008787, "grad_norm": 0.16331082582473755, "learning_rate": 7.455717531536259e-06, "loss": 0.3501, "step": 2334 }, { "epoch": 1.4715338245144005, "grad_norm": 0.15346533060073853, "learning_rate": 7.438985781096092e-06, "loss": 0.2929, "step": 2335 }, { "epoch": 1.4721642173279226, "grad_norm": 0.15735021233558655, "learning_rate": 7.422269106951096e-06, "loss": 0.3296, "step": 2336 }, { "epoch": 1.4727946101414444, "grad_norm": 0.150700643658638, "learning_rate": 7.405567525834898e-06, "loss": 0.323, "step": 2337 }, { "epoch": 1.4734250029549663, "grad_norm": 0.21111014485359192, "learning_rate": 7.3888810544660365e-06, "loss": 0.3881, "step": 2338 }, { "epoch": 1.4740553957684883, "grad_norm": 0.1844063103199005, "learning_rate": 7.372209709547894e-06, "loss": 0.3513, "step": 2339 }, { "epoch": 1.4746857885820102, "grad_norm": 0.1757941097021103, "learning_rate": 7.355553507768742e-06, "loss": 0.3798, "step": 2340 }, { "epoch": 1.4753161813955322, "grad_norm": 0.15453548729419708, "learning_rate": 7.338912465801681e-06, "loss": 0.3721, "step": 2341 }, { "epoch": 1.475946574209054, "grad_norm": 0.1763160228729248, "learning_rate": 7.322286600304643e-06, "loss": 0.4228, "step": 2342 }, { "epoch": 1.476576967022576, "grad_norm": 0.15534138679504395, "learning_rate": 7.305675927920348e-06, "loss": 0.411, "step": 2343 }, { "epoch": 1.4772073598360977, "grad_norm": 0.17668470740318298, "learning_rate": 7.289080465276335e-06, "loss": 0.3669, "step": 2344 }, { "epoch": 1.4778377526496198, "grad_norm": 0.17460039258003235, "learning_rate": 7.272500228984886e-06, "loss": 0.441, "step": 2345 }, { "epoch": 1.4784681454631416, "grad_norm": 0.20688629150390625, "learning_rate": 7.255935235643084e-06, "loss": 0.424, "step": 2346 }, { "epoch": 1.4790985382766637, "grad_norm": 0.15839360654354095, "learning_rate": 7.239385501832711e-06, "loss": 0.3268, "step": 2347 }, { "epoch": 1.4797289310901856, "grad_norm": 0.1791486293077469, "learning_rate": 7.222851044120307e-06, "loss": 0.3273, "step": 2348 }, { "epoch": 1.4803593239037074, "grad_norm": 0.1676637977361679, "learning_rate": 7.2063318790570875e-06, "loss": 0.3515, "step": 2349 }, { "epoch": 1.4809897167172295, "grad_norm": 0.1364106982946396, "learning_rate": 7.189828023178987e-06, "loss": 0.2846, "step": 2350 }, { "epoch": 1.4816201095307513, "grad_norm": 0.15322741866111755, "learning_rate": 7.173339493006609e-06, "loss": 0.3028, "step": 2351 }, { "epoch": 1.4822505023442734, "grad_norm": 0.18440741300582886, "learning_rate": 7.156866305045218e-06, "loss": 0.3662, "step": 2352 }, { "epoch": 1.4828808951577952, "grad_norm": 0.17829926311969757, "learning_rate": 7.1404084757847004e-06, "loss": 0.371, "step": 2353 }, { "epoch": 1.483511287971317, "grad_norm": 0.14179544150829315, "learning_rate": 7.12396602169959e-06, "loss": 0.3318, "step": 2354 }, { "epoch": 1.484141680784839, "grad_norm": 0.16233308613300323, "learning_rate": 7.107538959249027e-06, "loss": 0.3349, "step": 2355 }, { "epoch": 1.484772073598361, "grad_norm": 0.15612120926380157, "learning_rate": 7.091127304876745e-06, "loss": 0.2996, "step": 2356 }, { "epoch": 1.485402466411883, "grad_norm": 0.22952015697956085, "learning_rate": 7.074731075011037e-06, "loss": 0.3868, "step": 2357 }, { "epoch": 1.4860328592254048, "grad_norm": 0.1616676300764084, "learning_rate": 7.058350286064775e-06, "loss": 0.3905, "step": 2358 }, { "epoch": 1.4866632520389267, "grad_norm": 0.15389202535152435, "learning_rate": 7.041984954435369e-06, "loss": 0.3495, "step": 2359 }, { "epoch": 1.4872936448524487, "grad_norm": 0.1607460081577301, "learning_rate": 7.025635096504761e-06, "loss": 0.2944, "step": 2360 }, { "epoch": 1.4879240376659706, "grad_norm": 0.18009735643863678, "learning_rate": 7.00930072863938e-06, "loss": 0.3959, "step": 2361 }, { "epoch": 1.4885544304794927, "grad_norm": 0.189145028591156, "learning_rate": 6.992981867190189e-06, "loss": 0.3319, "step": 2362 }, { "epoch": 1.4891848232930145, "grad_norm": 0.16663429141044617, "learning_rate": 6.976678528492578e-06, "loss": 0.375, "step": 2363 }, { "epoch": 1.4898152161065363, "grad_norm": 0.1959405094385147, "learning_rate": 6.960390728866458e-06, "loss": 0.344, "step": 2364 }, { "epoch": 1.4904456089200582, "grad_norm": 0.12720222771167755, "learning_rate": 6.944118484616134e-06, "loss": 0.3095, "step": 2365 }, { "epoch": 1.4910760017335802, "grad_norm": 0.15662875771522522, "learning_rate": 6.927861812030374e-06, "loss": 0.3688, "step": 2366 }, { "epoch": 1.4917063945471023, "grad_norm": 0.17900823056697845, "learning_rate": 6.911620727382329e-06, "loss": 0.3871, "step": 2367 }, { "epoch": 1.4923367873606241, "grad_norm": 0.17844952642917633, "learning_rate": 6.89539524692957e-06, "loss": 0.3272, "step": 2368 }, { "epoch": 1.492967180174146, "grad_norm": 0.17715677618980408, "learning_rate": 6.8791853869140395e-06, "loss": 0.3838, "step": 2369 }, { "epoch": 1.4935975729876678, "grad_norm": 0.18059316277503967, "learning_rate": 6.862991163562051e-06, "loss": 0.3856, "step": 2370 }, { "epoch": 1.4942279658011899, "grad_norm": 0.15726926922798157, "learning_rate": 6.846812593084246e-06, "loss": 0.4097, "step": 2371 }, { "epoch": 1.4948583586147117, "grad_norm": 0.15653415024280548, "learning_rate": 6.830649691675614e-06, "loss": 0.3041, "step": 2372 }, { "epoch": 1.4954887514282338, "grad_norm": 0.22607919573783875, "learning_rate": 6.81450247551546e-06, "loss": 0.3754, "step": 2373 }, { "epoch": 1.4961191442417556, "grad_norm": 0.1733357161283493, "learning_rate": 6.798370960767385e-06, "loss": 0.3802, "step": 2374 }, { "epoch": 1.4967495370552775, "grad_norm": 0.1510259062051773, "learning_rate": 6.78225516357926e-06, "loss": 0.3057, "step": 2375 }, { "epoch": 1.4973799298687995, "grad_norm": 0.17537620663642883, "learning_rate": 6.766155100083238e-06, "loss": 0.3372, "step": 2376 }, { "epoch": 1.4980103226823214, "grad_norm": 0.19684992730617523, "learning_rate": 6.750070786395722e-06, "loss": 0.4168, "step": 2377 }, { "epoch": 1.4986407154958434, "grad_norm": 0.17185229063034058, "learning_rate": 6.734002238617346e-06, "loss": 0.3734, "step": 2378 }, { "epoch": 1.4992711083093653, "grad_norm": 0.18762269616127014, "learning_rate": 6.7179494728329514e-06, "loss": 0.3612, "step": 2379 }, { "epoch": 1.4999015011228871, "grad_norm": 0.17824314534664154, "learning_rate": 6.701912505111595e-06, "loss": 0.3269, "step": 2380 }, { "epoch": 1.5005318939364092, "grad_norm": 0.17873038351535797, "learning_rate": 6.68589135150652e-06, "loss": 0.3581, "step": 2381 }, { "epoch": 1.501162286749931, "grad_norm": 0.15424737334251404, "learning_rate": 6.669886028055138e-06, "loss": 0.3541, "step": 2382 }, { "epoch": 1.501792679563453, "grad_norm": 0.205153688788414, "learning_rate": 6.653896550778998e-06, "loss": 0.3629, "step": 2383 }, { "epoch": 1.502423072376975, "grad_norm": 0.17122958600521088, "learning_rate": 6.637922935683816e-06, "loss": 0.4008, "step": 2384 }, { "epoch": 1.5030534651904968, "grad_norm": 0.19354306161403656, "learning_rate": 6.621965198759393e-06, "loss": 0.3817, "step": 2385 }, { "epoch": 1.5036838580040186, "grad_norm": 0.18343815207481384, "learning_rate": 6.606023355979683e-06, "loss": 0.4202, "step": 2386 }, { "epoch": 1.5043142508175407, "grad_norm": 0.18962843716144562, "learning_rate": 6.5900974233026824e-06, "loss": 0.3502, "step": 2387 }, { "epoch": 1.5049446436310627, "grad_norm": 0.16619205474853516, "learning_rate": 6.574187416670491e-06, "loss": 0.3029, "step": 2388 }, { "epoch": 1.5055750364445846, "grad_norm": 0.17878541350364685, "learning_rate": 6.558293352009264e-06, "loss": 0.3589, "step": 2389 }, { "epoch": 1.5062054292581064, "grad_norm": 0.15167735517024994, "learning_rate": 6.542415245229172e-06, "loss": 0.3249, "step": 2390 }, { "epoch": 1.5068358220716283, "grad_norm": 0.2101573944091797, "learning_rate": 6.526553112224457e-06, "loss": 0.3233, "step": 2391 }, { "epoch": 1.5074662148851503, "grad_norm": 0.17878898978233337, "learning_rate": 6.510706968873328e-06, "loss": 0.3383, "step": 2392 }, { "epoch": 1.5080966076986724, "grad_norm": 0.1626843959093094, "learning_rate": 6.49487683103802e-06, "loss": 0.3771, "step": 2393 }, { "epoch": 1.5087270005121942, "grad_norm": 0.19067011773586273, "learning_rate": 6.4790627145647164e-06, "loss": 0.3521, "step": 2394 }, { "epoch": 1.509357393325716, "grad_norm": 0.17487511038780212, "learning_rate": 6.463264635283584e-06, "loss": 0.3363, "step": 2395 }, { "epoch": 1.509987786139238, "grad_norm": 0.1691151261329651, "learning_rate": 6.447482609008733e-06, "loss": 0.3659, "step": 2396 }, { "epoch": 1.51061817895276, "grad_norm": 0.1458681970834732, "learning_rate": 6.4317166515382125e-06, "loss": 0.3196, "step": 2397 }, { "epoch": 1.511248571766282, "grad_norm": 0.14862768352031708, "learning_rate": 6.415966778653955e-06, "loss": 0.3243, "step": 2398 }, { "epoch": 1.5118789645798039, "grad_norm": 0.18596191704273224, "learning_rate": 6.400233006121826e-06, "loss": 0.3664, "step": 2399 }, { "epoch": 1.5125093573933257, "grad_norm": 0.15921002626419067, "learning_rate": 6.38451534969156e-06, "loss": 0.3127, "step": 2400 }, { "epoch": 1.5125093573933257, "eval_loss": 0.4196794927120209, "eval_runtime": 222.4679, "eval_samples_per_second": 4.495, "eval_steps_per_second": 4.495, "step": 2400 }, { "epoch": 1.5131397502068475, "grad_norm": 0.1784636527299881, "learning_rate": 6.368813825096767e-06, "loss": 0.3868, "step": 2401 }, { "epoch": 1.5137701430203696, "grad_norm": 0.1569874882698059, "learning_rate": 6.353128448054886e-06, "loss": 0.3194, "step": 2402 }, { "epoch": 1.5144005358338914, "grad_norm": 0.1486312448978424, "learning_rate": 6.33745923426722e-06, "loss": 0.3188, "step": 2403 }, { "epoch": 1.5150309286474135, "grad_norm": 0.18690063059329987, "learning_rate": 6.321806199418879e-06, "loss": 0.4137, "step": 2404 }, { "epoch": 1.5156613214609354, "grad_norm": 0.17441874742507935, "learning_rate": 6.306169359178786e-06, "loss": 0.3199, "step": 2405 }, { "epoch": 1.5162917142744572, "grad_norm": 0.16934332251548767, "learning_rate": 6.290548729199634e-06, "loss": 0.3482, "step": 2406 }, { "epoch": 1.516922107087979, "grad_norm": 0.174238383769989, "learning_rate": 6.274944325117907e-06, "loss": 0.2894, "step": 2407 }, { "epoch": 1.517552499901501, "grad_norm": 0.17666368186473846, "learning_rate": 6.2593561625538416e-06, "loss": 0.393, "step": 2408 }, { "epoch": 1.5181828927150232, "grad_norm": 0.16942931711673737, "learning_rate": 6.243784257111422e-06, "loss": 0.3233, "step": 2409 }, { "epoch": 1.518813285528545, "grad_norm": 0.15660184621810913, "learning_rate": 6.228228624378343e-06, "loss": 0.3481, "step": 2410 }, { "epoch": 1.5194436783420668, "grad_norm": 0.18719768524169922, "learning_rate": 6.212689279926028e-06, "loss": 0.3512, "step": 2411 }, { "epoch": 1.5200740711555887, "grad_norm": 0.15997354686260223, "learning_rate": 6.197166239309573e-06, "loss": 0.371, "step": 2412 }, { "epoch": 1.5207044639691107, "grad_norm": 0.15153126418590546, "learning_rate": 6.181659518067792e-06, "loss": 0.3215, "step": 2413 }, { "epoch": 1.5213348567826328, "grad_norm": 0.17869645357131958, "learning_rate": 6.166169131723122e-06, "loss": 0.3497, "step": 2414 }, { "epoch": 1.5219652495961546, "grad_norm": 0.16944412887096405, "learning_rate": 6.150695095781676e-06, "loss": 0.3523, "step": 2415 }, { "epoch": 1.5225956424096765, "grad_norm": 0.186680868268013, "learning_rate": 6.135237425733178e-06, "loss": 0.3314, "step": 2416 }, { "epoch": 1.5232260352231983, "grad_norm": 0.23459716141223907, "learning_rate": 6.119796137050989e-06, "loss": 0.4228, "step": 2417 }, { "epoch": 1.5238564280367204, "grad_norm": 0.14739306271076202, "learning_rate": 6.1043712451920615e-06, "loss": 0.3521, "step": 2418 }, { "epoch": 1.5244868208502425, "grad_norm": 0.16764120757579803, "learning_rate": 6.088962765596949e-06, "loss": 0.383, "step": 2419 }, { "epoch": 1.5251172136637643, "grad_norm": 0.18274492025375366, "learning_rate": 6.073570713689747e-06, "loss": 0.3351, "step": 2420 }, { "epoch": 1.5257476064772861, "grad_norm": 0.17499703168869019, "learning_rate": 6.0581951048781325e-06, "loss": 0.3788, "step": 2421 }, { "epoch": 1.526377999290808, "grad_norm": 0.20380593836307526, "learning_rate": 6.042835954553315e-06, "loss": 0.3575, "step": 2422 }, { "epoch": 1.52700839210433, "grad_norm": 0.2026897519826889, "learning_rate": 6.027493278090036e-06, "loss": 0.3352, "step": 2423 }, { "epoch": 1.527638784917852, "grad_norm": 0.15653148293495178, "learning_rate": 6.012167090846522e-06, "loss": 0.3743, "step": 2424 }, { "epoch": 1.528269177731374, "grad_norm": 0.19078733026981354, "learning_rate": 5.996857408164517e-06, "loss": 0.3306, "step": 2425 }, { "epoch": 1.5288995705448958, "grad_norm": 0.18270157277584076, "learning_rate": 5.98156424536924e-06, "loss": 0.3914, "step": 2426 }, { "epoch": 1.5295299633584176, "grad_norm": 0.17126402258872986, "learning_rate": 5.966287617769377e-06, "loss": 0.3557, "step": 2427 }, { "epoch": 1.5301603561719397, "grad_norm": 0.17806404829025269, "learning_rate": 5.95102754065704e-06, "loss": 0.3603, "step": 2428 }, { "epoch": 1.5307907489854615, "grad_norm": 0.15630225837230682, "learning_rate": 5.935784029307802e-06, "loss": 0.3428, "step": 2429 }, { "epoch": 1.5314211417989836, "grad_norm": 0.1432841718196869, "learning_rate": 5.920557098980622e-06, "loss": 0.3472, "step": 2430 }, { "epoch": 1.5320515346125054, "grad_norm": 0.1667039692401886, "learning_rate": 5.905346764917904e-06, "loss": 0.3284, "step": 2431 }, { "epoch": 1.5326819274260273, "grad_norm": 0.15188711881637573, "learning_rate": 5.890153042345397e-06, "loss": 0.3359, "step": 2432 }, { "epoch": 1.533312320239549, "grad_norm": 0.18007594347000122, "learning_rate": 5.8749759464722525e-06, "loss": 0.3996, "step": 2433 }, { "epoch": 1.5339427130530712, "grad_norm": 0.17693552374839783, "learning_rate": 5.85981549249095e-06, "loss": 0.3524, "step": 2434 }, { "epoch": 1.5345731058665932, "grad_norm": 0.18030700087547302, "learning_rate": 5.844671695577333e-06, "loss": 0.3913, "step": 2435 }, { "epoch": 1.535203498680115, "grad_norm": 0.1581149399280548, "learning_rate": 5.829544570890562e-06, "loss": 0.3069, "step": 2436 }, { "epoch": 1.535833891493637, "grad_norm": 0.18180450797080994, "learning_rate": 5.8144341335731175e-06, "loss": 0.3135, "step": 2437 }, { "epoch": 1.5364642843071588, "grad_norm": 0.15132294595241547, "learning_rate": 5.799340398750756e-06, "loss": 0.2986, "step": 2438 }, { "epoch": 1.5370946771206808, "grad_norm": 0.16063569486141205, "learning_rate": 5.784263381532531e-06, "loss": 0.3967, "step": 2439 }, { "epoch": 1.5377250699342029, "grad_norm": 0.16005367040634155, "learning_rate": 5.7692030970107585e-06, "loss": 0.2788, "step": 2440 }, { "epoch": 1.5383554627477247, "grad_norm": 0.16369113326072693, "learning_rate": 5.7541595602610096e-06, "loss": 0.3814, "step": 2441 }, { "epoch": 1.5389858555612466, "grad_norm": 0.18679864704608917, "learning_rate": 5.73913278634207e-06, "loss": 0.3903, "step": 2442 }, { "epoch": 1.5396162483747684, "grad_norm": 0.18288299441337585, "learning_rate": 5.724122790295966e-06, "loss": 0.3872, "step": 2443 }, { "epoch": 1.5402466411882905, "grad_norm": 0.15036456286907196, "learning_rate": 5.709129587147923e-06, "loss": 0.3277, "step": 2444 }, { "epoch": 1.5408770340018125, "grad_norm": 0.18015111982822418, "learning_rate": 5.694153191906366e-06, "loss": 0.3685, "step": 2445 }, { "epoch": 1.5415074268153344, "grad_norm": 0.1756429523229599, "learning_rate": 5.67919361956287e-06, "loss": 0.3749, "step": 2446 }, { "epoch": 1.5421378196288562, "grad_norm": 0.20075921714305878, "learning_rate": 5.66425088509219e-06, "loss": 0.401, "step": 2447 }, { "epoch": 1.542768212442378, "grad_norm": 0.19082263112068176, "learning_rate": 5.649325003452224e-06, "loss": 0.3243, "step": 2448 }, { "epoch": 1.5433986052559001, "grad_norm": 0.19560390710830688, "learning_rate": 5.634415989584001e-06, "loss": 0.3659, "step": 2449 }, { "epoch": 1.544028998069422, "grad_norm": 0.15874971449375153, "learning_rate": 5.619523858411653e-06, "loss": 0.3534, "step": 2450 }, { "epoch": 1.544659390882944, "grad_norm": 0.18899878859519958, "learning_rate": 5.604648624842429e-06, "loss": 0.4562, "step": 2451 }, { "epoch": 1.5452897836964659, "grad_norm": 0.151338592171669, "learning_rate": 5.5897903037666405e-06, "loss": 0.3257, "step": 2452 }, { "epoch": 1.5459201765099877, "grad_norm": 0.1475290209054947, "learning_rate": 5.574948910057708e-06, "loss": 0.3249, "step": 2453 }, { "epoch": 1.5465505693235098, "grad_norm": 0.1572064757347107, "learning_rate": 5.56012445857206e-06, "loss": 0.3617, "step": 2454 }, { "epoch": 1.5471809621370316, "grad_norm": 0.13381150364875793, "learning_rate": 5.5453169641492085e-06, "loss": 0.3332, "step": 2455 }, { "epoch": 1.5478113549505537, "grad_norm": 0.2090974599123001, "learning_rate": 5.530526441611654e-06, "loss": 0.3652, "step": 2456 }, { "epoch": 1.5484417477640755, "grad_norm": 0.16606074571609497, "learning_rate": 5.515752905764933e-06, "loss": 0.3139, "step": 2457 }, { "epoch": 1.5490721405775973, "grad_norm": 0.20185938477516174, "learning_rate": 5.500996371397571e-06, "loss": 0.3398, "step": 2458 }, { "epoch": 1.5497025333911192, "grad_norm": 0.1493961215019226, "learning_rate": 5.486256853281081e-06, "loss": 0.3541, "step": 2459 }, { "epoch": 1.5503329262046412, "grad_norm": 0.20136871933937073, "learning_rate": 5.471534366169921e-06, "loss": 0.3318, "step": 2460 }, { "epoch": 1.5509633190181633, "grad_norm": 0.1520240306854248, "learning_rate": 5.456828924801526e-06, "loss": 0.3053, "step": 2461 }, { "epoch": 1.5515937118316852, "grad_norm": 0.14991526305675507, "learning_rate": 5.442140543896254e-06, "loss": 0.375, "step": 2462 }, { "epoch": 1.552224104645207, "grad_norm": 0.16552947461605072, "learning_rate": 5.427469238157397e-06, "loss": 0.3324, "step": 2463 }, { "epoch": 1.5528544974587288, "grad_norm": 0.15674886107444763, "learning_rate": 5.412815022271133e-06, "loss": 0.3353, "step": 2464 }, { "epoch": 1.553484890272251, "grad_norm": 0.19823868572711945, "learning_rate": 5.398177910906554e-06, "loss": 0.4067, "step": 2465 }, { "epoch": 1.554115283085773, "grad_norm": 0.16715441644191742, "learning_rate": 5.383557918715618e-06, "loss": 0.3385, "step": 2466 }, { "epoch": 1.5547456758992948, "grad_norm": 0.1568594127893448, "learning_rate": 5.3689550603331616e-06, "loss": 0.3172, "step": 2467 }, { "epoch": 1.5553760687128166, "grad_norm": 0.13004900515079498, "learning_rate": 5.354369350376844e-06, "loss": 0.3248, "step": 2468 }, { "epoch": 1.5560064615263385, "grad_norm": 0.16289322078227997, "learning_rate": 5.3398008034471815e-06, "loss": 0.3251, "step": 2469 }, { "epoch": 1.5566368543398605, "grad_norm": 0.17474089562892914, "learning_rate": 5.325249434127501e-06, "loss": 0.318, "step": 2470 }, { "epoch": 1.5572672471533826, "grad_norm": 0.15705354511737823, "learning_rate": 5.310715256983943e-06, "loss": 0.3237, "step": 2471 }, { "epoch": 1.5578976399669044, "grad_norm": 0.18768437206745148, "learning_rate": 5.29619828656542e-06, "loss": 0.3563, "step": 2472 }, { "epoch": 1.5585280327804263, "grad_norm": 0.14832916855812073, "learning_rate": 5.281698537403641e-06, "loss": 0.3264, "step": 2473 }, { "epoch": 1.5591584255939481, "grad_norm": 0.19431886076927185, "learning_rate": 5.267216024013048e-06, "loss": 0.3663, "step": 2474 }, { "epoch": 1.5597888184074702, "grad_norm": 0.18100936710834503, "learning_rate": 5.252750760890875e-06, "loss": 0.374, "step": 2475 }, { "epoch": 1.560419211220992, "grad_norm": 0.18944475054740906, "learning_rate": 5.23830276251704e-06, "loss": 0.3784, "step": 2476 }, { "epoch": 1.561049604034514, "grad_norm": 0.20163822174072266, "learning_rate": 5.2238720433542194e-06, "loss": 0.3453, "step": 2477 }, { "epoch": 1.561679996848036, "grad_norm": 0.13501304388046265, "learning_rate": 5.209458617847754e-06, "loss": 0.3022, "step": 2478 }, { "epoch": 1.5623103896615578, "grad_norm": 0.18465079367160797, "learning_rate": 5.195062500425703e-06, "loss": 0.3521, "step": 2479 }, { "epoch": 1.5629407824750798, "grad_norm": 0.17045170068740845, "learning_rate": 5.180683705498789e-06, "loss": 0.3355, "step": 2480 }, { "epoch": 1.5635711752886017, "grad_norm": 0.1863599270582199, "learning_rate": 5.166322247460405e-06, "loss": 0.3979, "step": 2481 }, { "epoch": 1.5642015681021237, "grad_norm": 0.19586798548698425, "learning_rate": 5.151978140686565e-06, "loss": 0.3579, "step": 2482 }, { "epoch": 1.5648319609156456, "grad_norm": 0.14061041176319122, "learning_rate": 5.137651399535935e-06, "loss": 0.3152, "step": 2483 }, { "epoch": 1.5654623537291674, "grad_norm": 0.17973418533802032, "learning_rate": 5.123342038349792e-06, "loss": 0.3428, "step": 2484 }, { "epoch": 1.5660927465426893, "grad_norm": 0.13237828016281128, "learning_rate": 5.1090500714520214e-06, "loss": 0.2866, "step": 2485 }, { "epoch": 1.5667231393562113, "grad_norm": 0.1529563069343567, "learning_rate": 5.094775513149082e-06, "loss": 0.3204, "step": 2486 }, { "epoch": 1.5673535321697334, "grad_norm": 0.18000224232673645, "learning_rate": 5.08051837773002e-06, "loss": 0.3938, "step": 2487 }, { "epoch": 1.5679839249832552, "grad_norm": 0.1832749843597412, "learning_rate": 5.0662786794664335e-06, "loss": 0.383, "step": 2488 }, { "epoch": 1.568614317796777, "grad_norm": 0.14123855531215668, "learning_rate": 5.052056432612477e-06, "loss": 0.3813, "step": 2489 }, { "epoch": 1.569244710610299, "grad_norm": 0.22280830144882202, "learning_rate": 5.037851651404814e-06, "loss": 0.3823, "step": 2490 }, { "epoch": 1.569875103423821, "grad_norm": 0.16046370565891266, "learning_rate": 5.023664350062651e-06, "loss": 0.4011, "step": 2491 }, { "epoch": 1.570505496237343, "grad_norm": 0.15080615878105164, "learning_rate": 5.009494542787663e-06, "loss": 0.3499, "step": 2492 }, { "epoch": 1.5711358890508649, "grad_norm": 0.1666601300239563, "learning_rate": 4.995342243764061e-06, "loss": 0.3163, "step": 2493 }, { "epoch": 1.5717662818643867, "grad_norm": 0.18129931390285492, "learning_rate": 4.981207467158486e-06, "loss": 0.3544, "step": 2494 }, { "epoch": 1.5723966746779086, "grad_norm": 0.1465536504983902, "learning_rate": 4.967090227120064e-06, "loss": 0.2443, "step": 2495 }, { "epoch": 1.5730270674914306, "grad_norm": 0.17997393012046814, "learning_rate": 4.952990537780347e-06, "loss": 0.367, "step": 2496 }, { "epoch": 1.5736574603049527, "grad_norm": 0.18684880435466766, "learning_rate": 4.938908413253338e-06, "loss": 0.4089, "step": 2497 }, { "epoch": 1.5742878531184745, "grad_norm": 0.1652742475271225, "learning_rate": 4.924843867635449e-06, "loss": 0.357, "step": 2498 }, { "epoch": 1.5749182459319964, "grad_norm": 0.1724511831998825, "learning_rate": 4.910796915005503e-06, "loss": 0.3158, "step": 2499 }, { "epoch": 1.5755486387455182, "grad_norm": 0.16976529359817505, "learning_rate": 4.896767569424689e-06, "loss": 0.2956, "step": 2500 }, { "epoch": 1.5761790315590403, "grad_norm": 0.14916852116584778, "learning_rate": 4.8827558449365935e-06, "loss": 0.3085, "step": 2501 }, { "epoch": 1.576809424372562, "grad_norm": 0.16239811480045319, "learning_rate": 4.868761755567159e-06, "loss": 0.353, "step": 2502 }, { "epoch": 1.5774398171860842, "grad_norm": 0.17747247219085693, "learning_rate": 4.854785315324677e-06, "loss": 0.3268, "step": 2503 }, { "epoch": 1.578070209999606, "grad_norm": 0.16685880720615387, "learning_rate": 4.8408265381997615e-06, "loss": 0.341, "step": 2504 }, { "epoch": 1.5787006028131279, "grad_norm": 0.14966802299022675, "learning_rate": 4.826885438165352e-06, "loss": 0.3725, "step": 2505 }, { "epoch": 1.5793309956266497, "grad_norm": 0.18024073541164398, "learning_rate": 4.812962029176696e-06, "loss": 0.3488, "step": 2506 }, { "epoch": 1.5799613884401718, "grad_norm": 0.17087945342063904, "learning_rate": 4.799056325171336e-06, "loss": 0.3603, "step": 2507 }, { "epoch": 1.5805917812536938, "grad_norm": 0.16011197865009308, "learning_rate": 4.7851683400690724e-06, "loss": 0.3461, "step": 2508 }, { "epoch": 1.5812221740672157, "grad_norm": 0.1560867875814438, "learning_rate": 4.771298087771987e-06, "loss": 0.3271, "step": 2509 }, { "epoch": 1.5818525668807375, "grad_norm": 0.14889803528785706, "learning_rate": 4.757445582164402e-06, "loss": 0.3311, "step": 2510 }, { "epoch": 1.5824829596942593, "grad_norm": 0.19574882090091705, "learning_rate": 4.743610837112888e-06, "loss": 0.3039, "step": 2511 }, { "epoch": 1.5831133525077814, "grad_norm": 0.15672369301319122, "learning_rate": 4.7297938664662124e-06, "loss": 0.35, "step": 2512 }, { "epoch": 1.5837437453213035, "grad_norm": 0.17589780688285828, "learning_rate": 4.715994684055378e-06, "loss": 0.3492, "step": 2513 }, { "epoch": 1.5843741381348253, "grad_norm": 0.17307446897029877, "learning_rate": 4.702213303693549e-06, "loss": 0.3479, "step": 2514 }, { "epoch": 1.5850045309483471, "grad_norm": 0.136874258518219, "learning_rate": 4.6884497391761125e-06, "loss": 0.3115, "step": 2515 }, { "epoch": 1.585634923761869, "grad_norm": 0.17769919335842133, "learning_rate": 4.67470400428058e-06, "loss": 0.3022, "step": 2516 }, { "epoch": 1.586265316575391, "grad_norm": 0.14690563082695007, "learning_rate": 4.660976112766643e-06, "loss": 0.3269, "step": 2517 }, { "epoch": 1.5868957093889131, "grad_norm": 0.17333537340164185, "learning_rate": 4.647266078376113e-06, "loss": 0.3409, "step": 2518 }, { "epoch": 1.587526102202435, "grad_norm": 0.1596616804599762, "learning_rate": 4.633573914832937e-06, "loss": 0.3709, "step": 2519 }, { "epoch": 1.5881564950159568, "grad_norm": 0.1438072770833969, "learning_rate": 4.619899635843173e-06, "loss": 0.291, "step": 2520 }, { "epoch": 1.5887868878294786, "grad_norm": 0.1735801249742508, "learning_rate": 4.606243255094979e-06, "loss": 0.3477, "step": 2521 }, { "epoch": 1.5894172806430007, "grad_norm": 0.1408032774925232, "learning_rate": 4.592604786258578e-06, "loss": 0.3318, "step": 2522 }, { "epoch": 1.5900476734565228, "grad_norm": 0.16737176477909088, "learning_rate": 4.578984242986286e-06, "loss": 0.3742, "step": 2523 }, { "epoch": 1.5906780662700446, "grad_norm": 0.16467176377773285, "learning_rate": 4.565381638912463e-06, "loss": 0.3153, "step": 2524 }, { "epoch": 1.5913084590835664, "grad_norm": 0.1506030112504959, "learning_rate": 4.551796987653522e-06, "loss": 0.2828, "step": 2525 }, { "epoch": 1.5919388518970883, "grad_norm": 0.16982591152191162, "learning_rate": 4.538230302807883e-06, "loss": 0.3579, "step": 2526 }, { "epoch": 1.5925692447106103, "grad_norm": 0.1681368499994278, "learning_rate": 4.524681597956003e-06, "loss": 0.3045, "step": 2527 }, { "epoch": 1.5931996375241322, "grad_norm": 0.15242716670036316, "learning_rate": 4.511150886660335e-06, "loss": 0.3063, "step": 2528 }, { "epoch": 1.5938300303376542, "grad_norm": 0.1426815241575241, "learning_rate": 4.497638182465323e-06, "loss": 0.3325, "step": 2529 }, { "epoch": 1.594460423151176, "grad_norm": 0.15260830521583557, "learning_rate": 4.484143498897367e-06, "loss": 0.2992, "step": 2530 }, { "epoch": 1.595090815964698, "grad_norm": 0.16513516008853912, "learning_rate": 4.470666849464857e-06, "loss": 0.3382, "step": 2531 }, { "epoch": 1.5957212087782198, "grad_norm": 0.16228486597537994, "learning_rate": 4.4572082476581e-06, "loss": 0.3705, "step": 2532 }, { "epoch": 1.5963516015917418, "grad_norm": 0.15340639650821686, "learning_rate": 4.443767706949372e-06, "loss": 0.3295, "step": 2533 }, { "epoch": 1.596981994405264, "grad_norm": 0.1555001437664032, "learning_rate": 4.430345240792837e-06, "loss": 0.349, "step": 2534 }, { "epoch": 1.5976123872187857, "grad_norm": 0.16534945368766785, "learning_rate": 4.416940862624589e-06, "loss": 0.3621, "step": 2535 }, { "epoch": 1.5982427800323076, "grad_norm": 0.18221977353096008, "learning_rate": 4.403554585862589e-06, "loss": 0.3357, "step": 2536 }, { "epoch": 1.5988731728458294, "grad_norm": 0.14937010407447815, "learning_rate": 4.390186423906717e-06, "loss": 0.3208, "step": 2537 }, { "epoch": 1.5995035656593515, "grad_norm": 0.24392864108085632, "learning_rate": 4.376836390138687e-06, "loss": 0.3775, "step": 2538 }, { "epoch": 1.6001339584728735, "grad_norm": 0.14190733432769775, "learning_rate": 4.363504497922081e-06, "loss": 0.3168, "step": 2539 }, { "epoch": 1.6007643512863954, "grad_norm": 0.19658371806144714, "learning_rate": 4.350190760602313e-06, "loss": 0.3723, "step": 2540 }, { "epoch": 1.6013947440999172, "grad_norm": 0.18854087591171265, "learning_rate": 4.336895191506629e-06, "loss": 0.3595, "step": 2541 }, { "epoch": 1.602025136913439, "grad_norm": 0.16696226596832275, "learning_rate": 4.323617803944094e-06, "loss": 0.3631, "step": 2542 }, { "epoch": 1.6026555297269611, "grad_norm": 0.13967862725257874, "learning_rate": 4.310358611205567e-06, "loss": 0.2913, "step": 2543 }, { "epoch": 1.6032859225404832, "grad_norm": 0.17204873263835907, "learning_rate": 4.297117626563685e-06, "loss": 0.3681, "step": 2544 }, { "epoch": 1.603916315354005, "grad_norm": 0.1861438751220703, "learning_rate": 4.283894863272871e-06, "loss": 0.3924, "step": 2545 }, { "epoch": 1.6045467081675269, "grad_norm": 0.1662445217370987, "learning_rate": 4.270690334569305e-06, "loss": 0.3553, "step": 2546 }, { "epoch": 1.6051771009810487, "grad_norm": 0.16762596368789673, "learning_rate": 4.257504053670921e-06, "loss": 0.3248, "step": 2547 }, { "epoch": 1.6058074937945708, "grad_norm": 0.16183894872665405, "learning_rate": 4.244336033777362e-06, "loss": 0.3225, "step": 2548 }, { "epoch": 1.6064378866080926, "grad_norm": 0.1885928511619568, "learning_rate": 4.2311862880700205e-06, "loss": 0.4148, "step": 2549 }, { "epoch": 1.6070682794216147, "grad_norm": 0.20267227292060852, "learning_rate": 4.2180548297119785e-06, "loss": 0.3956, "step": 2550 }, { "epoch": 1.6076986722351365, "grad_norm": 0.1709766834974289, "learning_rate": 4.204941671848024e-06, "loss": 0.3204, "step": 2551 }, { "epoch": 1.6083290650486584, "grad_norm": 0.1575269252061844, "learning_rate": 4.19184682760461e-06, "loss": 0.3385, "step": 2552 }, { "epoch": 1.6089594578621804, "grad_norm": 0.1714513599872589, "learning_rate": 4.17877031008988e-06, "loss": 0.2944, "step": 2553 }, { "epoch": 1.6095898506757023, "grad_norm": 0.16309994459152222, "learning_rate": 4.165712132393597e-06, "loss": 0.3186, "step": 2554 }, { "epoch": 1.6102202434892243, "grad_norm": 0.13920311629772186, "learning_rate": 4.1526723075872146e-06, "loss": 0.2782, "step": 2555 }, { "epoch": 1.6108506363027462, "grad_norm": 0.17000916600227356, "learning_rate": 4.1396508487237676e-06, "loss": 0.3481, "step": 2556 }, { "epoch": 1.611481029116268, "grad_norm": 0.1467917263507843, "learning_rate": 4.1266477688379384e-06, "loss": 0.3042, "step": 2557 }, { "epoch": 1.6121114219297898, "grad_norm": 0.18692618608474731, "learning_rate": 4.113663080945993e-06, "loss": 0.3545, "step": 2558 }, { "epoch": 1.612741814743312, "grad_norm": 0.1667277216911316, "learning_rate": 4.100696798045792e-06, "loss": 0.3248, "step": 2559 }, { "epoch": 1.613372207556834, "grad_norm": 0.20179854333400726, "learning_rate": 4.08774893311678e-06, "loss": 0.3872, "step": 2560 }, { "epoch": 1.6140026003703558, "grad_norm": 0.17441004514694214, "learning_rate": 4.07481949911996e-06, "loss": 0.3422, "step": 2561 }, { "epoch": 1.6146329931838777, "grad_norm": 0.16981568932533264, "learning_rate": 4.061908508997876e-06, "loss": 0.3244, "step": 2562 }, { "epoch": 1.6152633859973995, "grad_norm": 0.1381535828113556, "learning_rate": 4.049015975674623e-06, "loss": 0.2821, "step": 2563 }, { "epoch": 1.6158937788109216, "grad_norm": 0.22367283701896667, "learning_rate": 4.036141912055813e-06, "loss": 0.4001, "step": 2564 }, { "epoch": 1.6165241716244436, "grad_norm": 0.1576678603887558, "learning_rate": 4.0232863310285795e-06, "loss": 0.3758, "step": 2565 }, { "epoch": 1.6171545644379655, "grad_norm": 0.17275619506835938, "learning_rate": 4.010449245461535e-06, "loss": 0.3641, "step": 2566 }, { "epoch": 1.6177849572514873, "grad_norm": 0.15438169240951538, "learning_rate": 3.997630668204796e-06, "loss": 0.3748, "step": 2567 }, { "epoch": 1.6184153500650091, "grad_norm": 0.1662559062242508, "learning_rate": 3.984830612089941e-06, "loss": 0.2801, "step": 2568 }, { "epoch": 1.6190457428785312, "grad_norm": 0.1477411836385727, "learning_rate": 3.972049089930026e-06, "loss": 0.324, "step": 2569 }, { "epoch": 1.6196761356920533, "grad_norm": 0.17186160385608673, "learning_rate": 3.959286114519524e-06, "loss": 0.3551, "step": 2570 }, { "epoch": 1.620306528505575, "grad_norm": 0.1736433207988739, "learning_rate": 3.946541698634368e-06, "loss": 0.324, "step": 2571 }, { "epoch": 1.620936921319097, "grad_norm": 0.20666708052158356, "learning_rate": 3.933815855031907e-06, "loss": 0.3898, "step": 2572 }, { "epoch": 1.6215673141326188, "grad_norm": 0.16306789219379425, "learning_rate": 3.921108596450895e-06, "loss": 0.3686, "step": 2573 }, { "epoch": 1.6221977069461408, "grad_norm": 0.17801955342292786, "learning_rate": 3.9084199356114796e-06, "loss": 0.3579, "step": 2574 }, { "epoch": 1.6228280997596627, "grad_norm": 0.15669108927249908, "learning_rate": 3.895749885215201e-06, "loss": 0.3413, "step": 2575 }, { "epoch": 1.6234584925731848, "grad_norm": 0.1938735842704773, "learning_rate": 3.883098457944951e-06, "loss": 0.3592, "step": 2576 }, { "epoch": 1.6240888853867066, "grad_norm": 0.15771815180778503, "learning_rate": 3.8704656664650145e-06, "loss": 0.3474, "step": 2577 }, { "epoch": 1.6247192782002284, "grad_norm": 0.14929990470409393, "learning_rate": 3.857851523420986e-06, "loss": 0.2952, "step": 2578 }, { "epoch": 1.6253496710137505, "grad_norm": 0.20388369262218475, "learning_rate": 3.845256041439812e-06, "loss": 0.3406, "step": 2579 }, { "epoch": 1.6259800638272723, "grad_norm": 0.1572895348072052, "learning_rate": 3.832679233129748e-06, "loss": 0.3672, "step": 2580 }, { "epoch": 1.6266104566407944, "grad_norm": 0.1904786080121994, "learning_rate": 3.820121111080368e-06, "loss": 0.3725, "step": 2581 }, { "epoch": 1.6272408494543162, "grad_norm": 0.1755596548318863, "learning_rate": 3.807581687862533e-06, "loss": 0.3525, "step": 2582 }, { "epoch": 1.627871242267838, "grad_norm": 0.17767442762851715, "learning_rate": 3.7950609760283983e-06, "loss": 0.3402, "step": 2583 }, { "epoch": 1.62850163508136, "grad_norm": 0.1524641066789627, "learning_rate": 3.782558988111363e-06, "loss": 0.3572, "step": 2584 }, { "epoch": 1.629132027894882, "grad_norm": 0.15821711719036102, "learning_rate": 3.7700757366261087e-06, "loss": 0.3378, "step": 2585 }, { "epoch": 1.629762420708404, "grad_norm": 0.12943841516971588, "learning_rate": 3.7576112340685497e-06, "loss": 0.2988, "step": 2586 }, { "epoch": 1.6303928135219259, "grad_norm": 0.18882150948047638, "learning_rate": 3.745165492915843e-06, "loss": 0.3968, "step": 2587 }, { "epoch": 1.6310232063354477, "grad_norm": 0.1451679915189743, "learning_rate": 3.7327385256263476e-06, "loss": 0.3529, "step": 2588 }, { "epoch": 1.6316535991489696, "grad_norm": 0.16361628472805023, "learning_rate": 3.720330344639641e-06, "loss": 0.3535, "step": 2589 }, { "epoch": 1.6322839919624916, "grad_norm": 0.13375185430049896, "learning_rate": 3.707940962376494e-06, "loss": 0.3064, "step": 2590 }, { "epoch": 1.6329143847760137, "grad_norm": 0.15740998089313507, "learning_rate": 3.695570391238867e-06, "loss": 0.3922, "step": 2591 }, { "epoch": 1.6335447775895355, "grad_norm": 0.21207581460475922, "learning_rate": 3.6832186436098723e-06, "loss": 0.3297, "step": 2592 }, { "epoch": 1.6341751704030574, "grad_norm": 0.15415050089359283, "learning_rate": 3.6708857318537964e-06, "loss": 0.3319, "step": 2593 }, { "epoch": 1.6348055632165792, "grad_norm": 0.17594383656978607, "learning_rate": 3.658571668316049e-06, "loss": 0.2924, "step": 2594 }, { "epoch": 1.6354359560301013, "grad_norm": 0.1760425716638565, "learning_rate": 3.646276465323212e-06, "loss": 0.386, "step": 2595 }, { "epoch": 1.6360663488436233, "grad_norm": 0.16595324873924255, "learning_rate": 3.634000135182944e-06, "loss": 0.2909, "step": 2596 }, { "epoch": 1.6366967416571452, "grad_norm": 0.20991137623786926, "learning_rate": 3.6217426901840434e-06, "loss": 0.4425, "step": 2597 }, { "epoch": 1.637327134470667, "grad_norm": 0.1507931649684906, "learning_rate": 3.6095041425963793e-06, "loss": 0.3358, "step": 2598 }, { "epoch": 1.6379575272841889, "grad_norm": 0.17338827252388, "learning_rate": 3.5972845046709208e-06, "loss": 0.298, "step": 2599 }, { "epoch": 1.638587920097711, "grad_norm": 0.14650113880634308, "learning_rate": 3.585083788639709e-06, "loss": 0.3326, "step": 2600 }, { "epoch": 1.638587920097711, "eval_loss": 0.4189898371696472, "eval_runtime": 222.1834, "eval_samples_per_second": 4.501, "eval_steps_per_second": 4.501, "step": 2600 }, { "epoch": 1.6392183129112328, "grad_norm": 0.18736357986927032, "learning_rate": 3.5729020067158394e-06, "loss": 0.3513, "step": 2601 }, { "epoch": 1.6398487057247548, "grad_norm": 0.17417657375335693, "learning_rate": 3.5607391710934446e-06, "loss": 0.2972, "step": 2602 }, { "epoch": 1.6404790985382767, "grad_norm": 0.16662700474262238, "learning_rate": 3.548595293947706e-06, "loss": 0.3203, "step": 2603 }, { "epoch": 1.6411094913517985, "grad_norm": 0.15887589752674103, "learning_rate": 3.536470387434822e-06, "loss": 0.338, "step": 2604 }, { "epoch": 1.6417398841653204, "grad_norm": 0.21726834774017334, "learning_rate": 3.524364463692004e-06, "loss": 0.4007, "step": 2605 }, { "epoch": 1.6423702769788424, "grad_norm": 0.20520001649856567, "learning_rate": 3.5122775348374533e-06, "loss": 0.3685, "step": 2606 }, { "epoch": 1.6430006697923645, "grad_norm": 0.1584928184747696, "learning_rate": 3.5002096129703615e-06, "loss": 0.3241, "step": 2607 }, { "epoch": 1.6436310626058863, "grad_norm": 0.17537303268909454, "learning_rate": 3.4881607101708977e-06, "loss": 0.3486, "step": 2608 }, { "epoch": 1.6442614554194082, "grad_norm": 0.1952475756406784, "learning_rate": 3.476130838500197e-06, "loss": 0.3844, "step": 2609 }, { "epoch": 1.64489184823293, "grad_norm": 0.1399722397327423, "learning_rate": 3.4641200100003214e-06, "loss": 0.3307, "step": 2610 }, { "epoch": 1.645522241046452, "grad_norm": 0.17013977468013763, "learning_rate": 3.4521282366942916e-06, "loss": 0.3501, "step": 2611 }, { "epoch": 1.6461526338599741, "grad_norm": 0.17964443564414978, "learning_rate": 3.440155530586052e-06, "loss": 0.3822, "step": 2612 }, { "epoch": 1.646783026673496, "grad_norm": 0.17054222524166107, "learning_rate": 3.4282019036604564e-06, "loss": 0.3256, "step": 2613 }, { "epoch": 1.6474134194870178, "grad_norm": 0.1672014743089676, "learning_rate": 3.4162673678832526e-06, "loss": 0.3639, "step": 2614 }, { "epoch": 1.6480438123005396, "grad_norm": 0.1616799235343933, "learning_rate": 3.404351935201095e-06, "loss": 0.2757, "step": 2615 }, { "epoch": 1.6486742051140617, "grad_norm": 0.16986674070358276, "learning_rate": 3.3924556175414895e-06, "loss": 0.3926, "step": 2616 }, { "epoch": 1.6493045979275838, "grad_norm": 0.20208294689655304, "learning_rate": 3.3805784268128456e-06, "loss": 0.3595, "step": 2617 }, { "epoch": 1.6499349907411056, "grad_norm": 0.15416187047958374, "learning_rate": 3.368720374904388e-06, "loss": 0.2523, "step": 2618 }, { "epoch": 1.6505653835546275, "grad_norm": 0.148421049118042, "learning_rate": 3.3568814736862115e-06, "loss": 0.3518, "step": 2619 }, { "epoch": 1.6511957763681493, "grad_norm": 0.16845634579658508, "learning_rate": 3.3450617350092163e-06, "loss": 0.3474, "step": 2620 }, { "epoch": 1.6518261691816714, "grad_norm": 0.20402076840400696, "learning_rate": 3.3332611707051404e-06, "loss": 0.3908, "step": 2621 }, { "epoch": 1.6524565619951934, "grad_norm": 0.16622604429721832, "learning_rate": 3.3214797925865198e-06, "loss": 0.3393, "step": 2622 }, { "epoch": 1.6530869548087153, "grad_norm": 0.18637624382972717, "learning_rate": 3.3097176124466936e-06, "loss": 0.3607, "step": 2623 }, { "epoch": 1.653717347622237, "grad_norm": 0.18721283972263336, "learning_rate": 3.2979746420597634e-06, "loss": 0.3442, "step": 2624 }, { "epoch": 1.654347740435759, "grad_norm": 0.18155747652053833, "learning_rate": 3.2862508931806207e-06, "loss": 0.3225, "step": 2625 }, { "epoch": 1.654978133249281, "grad_norm": 0.16414645314216614, "learning_rate": 3.2745463775449087e-06, "loss": 0.3125, "step": 2626 }, { "epoch": 1.6556085260628028, "grad_norm": 0.1629631072282791, "learning_rate": 3.262861106869022e-06, "loss": 0.3336, "step": 2627 }, { "epoch": 1.656238918876325, "grad_norm": 0.20036570727825165, "learning_rate": 3.25119509285008e-06, "loss": 0.4085, "step": 2628 }, { "epoch": 1.6568693116898467, "grad_norm": 0.16448689997196198, "learning_rate": 3.2395483471659357e-06, "loss": 0.294, "step": 2629 }, { "epoch": 1.6574997045033686, "grad_norm": 0.18873000144958496, "learning_rate": 3.2279208814751527e-06, "loss": 0.3816, "step": 2630 }, { "epoch": 1.6581300973168904, "grad_norm": 0.18073073029518127, "learning_rate": 3.216312707416999e-06, "loss": 0.3841, "step": 2631 }, { "epoch": 1.6587604901304125, "grad_norm": 0.19359445571899414, "learning_rate": 3.204723836611414e-06, "loss": 0.3783, "step": 2632 }, { "epoch": 1.6593908829439346, "grad_norm": 0.19595137238502502, "learning_rate": 3.1931542806590356e-06, "loss": 0.4084, "step": 2633 }, { "epoch": 1.6600212757574564, "grad_norm": 0.19038718938827515, "learning_rate": 3.1816040511411552e-06, "loss": 0.429, "step": 2634 }, { "epoch": 1.6606516685709782, "grad_norm": 0.18803559243679047, "learning_rate": 3.1700731596197288e-06, "loss": 0.3572, "step": 2635 }, { "epoch": 1.6612820613845, "grad_norm": 0.1585325002670288, "learning_rate": 3.1585616176373336e-06, "loss": 0.3065, "step": 2636 }, { "epoch": 1.6619124541980221, "grad_norm": 0.16231995820999146, "learning_rate": 3.1470694367172044e-06, "loss": 0.3307, "step": 2637 }, { "epoch": 1.6625428470115442, "grad_norm": 0.1627531200647354, "learning_rate": 3.1355966283631664e-06, "loss": 0.3506, "step": 2638 }, { "epoch": 1.663173239825066, "grad_norm": 0.14652608335018158, "learning_rate": 3.124143204059688e-06, "loss": 0.3366, "step": 2639 }, { "epoch": 1.6638036326385879, "grad_norm": 0.13706839084625244, "learning_rate": 3.1127091752717983e-06, "loss": 0.3214, "step": 2640 }, { "epoch": 1.6644340254521097, "grad_norm": 0.15212666988372803, "learning_rate": 3.101294553445139e-06, "loss": 0.3175, "step": 2641 }, { "epoch": 1.6650644182656318, "grad_norm": 0.16928543150424957, "learning_rate": 3.0898993500059028e-06, "loss": 0.3812, "step": 2642 }, { "epoch": 1.6656948110791538, "grad_norm": 0.15869659185409546, "learning_rate": 3.078523576360858e-06, "loss": 0.3082, "step": 2643 }, { "epoch": 1.6663252038926757, "grad_norm": 0.16265471279621124, "learning_rate": 3.06716724389732e-06, "loss": 0.2794, "step": 2644 }, { "epoch": 1.6669555967061975, "grad_norm": 0.18983694911003113, "learning_rate": 3.055830363983153e-06, "loss": 0.3445, "step": 2645 }, { "epoch": 1.6675859895197194, "grad_norm": 0.18641945719718933, "learning_rate": 3.0445129479667224e-06, "loss": 0.3768, "step": 2646 }, { "epoch": 1.6682163823332414, "grad_norm": 0.1502719521522522, "learning_rate": 3.0332150071769383e-06, "loss": 0.2855, "step": 2647 }, { "epoch": 1.6688467751467633, "grad_norm": 0.1414182037115097, "learning_rate": 3.021936552923199e-06, "loss": 0.3237, "step": 2648 }, { "epoch": 1.6694771679602853, "grad_norm": 0.21098460257053375, "learning_rate": 3.010677596495411e-06, "loss": 0.426, "step": 2649 }, { "epoch": 1.6701075607738072, "grad_norm": 0.19079872965812683, "learning_rate": 2.9994381491639435e-06, "loss": 0.349, "step": 2650 }, { "epoch": 1.670737953587329, "grad_norm": 0.16061921417713165, "learning_rate": 2.98821822217965e-06, "loss": 0.3617, "step": 2651 }, { "epoch": 1.671368346400851, "grad_norm": 0.20180317759513855, "learning_rate": 2.9770178267738435e-06, "loss": 0.3442, "step": 2652 }, { "epoch": 1.671998739214373, "grad_norm": 0.15677542984485626, "learning_rate": 2.9658369741582877e-06, "loss": 0.3209, "step": 2653 }, { "epoch": 1.672629132027895, "grad_norm": 0.14801929891109467, "learning_rate": 2.9546756755251683e-06, "loss": 0.2879, "step": 2654 }, { "epoch": 1.6732595248414168, "grad_norm": 0.16312134265899658, "learning_rate": 2.9435339420471195e-06, "loss": 0.3165, "step": 2655 }, { "epoch": 1.6738899176549387, "grad_norm": 0.16553540527820587, "learning_rate": 2.9324117848771653e-06, "loss": 0.2867, "step": 2656 }, { "epoch": 1.6745203104684605, "grad_norm": 0.16059130430221558, "learning_rate": 2.921309215148765e-06, "loss": 0.3347, "step": 2657 }, { "epoch": 1.6751507032819826, "grad_norm": 0.19318628311157227, "learning_rate": 2.910226243975741e-06, "loss": 0.3434, "step": 2658 }, { "epoch": 1.6757810960955046, "grad_norm": 0.17356213927268982, "learning_rate": 2.899162882452313e-06, "loss": 0.3595, "step": 2659 }, { "epoch": 1.6764114889090265, "grad_norm": 0.13051897287368774, "learning_rate": 2.888119141653066e-06, "loss": 0.3069, "step": 2660 }, { "epoch": 1.6770418817225483, "grad_norm": 0.203187957406044, "learning_rate": 2.877095032632946e-06, "loss": 0.3804, "step": 2661 }, { "epoch": 1.6776722745360702, "grad_norm": 0.1899317353963852, "learning_rate": 2.8660905664272462e-06, "loss": 0.3632, "step": 2662 }, { "epoch": 1.6783026673495922, "grad_norm": 0.1583564132452011, "learning_rate": 2.8551057540516045e-06, "loss": 0.3131, "step": 2663 }, { "epoch": 1.6789330601631143, "grad_norm": 0.1547822654247284, "learning_rate": 2.8441406065019684e-06, "loss": 0.2862, "step": 2664 }, { "epoch": 1.6795634529766361, "grad_norm": 0.1812221109867096, "learning_rate": 2.8331951347546167e-06, "loss": 0.3665, "step": 2665 }, { "epoch": 1.680193845790158, "grad_norm": 0.18095368146896362, "learning_rate": 2.822269349766126e-06, "loss": 0.3622, "step": 2666 }, { "epoch": 1.6808242386036798, "grad_norm": 0.15226151049137115, "learning_rate": 2.8113632624733697e-06, "loss": 0.348, "step": 2667 }, { "epoch": 1.6814546314172019, "grad_norm": 0.1840267926454544, "learning_rate": 2.8004768837934926e-06, "loss": 0.3193, "step": 2668 }, { "epoch": 1.682085024230724, "grad_norm": 0.18752466142177582, "learning_rate": 2.789610224623925e-06, "loss": 0.4, "step": 2669 }, { "epoch": 1.6827154170442458, "grad_norm": 0.22188887000083923, "learning_rate": 2.778763295842346e-06, "loss": 0.3561, "step": 2670 }, { "epoch": 1.6833458098577676, "grad_norm": 0.14395049214363098, "learning_rate": 2.7679361083067e-06, "loss": 0.3108, "step": 2671 }, { "epoch": 1.6839762026712894, "grad_norm": 0.16563533246517181, "learning_rate": 2.7571286728551497e-06, "loss": 0.3403, "step": 2672 }, { "epoch": 1.6846065954848115, "grad_norm": 0.17470870912075043, "learning_rate": 2.7463410003060965e-06, "loss": 0.3393, "step": 2673 }, { "epoch": 1.6852369882983333, "grad_norm": 0.1701815128326416, "learning_rate": 2.735573101458162e-06, "loss": 0.3538, "step": 2674 }, { "epoch": 1.6858673811118554, "grad_norm": 0.2105788141489029, "learning_rate": 2.724824987090172e-06, "loss": 0.4069, "step": 2675 }, { "epoch": 1.6864977739253773, "grad_norm": 0.1775672733783722, "learning_rate": 2.71409666796114e-06, "loss": 0.3541, "step": 2676 }, { "epoch": 1.687128166738899, "grad_norm": 0.16868631541728973, "learning_rate": 2.7033881548102783e-06, "loss": 0.3065, "step": 2677 }, { "epoch": 1.6877585595524212, "grad_norm": 0.16328738629817963, "learning_rate": 2.6926994583569494e-06, "loss": 0.3419, "step": 2678 }, { "epoch": 1.688388952365943, "grad_norm": 0.16287265717983246, "learning_rate": 2.6820305893007143e-06, "loss": 0.3393, "step": 2679 }, { "epoch": 1.689019345179465, "grad_norm": 0.19886572659015656, "learning_rate": 2.671381558321251e-06, "loss": 0.4079, "step": 2680 }, { "epoch": 1.689649737992987, "grad_norm": 0.19469165802001953, "learning_rate": 2.660752376078407e-06, "loss": 0.3149, "step": 2681 }, { "epoch": 1.6902801308065087, "grad_norm": 0.15739621222019196, "learning_rate": 2.650143053212138e-06, "loss": 0.3475, "step": 2682 }, { "epoch": 1.6909105236200306, "grad_norm": 0.13259677588939667, "learning_rate": 2.639553600342538e-06, "loss": 0.3312, "step": 2683 }, { "epoch": 1.6915409164335526, "grad_norm": 0.16224607825279236, "learning_rate": 2.6289840280698013e-06, "loss": 0.3378, "step": 2684 }, { "epoch": 1.6921713092470747, "grad_norm": 0.15360943973064423, "learning_rate": 2.61843434697423e-06, "loss": 0.2946, "step": 2685 }, { "epoch": 1.6928017020605965, "grad_norm": 0.1765449494123459, "learning_rate": 2.6079045676161963e-06, "loss": 0.3056, "step": 2686 }, { "epoch": 1.6934320948741184, "grad_norm": 0.17461608350276947, "learning_rate": 2.5973947005361746e-06, "loss": 0.3048, "step": 2687 }, { "epoch": 1.6940624876876402, "grad_norm": 0.16787251830101013, "learning_rate": 2.5869047562546885e-06, "loss": 0.3737, "step": 2688 }, { "epoch": 1.6946928805011623, "grad_norm": 0.1929512768983841, "learning_rate": 2.5764347452723365e-06, "loss": 0.3367, "step": 2689 }, { "epoch": 1.6953232733146844, "grad_norm": 0.16267262399196625, "learning_rate": 2.565984678069737e-06, "loss": 0.3386, "step": 2690 }, { "epoch": 1.6959536661282062, "grad_norm": 0.19082540273666382, "learning_rate": 2.5555545651075677e-06, "loss": 0.3568, "step": 2691 }, { "epoch": 1.696584058941728, "grad_norm": 0.17961396276950836, "learning_rate": 2.545144416826524e-06, "loss": 0.3649, "step": 2692 }, { "epoch": 1.6972144517552499, "grad_norm": 0.17428696155548096, "learning_rate": 2.5347542436473204e-06, "loss": 0.3468, "step": 2693 }, { "epoch": 1.697844844568772, "grad_norm": 0.17337742447853088, "learning_rate": 2.524384055970662e-06, "loss": 0.3284, "step": 2694 }, { "epoch": 1.698475237382294, "grad_norm": 0.20851732790470123, "learning_rate": 2.5140338641772705e-06, "loss": 0.3598, "step": 2695 }, { "epoch": 1.6991056301958158, "grad_norm": 0.16625341773033142, "learning_rate": 2.5037036786278202e-06, "loss": 0.3261, "step": 2696 }, { "epoch": 1.6997360230093377, "grad_norm": 0.22402916848659515, "learning_rate": 2.493393509663e-06, "loss": 0.4392, "step": 2697 }, { "epoch": 1.7003664158228595, "grad_norm": 0.16897734999656677, "learning_rate": 2.483103367603424e-06, "loss": 0.3401, "step": 2698 }, { "epoch": 1.7009968086363816, "grad_norm": 0.1630307286977768, "learning_rate": 2.472833262749685e-06, "loss": 0.3425, "step": 2699 }, { "epoch": 1.7016272014499034, "grad_norm": 0.19418928027153015, "learning_rate": 2.462583205382292e-06, "loss": 0.3906, "step": 2700 }, { "epoch": 1.7022575942634255, "grad_norm": 0.16761890053749084, "learning_rate": 2.452353205761725e-06, "loss": 0.3793, "step": 2701 }, { "epoch": 1.7028879870769473, "grad_norm": 0.18117816746234894, "learning_rate": 2.442143274128345e-06, "loss": 0.3681, "step": 2702 }, { "epoch": 1.7035183798904692, "grad_norm": 0.14692558348178864, "learning_rate": 2.4319534207024514e-06, "loss": 0.3868, "step": 2703 }, { "epoch": 1.704148772703991, "grad_norm": 0.19921517372131348, "learning_rate": 2.42178365568423e-06, "loss": 0.3649, "step": 2704 }, { "epoch": 1.704779165517513, "grad_norm": 0.16821995377540588, "learning_rate": 2.4116339892537684e-06, "loss": 0.3404, "step": 2705 }, { "epoch": 1.7054095583310351, "grad_norm": 0.19933214783668518, "learning_rate": 2.401504431571028e-06, "loss": 0.3387, "step": 2706 }, { "epoch": 1.706039951144557, "grad_norm": 0.18399402499198914, "learning_rate": 2.3913949927758514e-06, "loss": 0.3846, "step": 2707 }, { "epoch": 1.7066703439580788, "grad_norm": 0.15911422669887543, "learning_rate": 2.381305682987926e-06, "loss": 0.3916, "step": 2708 }, { "epoch": 1.7073007367716007, "grad_norm": 0.1464301496744156, "learning_rate": 2.3712365123067997e-06, "loss": 0.3244, "step": 2709 }, { "epoch": 1.7079311295851227, "grad_norm": 0.20365366339683533, "learning_rate": 2.361187490811861e-06, "loss": 0.3821, "step": 2710 }, { "epoch": 1.7085615223986448, "grad_norm": 0.16726025938987732, "learning_rate": 2.3511586285623324e-06, "loss": 0.3617, "step": 2711 }, { "epoch": 1.7091919152121666, "grad_norm": 0.17762507498264313, "learning_rate": 2.341149935597239e-06, "loss": 0.3814, "step": 2712 }, { "epoch": 1.7098223080256885, "grad_norm": 0.13304100930690765, "learning_rate": 2.331161421935439e-06, "loss": 0.2826, "step": 2713 }, { "epoch": 1.7104527008392103, "grad_norm": 0.1622227430343628, "learning_rate": 2.321193097575577e-06, "loss": 0.3359, "step": 2714 }, { "epoch": 1.7110830936527324, "grad_norm": 0.1425076723098755, "learning_rate": 2.311244972496093e-06, "loss": 0.3394, "step": 2715 }, { "epoch": 1.7117134864662544, "grad_norm": 0.16510890424251556, "learning_rate": 2.301317056655202e-06, "loss": 0.2648, "step": 2716 }, { "epoch": 1.7123438792797763, "grad_norm": 0.1962651163339615, "learning_rate": 2.2914093599908985e-06, "loss": 0.3774, "step": 2717 }, { "epoch": 1.7129742720932981, "grad_norm": 0.18967847526073456, "learning_rate": 2.2815218924209198e-06, "loss": 0.3603, "step": 2718 }, { "epoch": 1.71360466490682, "grad_norm": 0.16082076728343964, "learning_rate": 2.271654663842781e-06, "loss": 0.3075, "step": 2719 }, { "epoch": 1.714235057720342, "grad_norm": 0.15744170546531677, "learning_rate": 2.261807684133709e-06, "loss": 0.3584, "step": 2720 }, { "epoch": 1.714865450533864, "grad_norm": 0.215196430683136, "learning_rate": 2.2519809631506858e-06, "loss": 0.3892, "step": 2721 }, { "epoch": 1.715495843347386, "grad_norm": 0.16610518097877502, "learning_rate": 2.242174510730391e-06, "loss": 0.3303, "step": 2722 }, { "epoch": 1.7161262361609078, "grad_norm": 0.1572638601064682, "learning_rate": 2.232388336689232e-06, "loss": 0.362, "step": 2723 }, { "epoch": 1.7167566289744296, "grad_norm": 0.17560486495494843, "learning_rate": 2.2226224508233105e-06, "loss": 0.3483, "step": 2724 }, { "epoch": 1.7173870217879517, "grad_norm": 0.18823911249637604, "learning_rate": 2.2128768629084286e-06, "loss": 0.3773, "step": 2725 }, { "epoch": 1.7180174146014735, "grad_norm": 0.12789592146873474, "learning_rate": 2.203151582700051e-06, "loss": 0.3035, "step": 2726 }, { "epoch": 1.7186478074149956, "grad_norm": 0.19134822487831116, "learning_rate": 2.1934466199333298e-06, "loss": 0.391, "step": 2727 }, { "epoch": 1.7192782002285174, "grad_norm": 0.14222867786884308, "learning_rate": 2.1837619843230744e-06, "loss": 0.2925, "step": 2728 }, { "epoch": 1.7199085930420392, "grad_norm": 0.1417618989944458, "learning_rate": 2.1740976855637517e-06, "loss": 0.3337, "step": 2729 }, { "epoch": 1.720538985855561, "grad_norm": 0.18344461917877197, "learning_rate": 2.1644537333294543e-06, "loss": 0.3417, "step": 2730 }, { "epoch": 1.7211693786690831, "grad_norm": 0.16136455535888672, "learning_rate": 2.1548301372739292e-06, "loss": 0.3542, "step": 2731 }, { "epoch": 1.7217997714826052, "grad_norm": 0.18665267527103424, "learning_rate": 2.145226907030531e-06, "loss": 0.4172, "step": 2732 }, { "epoch": 1.722430164296127, "grad_norm": 0.18608105182647705, "learning_rate": 2.1356440522122403e-06, "loss": 0.3145, "step": 2733 }, { "epoch": 1.723060557109649, "grad_norm": 0.1521705538034439, "learning_rate": 2.1260815824116256e-06, "loss": 0.2793, "step": 2734 }, { "epoch": 1.7236909499231707, "grad_norm": 0.21205557882785797, "learning_rate": 2.1165395072008627e-06, "loss": 0.3642, "step": 2735 }, { "epoch": 1.7243213427366928, "grad_norm": 0.1696559190750122, "learning_rate": 2.1070178361317113e-06, "loss": 0.3955, "step": 2736 }, { "epoch": 1.7249517355502149, "grad_norm": 0.1602529138326645, "learning_rate": 2.0975165787355016e-06, "loss": 0.3284, "step": 2737 }, { "epoch": 1.7255821283637367, "grad_norm": 0.19696998596191406, "learning_rate": 2.0880357445231265e-06, "loss": 0.4423, "step": 2738 }, { "epoch": 1.7262125211772585, "grad_norm": 0.14066126942634583, "learning_rate": 2.0785753429850485e-06, "loss": 0.3609, "step": 2739 }, { "epoch": 1.7268429139907804, "grad_norm": 0.18645574152469635, "learning_rate": 2.069135383591253e-06, "loss": 0.3536, "step": 2740 }, { "epoch": 1.7274733068043024, "grad_norm": 0.1929475963115692, "learning_rate": 2.059715875791296e-06, "loss": 0.4631, "step": 2741 }, { "epoch": 1.7281036996178245, "grad_norm": 0.16178543865680695, "learning_rate": 2.0503168290142277e-06, "loss": 0.3382, "step": 2742 }, { "epoch": 1.7287340924313463, "grad_norm": 0.1649865061044693, "learning_rate": 2.040938252668639e-06, "loss": 0.3416, "step": 2743 }, { "epoch": 1.7293644852448682, "grad_norm": 0.1723279356956482, "learning_rate": 2.031580156142618e-06, "loss": 0.3325, "step": 2744 }, { "epoch": 1.72999487805839, "grad_norm": 0.20794516801834106, "learning_rate": 2.022242548803754e-06, "loss": 0.414, "step": 2745 }, { "epoch": 1.730625270871912, "grad_norm": 0.17778415977954865, "learning_rate": 2.0129254399991324e-06, "loss": 0.3701, "step": 2746 }, { "epoch": 1.731255663685434, "grad_norm": 0.14724238216876984, "learning_rate": 2.003628839055318e-06, "loss": 0.2701, "step": 2747 }, { "epoch": 1.731886056498956, "grad_norm": 0.2008165717124939, "learning_rate": 1.9943527552783365e-06, "loss": 0.3588, "step": 2748 }, { "epoch": 1.7325164493124778, "grad_norm": 0.15005536377429962, "learning_rate": 1.985097197953685e-06, "loss": 0.397, "step": 2749 }, { "epoch": 1.7331468421259997, "grad_norm": 0.1658148318529129, "learning_rate": 1.975862176346316e-06, "loss": 0.3324, "step": 2750 }, { "epoch": 1.7337772349395217, "grad_norm": 0.16382184624671936, "learning_rate": 1.966647699700621e-06, "loss": 0.3103, "step": 2751 }, { "epoch": 1.7344076277530436, "grad_norm": 0.1729460507631302, "learning_rate": 1.9574537772404195e-06, "loss": 0.3479, "step": 2752 }, { "epoch": 1.7350380205665656, "grad_norm": 0.17231756448745728, "learning_rate": 1.9482804181689674e-06, "loss": 0.3712, "step": 2753 }, { "epoch": 1.7356684133800875, "grad_norm": 0.16201011836528778, "learning_rate": 1.9391276316689267e-06, "loss": 0.3607, "step": 2754 }, { "epoch": 1.7362988061936093, "grad_norm": 0.15200699865818024, "learning_rate": 1.9299954269023784e-06, "loss": 0.3093, "step": 2755 }, { "epoch": 1.7369291990071312, "grad_norm": 0.1864054799079895, "learning_rate": 1.920883813010783e-06, "loss": 0.3405, "step": 2756 }, { "epoch": 1.7375595918206532, "grad_norm": 0.17108328640460968, "learning_rate": 1.9117927991150066e-06, "loss": 0.345, "step": 2757 }, { "epoch": 1.7381899846341753, "grad_norm": 0.1786588728427887, "learning_rate": 1.9027223943152772e-06, "loss": 0.3149, "step": 2758 }, { "epoch": 1.7388203774476971, "grad_norm": 0.17140208184719086, "learning_rate": 1.8936726076912172e-06, "loss": 0.308, "step": 2759 }, { "epoch": 1.739450770261219, "grad_norm": 0.18755091726779938, "learning_rate": 1.8846434483017828e-06, "loss": 0.3674, "step": 2760 }, { "epoch": 1.7400811630747408, "grad_norm": 0.1827147752046585, "learning_rate": 1.8756349251853012e-06, "loss": 0.3668, "step": 2761 }, { "epoch": 1.7407115558882629, "grad_norm": 0.1812572330236435, "learning_rate": 1.8666470473594316e-06, "loss": 0.3738, "step": 2762 }, { "epoch": 1.741341948701785, "grad_norm": 0.18684326112270355, "learning_rate": 1.8576798238211698e-06, "loss": 0.3566, "step": 2763 }, { "epoch": 1.7419723415153068, "grad_norm": 0.1563086211681366, "learning_rate": 1.848733263546843e-06, "loss": 0.3371, "step": 2764 }, { "epoch": 1.7426027343288286, "grad_norm": 0.23187503218650818, "learning_rate": 1.8398073754920873e-06, "loss": 0.3441, "step": 2765 }, { "epoch": 1.7432331271423505, "grad_norm": 0.16062569618225098, "learning_rate": 1.8309021685918428e-06, "loss": 0.4071, "step": 2766 }, { "epoch": 1.7438635199558725, "grad_norm": 0.16154485940933228, "learning_rate": 1.8220176517603519e-06, "loss": 0.3936, "step": 2767 }, { "epoch": 1.7444939127693946, "grad_norm": 0.17705531418323517, "learning_rate": 1.81315383389115e-06, "loss": 0.3389, "step": 2768 }, { "epoch": 1.7451243055829164, "grad_norm": 0.16158095002174377, "learning_rate": 1.8043107238570498e-06, "loss": 0.3602, "step": 2769 }, { "epoch": 1.7457546983964383, "grad_norm": 0.18866123259067535, "learning_rate": 1.7954883305101277e-06, "loss": 0.3823, "step": 2770 }, { "epoch": 1.74638509120996, "grad_norm": 0.14326314628124237, "learning_rate": 1.7866866626817297e-06, "loss": 0.3356, "step": 2771 }, { "epoch": 1.7470154840234822, "grad_norm": 0.1846877783536911, "learning_rate": 1.7779057291824527e-06, "loss": 0.3421, "step": 2772 }, { "epoch": 1.747645876837004, "grad_norm": 0.15310382843017578, "learning_rate": 1.7691455388021484e-06, "loss": 0.2835, "step": 2773 }, { "epoch": 1.748276269650526, "grad_norm": 0.14761358499526978, "learning_rate": 1.7604061003098817e-06, "loss": 0.3675, "step": 2774 }, { "epoch": 1.748906662464048, "grad_norm": 0.19517937302589417, "learning_rate": 1.751687422453968e-06, "loss": 0.3041, "step": 2775 }, { "epoch": 1.7495370552775698, "grad_norm": 0.1950213611125946, "learning_rate": 1.7429895139619252e-06, "loss": 0.3819, "step": 2776 }, { "epoch": 1.7501674480910918, "grad_norm": 0.15891462564468384, "learning_rate": 1.734312383540494e-06, "loss": 0.3552, "step": 2777 }, { "epoch": 1.7507978409046137, "grad_norm": 0.16390115022659302, "learning_rate": 1.7256560398756014e-06, "loss": 0.4012, "step": 2778 }, { "epoch": 1.7514282337181357, "grad_norm": 0.1448567807674408, "learning_rate": 1.7170204916323784e-06, "loss": 0.3193, "step": 2779 }, { "epoch": 1.7520586265316576, "grad_norm": 0.1577872782945633, "learning_rate": 1.7084057474551254e-06, "loss": 0.3678, "step": 2780 }, { "epoch": 1.7526890193451794, "grad_norm": 0.161997988820076, "learning_rate": 1.6998118159673442e-06, "loss": 0.3477, "step": 2781 }, { "epoch": 1.7533194121587012, "grad_norm": 0.176879420876503, "learning_rate": 1.691238705771671e-06, "loss": 0.331, "step": 2782 }, { "epoch": 1.7539498049722233, "grad_norm": 0.16836316883563995, "learning_rate": 1.6826864254499236e-06, "loss": 0.3691, "step": 2783 }, { "epoch": 1.7545801977857454, "grad_norm": 0.20967160165309906, "learning_rate": 1.6741549835630538e-06, "loss": 0.3648, "step": 2784 }, { "epoch": 1.7552105905992672, "grad_norm": 0.13090410828590393, "learning_rate": 1.6656443886511577e-06, "loss": 0.2312, "step": 2785 }, { "epoch": 1.755840983412789, "grad_norm": 0.15169045329093933, "learning_rate": 1.6571546492334708e-06, "loss": 0.2898, "step": 2786 }, { "epoch": 1.7564713762263109, "grad_norm": 0.1762060821056366, "learning_rate": 1.6486857738083474e-06, "loss": 0.3035, "step": 2787 }, { "epoch": 1.757101769039833, "grad_norm": 0.17813783884048462, "learning_rate": 1.6402377708532466e-06, "loss": 0.3459, "step": 2788 }, { "epoch": 1.757732161853355, "grad_norm": 0.12713901698589325, "learning_rate": 1.631810648824751e-06, "loss": 0.2806, "step": 2789 }, { "epoch": 1.7583625546668769, "grad_norm": 0.16314174234867096, "learning_rate": 1.6234044161585276e-06, "loss": 0.3242, "step": 2790 }, { "epoch": 1.7589929474803987, "grad_norm": 0.18997551500797272, "learning_rate": 1.6150190812693458e-06, "loss": 0.4186, "step": 2791 }, { "epoch": 1.7596233402939205, "grad_norm": 0.14270219206809998, "learning_rate": 1.6066546525510384e-06, "loss": 0.2792, "step": 2792 }, { "epoch": 1.7602537331074426, "grad_norm": 0.18702076375484467, "learning_rate": 1.5983111383765254e-06, "loss": 0.3878, "step": 2793 }, { "epoch": 1.7608841259209647, "grad_norm": 0.17163795232772827, "learning_rate": 1.5899885470977858e-06, "loss": 0.3613, "step": 2794 }, { "epoch": 1.7615145187344865, "grad_norm": 0.14220699667930603, "learning_rate": 1.581686887045858e-06, "loss": 0.3274, "step": 2795 }, { "epoch": 1.7621449115480083, "grad_norm": 0.1297381967306137, "learning_rate": 1.5734061665308194e-06, "loss": 0.3298, "step": 2796 }, { "epoch": 1.7627753043615302, "grad_norm": 0.14467160403728485, "learning_rate": 1.5651463938417948e-06, "loss": 0.3203, "step": 2797 }, { "epoch": 1.7634056971750522, "grad_norm": 0.1549520492553711, "learning_rate": 1.5569075772469378e-06, "loss": 0.3047, "step": 2798 }, { "epoch": 1.764036089988574, "grad_norm": 0.1654859334230423, "learning_rate": 1.5486897249934285e-06, "loss": 0.3584, "step": 2799 }, { "epoch": 1.7646664828020961, "grad_norm": 0.18609672784805298, "learning_rate": 1.5404928453074468e-06, "loss": 0.3791, "step": 2800 }, { "epoch": 1.7646664828020961, "eval_loss": 0.41850462555885315, "eval_runtime": 222.3838, "eval_samples_per_second": 4.497, "eval_steps_per_second": 4.497, "step": 2800 }, { "epoch": 1.765296875615618, "grad_norm": 0.2150953710079193, "learning_rate": 1.5323169463941994e-06, "loss": 0.4229, "step": 2801 }, { "epoch": 1.7659272684291398, "grad_norm": 0.18508629500865936, "learning_rate": 1.5241620364378695e-06, "loss": 0.3557, "step": 2802 }, { "epoch": 1.7665576612426617, "grad_norm": 0.15115678310394287, "learning_rate": 1.5160281236016522e-06, "loss": 0.3079, "step": 2803 }, { "epoch": 1.7671880540561837, "grad_norm": 0.2158491164445877, "learning_rate": 1.5079152160277045e-06, "loss": 0.3583, "step": 2804 }, { "epoch": 1.7678184468697058, "grad_norm": 0.17843793332576752, "learning_rate": 1.4998233218371724e-06, "loss": 0.4217, "step": 2805 }, { "epoch": 1.7684488396832276, "grad_norm": 0.1688946932554245, "learning_rate": 1.4917524491301543e-06, "loss": 0.3485, "step": 2806 }, { "epoch": 1.7690792324967495, "grad_norm": 0.1464250385761261, "learning_rate": 1.4837026059857127e-06, "loss": 0.3557, "step": 2807 }, { "epoch": 1.7697096253102713, "grad_norm": 0.16816261410713196, "learning_rate": 1.4756738004618614e-06, "loss": 0.2934, "step": 2808 }, { "epoch": 1.7703400181237934, "grad_norm": 0.15366758406162262, "learning_rate": 1.467666040595552e-06, "loss": 0.3111, "step": 2809 }, { "epoch": 1.7709704109373154, "grad_norm": 0.15366709232330322, "learning_rate": 1.459679334402662e-06, "loss": 0.336, "step": 2810 }, { "epoch": 1.7716008037508373, "grad_norm": 0.18162818253040314, "learning_rate": 1.4517136898780068e-06, "loss": 0.3466, "step": 2811 }, { "epoch": 1.7722311965643591, "grad_norm": 0.16830721497535706, "learning_rate": 1.4437691149953124e-06, "loss": 0.313, "step": 2812 }, { "epoch": 1.772861589377881, "grad_norm": 0.20500795543193817, "learning_rate": 1.435845617707218e-06, "loss": 0.4075, "step": 2813 }, { "epoch": 1.773491982191403, "grad_norm": 0.16557179391384125, "learning_rate": 1.4279432059452511e-06, "loss": 0.3837, "step": 2814 }, { "epoch": 1.774122375004925, "grad_norm": 0.20282112061977386, "learning_rate": 1.4200618876198465e-06, "loss": 0.3548, "step": 2815 }, { "epoch": 1.774752767818447, "grad_norm": 0.17897149920463562, "learning_rate": 1.4122016706203188e-06, "loss": 0.4087, "step": 2816 }, { "epoch": 1.7753831606319688, "grad_norm": 0.16336628794670105, "learning_rate": 1.4043625628148635e-06, "loss": 0.3451, "step": 2817 }, { "epoch": 1.7760135534454906, "grad_norm": 0.1643591672182083, "learning_rate": 1.3965445720505365e-06, "loss": 0.3211, "step": 2818 }, { "epoch": 1.7766439462590127, "grad_norm": 0.17228056490421295, "learning_rate": 1.3887477061532628e-06, "loss": 0.2829, "step": 2819 }, { "epoch": 1.7772743390725347, "grad_norm": 0.14726178348064423, "learning_rate": 1.380971972927814e-06, "loss": 0.3419, "step": 2820 }, { "epoch": 1.7779047318860566, "grad_norm": 0.16053885221481323, "learning_rate": 1.3732173801578213e-06, "loss": 0.3403, "step": 2821 }, { "epoch": 1.7785351246995784, "grad_norm": 0.15379776060581207, "learning_rate": 1.3654839356057405e-06, "loss": 0.3309, "step": 2822 }, { "epoch": 1.7791655175131003, "grad_norm": 0.14749456942081451, "learning_rate": 1.3577716470128668e-06, "loss": 0.3119, "step": 2823 }, { "epoch": 1.7797959103266223, "grad_norm": 0.17150136828422546, "learning_rate": 1.3500805220993045e-06, "loss": 0.3999, "step": 2824 }, { "epoch": 1.7804263031401442, "grad_norm": 0.20718194544315338, "learning_rate": 1.3424105685639904e-06, "loss": 0.3495, "step": 2825 }, { "epoch": 1.7810566959536662, "grad_norm": 0.18642765283584595, "learning_rate": 1.3347617940846552e-06, "loss": 0.3682, "step": 2826 }, { "epoch": 1.781687088767188, "grad_norm": 0.18365077674388885, "learning_rate": 1.3271342063178393e-06, "loss": 0.3234, "step": 2827 }, { "epoch": 1.78231748158071, "grad_norm": 0.15567843616008759, "learning_rate": 1.3195278128988653e-06, "loss": 0.3824, "step": 2828 }, { "epoch": 1.7829478743942317, "grad_norm": 0.18231748044490814, "learning_rate": 1.3119426214418423e-06, "loss": 0.3105, "step": 2829 }, { "epoch": 1.7835782672077538, "grad_norm": 0.21656112372875214, "learning_rate": 1.3043786395396617e-06, "loss": 0.4615, "step": 2830 }, { "epoch": 1.7842086600212759, "grad_norm": 0.16485925018787384, "learning_rate": 1.2968358747639794e-06, "loss": 0.2991, "step": 2831 }, { "epoch": 1.7848390528347977, "grad_norm": 0.17655867338180542, "learning_rate": 1.2893143346652105e-06, "loss": 0.3318, "step": 2832 }, { "epoch": 1.7854694456483196, "grad_norm": 0.14479409158229828, "learning_rate": 1.2818140267725275e-06, "loss": 0.3241, "step": 2833 }, { "epoch": 1.7860998384618414, "grad_norm": 0.18551374971866608, "learning_rate": 1.2743349585938497e-06, "loss": 0.3531, "step": 2834 }, { "epoch": 1.7867302312753635, "grad_norm": 0.15372885763645172, "learning_rate": 1.2668771376158356e-06, "loss": 0.3643, "step": 2835 }, { "epoch": 1.7873606240888855, "grad_norm": 0.17137745022773743, "learning_rate": 1.2594405713038661e-06, "loss": 0.3494, "step": 2836 }, { "epoch": 1.7879910169024074, "grad_norm": 0.17164146900177002, "learning_rate": 1.2520252671020568e-06, "loss": 0.2951, "step": 2837 }, { "epoch": 1.7886214097159292, "grad_norm": 0.17525000870227814, "learning_rate": 1.2446312324332348e-06, "loss": 0.3379, "step": 2838 }, { "epoch": 1.789251802529451, "grad_norm": 0.17154620587825775, "learning_rate": 1.2372584746989397e-06, "loss": 0.3236, "step": 2839 }, { "epoch": 1.789882195342973, "grad_norm": 0.17524504661560059, "learning_rate": 1.229907001279405e-06, "loss": 0.386, "step": 2840 }, { "epoch": 1.7905125881564952, "grad_norm": 0.17549632489681244, "learning_rate": 1.222576819533567e-06, "loss": 0.381, "step": 2841 }, { "epoch": 1.791142980970017, "grad_norm": 0.1916823834180832, "learning_rate": 1.2152679367990364e-06, "loss": 0.3531, "step": 2842 }, { "epoch": 1.7917733737835388, "grad_norm": 0.21655133366584778, "learning_rate": 1.2079803603921253e-06, "loss": 0.3728, "step": 2843 }, { "epoch": 1.7924037665970607, "grad_norm": 0.16216030716896057, "learning_rate": 1.200714097607791e-06, "loss": 0.3782, "step": 2844 }, { "epoch": 1.7930341594105828, "grad_norm": 0.17528338730335236, "learning_rate": 1.1934691557196804e-06, "loss": 0.3318, "step": 2845 }, { "epoch": 1.7936645522241046, "grad_norm": 0.20596998929977417, "learning_rate": 1.1862455419800744e-06, "loss": 0.3781, "step": 2846 }, { "epoch": 1.7942949450376267, "grad_norm": 0.17023244500160217, "learning_rate": 1.1790432636199241e-06, "loss": 0.4052, "step": 2847 }, { "epoch": 1.7949253378511485, "grad_norm": 0.14573776721954346, "learning_rate": 1.17186232784881e-06, "loss": 0.2995, "step": 2848 }, { "epoch": 1.7955557306646703, "grad_norm": 0.178813636302948, "learning_rate": 1.1647027418549594e-06, "loss": 0.3702, "step": 2849 }, { "epoch": 1.7961861234781924, "grad_norm": 0.17194363474845886, "learning_rate": 1.1575645128052195e-06, "loss": 0.3203, "step": 2850 }, { "epoch": 1.7968165162917142, "grad_norm": 0.16584280133247375, "learning_rate": 1.1504476478450595e-06, "loss": 0.321, "step": 2851 }, { "epoch": 1.7974469091052363, "grad_norm": 0.1823454350233078, "learning_rate": 1.1433521540985658e-06, "loss": 0.3391, "step": 2852 }, { "epoch": 1.7980773019187581, "grad_norm": 0.17875845730304718, "learning_rate": 1.1362780386684369e-06, "loss": 0.3315, "step": 2853 }, { "epoch": 1.79870769473228, "grad_norm": 0.18415741622447968, "learning_rate": 1.1292253086359606e-06, "loss": 0.3427, "step": 2854 }, { "epoch": 1.7993380875458018, "grad_norm": 0.14331680536270142, "learning_rate": 1.1221939710610197e-06, "loss": 0.3506, "step": 2855 }, { "epoch": 1.7999684803593239, "grad_norm": 0.16605164110660553, "learning_rate": 1.1151840329820915e-06, "loss": 0.3184, "step": 2856 }, { "epoch": 1.800598873172846, "grad_norm": 0.16677996516227722, "learning_rate": 1.1081955014162279e-06, "loss": 0.287, "step": 2857 }, { "epoch": 1.8012292659863678, "grad_norm": 0.17730239033699036, "learning_rate": 1.1012283833590457e-06, "loss": 0.3395, "step": 2858 }, { "epoch": 1.8018596587998896, "grad_norm": 0.20990370213985443, "learning_rate": 1.0942826857847358e-06, "loss": 0.3315, "step": 2859 }, { "epoch": 1.8024900516134115, "grad_norm": 0.19944962859153748, "learning_rate": 1.0873584156460343e-06, "loss": 0.3541, "step": 2860 }, { "epoch": 1.8031204444269335, "grad_norm": 0.19780515134334564, "learning_rate": 1.0804555798742492e-06, "loss": 0.432, "step": 2861 }, { "epoch": 1.8037508372404556, "grad_norm": 0.1663893759250641, "learning_rate": 1.0735741853792105e-06, "loss": 0.3963, "step": 2862 }, { "epoch": 1.8043812300539774, "grad_norm": 0.15331469476222992, "learning_rate": 1.0667142390493034e-06, "loss": 0.3588, "step": 2863 }, { "epoch": 1.8050116228674993, "grad_norm": 0.16489370167255402, "learning_rate": 1.0598757477514224e-06, "loss": 0.3506, "step": 2864 }, { "epoch": 1.8056420156810211, "grad_norm": 0.2095303237438202, "learning_rate": 1.053058718331009e-06, "loss": 0.3441, "step": 2865 }, { "epoch": 1.8062724084945432, "grad_norm": 0.15023407340049744, "learning_rate": 1.0462631576119996e-06, "loss": 0.3304, "step": 2866 }, { "epoch": 1.8069028013080652, "grad_norm": 0.15889757871627808, "learning_rate": 1.0394890723968607e-06, "loss": 0.3887, "step": 2867 }, { "epoch": 1.807533194121587, "grad_norm": 0.16765765845775604, "learning_rate": 1.0327364694665376e-06, "loss": 0.3276, "step": 2868 }, { "epoch": 1.808163586935109, "grad_norm": 0.14473390579223633, "learning_rate": 1.026005355580491e-06, "loss": 0.3083, "step": 2869 }, { "epoch": 1.8087939797486308, "grad_norm": 0.20213527977466583, "learning_rate": 1.0192957374766663e-06, "loss": 0.3985, "step": 2870 }, { "epoch": 1.8094243725621528, "grad_norm": 0.14654994010925293, "learning_rate": 1.0126076218714881e-06, "loss": 0.3177, "step": 2871 }, { "epoch": 1.8100547653756747, "grad_norm": 0.17342987656593323, "learning_rate": 1.0059410154598538e-06, "loss": 0.3344, "step": 2872 }, { "epoch": 1.8106851581891967, "grad_norm": 0.1641921103000641, "learning_rate": 9.992959249151373e-07, "loss": 0.3203, "step": 2873 }, { "epoch": 1.8113155510027186, "grad_norm": 0.14185187220573425, "learning_rate": 9.926723568891683e-07, "loss": 0.3284, "step": 2874 }, { "epoch": 1.8119459438162404, "grad_norm": 0.1535300761461258, "learning_rate": 9.860703180122427e-07, "loss": 0.3436, "step": 2875 }, { "epoch": 1.8125763366297625, "grad_norm": 0.16254208981990814, "learning_rate": 9.794898148930893e-07, "loss": 0.3308, "step": 2876 }, { "epoch": 1.8132067294432843, "grad_norm": 0.17688316106796265, "learning_rate": 9.729308541188886e-07, "loss": 0.3402, "step": 2877 }, { "epoch": 1.8138371222568064, "grad_norm": 0.1664654016494751, "learning_rate": 9.663934422552615e-07, "loss": 0.3684, "step": 2878 }, { "epoch": 1.8144675150703282, "grad_norm": 0.17551520466804504, "learning_rate": 9.598775858462505e-07, "loss": 0.3491, "step": 2879 }, { "epoch": 1.81509790788385, "grad_norm": 0.19198037683963776, "learning_rate": 9.53383291414321e-07, "loss": 0.392, "step": 2880 }, { "epoch": 1.815728300697372, "grad_norm": 0.1434807926416397, "learning_rate": 9.4691056546036e-07, "loss": 0.33, "step": 2881 }, { "epoch": 1.816358693510894, "grad_norm": 0.1824355274438858, "learning_rate": 9.404594144636523e-07, "loss": 0.434, "step": 2882 }, { "epoch": 1.816989086324416, "grad_norm": 0.18050681054592133, "learning_rate": 9.340298448819046e-07, "loss": 0.3432, "step": 2883 }, { "epoch": 1.8176194791379379, "grad_norm": 0.16399838030338287, "learning_rate": 9.276218631512063e-07, "loss": 0.3352, "step": 2884 }, { "epoch": 1.8182498719514597, "grad_norm": 0.1734437495470047, "learning_rate": 9.212354756860411e-07, "loss": 0.3327, "step": 2885 }, { "epoch": 1.8188802647649815, "grad_norm": 0.14438305795192719, "learning_rate": 9.14870688879273e-07, "loss": 0.3377, "step": 2886 }, { "epoch": 1.8195106575785036, "grad_norm": 0.14935597777366638, "learning_rate": 9.085275091021478e-07, "loss": 0.3107, "step": 2887 }, { "epoch": 1.8201410503920257, "grad_norm": 0.18067923188209534, "learning_rate": 9.022059427042814e-07, "loss": 0.333, "step": 2888 }, { "epoch": 1.8207714432055475, "grad_norm": 0.1463068276643753, "learning_rate": 8.959059960136596e-07, "loss": 0.3535, "step": 2889 }, { "epoch": 1.8214018360190694, "grad_norm": 0.18298909068107605, "learning_rate": 8.896276753366126e-07, "loss": 0.3211, "step": 2890 }, { "epoch": 1.8220322288325912, "grad_norm": 0.184007465839386, "learning_rate": 8.833709869578333e-07, "loss": 0.3909, "step": 2891 }, { "epoch": 1.8226626216461133, "grad_norm": 0.14271940290927887, "learning_rate": 8.771359371403592e-07, "loss": 0.3538, "step": 2892 }, { "epoch": 1.8232930144596353, "grad_norm": 0.18465162813663483, "learning_rate": 8.709225321255704e-07, "loss": 0.3873, "step": 2893 }, { "epoch": 1.8239234072731572, "grad_norm": 0.14754216372966766, "learning_rate": 8.64730778133169e-07, "loss": 0.3445, "step": 2894 }, { "epoch": 1.824553800086679, "grad_norm": 0.15001064538955688, "learning_rate": 8.585606813611947e-07, "loss": 0.3217, "step": 2895 }, { "epoch": 1.8251841929002008, "grad_norm": 0.19190865755081177, "learning_rate": 8.524122479860042e-07, "loss": 0.365, "step": 2896 }, { "epoch": 1.825814585713723, "grad_norm": 0.1557355672121048, "learning_rate": 8.462854841622719e-07, "loss": 0.3154, "step": 2897 }, { "epoch": 1.8264449785272447, "grad_norm": 0.16147379577159882, "learning_rate": 8.401803960229696e-07, "loss": 0.3415, "step": 2898 }, { "epoch": 1.8270753713407668, "grad_norm": 0.1741449236869812, "learning_rate": 8.340969896793885e-07, "loss": 0.3075, "step": 2899 }, { "epoch": 1.8277057641542886, "grad_norm": 0.22572091221809387, "learning_rate": 8.280352712211e-07, "loss": 0.3748, "step": 2900 }, { "epoch": 1.8283361569678105, "grad_norm": 0.18792511522769928, "learning_rate": 8.21995246715983e-07, "loss": 0.3281, "step": 2901 }, { "epoch": 1.8289665497813323, "grad_norm": 0.19930507242679596, "learning_rate": 8.159769222101785e-07, "loss": 0.3866, "step": 2902 }, { "epoch": 1.8295969425948544, "grad_norm": 0.15603318810462952, "learning_rate": 8.099803037281248e-07, "loss": 0.3551, "step": 2903 }, { "epoch": 1.8302273354083765, "grad_norm": 0.17856952548027039, "learning_rate": 8.040053972725179e-07, "loss": 0.343, "step": 2904 }, { "epoch": 1.8308577282218983, "grad_norm": 0.18663626909255981, "learning_rate": 7.980522088243358e-07, "loss": 0.3805, "step": 2905 }, { "epoch": 1.8314881210354201, "grad_norm": 0.14066052436828613, "learning_rate": 7.921207443427994e-07, "loss": 0.3002, "step": 2906 }, { "epoch": 1.832118513848942, "grad_norm": 0.1478758603334427, "learning_rate": 7.862110097653966e-07, "loss": 0.274, "step": 2907 }, { "epoch": 1.832748906662464, "grad_norm": 0.1529366672039032, "learning_rate": 7.803230110078502e-07, "loss": 0.3466, "step": 2908 }, { "epoch": 1.833379299475986, "grad_norm": 0.16557784378528595, "learning_rate": 7.744567539641406e-07, "loss": 0.3323, "step": 2909 }, { "epoch": 1.834009692289508, "grad_norm": 0.2191508263349533, "learning_rate": 7.686122445064731e-07, "loss": 0.4284, "step": 2910 }, { "epoch": 1.8346400851030298, "grad_norm": 0.21287937462329865, "learning_rate": 7.627894884852902e-07, "loss": 0.3492, "step": 2911 }, { "epoch": 1.8352704779165516, "grad_norm": 0.1814602017402649, "learning_rate": 7.569884917292519e-07, "loss": 0.3924, "step": 2912 }, { "epoch": 1.8359008707300737, "grad_norm": 0.18028287589550018, "learning_rate": 7.512092600452408e-07, "loss": 0.386, "step": 2913 }, { "epoch": 1.8365312635435957, "grad_norm": 0.15933865308761597, "learning_rate": 7.454517992183569e-07, "loss": 0.3295, "step": 2914 }, { "epoch": 1.8371616563571176, "grad_norm": 0.14482073485851288, "learning_rate": 7.397161150118998e-07, "loss": 0.3374, "step": 2915 }, { "epoch": 1.8377920491706394, "grad_norm": 0.17769312858581543, "learning_rate": 7.340022131673693e-07, "loss": 0.2856, "step": 2916 }, { "epoch": 1.8384224419841613, "grad_norm": 0.18236099183559418, "learning_rate": 7.283100994044678e-07, "loss": 0.3551, "step": 2917 }, { "epoch": 1.8390528347976833, "grad_norm": 0.138231098651886, "learning_rate": 7.226397794210848e-07, "loss": 0.3117, "step": 2918 }, { "epoch": 1.8396832276112054, "grad_norm": 0.18965476751327515, "learning_rate": 7.169912588932976e-07, "loss": 0.3806, "step": 2919 }, { "epoch": 1.8403136204247272, "grad_norm": 0.17229288816452026, "learning_rate": 7.11364543475348e-07, "loss": 0.2963, "step": 2920 }, { "epoch": 1.840944013238249, "grad_norm": 0.1469162553548813, "learning_rate": 7.057596387996707e-07, "loss": 0.2906, "step": 2921 }, { "epoch": 1.841574406051771, "grad_norm": 0.16973532736301422, "learning_rate": 7.001765504768451e-07, "loss": 0.3742, "step": 2922 }, { "epoch": 1.842204798865293, "grad_norm": 0.1712886393070221, "learning_rate": 6.946152840956358e-07, "loss": 0.397, "step": 2923 }, { "epoch": 1.8428351916788148, "grad_norm": 0.18453745543956757, "learning_rate": 6.890758452229471e-07, "loss": 0.3905, "step": 2924 }, { "epoch": 1.8434655844923369, "grad_norm": 0.16405226290225983, "learning_rate": 6.835582394038359e-07, "loss": 0.2815, "step": 2925 }, { "epoch": 1.8440959773058587, "grad_norm": 0.16712190210819244, "learning_rate": 6.780624721615092e-07, "loss": 0.3236, "step": 2926 }, { "epoch": 1.8447263701193806, "grad_norm": 0.19651249051094055, "learning_rate": 6.725885489973089e-07, "loss": 0.4105, "step": 2927 }, { "epoch": 1.8453567629329024, "grad_norm": 0.18198879063129425, "learning_rate": 6.671364753907116e-07, "loss": 0.3217, "step": 2928 }, { "epoch": 1.8459871557464245, "grad_norm": 0.1818927526473999, "learning_rate": 6.617062567993245e-07, "loss": 0.309, "step": 2929 }, { "epoch": 1.8466175485599465, "grad_norm": 0.17822176218032837, "learning_rate": 6.562978986588744e-07, "loss": 0.3661, "step": 2930 }, { "epoch": 1.8472479413734684, "grad_norm": 0.1938237100839615, "learning_rate": 6.50911406383208e-07, "loss": 0.3514, "step": 2931 }, { "epoch": 1.8478783341869902, "grad_norm": 0.1587604582309723, "learning_rate": 6.4554678536428e-07, "loss": 0.3102, "step": 2932 }, { "epoch": 1.848508727000512, "grad_norm": 0.17696255445480347, "learning_rate": 6.402040409721568e-07, "loss": 0.3909, "step": 2933 }, { "epoch": 1.8491391198140341, "grad_norm": 0.13390642404556274, "learning_rate": 6.348831785550033e-07, "loss": 0.2746, "step": 2934 }, { "epoch": 1.8497695126275562, "grad_norm": 0.13030406832695007, "learning_rate": 6.295842034390786e-07, "loss": 0.3389, "step": 2935 }, { "epoch": 1.850399905441078, "grad_norm": 0.12831594049930573, "learning_rate": 6.243071209287374e-07, "loss": 0.2808, "step": 2936 }, { "epoch": 1.8510302982545999, "grad_norm": 0.19339363276958466, "learning_rate": 6.19051936306414e-07, "loss": 0.4201, "step": 2937 }, { "epoch": 1.8516606910681217, "grad_norm": 0.1678164303302765, "learning_rate": 6.138186548326255e-07, "loss": 0.369, "step": 2938 }, { "epoch": 1.8522910838816438, "grad_norm": 0.17880061268806458, "learning_rate": 6.086072817459615e-07, "loss": 0.3824, "step": 2939 }, { "epoch": 1.8529214766951658, "grad_norm": 0.22390973567962646, "learning_rate": 6.034178222630865e-07, "loss": 0.4395, "step": 2940 }, { "epoch": 1.8535518695086877, "grad_norm": 0.15214017033576965, "learning_rate": 5.982502815787228e-07, "loss": 0.3766, "step": 2941 }, { "epoch": 1.8541822623222095, "grad_norm": 0.17880332469940186, "learning_rate": 5.931046648656524e-07, "loss": 0.4008, "step": 2942 }, { "epoch": 1.8548126551357313, "grad_norm": 0.1607338786125183, "learning_rate": 5.879809772747177e-07, "loss": 0.3282, "step": 2943 }, { "epoch": 1.8554430479492534, "grad_norm": 0.15719808638095856, "learning_rate": 5.82879223934796e-07, "loss": 0.355, "step": 2944 }, { "epoch": 1.8560734407627753, "grad_norm": 0.15147340297698975, "learning_rate": 5.777994099528272e-07, "loss": 0.3112, "step": 2945 }, { "epoch": 1.8567038335762973, "grad_norm": 0.15864863991737366, "learning_rate": 5.727415404137737e-07, "loss": 0.3386, "step": 2946 }, { "epoch": 1.8573342263898192, "grad_norm": 0.1567879170179367, "learning_rate": 5.67705620380638e-07, "loss": 0.3387, "step": 2947 }, { "epoch": 1.857964619203341, "grad_norm": 0.14126014709472656, "learning_rate": 5.626916548944478e-07, "loss": 0.3969, "step": 2948 }, { "epoch": 1.858595012016863, "grad_norm": 0.1492016315460205, "learning_rate": 5.576996489742581e-07, "loss": 0.3638, "step": 2949 }, { "epoch": 1.859225404830385, "grad_norm": 0.21211956441402435, "learning_rate": 5.527296076171368e-07, "loss": 0.382, "step": 2950 }, { "epoch": 1.859855797643907, "grad_norm": 0.17853829264640808, "learning_rate": 5.477815357981742e-07, "loss": 0.3108, "step": 2951 }, { "epoch": 1.8604861904574288, "grad_norm": 0.18469850718975067, "learning_rate": 5.428554384704557e-07, "loss": 0.365, "step": 2952 }, { "epoch": 1.8611165832709506, "grad_norm": 0.15979118645191193, "learning_rate": 5.379513205650768e-07, "loss": 0.2674, "step": 2953 }, { "epoch": 1.8617469760844725, "grad_norm": 0.21204063296318054, "learning_rate": 5.330691869911359e-07, "loss": 0.3816, "step": 2954 }, { "epoch": 1.8623773688979945, "grad_norm": 0.19262298941612244, "learning_rate": 5.282090426357208e-07, "loss": 0.3409, "step": 2955 }, { "epoch": 1.8630077617115166, "grad_norm": 0.16887032985687256, "learning_rate": 5.233708923638978e-07, "loss": 0.3895, "step": 2956 }, { "epoch": 1.8636381545250384, "grad_norm": 0.15567231178283691, "learning_rate": 5.185547410187349e-07, "loss": 0.3607, "step": 2957 }, { "epoch": 1.8642685473385603, "grad_norm": 0.17275381088256836, "learning_rate": 5.137605934212661e-07, "loss": 0.2954, "step": 2958 }, { "epoch": 1.8648989401520821, "grad_norm": 0.18041570484638214, "learning_rate": 5.089884543705073e-07, "loss": 0.4095, "step": 2959 }, { "epoch": 1.8655293329656042, "grad_norm": 0.16471639275550842, "learning_rate": 5.042383286434324e-07, "loss": 0.3647, "step": 2960 }, { "epoch": 1.8661597257791263, "grad_norm": 0.15917225182056427, "learning_rate": 4.995102209949904e-07, "loss": 0.3822, "step": 2961 }, { "epoch": 1.866790118592648, "grad_norm": 0.174025759100914, "learning_rate": 4.948041361580852e-07, "loss": 0.3648, "step": 2962 }, { "epoch": 1.86742051140617, "grad_norm": 0.151805579662323, "learning_rate": 4.901200788435811e-07, "loss": 0.3146, "step": 2963 }, { "epoch": 1.8680509042196918, "grad_norm": 0.19342181086540222, "learning_rate": 4.8545805374028e-07, "loss": 0.3677, "step": 2964 }, { "epoch": 1.8686812970332138, "grad_norm": 0.14720351994037628, "learning_rate": 4.808180655149438e-07, "loss": 0.3263, "step": 2965 }, { "epoch": 1.869311689846736, "grad_norm": 0.1777946650981903, "learning_rate": 4.7620011881226224e-07, "loss": 0.3635, "step": 2966 }, { "epoch": 1.8699420826602577, "grad_norm": 0.16213665902614594, "learning_rate": 4.7160421825487493e-07, "loss": 0.3436, "step": 2967 }, { "epoch": 1.8705724754737796, "grad_norm": 0.18879404664039612, "learning_rate": 4.670303684433394e-07, "loss": 0.3891, "step": 2968 }, { "epoch": 1.8712028682873014, "grad_norm": 0.13427011668682098, "learning_rate": 4.6247857395615076e-07, "loss": 0.3509, "step": 2969 }, { "epoch": 1.8718332611008235, "grad_norm": 0.18270568549633026, "learning_rate": 4.579488393497167e-07, "loss": 0.3303, "step": 2970 }, { "epoch": 1.8724636539143453, "grad_norm": 0.16502298414707184, "learning_rate": 4.534411691583701e-07, "loss": 0.3537, "step": 2971 }, { "epoch": 1.8730940467278674, "grad_norm": 0.2042229175567627, "learning_rate": 4.489555678943541e-07, "loss": 0.386, "step": 2972 }, { "epoch": 1.8737244395413892, "grad_norm": 0.1804782897233963, "learning_rate": 4.4449204004782445e-07, "loss": 0.3777, "step": 2973 }, { "epoch": 1.874354832354911, "grad_norm": 0.200051411986351, "learning_rate": 4.400505900868321e-07, "loss": 0.34, "step": 2974 }, { "epoch": 1.8749852251684331, "grad_norm": 0.1387588083744049, "learning_rate": 4.3563122245733805e-07, "loss": 0.2821, "step": 2975 }, { "epoch": 1.875615617981955, "grad_norm": 0.16911667585372925, "learning_rate": 4.312339415831912e-07, "loss": 0.3272, "step": 2976 }, { "epoch": 1.876246010795477, "grad_norm": 0.18176445364952087, "learning_rate": 4.2685875186613547e-07, "loss": 0.3403, "step": 2977 }, { "epoch": 1.8768764036089989, "grad_norm": 0.15195323526859283, "learning_rate": 4.225056576857975e-07, "loss": 0.337, "step": 2978 }, { "epoch": 1.8775067964225207, "grad_norm": 0.17995168268680573, "learning_rate": 4.1817466339968914e-07, "loss": 0.3082, "step": 2979 }, { "epoch": 1.8781371892360426, "grad_norm": 0.18412546813488007, "learning_rate": 4.1386577334319997e-07, "loss": 0.3054, "step": 2980 }, { "epoch": 1.8787675820495646, "grad_norm": 0.18003132939338684, "learning_rate": 4.0957899182959226e-07, "loss": 0.3801, "step": 2981 }, { "epoch": 1.8793979748630867, "grad_norm": 0.17081038653850555, "learning_rate": 4.053143231499934e-07, "loss": 0.3192, "step": 2982 }, { "epoch": 1.8800283676766085, "grad_norm": 0.1643000692129135, "learning_rate": 4.010717715734036e-07, "loss": 0.3418, "step": 2983 }, { "epoch": 1.8806587604901304, "grad_norm": 0.12995314598083496, "learning_rate": 3.9685134134667317e-07, "loss": 0.2888, "step": 2984 }, { "epoch": 1.8812891533036522, "grad_norm": 0.1708272397518158, "learning_rate": 3.9265303669452277e-07, "loss": 0.3322, "step": 2985 }, { "epoch": 1.8819195461171743, "grad_norm": 0.16013602912425995, "learning_rate": 3.884768618195106e-07, "loss": 0.3303, "step": 2986 }, { "epoch": 1.8825499389306963, "grad_norm": 0.1538516879081726, "learning_rate": 3.8432282090205023e-07, "loss": 0.3555, "step": 2987 }, { "epoch": 1.8831803317442182, "grad_norm": 0.19950860738754272, "learning_rate": 3.8019091810039544e-07, "loss": 0.4115, "step": 2988 }, { "epoch": 1.88381072455774, "grad_norm": 0.20162293314933777, "learning_rate": 3.760811575506426e-07, "loss": 0.3634, "step": 2989 }, { "epoch": 1.8844411173712619, "grad_norm": 0.17130547761917114, "learning_rate": 3.71993543366721e-07, "loss": 0.3337, "step": 2990 }, { "epoch": 1.885071510184784, "grad_norm": 0.14211808145046234, "learning_rate": 3.679280796403975e-07, "loss": 0.3429, "step": 2991 }, { "epoch": 1.885701902998306, "grad_norm": 0.16880738735198975, "learning_rate": 3.638847704412543e-07, "loss": 0.3424, "step": 2992 }, { "epoch": 1.8863322958118278, "grad_norm": 0.18482467532157898, "learning_rate": 3.598636198167038e-07, "loss": 0.3143, "step": 2993 }, { "epoch": 1.8869626886253497, "grad_norm": 0.15687884390354156, "learning_rate": 3.5586463179197864e-07, "loss": 0.2924, "step": 2994 }, { "epoch": 1.8875930814388715, "grad_norm": 0.19636908173561096, "learning_rate": 3.518878103701268e-07, "loss": 0.3539, "step": 2995 }, { "epoch": 1.8882234742523936, "grad_norm": 0.15740831196308136, "learning_rate": 3.4793315953199624e-07, "loss": 0.3046, "step": 2996 }, { "epoch": 1.8888538670659154, "grad_norm": 0.13536562025547028, "learning_rate": 3.4400068323625803e-07, "loss": 0.3097, "step": 2997 }, { "epoch": 1.8894842598794375, "grad_norm": 0.1597716510295868, "learning_rate": 3.4009038541937574e-07, "loss": 0.3146, "step": 2998 }, { "epoch": 1.8901146526929593, "grad_norm": 0.1682119220495224, "learning_rate": 3.3620226999561325e-07, "loss": 0.4298, "step": 2999 }, { "epoch": 1.8907450455064811, "grad_norm": 0.20387521386146545, "learning_rate": 3.3233634085703236e-07, "loss": 0.4097, "step": 3000 }, { "epoch": 1.8907450455064811, "eval_loss": 0.41817376017570496, "eval_runtime": 222.7094, "eval_samples_per_second": 4.49, "eval_steps_per_second": 4.49, "step": 3000 }, { "epoch": 1.891375438320003, "grad_norm": 0.19256849586963654, "learning_rate": 3.284926018734849e-07, "loss": 0.309, "step": 3001 }, { "epoch": 1.892005831133525, "grad_norm": 0.20908156037330627, "learning_rate": 3.246710568926081e-07, "loss": 0.396, "step": 3002 }, { "epoch": 1.8926362239470471, "grad_norm": 0.14932669699192047, "learning_rate": 3.20871709739827e-07, "loss": 0.3048, "step": 3003 }, { "epoch": 1.893266616760569, "grad_norm": 0.1737649291753769, "learning_rate": 3.170945642183393e-07, "loss": 0.3218, "step": 3004 }, { "epoch": 1.8938970095740908, "grad_norm": 0.1602880358695984, "learning_rate": 3.133396241091305e-07, "loss": 0.3153, "step": 3005 }, { "epoch": 1.8945274023876126, "grad_norm": 0.15219104290008545, "learning_rate": 3.0960689317093896e-07, "loss": 0.3298, "step": 3006 }, { "epoch": 1.8951577952011347, "grad_norm": 0.16922716796398163, "learning_rate": 3.0589637514029326e-07, "loss": 0.3161, "step": 3007 }, { "epoch": 1.8957881880146568, "grad_norm": 0.18126600980758667, "learning_rate": 3.0220807373146976e-07, "loss": 0.3512, "step": 3008 }, { "epoch": 1.8964185808281786, "grad_norm": 0.17149627208709717, "learning_rate": 2.985419926365177e-07, "loss": 0.3605, "step": 3009 }, { "epoch": 1.8970489736417004, "grad_norm": 0.18697969615459442, "learning_rate": 2.9489813552523146e-07, "loss": 0.3286, "step": 3010 }, { "epoch": 1.8976793664552223, "grad_norm": 0.17346805334091187, "learning_rate": 2.912765060451683e-07, "loss": 0.3158, "step": 3011 }, { "epoch": 1.8983097592687443, "grad_norm": 0.16184809803962708, "learning_rate": 2.876771078216334e-07, "loss": 0.2843, "step": 3012 }, { "epoch": 1.8989401520822664, "grad_norm": 0.16137875616550446, "learning_rate": 2.8409994445767696e-07, "loss": 0.3306, "step": 3013 }, { "epoch": 1.8995705448957882, "grad_norm": 0.17828133702278137, "learning_rate": 2.8054501953408725e-07, "loss": 0.3278, "step": 3014 }, { "epoch": 1.90020093770931, "grad_norm": 0.17460158467292786, "learning_rate": 2.770123366094027e-07, "loss": 0.3224, "step": 3015 }, { "epoch": 1.900831330522832, "grad_norm": 0.18349096179008484, "learning_rate": 2.735018992198846e-07, "loss": 0.3714, "step": 3016 }, { "epoch": 1.901461723336354, "grad_norm": 0.17420694231987, "learning_rate": 2.7001371087953697e-07, "loss": 0.3074, "step": 3017 }, { "epoch": 1.902092116149876, "grad_norm": 0.17939402163028717, "learning_rate": 2.665477750800818e-07, "loss": 0.3582, "step": 3018 }, { "epoch": 1.902722508963398, "grad_norm": 0.18356801569461823, "learning_rate": 2.6310409529097636e-07, "loss": 0.3741, "step": 3019 }, { "epoch": 1.9033529017769197, "grad_norm": 0.16717529296875, "learning_rate": 2.5968267495939056e-07, "loss": 0.3853, "step": 3020 }, { "epoch": 1.9039832945904416, "grad_norm": 0.15661168098449707, "learning_rate": 2.562835175102199e-07, "loss": 0.3774, "step": 3021 }, { "epoch": 1.9046136874039636, "grad_norm": 0.15401984751224518, "learning_rate": 2.5290662634606997e-07, "loss": 0.3381, "step": 3022 }, { "epoch": 1.9052440802174855, "grad_norm": 0.20636636018753052, "learning_rate": 2.495520048472569e-07, "loss": 0.3507, "step": 3023 }, { "epoch": 1.9058744730310075, "grad_norm": 0.1823091357946396, "learning_rate": 2.4621965637180195e-07, "loss": 0.283, "step": 3024 }, { "epoch": 1.9065048658445294, "grad_norm": 0.14718377590179443, "learning_rate": 2.4290958425544187e-07, "loss": 0.4077, "step": 3025 }, { "epoch": 1.9071352586580512, "grad_norm": 0.15838302671909332, "learning_rate": 2.396217918116037e-07, "loss": 0.3339, "step": 3026 }, { "epoch": 1.907765651471573, "grad_norm": 0.18296922743320465, "learning_rate": 2.363562823314147e-07, "loss": 0.3178, "step": 3027 }, { "epoch": 1.9083960442850951, "grad_norm": 0.19694828987121582, "learning_rate": 2.3311305908369774e-07, "loss": 0.3308, "step": 3028 }, { "epoch": 1.9090264370986172, "grad_norm": 0.166620671749115, "learning_rate": 2.298921253149708e-07, "loss": 0.2882, "step": 3029 }, { "epoch": 1.909656829912139, "grad_norm": 0.18683145940303802, "learning_rate": 2.2669348424942977e-07, "loss": 0.3599, "step": 3030 }, { "epoch": 1.9102872227256609, "grad_norm": 0.18327577412128448, "learning_rate": 2.2351713908896594e-07, "loss": 0.2976, "step": 3031 }, { "epoch": 1.9109176155391827, "grad_norm": 0.1980234980583191, "learning_rate": 2.203630930131434e-07, "loss": 0.3111, "step": 3032 }, { "epoch": 1.9115480083527048, "grad_norm": 0.18122927844524384, "learning_rate": 2.1723134917920913e-07, "loss": 0.4303, "step": 3033 }, { "epoch": 1.9121784011662268, "grad_norm": 0.17239895462989807, "learning_rate": 2.1412191072208543e-07, "loss": 0.3429, "step": 3034 }, { "epoch": 1.9128087939797487, "grad_norm": 0.15282370150089264, "learning_rate": 2.110347807543675e-07, "loss": 0.3253, "step": 3035 }, { "epoch": 1.9134391867932705, "grad_norm": 0.18070636689662933, "learning_rate": 2.079699623663134e-07, "loss": 0.3788, "step": 3036 }, { "epoch": 1.9140695796067924, "grad_norm": 0.20107102394104004, "learning_rate": 2.049274586258515e-07, "loss": 0.3869, "step": 3037 }, { "epoch": 1.9146999724203144, "grad_norm": 0.18471233546733856, "learning_rate": 2.019072725785756e-07, "loss": 0.4217, "step": 3038 }, { "epoch": 1.9153303652338365, "grad_norm": 0.15934079885482788, "learning_rate": 1.9890940724773738e-07, "loss": 0.3242, "step": 3039 }, { "epoch": 1.9159607580473583, "grad_norm": 0.16995728015899658, "learning_rate": 1.9593386563423882e-07, "loss": 0.3333, "step": 3040 }, { "epoch": 1.9165911508608802, "grad_norm": 0.2071707844734192, "learning_rate": 1.9298065071663985e-07, "loss": 0.3952, "step": 3041 }, { "epoch": 1.917221543674402, "grad_norm": 0.17585161328315735, "learning_rate": 1.900497654511582e-07, "loss": 0.3582, "step": 3042 }, { "epoch": 1.917851936487924, "grad_norm": 0.18772096931934357, "learning_rate": 1.8714121277164958e-07, "loss": 0.326, "step": 3043 }, { "epoch": 1.918482329301446, "grad_norm": 0.15703828632831573, "learning_rate": 1.8425499558961502e-07, "loss": 0.345, "step": 3044 }, { "epoch": 1.919112722114968, "grad_norm": 0.1644994169473648, "learning_rate": 1.813911167942034e-07, "loss": 0.3261, "step": 3045 }, { "epoch": 1.9197431149284898, "grad_norm": 0.15160581469535828, "learning_rate": 1.7854957925219657e-07, "loss": 0.3345, "step": 3046 }, { "epoch": 1.9203735077420117, "grad_norm": 0.16602809727191925, "learning_rate": 1.757303858080167e-07, "loss": 0.328, "step": 3047 }, { "epoch": 1.9210039005555337, "grad_norm": 0.16758157312870026, "learning_rate": 1.7293353928371893e-07, "loss": 0.3538, "step": 3048 }, { "epoch": 1.9216342933690556, "grad_norm": 0.18970537185668945, "learning_rate": 1.7015904247898618e-07, "loss": 0.3269, "step": 3049 }, { "epoch": 1.9222646861825776, "grad_norm": 0.15211471915245056, "learning_rate": 1.6740689817112936e-07, "loss": 0.35, "step": 3050 }, { "epoch": 1.9228950789960995, "grad_norm": 0.14648528397083282, "learning_rate": 1.6467710911508476e-07, "loss": 0.2858, "step": 3051 }, { "epoch": 1.9235254718096213, "grad_norm": 0.22460030019283295, "learning_rate": 1.6196967804341408e-07, "loss": 0.4544, "step": 3052 }, { "epoch": 1.9241558646231431, "grad_norm": 0.14347076416015625, "learning_rate": 1.5928460766629432e-07, "loss": 0.3336, "step": 3053 }, { "epoch": 1.9247862574366652, "grad_norm": 0.2079204022884369, "learning_rate": 1.5662190067151805e-07, "loss": 0.3914, "step": 3054 }, { "epoch": 1.9254166502501873, "grad_norm": 0.1357291042804718, "learning_rate": 1.5398155972449314e-07, "loss": 0.2695, "step": 3055 }, { "epoch": 1.926047043063709, "grad_norm": 0.1850150227546692, "learning_rate": 1.5136358746824287e-07, "loss": 0.3429, "step": 3056 }, { "epoch": 1.926677435877231, "grad_norm": 0.15783745050430298, "learning_rate": 1.4876798652339595e-07, "loss": 0.39, "step": 3057 }, { "epoch": 1.9273078286907528, "grad_norm": 0.17066796123981476, "learning_rate": 1.4619475948818407e-07, "loss": 0.3225, "step": 3058 }, { "epoch": 1.9279382215042749, "grad_norm": 0.15452978014945984, "learning_rate": 1.436439089384467e-07, "loss": 0.3813, "step": 3059 }, { "epoch": 1.928568614317797, "grad_norm": 0.1883036345243454, "learning_rate": 1.4111543742762135e-07, "loss": 0.3995, "step": 3060 }, { "epoch": 1.9291990071313188, "grad_norm": 0.14374081790447235, "learning_rate": 1.3860934748674588e-07, "loss": 0.3013, "step": 3061 }, { "epoch": 1.9298293999448406, "grad_norm": 0.21677044034004211, "learning_rate": 1.3612564162445106e-07, "loss": 0.4131, "step": 3062 }, { "epoch": 1.9304597927583624, "grad_norm": 0.16398075222969055, "learning_rate": 1.3366432232696568e-07, "loss": 0.3285, "step": 3063 }, { "epoch": 1.9310901855718845, "grad_norm": 0.15119622647762299, "learning_rate": 1.3122539205810387e-07, "loss": 0.2927, "step": 3064 }, { "epoch": 1.9317205783854066, "grad_norm": 0.1611369550228119, "learning_rate": 1.2880885325927268e-07, "loss": 0.3363, "step": 3065 }, { "epoch": 1.9323509711989284, "grad_norm": 0.1843705177307129, "learning_rate": 1.2641470834945963e-07, "loss": 0.3361, "step": 3066 }, { "epoch": 1.9329813640124502, "grad_norm": 0.21802350878715515, "learning_rate": 1.2404295972524266e-07, "loss": 0.4132, "step": 3067 }, { "epoch": 1.933611756825972, "grad_norm": 0.1921495646238327, "learning_rate": 1.2169360976077266e-07, "loss": 0.3443, "step": 3068 }, { "epoch": 1.9342421496394941, "grad_norm": 0.1394999772310257, "learning_rate": 1.193666608077859e-07, "loss": 0.2871, "step": 3069 }, { "epoch": 1.934872542453016, "grad_norm": 0.1905948966741562, "learning_rate": 1.1706211519559169e-07, "loss": 0.4448, "step": 3070 }, { "epoch": 1.935502935266538, "grad_norm": 0.18064671754837036, "learning_rate": 1.1477997523107464e-07, "loss": 0.3654, "step": 3071 }, { "epoch": 1.93613332808006, "grad_norm": 0.16649015247821808, "learning_rate": 1.1252024319868992e-07, "loss": 0.3733, "step": 3072 }, { "epoch": 1.9367637208935817, "grad_norm": 0.19247853755950928, "learning_rate": 1.1028292136046059e-07, "loss": 0.4663, "step": 3073 }, { "epoch": 1.9373941137071038, "grad_norm": 0.18122366070747375, "learning_rate": 1.0806801195597766e-07, "loss": 0.3317, "step": 3074 }, { "epoch": 1.9380245065206256, "grad_norm": 0.15887488424777985, "learning_rate": 1.0587551720240258e-07, "loss": 0.3566, "step": 3075 }, { "epoch": 1.9386548993341477, "grad_norm": 0.19291207194328308, "learning_rate": 1.0370543929444975e-07, "loss": 0.3351, "step": 3076 }, { "epoch": 1.9392852921476695, "grad_norm": 0.1809178739786148, "learning_rate": 1.0155778040440156e-07, "loss": 0.3887, "step": 3077 }, { "epoch": 1.9399156849611914, "grad_norm": 0.1738986223936081, "learning_rate": 9.943254268209579e-08, "loss": 0.3293, "step": 3078 }, { "epoch": 1.9405460777747132, "grad_norm": 0.16989794373512268, "learning_rate": 9.73297282549257e-08, "loss": 0.3517, "step": 3079 }, { "epoch": 1.9411764705882353, "grad_norm": 0.17106318473815918, "learning_rate": 9.524933922784002e-08, "loss": 0.3602, "step": 3080 }, { "epoch": 1.9418068634017573, "grad_norm": 0.18975426256656647, "learning_rate": 9.319137768333541e-08, "loss": 0.3415, "step": 3081 }, { "epoch": 1.9424372562152792, "grad_norm": 0.17912082374095917, "learning_rate": 9.115584568146651e-08, "loss": 0.3994, "step": 3082 }, { "epoch": 1.943067649028801, "grad_norm": 0.16902948915958405, "learning_rate": 8.914274525982841e-08, "loss": 0.3515, "step": 3083 }, { "epoch": 1.9436980418423229, "grad_norm": 0.17020246386528015, "learning_rate": 8.715207843356165e-08, "loss": 0.2807, "step": 3084 }, { "epoch": 1.944328434655845, "grad_norm": 0.1424531638622284, "learning_rate": 8.518384719535477e-08, "loss": 0.3058, "step": 3085 }, { "epoch": 1.944958827469367, "grad_norm": 0.17307209968566895, "learning_rate": 8.323805351543673e-08, "loss": 0.3989, "step": 3086 }, { "epoch": 1.9455892202828888, "grad_norm": 0.197873055934906, "learning_rate": 8.131469934157448e-08, "loss": 0.365, "step": 3087 }, { "epoch": 1.9462196130964107, "grad_norm": 0.14806973934173584, "learning_rate": 7.941378659907295e-08, "loss": 0.3045, "step": 3088 }, { "epoch": 1.9468500059099325, "grad_norm": 0.14717578887939453, "learning_rate": 7.75353171907775e-08, "loss": 0.3438, "step": 3089 }, { "epoch": 1.9474803987234546, "grad_norm": 0.15752723813056946, "learning_rate": 7.567929299705651e-08, "loss": 0.3682, "step": 3090 }, { "epoch": 1.9481107915369766, "grad_norm": 0.1642657220363617, "learning_rate": 7.384571587582378e-08, "loss": 0.3269, "step": 3091 }, { "epoch": 1.9487411843504985, "grad_norm": 0.17168262600898743, "learning_rate": 7.203458766251364e-08, "loss": 0.3386, "step": 3092 }, { "epoch": 1.9493715771640203, "grad_norm": 0.18933595716953278, "learning_rate": 7.024591017009086e-08, "loss": 0.4002, "step": 3093 }, { "epoch": 1.9500019699775422, "grad_norm": 0.13463959097862244, "learning_rate": 6.847968518905068e-08, "loss": 0.2955, "step": 3094 }, { "epoch": 1.9506323627910642, "grad_norm": 0.19657647609710693, "learning_rate": 6.673591448740885e-08, "loss": 0.3551, "step": 3095 }, { "epoch": 1.951262755604586, "grad_norm": 0.21734939515590668, "learning_rate": 6.501459981070407e-08, "loss": 0.4166, "step": 3096 }, { "epoch": 1.9518931484181081, "grad_norm": 0.16016525030136108, "learning_rate": 6.331574288200054e-08, "loss": 0.3323, "step": 3097 }, { "epoch": 1.95252354123163, "grad_norm": 0.16205845773220062, "learning_rate": 6.163934540187544e-08, "loss": 0.351, "step": 3098 }, { "epoch": 1.9531539340451518, "grad_norm": 0.1572217047214508, "learning_rate": 5.998540904842643e-08, "loss": 0.3103, "step": 3099 }, { "epoch": 1.9537843268586736, "grad_norm": 0.17913149297237396, "learning_rate": 5.835393547727414e-08, "loss": 0.3093, "step": 3100 }, { "epoch": 1.9544147196721957, "grad_norm": 0.16652511060237885, "learning_rate": 5.6744926321539715e-08, "loss": 0.3165, "step": 3101 }, { "epoch": 1.9550451124857178, "grad_norm": 0.1753828376531601, "learning_rate": 5.5158383191872275e-08, "loss": 0.3602, "step": 3102 }, { "epoch": 1.9556755052992396, "grad_norm": 0.1615012139081955, "learning_rate": 5.3594307676416436e-08, "loss": 0.2906, "step": 3103 }, { "epoch": 1.9563058981127615, "grad_norm": 0.16111844778060913, "learning_rate": 5.205270134084228e-08, "loss": 0.3132, "step": 3104 }, { "epoch": 1.9569362909262833, "grad_norm": 0.21132858097553253, "learning_rate": 5.0533565728315415e-08, "loss": 0.3911, "step": 3105 }, { "epoch": 1.9575666837398054, "grad_norm": 0.16164764761924744, "learning_rate": 4.903690235951441e-08, "loss": 0.2846, "step": 3106 }, { "epoch": 1.9581970765533274, "grad_norm": 0.14004522562026978, "learning_rate": 4.756271273262336e-08, "loss": 0.3344, "step": 3107 }, { "epoch": 1.9588274693668493, "grad_norm": 0.13915061950683594, "learning_rate": 4.611099832332183e-08, "loss": 0.3273, "step": 3108 }, { "epoch": 1.959457862180371, "grad_norm": 0.16548193991184235, "learning_rate": 4.4681760584804887e-08, "loss": 0.3241, "step": 3109 }, { "epoch": 1.960088254993893, "grad_norm": 0.16104203462600708, "learning_rate": 4.327500094775311e-08, "loss": 0.3391, "step": 3110 }, { "epoch": 1.960718647807415, "grad_norm": 0.15194769203662872, "learning_rate": 4.1890720820357555e-08, "loss": 0.2724, "step": 3111 }, { "epoch": 1.961349040620937, "grad_norm": 0.18014349043369293, "learning_rate": 4.0528921588299795e-08, "loss": 0.3043, "step": 3112 }, { "epoch": 1.961979433434459, "grad_norm": 0.14656884968280792, "learning_rate": 3.91896046147644e-08, "loss": 0.3392, "step": 3113 }, { "epoch": 1.9626098262479807, "grad_norm": 0.15012793242931366, "learning_rate": 3.787277124042643e-08, "loss": 0.3175, "step": 3114 }, { "epoch": 1.9632402190615026, "grad_norm": 0.14461544156074524, "learning_rate": 3.657842278345397e-08, "loss": 0.2879, "step": 3115 }, { "epoch": 1.9638706118750247, "grad_norm": 0.13914228975772858, "learning_rate": 3.530656053950809e-08, "loss": 0.321, "step": 3116 }, { "epoch": 1.9645010046885467, "grad_norm": 0.18909741938114166, "learning_rate": 3.405718578174289e-08, "loss": 0.3458, "step": 3117 }, { "epoch": 1.9651313975020686, "grad_norm": 0.1531994789838791, "learning_rate": 3.283029976080043e-08, "loss": 0.337, "step": 3118 }, { "epoch": 1.9657617903155904, "grad_norm": 0.1318083256483078, "learning_rate": 3.162590370481333e-08, "loss": 0.277, "step": 3119 }, { "epoch": 1.9663921831291122, "grad_norm": 0.18918323516845703, "learning_rate": 3.044399881939469e-08, "loss": 0.3567, "step": 3120 }, { "epoch": 1.9670225759426343, "grad_norm": 0.1714440882205963, "learning_rate": 2.9284586287655626e-08, "loss": 0.3354, "step": 3121 }, { "epoch": 1.9676529687561561, "grad_norm": 0.17648588120937347, "learning_rate": 2.814766727017776e-08, "loss": 0.3654, "step": 3122 }, { "epoch": 1.9682833615696782, "grad_norm": 0.16460299491882324, "learning_rate": 2.7033242905038226e-08, "loss": 0.3516, "step": 3123 }, { "epoch": 1.9689137543832, "grad_norm": 0.18309004604816437, "learning_rate": 2.594131430778968e-08, "loss": 0.3671, "step": 3124 }, { "epoch": 1.9695441471967219, "grad_norm": 0.16385690867900848, "learning_rate": 2.4871882571472772e-08, "loss": 0.3237, "step": 3125 }, { "epoch": 1.9701745400102437, "grad_norm": 0.15146617591381073, "learning_rate": 2.382494876659869e-08, "loss": 0.3169, "step": 3126 }, { "epoch": 1.9708049328237658, "grad_norm": 0.14356575906276703, "learning_rate": 2.2800513941166614e-08, "loss": 0.3171, "step": 3127 }, { "epoch": 1.9714353256372878, "grad_norm": 0.1531408727169037, "learning_rate": 2.1798579120648756e-08, "loss": 0.351, "step": 3128 }, { "epoch": 1.9720657184508097, "grad_norm": 0.16242480278015137, "learning_rate": 2.0819145308000333e-08, "loss": 0.34, "step": 3129 }, { "epoch": 1.9726961112643315, "grad_norm": 0.2029055804014206, "learning_rate": 1.9862213483644588e-08, "loss": 0.3804, "step": 3130 }, { "epoch": 1.9733265040778534, "grad_norm": 0.1916704624891281, "learning_rate": 1.8927784605487774e-08, "loss": 0.3823, "step": 3131 }, { "epoch": 1.9739568968913754, "grad_norm": 0.2057744860649109, "learning_rate": 1.8015859608904175e-08, "loss": 0.3805, "step": 3132 }, { "epoch": 1.9745872897048975, "grad_norm": 0.18788807094097137, "learning_rate": 1.7126439406746077e-08, "loss": 0.3535, "step": 3133 }, { "epoch": 1.9752176825184193, "grad_norm": 0.19672442972660065, "learning_rate": 1.62595248893338e-08, "loss": 0.3592, "step": 3134 }, { "epoch": 1.9758480753319412, "grad_norm": 0.20926621556282043, "learning_rate": 1.541511692446318e-08, "loss": 0.4156, "step": 3135 }, { "epoch": 1.976478468145463, "grad_norm": 0.15115287899971008, "learning_rate": 1.4593216357400574e-08, "loss": 0.2962, "step": 3136 }, { "epoch": 1.977108860958985, "grad_norm": 0.182939812541008, "learning_rate": 1.3793824010875366e-08, "loss": 0.3647, "step": 3137 }, { "epoch": 1.9777392537725071, "grad_norm": 0.16283048689365387, "learning_rate": 1.3016940685094959e-08, "loss": 0.3349, "step": 3138 }, { "epoch": 1.978369646586029, "grad_norm": 0.15308210253715515, "learning_rate": 1.2262567157729777e-08, "loss": 0.3077, "step": 3139 }, { "epoch": 1.9790000393995508, "grad_norm": 0.1855187863111496, "learning_rate": 1.1530704183918273e-08, "loss": 0.3904, "step": 3140 }, { "epoch": 1.9796304322130727, "grad_norm": 0.2238065004348755, "learning_rate": 1.0821352496269422e-08, "loss": 0.3726, "step": 3141 }, { "epoch": 1.9802608250265947, "grad_norm": 0.15885622799396515, "learning_rate": 1.013451280484773e-08, "loss": 0.3599, "step": 3142 }, { "epoch": 1.9808912178401166, "grad_norm": 0.1552300751209259, "learning_rate": 9.470185797195718e-09, "loss": 0.3267, "step": 3143 }, { "epoch": 1.9815216106536386, "grad_norm": 0.14502142369747162, "learning_rate": 8.828372138313945e-09, "loss": 0.3253, "step": 3144 }, { "epoch": 1.9821520034671605, "grad_norm": 0.15471288561820984, "learning_rate": 8.209072470663493e-09, "loss": 0.3686, "step": 3145 }, { "epoch": 1.9827823962806823, "grad_norm": 0.20301209390163422, "learning_rate": 7.612287414175968e-09, "loss": 0.3457, "step": 3146 }, { "epoch": 1.9834127890942044, "grad_norm": 0.18437117338180542, "learning_rate": 7.038017566238509e-09, "loss": 0.3005, "step": 3147 }, { "epoch": 1.9840431819077262, "grad_norm": 0.16809889674186707, "learning_rate": 6.486263501708778e-09, "loss": 0.3795, "step": 3148 }, { "epoch": 1.9846735747212483, "grad_norm": 0.17451220750808716, "learning_rate": 5.9570257728949754e-09, "loss": 0.398, "step": 3149 }, { "epoch": 1.9853039675347701, "grad_norm": 0.17353513836860657, "learning_rate": 5.450304909575821e-09, "loss": 0.3307, "step": 3150 }, { "epoch": 1.985934360348292, "grad_norm": 0.2020755410194397, "learning_rate": 4.966101418985569e-09, "loss": 0.3615, "step": 3151 }, { "epoch": 1.9865647531618138, "grad_norm": 0.18863660097122192, "learning_rate": 4.504415785816507e-09, "loss": 0.3231, "step": 3152 }, { "epoch": 1.9871951459753359, "grad_norm": 0.20244933664798737, "learning_rate": 4.0652484722264466e-09, "loss": 0.3875, "step": 3153 }, { "epoch": 1.987825538788858, "grad_norm": 0.14496862888336182, "learning_rate": 3.6485999178237363e-09, "loss": 0.2825, "step": 3154 }, { "epoch": 1.9884559316023798, "grad_norm": 0.17724725604057312, "learning_rate": 3.254470539684751e-09, "loss": 0.3615, "step": 3155 }, { "epoch": 1.9890863244159016, "grad_norm": 0.16778138279914856, "learning_rate": 2.882860732336401e-09, "loss": 0.3507, "step": 3156 }, { "epoch": 1.9897167172294234, "grad_norm": 0.18544743955135345, "learning_rate": 2.5337708677636295e-09, "loss": 0.3382, "step": 3157 }, { "epoch": 1.9903471100429455, "grad_norm": 0.15907777845859528, "learning_rate": 2.207201295411909e-09, "loss": 0.3352, "step": 3158 }, { "epoch": 1.9909775028564676, "grad_norm": 0.1705615222454071, "learning_rate": 1.903152342184744e-09, "loss": 0.3351, "step": 3159 }, { "epoch": 1.9916078956699894, "grad_norm": 0.15151086449623108, "learning_rate": 1.6216243124361764e-09, "loss": 0.3626, "step": 3160 }, { "epoch": 1.9922382884835113, "grad_norm": 0.15104210376739502, "learning_rate": 1.3626174879807775e-09, "loss": 0.3037, "step": 3161 }, { "epoch": 1.992868681297033, "grad_norm": 0.17009414732456207, "learning_rate": 1.1261321280886527e-09, "loss": 0.3687, "step": 3162 }, { "epoch": 1.9934990741105552, "grad_norm": 0.18878397345542908, "learning_rate": 9.121684694854415e-10, "loss": 0.4226, "step": 3163 }, { "epoch": 1.9941294669240772, "grad_norm": 0.1798999160528183, "learning_rate": 7.207267263498185e-10, "loss": 0.3507, "step": 3164 }, { "epoch": 1.994759859737599, "grad_norm": 0.15835590660572052, "learning_rate": 5.518070903209882e-10, "loss": 0.3359, "step": 3165 }, { "epoch": 1.995390252551121, "grad_norm": 0.1508297324180603, "learning_rate": 4.054097304886928e-10, "loss": 0.3145, "step": 3166 }, { "epoch": 1.9960206453646427, "grad_norm": 0.17179661989212036, "learning_rate": 2.815347933957102e-10, "loss": 0.3899, "step": 3167 }, { "epoch": 1.9966510381781648, "grad_norm": 0.16289018094539642, "learning_rate": 1.8018240304784606e-10, "loss": 0.3098, "step": 3168 }, { "epoch": 1.9972814309916866, "grad_norm": 0.1442747563123703, "learning_rate": 1.0135266089644768e-10, "loss": 0.3264, "step": 3169 }, { "epoch": 1.9979118238052087, "grad_norm": 0.17480942606925964, "learning_rate": 4.504564585339211e-11, "loss": 0.3562, "step": 3170 }, { "epoch": 1.9985422166187305, "grad_norm": 0.1754692643880844, "learning_rate": 1.1261414281094062e-11, "loss": 0.3207, "step": 3171 }, { "epoch": 1.9991726094322524, "grad_norm": 0.13669772446155548, "learning_rate": 0.0, "loss": 0.2699, "step": 3172 } ], "logging_steps": 1, "max_steps": 3172, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.1449881977460634e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }