{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0006379585326954, "eval_steps": 294, "global_step": 1176, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0008506113769271664, "grad_norm": 0.7357208728790283, "learning_rate": 2e-05, "loss": 2.8145, "step": 1 }, { "epoch": 0.0017012227538543328, "grad_norm": 0.771299421787262, "learning_rate": 4e-05, "loss": 3.195, "step": 2 }, { "epoch": 0.002551834130781499, "grad_norm": 0.7344720363616943, "learning_rate": 6e-05, "loss": 2.8861, "step": 3 }, { "epoch": 0.0034024455077086655, "grad_norm": 0.7500324845314026, "learning_rate": 8e-05, "loss": 2.7421, "step": 4 }, { "epoch": 0.004253056884635832, "grad_norm": 0.9078495502471924, "learning_rate": 0.0001, "loss": 2.9622, "step": 5 }, { "epoch": 0.005103668261562998, "grad_norm": 1.0794708728790283, "learning_rate": 0.00012, "loss": 3.1124, "step": 6 }, { "epoch": 0.005954279638490165, "grad_norm": 1.0218361616134644, "learning_rate": 0.00014, "loss": 2.7233, "step": 7 }, { "epoch": 0.006804891015417331, "grad_norm": 1.059141755104065, "learning_rate": 0.00016, "loss": 2.8784, "step": 8 }, { "epoch": 0.007655502392344498, "grad_norm": 0.4901650547981262, "learning_rate": 0.00018, "loss": 2.6192, "step": 9 }, { "epoch": 0.008506113769271665, "grad_norm": 0.8344933390617371, "learning_rate": 0.0002, "loss": 2.6448, "step": 10 }, { "epoch": 0.00935672514619883, "grad_norm": 1.5278894901275635, "learning_rate": 0.00019999963702861705, "loss": 2.7457, "step": 11 }, { "epoch": 0.010207336523125997, "grad_norm": 1.2650033235549927, "learning_rate": 0.00019999854811710317, "loss": 2.7532, "step": 12 }, { "epoch": 0.011057947900053162, "grad_norm": 0.740222156047821, "learning_rate": 0.0001999967332733632, "loss": 2.6836, "step": 13 }, { "epoch": 0.01190855927698033, "grad_norm": 0.49257639050483704, "learning_rate": 0.0001999941925105719, "loss": 2.6658, "step": 14 }, { "epoch": 0.012759170653907496, "grad_norm": 0.3310573399066925, "learning_rate": 0.00019999092584717374, "loss": 2.5043, "step": 15 }, { "epoch": 0.013609782030834662, "grad_norm": 0.33361560106277466, "learning_rate": 0.00019998693330688282, "loss": 2.6252, "step": 16 }, { "epoch": 0.014460393407761828, "grad_norm": 0.4449865221977234, "learning_rate": 0.00019998221491868273, "loss": 2.648, "step": 17 }, { "epoch": 0.015311004784688996, "grad_norm": 0.4820970892906189, "learning_rate": 0.0001999767707168262, "loss": 2.7337, "step": 18 }, { "epoch": 0.01616161616161616, "grad_norm": 0.5144203901290894, "learning_rate": 0.0001999706007408351, "loss": 2.6967, "step": 19 }, { "epoch": 0.01701222753854333, "grad_norm": 0.501557469367981, "learning_rate": 0.0001999637050354999, "loss": 2.7318, "step": 20 }, { "epoch": 0.017862838915470493, "grad_norm": 0.4480394423007965, "learning_rate": 0.00019995608365087946, "loss": 2.4126, "step": 21 }, { "epoch": 0.01871345029239766, "grad_norm": 0.4459284842014313, "learning_rate": 0.00019994773664230064, "loss": 2.7072, "step": 22 }, { "epoch": 0.01956406166932483, "grad_norm": 0.39909827709198, "learning_rate": 0.00019993866407035798, "loss": 2.6358, "step": 23 }, { "epoch": 0.020414673046251993, "grad_norm": 0.36802783608436584, "learning_rate": 0.0001999288660009132, "loss": 2.6751, "step": 24 }, { "epoch": 0.02126528442317916, "grad_norm": 0.43287962675094604, "learning_rate": 0.0001999183425050946, "loss": 2.7518, "step": 25 }, { "epoch": 0.022115895800106325, "grad_norm": 0.4289425313472748, "learning_rate": 0.00019990709365929677, "loss": 2.7535, "step": 26 }, { "epoch": 0.022966507177033493, "grad_norm": 0.4627043604850769, "learning_rate": 0.00019989511954517992, "loss": 2.8111, "step": 27 }, { "epoch": 0.02381711855396066, "grad_norm": 0.4823961853981018, "learning_rate": 0.00019988242024966923, "loss": 2.9493, "step": 28 }, { "epoch": 0.024667729930887825, "grad_norm": 0.4622437059879303, "learning_rate": 0.00019986899586495432, "loss": 2.788, "step": 29 }, { "epoch": 0.025518341307814992, "grad_norm": 0.4963669776916504, "learning_rate": 0.00019985484648848853, "loss": 2.8304, "step": 30 }, { "epoch": 0.02636895268474216, "grad_norm": 0.47957557439804077, "learning_rate": 0.00019983997222298828, "loss": 2.7323, "step": 31 }, { "epoch": 0.027219564061669324, "grad_norm": 0.445528507232666, "learning_rate": 0.00019982437317643217, "loss": 3.015, "step": 32 }, { "epoch": 0.028070175438596492, "grad_norm": 0.46085312962532043, "learning_rate": 0.00019980804946206036, "loss": 2.8556, "step": 33 }, { "epoch": 0.028920786815523656, "grad_norm": 0.5078282356262207, "learning_rate": 0.0001997910011983737, "loss": 2.8472, "step": 34 }, { "epoch": 0.029771398192450824, "grad_norm": 0.4612430930137634, "learning_rate": 0.00019977322850913283, "loss": 2.6399, "step": 35 }, { "epoch": 0.03062200956937799, "grad_norm": 0.499965101480484, "learning_rate": 0.00019975473152335726, "loss": 2.9121, "step": 36 }, { "epoch": 0.03147262094630516, "grad_norm": 0.5101069808006287, "learning_rate": 0.0001997355103753246, "loss": 2.8488, "step": 37 }, { "epoch": 0.03232323232323232, "grad_norm": 0.5065872669219971, "learning_rate": 0.00019971556520456929, "loss": 2.8311, "step": 38 }, { "epoch": 0.03317384370015949, "grad_norm": 0.5324426889419556, "learning_rate": 0.00019969489615588189, "loss": 2.7454, "step": 39 }, { "epoch": 0.03402445507708666, "grad_norm": 0.5128815770149231, "learning_rate": 0.0001996735033793079, "loss": 2.8116, "step": 40 }, { "epoch": 0.03487506645401382, "grad_norm": 0.5330538153648376, "learning_rate": 0.00019965138703014655, "loss": 2.7584, "step": 41 }, { "epoch": 0.03572567783094099, "grad_norm": 0.556816577911377, "learning_rate": 0.00019962854726894997, "loss": 2.8902, "step": 42 }, { "epoch": 0.03657628920786816, "grad_norm": 0.5452866554260254, "learning_rate": 0.0001996049842615217, "loss": 2.7984, "step": 43 }, { "epoch": 0.03742690058479532, "grad_norm": 0.5836021304130554, "learning_rate": 0.0001995806981789157, "loss": 2.803, "step": 44 }, { "epoch": 0.03827751196172249, "grad_norm": 0.5968561172485352, "learning_rate": 0.00019955568919743507, "loss": 2.8592, "step": 45 }, { "epoch": 0.03912812333864966, "grad_norm": 0.6416970491409302, "learning_rate": 0.0001995299574986306, "loss": 2.7488, "step": 46 }, { "epoch": 0.03997873471557682, "grad_norm": 0.704325795173645, "learning_rate": 0.0001995035032692998, "loss": 2.6983, "step": 47 }, { "epoch": 0.040829346092503986, "grad_norm": 0.7766572833061218, "learning_rate": 0.00019947632670148517, "loss": 2.9677, "step": 48 }, { "epoch": 0.04167995746943115, "grad_norm": 0.7186003923416138, "learning_rate": 0.00019944842799247308, "loss": 3.0728, "step": 49 }, { "epoch": 0.04253056884635832, "grad_norm": 0.7572959065437317, "learning_rate": 0.00019941980734479214, "loss": 3.0345, "step": 50 }, { "epoch": 0.043381180223285486, "grad_norm": 0.48461732268333435, "learning_rate": 0.00019939046496621194, "loss": 2.6307, "step": 51 }, { "epoch": 0.04423179160021265, "grad_norm": 0.468675434589386, "learning_rate": 0.0001993604010697413, "loss": 2.4616, "step": 52 }, { "epoch": 0.04508240297713982, "grad_norm": 0.3815957009792328, "learning_rate": 0.0001993296158736269, "loss": 2.7479, "step": 53 }, { "epoch": 0.045933014354066985, "grad_norm": 0.3313361704349518, "learning_rate": 0.00019929810960135172, "loss": 2.4983, "step": 54 }, { "epoch": 0.04678362573099415, "grad_norm": 0.32521429657936096, "learning_rate": 0.00019926588248163316, "loss": 2.5446, "step": 55 }, { "epoch": 0.04763423710792132, "grad_norm": 0.2972453236579895, "learning_rate": 0.00019923293474842174, "loss": 2.5472, "step": 56 }, { "epoch": 0.048484848484848485, "grad_norm": 0.2972238063812256, "learning_rate": 0.00019919926664089909, "loss": 2.5389, "step": 57 }, { "epoch": 0.04933545986177565, "grad_norm": 0.27498453855514526, "learning_rate": 0.00019916487840347644, "loss": 2.571, "step": 58 }, { "epoch": 0.05018607123870282, "grad_norm": 0.2938655614852905, "learning_rate": 0.00019912977028579268, "loss": 2.7134, "step": 59 }, { "epoch": 0.051036682615629984, "grad_norm": 0.26742392778396606, "learning_rate": 0.0001990939425427127, "loss": 2.5632, "step": 60 }, { "epoch": 0.05188729399255715, "grad_norm": 0.28117692470550537, "learning_rate": 0.00019905739543432536, "loss": 2.5297, "step": 61 }, { "epoch": 0.05273790536948432, "grad_norm": 0.28916725516319275, "learning_rate": 0.00019902012922594177, "loss": 2.7096, "step": 62 }, { "epoch": 0.053588516746411484, "grad_norm": 0.32468459010124207, "learning_rate": 0.0001989821441880933, "loss": 2.6192, "step": 63 }, { "epoch": 0.05443912812333865, "grad_norm": 0.2806537449359894, "learning_rate": 0.0001989434405965295, "loss": 2.6747, "step": 64 }, { "epoch": 0.05528973950026582, "grad_norm": 0.2876998782157898, "learning_rate": 0.0001989040187322164, "loss": 2.7443, "step": 65 }, { "epoch": 0.056140350877192984, "grad_norm": 0.27619123458862305, "learning_rate": 0.00019886387888133413, "loss": 2.7379, "step": 66 }, { "epoch": 0.05699096225412015, "grad_norm": 0.31479549407958984, "learning_rate": 0.000198823021335275, "loss": 2.4039, "step": 67 }, { "epoch": 0.05784157363104731, "grad_norm": 0.300857812166214, "learning_rate": 0.00019878144639064144, "loss": 2.5705, "step": 68 }, { "epoch": 0.05869218500797448, "grad_norm": 0.3776433765888214, "learning_rate": 0.00019873915434924375, "loss": 2.863, "step": 69 }, { "epoch": 0.05954279638490165, "grad_norm": 0.30585938692092896, "learning_rate": 0.00019869614551809795, "loss": 2.5312, "step": 70 }, { "epoch": 0.06039340776182881, "grad_norm": 0.3163856267929077, "learning_rate": 0.00019865242020942353, "loss": 2.8491, "step": 71 }, { "epoch": 0.06124401913875598, "grad_norm": 0.30077147483825684, "learning_rate": 0.00019860797874064122, "loss": 2.7777, "step": 72 }, { "epoch": 0.06209463051568315, "grad_norm": 0.4153176248073578, "learning_rate": 0.0001985628214343706, "loss": 2.7499, "step": 73 }, { "epoch": 0.06294524189261032, "grad_norm": 0.35611122846603394, "learning_rate": 0.00019851694861842793, "loss": 2.7089, "step": 74 }, { "epoch": 0.06379585326953748, "grad_norm": 0.3143812417984009, "learning_rate": 0.00019847036062582357, "loss": 2.758, "step": 75 }, { "epoch": 0.06464646464646465, "grad_norm": 0.32024794816970825, "learning_rate": 0.00019842305779475968, "loss": 2.4616, "step": 76 }, { "epoch": 0.06549707602339182, "grad_norm": 0.3146126866340637, "learning_rate": 0.00019837504046862775, "loss": 2.6104, "step": 77 }, { "epoch": 0.06634768740031897, "grad_norm": 0.32578444480895996, "learning_rate": 0.00019832630899600608, "loss": 2.6297, "step": 78 }, { "epoch": 0.06719829877724615, "grad_norm": 0.36873045563697815, "learning_rate": 0.00019827686373065728, "loss": 2.6358, "step": 79 }, { "epoch": 0.06804891015417332, "grad_norm": 0.3558378517627716, "learning_rate": 0.00019822670503152567, "loss": 2.6308, "step": 80 }, { "epoch": 0.06889952153110047, "grad_norm": 0.37967684864997864, "learning_rate": 0.00019817583326273467, "loss": 2.7577, "step": 81 }, { "epoch": 0.06975013290802765, "grad_norm": 0.3737669885158539, "learning_rate": 0.00019812424879358425, "loss": 2.9207, "step": 82 }, { "epoch": 0.07060074428495482, "grad_norm": 0.39410829544067383, "learning_rate": 0.0001980719519985481, "loss": 2.9544, "step": 83 }, { "epoch": 0.07145135566188197, "grad_norm": 0.3863750696182251, "learning_rate": 0.00019801894325727104, "loss": 2.7794, "step": 84 }, { "epoch": 0.07230196703880915, "grad_norm": 0.4226458966732025, "learning_rate": 0.0001979652229545662, "loss": 2.7491, "step": 85 }, { "epoch": 0.07315257841573632, "grad_norm": 0.42758506536483765, "learning_rate": 0.0001979107914804122, "loss": 2.8524, "step": 86 }, { "epoch": 0.07400318979266347, "grad_norm": 0.4379200041294098, "learning_rate": 0.0001978556492299504, "loss": 2.6526, "step": 87 }, { "epoch": 0.07485380116959064, "grad_norm": 0.44331902265548706, "learning_rate": 0.000197799796603482, "loss": 2.8028, "step": 88 }, { "epoch": 0.07570441254651782, "grad_norm": 0.4358711540699005, "learning_rate": 0.0001977432340064651, "loss": 2.5426, "step": 89 }, { "epoch": 0.07655502392344497, "grad_norm": 0.45511335134506226, "learning_rate": 0.00019768596184951173, "loss": 2.7067, "step": 90 }, { "epoch": 0.07740563530037214, "grad_norm": 0.5394377112388611, "learning_rate": 0.00019762798054838502, "loss": 2.8189, "step": 91 }, { "epoch": 0.07825624667729932, "grad_norm": 0.5124706625938416, "learning_rate": 0.00019756929052399603, "loss": 2.7702, "step": 92 }, { "epoch": 0.07910685805422647, "grad_norm": 0.5025349855422974, "learning_rate": 0.00019750989220240073, "loss": 2.6872, "step": 93 }, { "epoch": 0.07995746943115364, "grad_norm": 0.5144663453102112, "learning_rate": 0.00019744978601479694, "loss": 2.6366, "step": 94 }, { "epoch": 0.08080808080808081, "grad_norm": 0.5908443927764893, "learning_rate": 0.00019738897239752118, "loss": 2.7918, "step": 95 }, { "epoch": 0.08165869218500797, "grad_norm": 0.6398508548736572, "learning_rate": 0.00019732745179204552, "loss": 2.9972, "step": 96 }, { "epoch": 0.08250930356193514, "grad_norm": 0.6032273173332214, "learning_rate": 0.00019726522464497435, "loss": 2.7638, "step": 97 }, { "epoch": 0.0833599149388623, "grad_norm": 0.6310097575187683, "learning_rate": 0.0001972022914080411, "loss": 2.9328, "step": 98 }, { "epoch": 0.08421052631578947, "grad_norm": 0.7050711512565613, "learning_rate": 0.00019713865253810506, "loss": 2.8143, "step": 99 }, { "epoch": 0.08506113769271664, "grad_norm": 0.755136251449585, "learning_rate": 0.00019707430849714807, "loss": 3.036, "step": 100 }, { "epoch": 0.0859117490696438, "grad_norm": 0.35153907537460327, "learning_rate": 0.00019700925975227096, "loss": 2.4444, "step": 101 }, { "epoch": 0.08676236044657097, "grad_norm": 0.40153488516807556, "learning_rate": 0.0001969435067756904, "loss": 2.6068, "step": 102 }, { "epoch": 0.08761297182349814, "grad_norm": 0.3474213480949402, "learning_rate": 0.00019687705004473545, "loss": 2.4261, "step": 103 }, { "epoch": 0.0884635832004253, "grad_norm": 0.3283519744873047, "learning_rate": 0.00019680989004184382, "loss": 2.6736, "step": 104 }, { "epoch": 0.08931419457735247, "grad_norm": 0.29034170508384705, "learning_rate": 0.00019674202725455877, "loss": 2.5551, "step": 105 }, { "epoch": 0.09016480595427964, "grad_norm": 0.2918970584869385, "learning_rate": 0.00019667346217552527, "loss": 2.6039, "step": 106 }, { "epoch": 0.0910154173312068, "grad_norm": 0.2852106988430023, "learning_rate": 0.00019660419530248655, "loss": 2.5432, "step": 107 }, { "epoch": 0.09186602870813397, "grad_norm": 0.30997323989868164, "learning_rate": 0.0001965342271382805, "loss": 2.7324, "step": 108 }, { "epoch": 0.09271664008506114, "grad_norm": 0.34156399965286255, "learning_rate": 0.00019646355819083589, "loss": 2.6548, "step": 109 }, { "epoch": 0.0935672514619883, "grad_norm": 0.2763843238353729, "learning_rate": 0.00019639218897316883, "loss": 2.5254, "step": 110 }, { "epoch": 0.09441786283891547, "grad_norm": 0.2835611402988434, "learning_rate": 0.00019632012000337908, "loss": 2.5677, "step": 111 }, { "epoch": 0.09526847421584264, "grad_norm": 0.2940271198749542, "learning_rate": 0.00019624735180464602, "loss": 2.5976, "step": 112 }, { "epoch": 0.0961190855927698, "grad_norm": 0.2714485824108124, "learning_rate": 0.00019617388490522517, "loss": 2.6087, "step": 113 }, { "epoch": 0.09696969696969697, "grad_norm": 0.30371204018592834, "learning_rate": 0.00019609971983844412, "loss": 2.6129, "step": 114 }, { "epoch": 0.09782030834662414, "grad_norm": 0.2762625813484192, "learning_rate": 0.0001960248571426989, "loss": 2.5759, "step": 115 }, { "epoch": 0.0986709197235513, "grad_norm": 0.2702981233596802, "learning_rate": 0.00019594929736144976, "loss": 2.5443, "step": 116 }, { "epoch": 0.09952153110047847, "grad_norm": 0.29210978746414185, "learning_rate": 0.00019587304104321746, "loss": 2.6425, "step": 117 }, { "epoch": 0.10037214247740564, "grad_norm": 0.31620749831199646, "learning_rate": 0.00019579608874157928, "loss": 2.703, "step": 118 }, { "epoch": 0.1012227538543328, "grad_norm": 0.2803102433681488, "learning_rate": 0.00019571844101516484, "loss": 2.6886, "step": 119 }, { "epoch": 0.10207336523125997, "grad_norm": 0.30169349908828735, "learning_rate": 0.00019564009842765225, "loss": 2.8221, "step": 120 }, { "epoch": 0.10292397660818714, "grad_norm": 0.297553151845932, "learning_rate": 0.00019556106154776379, "loss": 2.6897, "step": 121 }, { "epoch": 0.1037745879851143, "grad_norm": 0.30721086263656616, "learning_rate": 0.000195481330949262, "loss": 2.6551, "step": 122 }, { "epoch": 0.10462519936204147, "grad_norm": 0.29124605655670166, "learning_rate": 0.00019540090721094542, "loss": 2.6292, "step": 123 }, { "epoch": 0.10547581073896864, "grad_norm": 0.31037285923957825, "learning_rate": 0.0001953197909166443, "loss": 2.5459, "step": 124 }, { "epoch": 0.1063264221158958, "grad_norm": 0.3543750047683716, "learning_rate": 0.00019523798265521654, "loss": 2.5622, "step": 125 }, { "epoch": 0.10717703349282297, "grad_norm": 0.3356544077396393, "learning_rate": 0.00019515548302054335, "loss": 2.7272, "step": 126 }, { "epoch": 0.10802764486975014, "grad_norm": 0.34296396374702454, "learning_rate": 0.00019507229261152476, "loss": 2.6629, "step": 127 }, { "epoch": 0.1088782562466773, "grad_norm": 0.34629112482070923, "learning_rate": 0.0001949884120320756, "loss": 2.6371, "step": 128 }, { "epoch": 0.10972886762360447, "grad_norm": 0.34170377254486084, "learning_rate": 0.00019490384189112082, "loss": 2.7218, "step": 129 }, { "epoch": 0.11057947900053164, "grad_norm": 0.38438230752944946, "learning_rate": 0.0001948185828025913, "loss": 2.7096, "step": 130 }, { "epoch": 0.1114300903774588, "grad_norm": 0.40347060561180115, "learning_rate": 0.00019473263538541914, "loss": 2.8129, "step": 131 }, { "epoch": 0.11228070175438597, "grad_norm": 0.3742891848087311, "learning_rate": 0.00019464600026353348, "loss": 2.7916, "step": 132 }, { "epoch": 0.11313131313131314, "grad_norm": 0.4015231430530548, "learning_rate": 0.0001945586780658557, "loss": 2.6099, "step": 133 }, { "epoch": 0.1139819245082403, "grad_norm": 0.40618133544921875, "learning_rate": 0.00019447066942629491, "loss": 2.6669, "step": 134 }, { "epoch": 0.11483253588516747, "grad_norm": 0.4171842932701111, "learning_rate": 0.00019438197498374357, "loss": 2.6272, "step": 135 }, { "epoch": 0.11568314726209462, "grad_norm": 0.443013995885849, "learning_rate": 0.0001942925953820725, "loss": 2.5722, "step": 136 }, { "epoch": 0.1165337586390218, "grad_norm": 0.4636158347129822, "learning_rate": 0.00019420253127012645, "loss": 2.8075, "step": 137 }, { "epoch": 0.11738437001594897, "grad_norm": 0.4271916151046753, "learning_rate": 0.00019411178330171937, "loss": 2.6875, "step": 138 }, { "epoch": 0.11823498139287612, "grad_norm": 0.47826603055000305, "learning_rate": 0.00019402035213562954, "loss": 2.7042, "step": 139 }, { "epoch": 0.1190855927698033, "grad_norm": 0.46729791164398193, "learning_rate": 0.0001939282384355949, "loss": 2.6663, "step": 140 }, { "epoch": 0.11993620414673047, "grad_norm": 0.4689824879169464, "learning_rate": 0.0001938354428703082, "loss": 2.6138, "step": 141 }, { "epoch": 0.12078681552365762, "grad_norm": 0.526096522808075, "learning_rate": 0.0001937419661134121, "loss": 2.9258, "step": 142 }, { "epoch": 0.1216374269005848, "grad_norm": 0.5075511932373047, "learning_rate": 0.0001936478088434944, "loss": 2.8021, "step": 143 }, { "epoch": 0.12248803827751197, "grad_norm": 0.5048439502716064, "learning_rate": 0.00019355297174408298, "loss": 2.6274, "step": 144 }, { "epoch": 0.12333864965443912, "grad_norm": 0.5787357687950134, "learning_rate": 0.00019345745550364087, "loss": 2.851, "step": 145 }, { "epoch": 0.1241892610313663, "grad_norm": 0.5641311407089233, "learning_rate": 0.00019336126081556134, "loss": 2.7681, "step": 146 }, { "epoch": 0.12503987240829345, "grad_norm": 0.5504147410392761, "learning_rate": 0.00019326438837816276, "loss": 2.6905, "step": 147 }, { "epoch": 0.12589048378522064, "grad_norm": 0.6101283431053162, "learning_rate": 0.00019316683889468358, "loss": 2.589, "step": 148 }, { "epoch": 0.1267410951621478, "grad_norm": 0.7153661847114563, "learning_rate": 0.00019306861307327725, "loss": 2.9563, "step": 149 }, { "epoch": 0.12759170653907495, "grad_norm": 0.7049738168716431, "learning_rate": 0.00019296971162700694, "loss": 2.8023, "step": 150 }, { "epoch": 0.12844231791600214, "grad_norm": 0.3282754421234131, "learning_rate": 0.00019287013527384062, "loss": 2.4278, "step": 151 }, { "epoch": 0.1292929292929293, "grad_norm": 0.350577712059021, "learning_rate": 0.00019276988473664557, "loss": 2.5845, "step": 152 }, { "epoch": 0.13014354066985645, "grad_norm": 0.32433176040649414, "learning_rate": 0.00019266896074318334, "loss": 2.6126, "step": 153 }, { "epoch": 0.13099415204678364, "grad_norm": 0.31844663619995117, "learning_rate": 0.00019256736402610436, "loss": 2.527, "step": 154 }, { "epoch": 0.1318447634237108, "grad_norm": 0.2559802830219269, "learning_rate": 0.00019246509532294266, "loss": 2.2437, "step": 155 }, { "epoch": 0.13269537480063795, "grad_norm": 0.28512275218963623, "learning_rate": 0.00019236215537611046, "loss": 2.5739, "step": 156 }, { "epoch": 0.13354598617756513, "grad_norm": 0.26634740829467773, "learning_rate": 0.00019225854493289286, "loss": 2.4485, "step": 157 }, { "epoch": 0.1343965975544923, "grad_norm": 0.2785400450229645, "learning_rate": 0.0001921542647454424, "loss": 2.7944, "step": 158 }, { "epoch": 0.13524720893141945, "grad_norm": 0.27485981583595276, "learning_rate": 0.00019204931557077355, "loss": 2.6518, "step": 159 }, { "epoch": 0.13609782030834663, "grad_norm": 0.2687318027019501, "learning_rate": 0.00019194369817075724, "loss": 2.6595, "step": 160 }, { "epoch": 0.1369484316852738, "grad_norm": 0.26418977975845337, "learning_rate": 0.00019183741331211537, "loss": 2.7045, "step": 161 }, { "epoch": 0.13779904306220095, "grad_norm": 0.28258347511291504, "learning_rate": 0.00019173046176641513, "loss": 2.5896, "step": 162 }, { "epoch": 0.13864965443912813, "grad_norm": 0.27390146255493164, "learning_rate": 0.00019162284431006358, "loss": 2.5566, "step": 163 }, { "epoch": 0.1395002658160553, "grad_norm": 0.2916048765182495, "learning_rate": 0.00019151456172430183, "loss": 2.609, "step": 164 }, { "epoch": 0.14035087719298245, "grad_norm": 0.30684247612953186, "learning_rate": 0.00019140561479519955, "loss": 2.5222, "step": 165 }, { "epoch": 0.14120148856990963, "grad_norm": 0.26836761832237244, "learning_rate": 0.00019129600431364897, "loss": 2.5891, "step": 166 }, { "epoch": 0.1420520999468368, "grad_norm": 0.2658300995826721, "learning_rate": 0.00019118573107535953, "loss": 2.644, "step": 167 }, { "epoch": 0.14290271132376395, "grad_norm": 0.2789425551891327, "learning_rate": 0.00019107479588085182, "loss": 2.5641, "step": 168 }, { "epoch": 0.14375332270069113, "grad_norm": 0.2909972071647644, "learning_rate": 0.00019096319953545185, "loss": 2.5982, "step": 169 }, { "epoch": 0.1446039340776183, "grad_norm": 0.3741363286972046, "learning_rate": 0.0001908509428492852, "loss": 2.6293, "step": 170 }, { "epoch": 0.14545454545454545, "grad_norm": 0.2989426851272583, "learning_rate": 0.0001907380266372712, "loss": 2.7364, "step": 171 }, { "epoch": 0.14630515683147263, "grad_norm": 0.28862622380256653, "learning_rate": 0.00019062445171911686, "loss": 2.5656, "step": 172 }, { "epoch": 0.1471557682083998, "grad_norm": 0.3215920329093933, "learning_rate": 0.0001905102189193112, "loss": 2.8443, "step": 173 }, { "epoch": 0.14800637958532695, "grad_norm": 0.2994636595249176, "learning_rate": 0.00019039532906711882, "loss": 2.7014, "step": 174 }, { "epoch": 0.14885699096225413, "grad_norm": 0.32109183073043823, "learning_rate": 0.00019027978299657436, "loss": 2.8364, "step": 175 }, { "epoch": 0.1497076023391813, "grad_norm": 0.30813783407211304, "learning_rate": 0.00019016358154647618, "loss": 2.5102, "step": 176 }, { "epoch": 0.15055821371610845, "grad_norm": 0.32674533128738403, "learning_rate": 0.00019004672556038028, "loss": 2.757, "step": 177 }, { "epoch": 0.15140882509303563, "grad_norm": 0.34680357575416565, "learning_rate": 0.00018992921588659422, "loss": 2.5228, "step": 178 }, { "epoch": 0.1522594364699628, "grad_norm": 0.35170817375183105, "learning_rate": 0.00018981105337817104, "loss": 2.6148, "step": 179 }, { "epoch": 0.15311004784688995, "grad_norm": 0.3741483986377716, "learning_rate": 0.00018969223889290284, "loss": 2.8025, "step": 180 }, { "epoch": 0.15396065922381713, "grad_norm": 0.4156269431114197, "learning_rate": 0.00018957277329331485, "loss": 2.72, "step": 181 }, { "epoch": 0.1548112706007443, "grad_norm": 0.3726477324962616, "learning_rate": 0.00018945265744665886, "loss": 2.6197, "step": 182 }, { "epoch": 0.15566188197767145, "grad_norm": 0.4135706424713135, "learning_rate": 0.00018933189222490726, "loss": 2.7176, "step": 183 }, { "epoch": 0.15651249335459863, "grad_norm": 0.38799911737442017, "learning_rate": 0.00018921047850474642, "loss": 2.5641, "step": 184 }, { "epoch": 0.1573631047315258, "grad_norm": 0.4622843265533447, "learning_rate": 0.00018908841716757042, "loss": 2.7626, "step": 185 }, { "epoch": 0.15821371610845295, "grad_norm": 0.4251146912574768, "learning_rate": 0.00018896570909947475, "loss": 2.6842, "step": 186 }, { "epoch": 0.15906432748538013, "grad_norm": 0.4628697335720062, "learning_rate": 0.00018884235519124972, "loss": 2.9476, "step": 187 }, { "epoch": 0.1599149388623073, "grad_norm": 0.5052159428596497, "learning_rate": 0.0001887183563383741, "loss": 2.769, "step": 188 }, { "epoch": 0.16076555023923444, "grad_norm": 0.4817435145378113, "learning_rate": 0.00018859371344100864, "loss": 2.6266, "step": 189 }, { "epoch": 0.16161616161616163, "grad_norm": 0.4751468598842621, "learning_rate": 0.0001884684274039894, "loss": 2.877, "step": 190 }, { "epoch": 0.1624667729930888, "grad_norm": 0.5826165676116943, "learning_rate": 0.00018834249913682132, "loss": 2.7308, "step": 191 }, { "epoch": 0.16331738437001594, "grad_norm": 0.5441760420799255, "learning_rate": 0.00018821592955367154, "loss": 2.6764, "step": 192 }, { "epoch": 0.1641679957469431, "grad_norm": 0.5005947947502136, "learning_rate": 0.00018808871957336275, "loss": 2.664, "step": 193 }, { "epoch": 0.1650186071238703, "grad_norm": 0.5205551981925964, "learning_rate": 0.00018796087011936665, "loss": 2.6192, "step": 194 }, { "epoch": 0.16586921850079744, "grad_norm": 0.5489931106567383, "learning_rate": 0.0001878323821197971, "loss": 2.5061, "step": 195 }, { "epoch": 0.1667198298777246, "grad_norm": 0.5525840520858765, "learning_rate": 0.00018770325650740345, "loss": 2.7474, "step": 196 }, { "epoch": 0.1675704412546518, "grad_norm": 0.5978725552558899, "learning_rate": 0.0001875734942195637, "loss": 2.6055, "step": 197 }, { "epoch": 0.16842105263157894, "grad_norm": 0.6148700714111328, "learning_rate": 0.0001874430961982778, "loss": 2.8352, "step": 198 }, { "epoch": 0.1692716640085061, "grad_norm": 0.5956620573997498, "learning_rate": 0.0001873120633901608, "loss": 2.7367, "step": 199 }, { "epoch": 0.17012227538543329, "grad_norm": 0.7082740664482117, "learning_rate": 0.0001871803967464358, "loss": 2.9437, "step": 200 }, { "epoch": 0.17097288676236044, "grad_norm": 0.32244405150413513, "learning_rate": 0.00018704809722292737, "loss": 2.3835, "step": 201 }, { "epoch": 0.1718234981392876, "grad_norm": 0.3367772102355957, "learning_rate": 0.00018691516578005427, "loss": 2.601, "step": 202 }, { "epoch": 0.17267410951621479, "grad_norm": 0.31732872128486633, "learning_rate": 0.00018678160338282272, "loss": 2.5894, "step": 203 }, { "epoch": 0.17352472089314194, "grad_norm": 0.27467650175094604, "learning_rate": 0.0001866474110008193, "loss": 2.4369, "step": 204 }, { "epoch": 0.1743753322700691, "grad_norm": 0.29726937413215637, "learning_rate": 0.00018651258960820385, "loss": 2.6123, "step": 205 }, { "epoch": 0.17522594364699628, "grad_norm": 0.27499106526374817, "learning_rate": 0.00018637714018370253, "loss": 2.5141, "step": 206 }, { "epoch": 0.17607655502392344, "grad_norm": 0.27535390853881836, "learning_rate": 0.00018624106371060067, "loss": 2.5148, "step": 207 }, { "epoch": 0.1769271664008506, "grad_norm": 0.2687024176120758, "learning_rate": 0.00018610436117673555, "loss": 2.6057, "step": 208 }, { "epoch": 0.17777777777777778, "grad_norm": 0.31320950388908386, "learning_rate": 0.00018596703357448934, "loss": 2.6813, "step": 209 }, { "epoch": 0.17862838915470494, "grad_norm": 0.25832033157348633, "learning_rate": 0.00018582908190078185, "loss": 2.4898, "step": 210 }, { "epoch": 0.1794790005316321, "grad_norm": 0.2806166410446167, "learning_rate": 0.00018569050715706325, "loss": 2.5762, "step": 211 }, { "epoch": 0.18032961190855928, "grad_norm": 0.26099708676338196, "learning_rate": 0.00018555131034930685, "loss": 2.5386, "step": 212 }, { "epoch": 0.18118022328548644, "grad_norm": 0.26140880584716797, "learning_rate": 0.00018541149248800184, "loss": 2.7159, "step": 213 }, { "epoch": 0.1820308346624136, "grad_norm": 0.2698177695274353, "learning_rate": 0.0001852710545881459, "loss": 2.5942, "step": 214 }, { "epoch": 0.18288144603934078, "grad_norm": 0.27240726351737976, "learning_rate": 0.00018512999766923772, "loss": 2.5377, "step": 215 }, { "epoch": 0.18373205741626794, "grad_norm": 0.2780822813510895, "learning_rate": 0.00018498832275526988, "loss": 2.6185, "step": 216 }, { "epoch": 0.1845826687931951, "grad_norm": 0.2713901400566101, "learning_rate": 0.00018484603087472109, "loss": 2.5802, "step": 217 }, { "epoch": 0.18543328017012228, "grad_norm": 0.2843954265117645, "learning_rate": 0.000184703123060549, "loss": 2.6404, "step": 218 }, { "epoch": 0.18628389154704944, "grad_norm": 0.2679051160812378, "learning_rate": 0.0001845596003501826, "loss": 2.6688, "step": 219 }, { "epoch": 0.1871345029239766, "grad_norm": 0.292568176984787, "learning_rate": 0.00018441546378551458, "loss": 2.6505, "step": 220 }, { "epoch": 0.18798511430090378, "grad_norm": 0.282326877117157, "learning_rate": 0.00018427071441289388, "loss": 2.6299, "step": 221 }, { "epoch": 0.18883572567783094, "grad_norm": 0.2853985130786896, "learning_rate": 0.00018412535328311814, "loss": 2.8143, "step": 222 }, { "epoch": 0.1896863370547581, "grad_norm": 0.2786814868450165, "learning_rate": 0.00018397938145142591, "loss": 2.6007, "step": 223 }, { "epoch": 0.19053694843168528, "grad_norm": 0.42460358142852783, "learning_rate": 0.0001838327999774892, "loss": 2.7891, "step": 224 }, { "epoch": 0.19138755980861244, "grad_norm": 0.30478838086128235, "learning_rate": 0.00018368560992540562, "loss": 2.4551, "step": 225 }, { "epoch": 0.1922381711855396, "grad_norm": 0.3402044177055359, "learning_rate": 0.00018353781236369064, "loss": 2.9191, "step": 226 }, { "epoch": 0.19308878256246678, "grad_norm": 0.33662521839141846, "learning_rate": 0.00018338940836527004, "loss": 2.5606, "step": 227 }, { "epoch": 0.19393939393939394, "grad_norm": 0.34461426734924316, "learning_rate": 0.0001832403990074719, "loss": 2.714, "step": 228 }, { "epoch": 0.1947900053163211, "grad_norm": 0.342184454202652, "learning_rate": 0.0001830907853720188, "loss": 2.6936, "step": 229 }, { "epoch": 0.19564061669324828, "grad_norm": 0.3557281494140625, "learning_rate": 0.0001829405685450202, "loss": 2.6663, "step": 230 }, { "epoch": 0.19649122807017544, "grad_norm": 0.38674700260162354, "learning_rate": 0.0001827897496169642, "loss": 2.7257, "step": 231 }, { "epoch": 0.1973418394471026, "grad_norm": 0.3849089741706848, "learning_rate": 0.00018263832968271, "loss": 2.7178, "step": 232 }, { "epoch": 0.19819245082402978, "grad_norm": 0.4508901834487915, "learning_rate": 0.00018248630984147955, "loss": 2.7947, "step": 233 }, { "epoch": 0.19904306220095694, "grad_norm": 0.39502936601638794, "learning_rate": 0.00018233369119684996, "loss": 2.5885, "step": 234 }, { "epoch": 0.1998936735778841, "grad_norm": 0.4287837743759155, "learning_rate": 0.00018218047485674523, "loss": 2.6911, "step": 235 }, { "epoch": 0.20074428495481128, "grad_norm": 0.4257849454879761, "learning_rate": 0.00018202666193342833, "loss": 2.8803, "step": 236 }, { "epoch": 0.20159489633173844, "grad_norm": 0.4459477961063385, "learning_rate": 0.00018187225354349295, "loss": 2.8352, "step": 237 }, { "epoch": 0.2024455077086656, "grad_norm": 0.4430312514305115, "learning_rate": 0.0001817172508078557, "loss": 2.7517, "step": 238 }, { "epoch": 0.20329611908559278, "grad_norm": 0.4465429484844208, "learning_rate": 0.00018156165485174773, "loss": 2.7119, "step": 239 }, { "epoch": 0.20414673046251994, "grad_norm": 0.4532601833343506, "learning_rate": 0.00018140546680470659, "loss": 2.7346, "step": 240 }, { "epoch": 0.2049973418394471, "grad_norm": 0.4750036299228668, "learning_rate": 0.00018124868780056814, "loss": 2.6113, "step": 241 }, { "epoch": 0.20584795321637428, "grad_norm": 0.5072234272956848, "learning_rate": 0.00018109131897745822, "loss": 2.844, "step": 242 }, { "epoch": 0.20669856459330144, "grad_norm": 0.5094662308692932, "learning_rate": 0.00018093336147778438, "loss": 2.7737, "step": 243 }, { "epoch": 0.2075491759702286, "grad_norm": 0.606842577457428, "learning_rate": 0.00018077481644822768, "loss": 2.6153, "step": 244 }, { "epoch": 0.20839978734715578, "grad_norm": 0.5311163067817688, "learning_rate": 0.00018061568503973435, "loss": 2.6038, "step": 245 }, { "epoch": 0.20925039872408294, "grad_norm": 0.5758761167526245, "learning_rate": 0.00018045596840750723, "loss": 2.6446, "step": 246 }, { "epoch": 0.2101010101010101, "grad_norm": 0.598297119140625, "learning_rate": 0.00018029566771099776, "loss": 2.7002, "step": 247 }, { "epoch": 0.21095162147793728, "grad_norm": 0.6635774970054626, "learning_rate": 0.00018013478411389716, "loss": 2.8011, "step": 248 }, { "epoch": 0.21180223285486444, "grad_norm": 0.6850919723510742, "learning_rate": 0.00017997331878412835, "loss": 2.8903, "step": 249 }, { "epoch": 0.2126528442317916, "grad_norm": 0.7298348546028137, "learning_rate": 0.00017981127289383716, "loss": 2.9483, "step": 250 }, { "epoch": 0.21350345560871878, "grad_norm": 0.33354559540748596, "learning_rate": 0.00017964864761938404, "loss": 2.4727, "step": 251 }, { "epoch": 0.21435406698564594, "grad_norm": 0.3557465374469757, "learning_rate": 0.00017948544414133534, "loss": 2.5058, "step": 252 }, { "epoch": 0.2152046783625731, "grad_norm": 0.3230442702770233, "learning_rate": 0.00017932166364445498, "loss": 2.5422, "step": 253 }, { "epoch": 0.21605528973950028, "grad_norm": 0.28668278455734253, "learning_rate": 0.0001791573073176956, "loss": 2.3173, "step": 254 }, { "epoch": 0.21690590111642744, "grad_norm": 0.30019721388816833, "learning_rate": 0.00017899237635419002, "loss": 2.6444, "step": 255 }, { "epoch": 0.2177565124933546, "grad_norm": 0.285314679145813, "learning_rate": 0.0001788268719512427, "loss": 2.5319, "step": 256 }, { "epoch": 0.21860712387028178, "grad_norm": 0.27584996819496155, "learning_rate": 0.00017866079531032088, "loss": 2.6496, "step": 257 }, { "epoch": 0.21945773524720893, "grad_norm": 0.2874069809913635, "learning_rate": 0.0001784941476370459, "loss": 2.5156, "step": 258 }, { "epoch": 0.2203083466241361, "grad_norm": 0.26786255836486816, "learning_rate": 0.00017832693014118448, "loss": 2.6211, "step": 259 }, { "epoch": 0.22115895800106328, "grad_norm": 0.2633914351463318, "learning_rate": 0.0001781591440366399, "loss": 2.5811, "step": 260 }, { "epoch": 0.22200956937799043, "grad_norm": 0.2724866569042206, "learning_rate": 0.00017799079054144334, "loss": 2.5904, "step": 261 }, { "epoch": 0.2228601807549176, "grad_norm": 0.29333001375198364, "learning_rate": 0.00017782187087774477, "loss": 2.7581, "step": 262 }, { "epoch": 0.22371079213184478, "grad_norm": 0.2735550105571747, "learning_rate": 0.00017765238627180424, "loss": 2.7114, "step": 263 }, { "epoch": 0.22456140350877193, "grad_norm": 0.2721397280693054, "learning_rate": 0.00017748233795398307, "loss": 2.5991, "step": 264 }, { "epoch": 0.2254120148856991, "grad_norm": 0.25755858421325684, "learning_rate": 0.0001773117271587346, "loss": 2.5786, "step": 265 }, { "epoch": 0.22626262626262628, "grad_norm": 0.25772804021835327, "learning_rate": 0.00017714055512459565, "loss": 2.488, "step": 266 }, { "epoch": 0.22711323763955343, "grad_norm": 0.2766227424144745, "learning_rate": 0.0001769688230941772, "loss": 2.8924, "step": 267 }, { "epoch": 0.2279638490164806, "grad_norm": 0.26846593618392944, "learning_rate": 0.00017679653231415552, "loss": 2.5783, "step": 268 }, { "epoch": 0.22881446039340775, "grad_norm": 0.26374372839927673, "learning_rate": 0.00017662368403526302, "loss": 2.4675, "step": 269 }, { "epoch": 0.22966507177033493, "grad_norm": 0.28237268328666687, "learning_rate": 0.0001764502795122793, "loss": 2.5994, "step": 270 }, { "epoch": 0.2305156831472621, "grad_norm": 0.2786102890968323, "learning_rate": 0.00017627632000402193, "loss": 2.514, "step": 271 }, { "epoch": 0.23136629452418925, "grad_norm": 0.27646180987358093, "learning_rate": 0.00017610180677333739, "loss": 2.5673, "step": 272 }, { "epoch": 0.23221690590111643, "grad_norm": 0.3052549660205841, "learning_rate": 0.00017592674108709186, "loss": 2.5345, "step": 273 }, { "epoch": 0.2330675172780436, "grad_norm": 0.30554690957069397, "learning_rate": 0.00017575112421616202, "loss": 2.709, "step": 274 }, { "epoch": 0.23391812865497075, "grad_norm": 0.3219161331653595, "learning_rate": 0.00017557495743542585, "loss": 2.6825, "step": 275 }, { "epoch": 0.23476874003189793, "grad_norm": 0.31834957003593445, "learning_rate": 0.0001753982420237533, "loss": 2.7017, "step": 276 }, { "epoch": 0.2356193514088251, "grad_norm": 0.30264872312545776, "learning_rate": 0.00017522097926399722, "loss": 2.3725, "step": 277 }, { "epoch": 0.23646996278575225, "grad_norm": 0.3283548951148987, "learning_rate": 0.00017504317044298367, "loss": 2.6217, "step": 278 }, { "epoch": 0.23732057416267943, "grad_norm": 0.33564746379852295, "learning_rate": 0.00017486481685150302, "loss": 2.5738, "step": 279 }, { "epoch": 0.2381711855396066, "grad_norm": 0.37258434295654297, "learning_rate": 0.0001746859197843002, "loss": 2.783, "step": 280 }, { "epoch": 0.23902179691653375, "grad_norm": 0.3897363245487213, "learning_rate": 0.0001745064805400656, "loss": 2.7908, "step": 281 }, { "epoch": 0.23987240829346093, "grad_norm": 0.3756699562072754, "learning_rate": 0.00017432650042142536, "loss": 2.5944, "step": 282 }, { "epoch": 0.2407230196703881, "grad_norm": 0.3787755072116852, "learning_rate": 0.00017414598073493216, "loss": 2.7574, "step": 283 }, { "epoch": 0.24157363104731525, "grad_norm": 0.38891106843948364, "learning_rate": 0.0001739649227910556, "loss": 2.8635, "step": 284 }, { "epoch": 0.24242424242424243, "grad_norm": 0.40293633937835693, "learning_rate": 0.00017378332790417273, "loss": 2.729, "step": 285 }, { "epoch": 0.2432748538011696, "grad_norm": 0.414109468460083, "learning_rate": 0.00017360119739255852, "loss": 2.6077, "step": 286 }, { "epoch": 0.24412546517809675, "grad_norm": 0.42549028992652893, "learning_rate": 0.0001734185325783762, "loss": 2.7812, "step": 287 }, { "epoch": 0.24497607655502393, "grad_norm": 0.42882055044174194, "learning_rate": 0.00017323533478766777, "loss": 2.7653, "step": 288 }, { "epoch": 0.2458266879319511, "grad_norm": 0.42119139432907104, "learning_rate": 0.00017305160535034436, "loss": 2.5355, "step": 289 }, { "epoch": 0.24667729930887825, "grad_norm": 0.4749990999698639, "learning_rate": 0.0001728673456001766, "loss": 2.7885, "step": 290 }, { "epoch": 0.24752791068580543, "grad_norm": 0.4682268500328064, "learning_rate": 0.00017268255687478469, "loss": 2.6402, "step": 291 }, { "epoch": 0.2483785220627326, "grad_norm": 0.4854019284248352, "learning_rate": 0.00017249724051562906, "loss": 2.7255, "step": 292 }, { "epoch": 0.24922913343965974, "grad_norm": 0.5112527012825012, "learning_rate": 0.00017231139786800042, "loss": 2.8374, "step": 293 }, { "epoch": 0.2500797448165869, "grad_norm": 0.5242344737052917, "learning_rate": 0.0001721250302810101, "loss": 2.9178, "step": 294 }, { "epoch": 0.2500797448165869, "eval_loss": 2.688343048095703, "eval_runtime": 80.6326, "eval_samples_per_second": 12.278, "eval_steps_per_second": 6.139, "step": 294 }, { "epoch": 0.2509303561935141, "grad_norm": 0.6918848156929016, "learning_rate": 0.00017193813910758, "loss": 2.6556, "step": 295 }, { "epoch": 0.25178096757044127, "grad_norm": 0.5682982802391052, "learning_rate": 0.00017175072570443312, "loss": 2.6581, "step": 296 }, { "epoch": 0.25263157894736843, "grad_norm": 0.6087559461593628, "learning_rate": 0.00017156279143208352, "loss": 2.5665, "step": 297 }, { "epoch": 0.2534821903242956, "grad_norm": 0.6545628309249878, "learning_rate": 0.00017137433765482642, "loss": 2.8215, "step": 298 }, { "epoch": 0.25433280170122274, "grad_norm": 0.6754540801048279, "learning_rate": 0.00017118536574072842, "loss": 2.7991, "step": 299 }, { "epoch": 0.2551834130781499, "grad_norm": 0.6926846504211426, "learning_rate": 0.0001709958770616174, "loss": 2.7371, "step": 300 }, { "epoch": 0.2560340244550771, "grad_norm": 0.32997071743011475, "learning_rate": 0.00017080587299307283, "loss": 2.7739, "step": 301 }, { "epoch": 0.25688463583200427, "grad_norm": 0.3260290324687958, "learning_rate": 0.0001706153549144154, "loss": 2.5971, "step": 302 }, { "epoch": 0.25773524720893143, "grad_norm": 0.3378421366214752, "learning_rate": 0.00017042432420869732, "loss": 2.7588, "step": 303 }, { "epoch": 0.2585858585858586, "grad_norm": 0.27844691276550293, "learning_rate": 0.0001702327822626922, "loss": 2.6493, "step": 304 }, { "epoch": 0.25943646996278574, "grad_norm": 0.29100462794303894, "learning_rate": 0.00017004073046688497, "loss": 2.6397, "step": 305 }, { "epoch": 0.2602870813397129, "grad_norm": 0.2764577865600586, "learning_rate": 0.00016984817021546177, "loss": 2.4199, "step": 306 }, { "epoch": 0.26113769271664006, "grad_norm": 0.26870644092559814, "learning_rate": 0.00016965510290629972, "loss": 2.2552, "step": 307 }, { "epoch": 0.26198830409356727, "grad_norm": 0.2770349979400635, "learning_rate": 0.00016946152994095704, "loss": 2.6118, "step": 308 }, { "epoch": 0.2628389154704944, "grad_norm": 0.27041003108024597, "learning_rate": 0.00016926745272466268, "loss": 2.5329, "step": 309 }, { "epoch": 0.2636895268474216, "grad_norm": 0.25608015060424805, "learning_rate": 0.00016907287266630614, "loss": 2.3411, "step": 310 }, { "epoch": 0.26454013822434874, "grad_norm": 0.2750420570373535, "learning_rate": 0.00016887779117842725, "loss": 2.6393, "step": 311 }, { "epoch": 0.2653907496012759, "grad_norm": 0.26276537775993347, "learning_rate": 0.00016868220967720604, "loss": 2.3616, "step": 312 }, { "epoch": 0.26624136097820306, "grad_norm": 0.2735307514667511, "learning_rate": 0.00016848612958245216, "loss": 2.5156, "step": 313 }, { "epoch": 0.26709197235513027, "grad_norm": 0.32951095700263977, "learning_rate": 0.00016828955231759497, "loss": 2.5329, "step": 314 }, { "epoch": 0.2679425837320574, "grad_norm": 0.2762184143066406, "learning_rate": 0.00016809247930967282, "loss": 2.6873, "step": 315 }, { "epoch": 0.2687931951089846, "grad_norm": 0.2802570164203644, "learning_rate": 0.000167894911989323, "loss": 2.5532, "step": 316 }, { "epoch": 0.26964380648591174, "grad_norm": 0.26968276500701904, "learning_rate": 0.0001676968517907712, "loss": 2.602, "step": 317 }, { "epoch": 0.2704944178628389, "grad_norm": 0.27560874819755554, "learning_rate": 0.00016749830015182107, "loss": 2.5003, "step": 318 }, { "epoch": 0.27134502923976606, "grad_norm": 0.288411021232605, "learning_rate": 0.00016729925851384386, "loss": 2.6859, "step": 319 }, { "epoch": 0.27219564061669327, "grad_norm": 0.2999224364757538, "learning_rate": 0.00016709972832176797, "loss": 2.8356, "step": 320 }, { "epoch": 0.2730462519936204, "grad_norm": 0.2956329882144928, "learning_rate": 0.0001668997110240684, "loss": 2.6157, "step": 321 }, { "epoch": 0.2738968633705476, "grad_norm": 0.30924192070961, "learning_rate": 0.00016669920807275623, "loss": 2.8421, "step": 322 }, { "epoch": 0.27474747474747474, "grad_norm": 0.31185418367385864, "learning_rate": 0.00016649822092336812, "loss": 2.758, "step": 323 }, { "epoch": 0.2755980861244019, "grad_norm": 0.3129124045372009, "learning_rate": 0.0001662967510349558, "loss": 2.6861, "step": 324 }, { "epoch": 0.27644869750132905, "grad_norm": 0.3300238251686096, "learning_rate": 0.00016609479987007527, "loss": 2.8284, "step": 325 }, { "epoch": 0.27729930887825627, "grad_norm": 0.3459511399269104, "learning_rate": 0.00016589236889477646, "loss": 2.6454, "step": 326 }, { "epoch": 0.2781499202551834, "grad_norm": 0.3566714823246002, "learning_rate": 0.00016568945957859236, "loss": 2.399, "step": 327 }, { "epoch": 0.2790005316321106, "grad_norm": 0.3650771379470825, "learning_rate": 0.00016548607339452853, "loss": 2.7529, "step": 328 }, { "epoch": 0.27985114300903774, "grad_norm": 0.35940608382225037, "learning_rate": 0.00016528221181905217, "loss": 2.5894, "step": 329 }, { "epoch": 0.2807017543859649, "grad_norm": 0.4018422067165375, "learning_rate": 0.0001650778763320817, "loss": 2.7001, "step": 330 }, { "epoch": 0.28155236576289205, "grad_norm": 0.3867095708847046, "learning_rate": 0.00016487306841697578, "loss": 2.9919, "step": 331 }, { "epoch": 0.28240297713981927, "grad_norm": 0.5714160799980164, "learning_rate": 0.0001646677895605227, "loss": 2.9366, "step": 332 }, { "epoch": 0.2832535885167464, "grad_norm": 0.3697023391723633, "learning_rate": 0.00016446204125292942, "loss": 2.653, "step": 333 }, { "epoch": 0.2841041998936736, "grad_norm": 0.4222877323627472, "learning_rate": 0.00016425582498781087, "loss": 2.8557, "step": 334 }, { "epoch": 0.28495481127060074, "grad_norm": 0.40899163484573364, "learning_rate": 0.0001640491422621792, "loss": 2.7875, "step": 335 }, { "epoch": 0.2858054226475279, "grad_norm": 0.44694027304649353, "learning_rate": 0.00016384199457643262, "loss": 2.8616, "step": 336 }, { "epoch": 0.28665603402445505, "grad_norm": 0.45921215415000916, "learning_rate": 0.00016363438343434483, "loss": 2.5821, "step": 337 }, { "epoch": 0.28750664540138227, "grad_norm": 0.4156991243362427, "learning_rate": 0.00016342631034305384, "loss": 2.7228, "step": 338 }, { "epoch": 0.2883572567783094, "grad_norm": 0.43912479281425476, "learning_rate": 0.00016321777681305125, "loss": 2.7119, "step": 339 }, { "epoch": 0.2892078681552366, "grad_norm": 0.4638505280017853, "learning_rate": 0.00016300878435817113, "loss": 2.6832, "step": 340 }, { "epoch": 0.29005847953216374, "grad_norm": 0.4912424087524414, "learning_rate": 0.00016279933449557906, "loss": 2.6583, "step": 341 }, { "epoch": 0.2909090909090909, "grad_norm": 0.5067815780639648, "learning_rate": 0.00016258942874576118, "loss": 2.7723, "step": 342 }, { "epoch": 0.29175970228601805, "grad_norm": 0.49635544419288635, "learning_rate": 0.0001623790686325131, "loss": 2.7351, "step": 343 }, { "epoch": 0.29261031366294527, "grad_norm": 0.49465370178222656, "learning_rate": 0.00016216825568292885, "loss": 2.7251, "step": 344 }, { "epoch": 0.2934609250398724, "grad_norm": 0.558665931224823, "learning_rate": 0.00016195699142738975, "loss": 2.5475, "step": 345 }, { "epoch": 0.2943115364167996, "grad_norm": 0.6477727890014648, "learning_rate": 0.00016174527739955342, "loss": 2.7069, "step": 346 }, { "epoch": 0.29516214779372674, "grad_norm": 0.6656542420387268, "learning_rate": 0.00016153311513634257, "loss": 3.0185, "step": 347 }, { "epoch": 0.2960127591706539, "grad_norm": 0.7209298610687256, "learning_rate": 0.0001613205061779337, "loss": 2.8225, "step": 348 }, { "epoch": 0.29686337054758105, "grad_norm": 0.6285322308540344, "learning_rate": 0.0001611074520677462, "loss": 2.9088, "step": 349 }, { "epoch": 0.29771398192450826, "grad_norm": 0.7612189054489136, "learning_rate": 0.00016089395435243105, "loss": 2.9744, "step": 350 }, { "epoch": 0.2985645933014354, "grad_norm": 0.3265992999076843, "learning_rate": 0.00016068001458185936, "loss": 2.3931, "step": 351 }, { "epoch": 0.2994152046783626, "grad_norm": 0.3179806172847748, "learning_rate": 0.00016046563430911146, "loss": 2.6212, "step": 352 }, { "epoch": 0.30026581605528974, "grad_norm": 0.31296634674072266, "learning_rate": 0.00016025081509046544, "loss": 2.5008, "step": 353 }, { "epoch": 0.3011164274322169, "grad_norm": 0.283408522605896, "learning_rate": 0.00016003555848538586, "loss": 2.3946, "step": 354 }, { "epoch": 0.30196703880914405, "grad_norm": 0.2742927074432373, "learning_rate": 0.00015981986605651248, "loss": 2.5154, "step": 355 }, { "epoch": 0.30281765018607126, "grad_norm": 0.29107552766799927, "learning_rate": 0.00015960373936964892, "loss": 2.505, "step": 356 }, { "epoch": 0.3036682615629984, "grad_norm": 0.3244473338127136, "learning_rate": 0.0001593871799937512, "loss": 2.5244, "step": 357 }, { "epoch": 0.3045188729399256, "grad_norm": 0.27494120597839355, "learning_rate": 0.0001591701895009164, "loss": 2.4161, "step": 358 }, { "epoch": 0.30536948431685274, "grad_norm": 0.28784480690956116, "learning_rate": 0.00015895276946637136, "loss": 2.6434, "step": 359 }, { "epoch": 0.3062200956937799, "grad_norm": 0.28096505999565125, "learning_rate": 0.00015873492146846108, "loss": 2.6784, "step": 360 }, { "epoch": 0.30707070707070705, "grad_norm": 0.2602795362472534, "learning_rate": 0.00015851664708863735, "loss": 2.2712, "step": 361 }, { "epoch": 0.30792131844763426, "grad_norm": 0.28796225786209106, "learning_rate": 0.0001582979479114472, "loss": 2.8673, "step": 362 }, { "epoch": 0.3087719298245614, "grad_norm": 0.273456871509552, "learning_rate": 0.00015807882552452154, "loss": 2.6931, "step": 363 }, { "epoch": 0.3096225412014886, "grad_norm": 0.29190319776535034, "learning_rate": 0.00015785928151856347, "loss": 2.5932, "step": 364 }, { "epoch": 0.31047315257841573, "grad_norm": 0.290829598903656, "learning_rate": 0.0001576393174873368, "loss": 2.6889, "step": 365 }, { "epoch": 0.3113237639553429, "grad_norm": 0.27631402015686035, "learning_rate": 0.0001574189350276545, "loss": 2.6508, "step": 366 }, { "epoch": 0.31217437533227005, "grad_norm": 0.30464449524879456, "learning_rate": 0.00015719813573936712, "loss": 2.4918, "step": 367 }, { "epoch": 0.31302498670919726, "grad_norm": 0.2622525095939636, "learning_rate": 0.00015697692122535107, "loss": 2.4657, "step": 368 }, { "epoch": 0.3138755980861244, "grad_norm": 0.2790607810020447, "learning_rate": 0.0001567552930914972, "loss": 2.78, "step": 369 }, { "epoch": 0.3147262094630516, "grad_norm": 0.27601122856140137, "learning_rate": 0.00015653325294669884, "loss": 2.6908, "step": 370 }, { "epoch": 0.31557682083997873, "grad_norm": 0.3054703176021576, "learning_rate": 0.0001563108024028404, "loss": 2.6602, "step": 371 }, { "epoch": 0.3164274322169059, "grad_norm": 0.2979956567287445, "learning_rate": 0.00015608794307478546, "loss": 2.8034, "step": 372 }, { "epoch": 0.31727804359383305, "grad_norm": 0.3022500276565552, "learning_rate": 0.00015586467658036524, "loss": 2.6491, "step": 373 }, { "epoch": 0.31812865497076026, "grad_norm": 0.3278852701187134, "learning_rate": 0.0001556410045403667, "loss": 2.5928, "step": 374 }, { "epoch": 0.3189792663476874, "grad_norm": 0.3023948073387146, "learning_rate": 0.0001554169285785208, "loss": 2.6165, "step": 375 }, { "epoch": 0.3198298777246146, "grad_norm": 0.33025047183036804, "learning_rate": 0.00015519245032149083, "loss": 2.7669, "step": 376 }, { "epoch": 0.32068048910154173, "grad_norm": 0.3081379532814026, "learning_rate": 0.0001549675713988604, "loss": 2.5991, "step": 377 }, { "epoch": 0.3215311004784689, "grad_norm": 0.35036811232566833, "learning_rate": 0.0001547422934431218, "loss": 2.6891, "step": 378 }, { "epoch": 0.32238171185539605, "grad_norm": 0.399915486574173, "learning_rate": 0.00015451661808966405, "loss": 2.8271, "step": 379 }, { "epoch": 0.32323232323232326, "grad_norm": 0.3355950713157654, "learning_rate": 0.00015429054697676107, "loss": 2.3574, "step": 380 }, { "epoch": 0.3240829346092504, "grad_norm": 0.35686787962913513, "learning_rate": 0.00015406408174555976, "loss": 2.6926, "step": 381 }, { "epoch": 0.3249335459861776, "grad_norm": 0.3730961084365845, "learning_rate": 0.00015383722404006806, "loss": 2.7418, "step": 382 }, { "epoch": 0.32578415736310473, "grad_norm": 0.36210712790489197, "learning_rate": 0.00015360997550714305, "loss": 2.7188, "step": 383 }, { "epoch": 0.3266347687400319, "grad_norm": 0.4164154827594757, "learning_rate": 0.0001533823377964791, "loss": 2.8182, "step": 384 }, { "epoch": 0.32748538011695905, "grad_norm": 0.3890111446380615, "learning_rate": 0.0001531543125605956, "loss": 2.6873, "step": 385 }, { "epoch": 0.3283359914938862, "grad_norm": 0.3929746747016907, "learning_rate": 0.0001529259014548253, "loss": 2.6169, "step": 386 }, { "epoch": 0.3291866028708134, "grad_norm": 0.4354581832885742, "learning_rate": 0.0001526971061373021, "loss": 2.681, "step": 387 }, { "epoch": 0.3300372142477406, "grad_norm": 0.4014440178871155, "learning_rate": 0.00015246792826894906, "loss": 2.6601, "step": 388 }, { "epoch": 0.33088782562466773, "grad_norm": 0.41521206498146057, "learning_rate": 0.00015223836951346634, "loss": 2.7763, "step": 389 }, { "epoch": 0.3317384370015949, "grad_norm": 0.43811067938804626, "learning_rate": 0.00015200843153731906, "loss": 2.7373, "step": 390 }, { "epoch": 0.33258904837852205, "grad_norm": 0.4687197208404541, "learning_rate": 0.0001517781160097254, "loss": 2.6432, "step": 391 }, { "epoch": 0.3334396597554492, "grad_norm": 0.4998420178890228, "learning_rate": 0.00015154742460264425, "loss": 2.6434, "step": 392 }, { "epoch": 0.3342902711323764, "grad_norm": 0.4983424246311188, "learning_rate": 0.0001513163589907632, "loss": 2.7619, "step": 393 }, { "epoch": 0.3351408825093036, "grad_norm": 0.5066902041435242, "learning_rate": 0.00015108492085148632, "loss": 2.6515, "step": 394 }, { "epoch": 0.33599149388623073, "grad_norm": 0.5106616020202637, "learning_rate": 0.00015085311186492206, "loss": 2.6288, "step": 395 }, { "epoch": 0.3368421052631579, "grad_norm": 0.6036600470542908, "learning_rate": 0.00015062093371387097, "loss": 2.6391, "step": 396 }, { "epoch": 0.33769271664008504, "grad_norm": 0.6260906457901001, "learning_rate": 0.00015038838808381354, "loss": 2.8623, "step": 397 }, { "epoch": 0.3385433280170122, "grad_norm": 0.6070663332939148, "learning_rate": 0.00015015547666289797, "loss": 2.615, "step": 398 }, { "epoch": 0.3393939393939394, "grad_norm": 0.66013503074646, "learning_rate": 0.00014992220114192785, "loss": 2.6865, "step": 399 }, { "epoch": 0.34024455077086657, "grad_norm": 0.7371327877044678, "learning_rate": 0.00014968856321434998, "loss": 2.6959, "step": 400 }, { "epoch": 0.34109516214779373, "grad_norm": 0.3081108629703522, "learning_rate": 0.00014945456457624197, "loss": 2.321, "step": 401 }, { "epoch": 0.3419457735247209, "grad_norm": 0.37835538387298584, "learning_rate": 0.0001492202069263, "loss": 2.5144, "step": 402 }, { "epoch": 0.34279638490164804, "grad_norm": 0.3184291422367096, "learning_rate": 0.00014898549196582645, "loss": 2.5783, "step": 403 }, { "epoch": 0.3436469962785752, "grad_norm": 0.3378361165523529, "learning_rate": 0.00014875042139871766, "loss": 2.667, "step": 404 }, { "epoch": 0.3444976076555024, "grad_norm": 0.285756379365921, "learning_rate": 0.00014851499693145135, "loss": 2.5316, "step": 405 }, { "epoch": 0.34534821903242957, "grad_norm": 0.28837427496910095, "learning_rate": 0.00014827922027307451, "loss": 2.5012, "step": 406 }, { "epoch": 0.34619883040935673, "grad_norm": 0.30004727840423584, "learning_rate": 0.0001480430931351906, "loss": 2.5237, "step": 407 }, { "epoch": 0.3470494417862839, "grad_norm": 0.28527382016181946, "learning_rate": 0.00014780661723194757, "loss": 2.5067, "step": 408 }, { "epoch": 0.34790005316321104, "grad_norm": 0.26983842253685, "learning_rate": 0.00014756979428002514, "loss": 2.5577, "step": 409 }, { "epoch": 0.3487506645401382, "grad_norm": 0.2686617076396942, "learning_rate": 0.00014733262599862234, "loss": 2.5267, "step": 410 }, { "epoch": 0.3496012759170654, "grad_norm": 0.26147857308387756, "learning_rate": 0.00014709511410944523, "loss": 2.4459, "step": 411 }, { "epoch": 0.35045188729399257, "grad_norm": 0.29142996668815613, "learning_rate": 0.00014685726033669412, "loss": 2.7246, "step": 412 }, { "epoch": 0.3513024986709197, "grad_norm": 0.29214030504226685, "learning_rate": 0.00014661906640705129, "loss": 2.6422, "step": 413 }, { "epoch": 0.3521531100478469, "grad_norm": 0.2727803885936737, "learning_rate": 0.00014638053404966836, "loss": 2.6416, "step": 414 }, { "epoch": 0.35300372142477404, "grad_norm": 0.2637098431587219, "learning_rate": 0.0001461416649961537, "loss": 2.5555, "step": 415 }, { "epoch": 0.3538543328017012, "grad_norm": 0.2736997902393341, "learning_rate": 0.00014590246098055996, "loss": 2.7423, "step": 416 }, { "epoch": 0.3547049441786284, "grad_norm": 0.2691054940223694, "learning_rate": 0.0001456629237393713, "loss": 2.7818, "step": 417 }, { "epoch": 0.35555555555555557, "grad_norm": 0.28304579854011536, "learning_rate": 0.0001454230550114911, "loss": 2.554, "step": 418 }, { "epoch": 0.3564061669324827, "grad_norm": 0.2698013484477997, "learning_rate": 0.00014518285653822898, "loss": 2.5155, "step": 419 }, { "epoch": 0.3572567783094099, "grad_norm": 0.25874024629592896, "learning_rate": 0.00014494233006328837, "loss": 2.4841, "step": 420 }, { "epoch": 0.35810738968633704, "grad_norm": 0.2918216586112976, "learning_rate": 0.00014470147733275387, "loss": 2.7489, "step": 421 }, { "epoch": 0.3589580010632642, "grad_norm": 0.28201210498809814, "learning_rate": 0.0001444603000950784, "loss": 2.5709, "step": 422 }, { "epoch": 0.3598086124401914, "grad_norm": 0.3077748119831085, "learning_rate": 0.0001442188001010707, "loss": 2.6089, "step": 423 }, { "epoch": 0.36065922381711857, "grad_norm": 0.2970117926597595, "learning_rate": 0.00014397697910388248, "loss": 2.6171, "step": 424 }, { "epoch": 0.3615098351940457, "grad_norm": 0.302947074174881, "learning_rate": 0.00014373483885899582, "loss": 2.4033, "step": 425 }, { "epoch": 0.3623604465709729, "grad_norm": 0.31938815116882324, "learning_rate": 0.00014349238112421024, "loss": 2.7042, "step": 426 }, { "epoch": 0.36321105794790004, "grad_norm": 0.3481922447681427, "learning_rate": 0.00014324960765963018, "loss": 2.6479, "step": 427 }, { "epoch": 0.3640616693248272, "grad_norm": 0.3142179846763611, "learning_rate": 0.00014300652022765207, "loss": 2.5285, "step": 428 }, { "epoch": 0.3649122807017544, "grad_norm": 0.3226439952850342, "learning_rate": 0.00014276312059295147, "loss": 2.5389, "step": 429 }, { "epoch": 0.36576289207868157, "grad_norm": 0.401301771402359, "learning_rate": 0.00014251941052247045, "loss": 2.8749, "step": 430 }, { "epoch": 0.3666135034556087, "grad_norm": 0.3587849736213684, "learning_rate": 0.00014227539178540463, "loss": 2.7549, "step": 431 }, { "epoch": 0.3674641148325359, "grad_norm": 0.38098907470703125, "learning_rate": 0.00014203106615319038, "loss": 2.6159, "step": 432 }, { "epoch": 0.36831472620946304, "grad_norm": 0.402971088886261, "learning_rate": 0.00014178643539949196, "loss": 2.6626, "step": 433 }, { "epoch": 0.3691653375863902, "grad_norm": 0.3960564136505127, "learning_rate": 0.00014154150130018866, "loss": 2.8099, "step": 434 }, { "epoch": 0.3700159489633174, "grad_norm": 0.41555696725845337, "learning_rate": 0.00014129626563336178, "loss": 2.7282, "step": 435 }, { "epoch": 0.37086656034024457, "grad_norm": 0.3991285264492035, "learning_rate": 0.000141050730179282, "loss": 2.5909, "step": 436 }, { "epoch": 0.3717171717171717, "grad_norm": 0.442220002412796, "learning_rate": 0.00014080489672039606, "loss": 2.8671, "step": 437 }, { "epoch": 0.3725677830940989, "grad_norm": 0.4263823628425598, "learning_rate": 0.0001405587670413143, "loss": 2.5901, "step": 438 }, { "epoch": 0.37341839447102604, "grad_norm": 0.45683711767196655, "learning_rate": 0.00014031234292879725, "loss": 2.7801, "step": 439 }, { "epoch": 0.3742690058479532, "grad_norm": 0.4672732353210449, "learning_rate": 0.00014006562617174294, "loss": 2.696, "step": 440 }, { "epoch": 0.3751196172248804, "grad_norm": 0.45676231384277344, "learning_rate": 0.0001398186185611738, "loss": 2.6197, "step": 441 }, { "epoch": 0.37597022860180757, "grad_norm": 0.4717809855937958, "learning_rate": 0.00013957132189022374, "loss": 2.7676, "step": 442 }, { "epoch": 0.3768208399787347, "grad_norm": 0.4931057393550873, "learning_rate": 0.00013932373795412503, "loss": 2.7968, "step": 443 }, { "epoch": 0.3776714513556619, "grad_norm": 0.5112429857254028, "learning_rate": 0.0001390758685501954, "loss": 2.5947, "step": 444 }, { "epoch": 0.37852206273258904, "grad_norm": 0.5512686371803284, "learning_rate": 0.00013882771547782475, "loss": 2.6041, "step": 445 }, { "epoch": 0.3793726741095162, "grad_norm": 0.5506839752197266, "learning_rate": 0.0001385792805384625, "loss": 2.7389, "step": 446 }, { "epoch": 0.3802232854864434, "grad_norm": 0.5831094980239868, "learning_rate": 0.00013833056553560398, "loss": 2.8867, "step": 447 }, { "epoch": 0.38107389686337056, "grad_norm": 0.6127706170082092, "learning_rate": 0.00013808157227477788, "loss": 2.689, "step": 448 }, { "epoch": 0.3819245082402977, "grad_norm": 0.6911614537239075, "learning_rate": 0.00013783230256353266, "loss": 2.9826, "step": 449 }, { "epoch": 0.3827751196172249, "grad_norm": 0.7095205187797546, "learning_rate": 0.00013758275821142382, "loss": 2.7725, "step": 450 }, { "epoch": 0.38362573099415204, "grad_norm": 0.31148967146873474, "learning_rate": 0.00013733294103000055, "loss": 2.5203, "step": 451 }, { "epoch": 0.3844763423710792, "grad_norm": 0.3002786338329315, "learning_rate": 0.00013708285283279252, "loss": 2.5911, "step": 452 }, { "epoch": 0.3853269537480064, "grad_norm": 0.29820379614830017, "learning_rate": 0.00013683249543529696, "loss": 2.3441, "step": 453 }, { "epoch": 0.38617756512493356, "grad_norm": 0.29162371158599854, "learning_rate": 0.00013658187065496532, "loss": 2.4791, "step": 454 }, { "epoch": 0.3870281765018607, "grad_norm": 0.2905353307723999, "learning_rate": 0.00013633098031119002, "loss": 2.4208, "step": 455 }, { "epoch": 0.3878787878787879, "grad_norm": 0.28394779562950134, "learning_rate": 0.00013607982622529133, "loss": 2.6468, "step": 456 }, { "epoch": 0.38872939925571504, "grad_norm": 0.2683579921722412, "learning_rate": 0.00013582841022050424, "loss": 2.5199, "step": 457 }, { "epoch": 0.3895800106326422, "grad_norm": 0.2719745934009552, "learning_rate": 0.00013557673412196503, "loss": 2.4813, "step": 458 }, { "epoch": 0.39043062200956935, "grad_norm": 0.26781052350997925, "learning_rate": 0.00013532479975669808, "loss": 2.5296, "step": 459 }, { "epoch": 0.39128123338649656, "grad_norm": 0.26786506175994873, "learning_rate": 0.00013507260895360274, "loss": 2.5254, "step": 460 }, { "epoch": 0.3921318447634237, "grad_norm": 0.27148234844207764, "learning_rate": 0.0001348201635434399, "loss": 2.6415, "step": 461 }, { "epoch": 0.3929824561403509, "grad_norm": 0.29129862785339355, "learning_rate": 0.00013456746535881871, "loss": 2.6098, "step": 462 }, { "epoch": 0.39383306751727803, "grad_norm": 0.2636375427246094, "learning_rate": 0.00013431451623418343, "loss": 2.4875, "step": 463 }, { "epoch": 0.3946836788942052, "grad_norm": 0.2701190114021301, "learning_rate": 0.00013406131800579985, "loss": 2.7151, "step": 464 }, { "epoch": 0.39553429027113235, "grad_norm": 0.2739466428756714, "learning_rate": 0.00013380787251174225, "loss": 2.6465, "step": 465 }, { "epoch": 0.39638490164805956, "grad_norm": 0.2672434151172638, "learning_rate": 0.00013355418159187985, "loss": 2.6823, "step": 466 }, { "epoch": 0.3972355130249867, "grad_norm": 0.26863133907318115, "learning_rate": 0.00013330024708786353, "loss": 2.5357, "step": 467 }, { "epoch": 0.3980861244019139, "grad_norm": 0.2704436779022217, "learning_rate": 0.00013304607084311244, "loss": 2.6887, "step": 468 }, { "epoch": 0.39893673577884103, "grad_norm": 0.2772809863090515, "learning_rate": 0.00013279165470280065, "loss": 2.6992, "step": 469 }, { "epoch": 0.3997873471557682, "grad_norm": 0.3024834394454956, "learning_rate": 0.0001325370005138437, "loss": 2.753, "step": 470 }, { "epoch": 0.40063795853269535, "grad_norm": 0.2810865342617035, "learning_rate": 0.00013228211012488532, "loss": 2.7921, "step": 471 }, { "epoch": 0.40148856990962256, "grad_norm": 0.28023043274879456, "learning_rate": 0.00013202698538628376, "loss": 2.2935, "step": 472 }, { "epoch": 0.4023391812865497, "grad_norm": 0.30379030108451843, "learning_rate": 0.0001317716281500987, "loss": 2.5575, "step": 473 }, { "epoch": 0.4031897926634769, "grad_norm": 0.29648759961128235, "learning_rate": 0.00013151604027007745, "loss": 2.4384, "step": 474 }, { "epoch": 0.40404040404040403, "grad_norm": 0.32276931405067444, "learning_rate": 0.00013126022360164172, "loss": 2.7529, "step": 475 }, { "epoch": 0.4048910154173312, "grad_norm": 0.3146274983882904, "learning_rate": 0.00013100418000187419, "loss": 2.473, "step": 476 }, { "epoch": 0.40574162679425835, "grad_norm": 0.33162757754325867, "learning_rate": 0.00013074791132950485, "loss": 2.5832, "step": 477 }, { "epoch": 0.40659223817118556, "grad_norm": 0.33521875739097595, "learning_rate": 0.00013049141944489748, "loss": 2.4807, "step": 478 }, { "epoch": 0.4074428495481127, "grad_norm": 0.3724415898323059, "learning_rate": 0.00013023470621003643, "loss": 2.8245, "step": 479 }, { "epoch": 0.4082934609250399, "grad_norm": 0.3634830117225647, "learning_rate": 0.00012997777348851288, "loss": 2.5921, "step": 480 }, { "epoch": 0.40914407230196703, "grad_norm": 0.38656124472618103, "learning_rate": 0.0001297206231455113, "loss": 2.619, "step": 481 }, { "epoch": 0.4099946836788942, "grad_norm": 0.3939076066017151, "learning_rate": 0.00012946325704779602, "loss": 2.7466, "step": 482 }, { "epoch": 0.41084529505582135, "grad_norm": 0.37405261397361755, "learning_rate": 0.00012920567706369758, "loss": 2.764, "step": 483 }, { "epoch": 0.41169590643274856, "grad_norm": 0.3894766569137573, "learning_rate": 0.0001289478850630993, "loss": 2.7373, "step": 484 }, { "epoch": 0.4125465178096757, "grad_norm": 0.44012099504470825, "learning_rate": 0.00012868988291742347, "loss": 2.6475, "step": 485 }, { "epoch": 0.4133971291866029, "grad_norm": 0.41175583004951477, "learning_rate": 0.0001284316724996181, "loss": 2.7573, "step": 486 }, { "epoch": 0.41424774056353003, "grad_norm": 0.4406805634498596, "learning_rate": 0.00012817325568414297, "loss": 2.5429, "step": 487 }, { "epoch": 0.4150983519404572, "grad_norm": 0.4783489406108856, "learning_rate": 0.0001279146343469563, "loss": 2.8275, "step": 488 }, { "epoch": 0.41594896331738435, "grad_norm": 0.44115763902664185, "learning_rate": 0.00012765581036550095, "loss": 2.6858, "step": 489 }, { "epoch": 0.41679957469431156, "grad_norm": 0.5237467885017395, "learning_rate": 0.0001273967856186909, "loss": 2.8638, "step": 490 }, { "epoch": 0.4176501860712387, "grad_norm": 0.5071147680282593, "learning_rate": 0.00012713756198689757, "loss": 2.8603, "step": 491 }, { "epoch": 0.4185007974481659, "grad_norm": 0.5125464797019958, "learning_rate": 0.00012687814135193612, "loss": 3.0048, "step": 492 }, { "epoch": 0.41935140882509303, "grad_norm": 0.5373572707176208, "learning_rate": 0.0001266185255970519, "loss": 2.5727, "step": 493 }, { "epoch": 0.4202020202020202, "grad_norm": 0.4913314878940582, "learning_rate": 0.00012635871660690676, "loss": 2.6501, "step": 494 }, { "epoch": 0.42105263157894735, "grad_norm": 0.5953575968742371, "learning_rate": 0.00012609871626756522, "loss": 2.8674, "step": 495 }, { "epoch": 0.42190324295587456, "grad_norm": 0.5852685570716858, "learning_rate": 0.00012583852646648095, "loss": 2.6855, "step": 496 }, { "epoch": 0.4227538543328017, "grad_norm": 0.6449065804481506, "learning_rate": 0.00012557814909248296, "loss": 2.7688, "step": 497 }, { "epoch": 0.4236044657097289, "grad_norm": 0.6615833640098572, "learning_rate": 0.000125317586035762, "loss": 2.7446, "step": 498 }, { "epoch": 0.42445507708665603, "grad_norm": 0.6561222076416016, "learning_rate": 0.0001250568391878567, "loss": 2.801, "step": 499 }, { "epoch": 0.4253056884635832, "grad_norm": 0.8370924592018127, "learning_rate": 0.00012479591044163997, "loss": 2.9991, "step": 500 }, { "epoch": 0.42615629984051034, "grad_norm": 0.3273813724517822, "learning_rate": 0.0001245348016913051, "loss": 2.5914, "step": 501 }, { "epoch": 0.42700691121743756, "grad_norm": 0.3103164732456207, "learning_rate": 0.00012427351483235223, "loss": 2.6331, "step": 502 }, { "epoch": 0.4278575225943647, "grad_norm": 0.3061564862728119, "learning_rate": 0.00012401205176157447, "loss": 2.5927, "step": 503 }, { "epoch": 0.42870813397129187, "grad_norm": 0.2943616509437561, "learning_rate": 0.00012375041437704393, "loss": 2.4734, "step": 504 }, { "epoch": 0.42955874534821903, "grad_norm": 0.29883497953414917, "learning_rate": 0.00012348860457809838, "loss": 2.4734, "step": 505 }, { "epoch": 0.4304093567251462, "grad_norm": 0.295578271150589, "learning_rate": 0.00012322662426532708, "loss": 2.3561, "step": 506 }, { "epoch": 0.43125996810207334, "grad_norm": 0.3073442578315735, "learning_rate": 0.00012296447534055716, "loss": 2.5489, "step": 507 }, { "epoch": 0.43211057947900056, "grad_norm": 0.3207886517047882, "learning_rate": 0.00012270215970683977, "loss": 2.6984, "step": 508 }, { "epoch": 0.4329611908559277, "grad_norm": 0.26681843400001526, "learning_rate": 0.00012243967926843627, "loss": 2.4313, "step": 509 }, { "epoch": 0.43381180223285487, "grad_norm": 0.29009976983070374, "learning_rate": 0.00012217703593080445, "loss": 2.6128, "step": 510 }, { "epoch": 0.43466241360978203, "grad_norm": 0.27712225914001465, "learning_rate": 0.00012191423160058462, "loss": 2.4976, "step": 511 }, { "epoch": 0.4355130249867092, "grad_norm": 0.30184635519981384, "learning_rate": 0.00012165126818558572, "loss": 2.6912, "step": 512 }, { "epoch": 0.43636363636363634, "grad_norm": 0.28122004866600037, "learning_rate": 0.00012138814759477176, "loss": 2.6935, "step": 513 }, { "epoch": 0.43721424774056356, "grad_norm": 0.2680952250957489, "learning_rate": 0.00012112487173824753, "loss": 2.5607, "step": 514 }, { "epoch": 0.4380648591174907, "grad_norm": 0.26283374428749084, "learning_rate": 0.00012086144252724513, "loss": 2.5001, "step": 515 }, { "epoch": 0.43891547049441787, "grad_norm": 0.26960310339927673, "learning_rate": 0.00012059786187410984, "loss": 2.609, "step": 516 }, { "epoch": 0.439766081871345, "grad_norm": 0.2755014896392822, "learning_rate": 0.00012033413169228635, "loss": 2.6356, "step": 517 }, { "epoch": 0.4406166932482722, "grad_norm": 0.2706344723701477, "learning_rate": 0.00012007025389630484, "loss": 2.6909, "step": 518 }, { "epoch": 0.44146730462519934, "grad_norm": 0.2639751434326172, "learning_rate": 0.00011980623040176704, "loss": 2.542, "step": 519 }, { "epoch": 0.44231791600212655, "grad_norm": 0.2747778594493866, "learning_rate": 0.00011954206312533245, "loss": 2.4773, "step": 520 }, { "epoch": 0.4431685273790537, "grad_norm": 0.2990424931049347, "learning_rate": 0.0001192777539847043, "loss": 2.6881, "step": 521 }, { "epoch": 0.44401913875598087, "grad_norm": 0.27671095728874207, "learning_rate": 0.00011901330489861564, "loss": 2.5495, "step": 522 }, { "epoch": 0.444869750132908, "grad_norm": 0.3051941394805908, "learning_rate": 0.00011874871778681555, "loss": 2.7591, "step": 523 }, { "epoch": 0.4457203615098352, "grad_norm": 0.2990604043006897, "learning_rate": 0.00011848399457005495, "loss": 2.5765, "step": 524 }, { "epoch": 0.44657097288676234, "grad_norm": 0.3488616943359375, "learning_rate": 0.00011821913717007298, "loss": 2.634, "step": 525 }, { "epoch": 0.44742158426368955, "grad_norm": 0.3207804262638092, "learning_rate": 0.00011795414750958265, "loss": 2.5632, "step": 526 }, { "epoch": 0.4482721956406167, "grad_norm": 0.3244103491306305, "learning_rate": 0.0001176890275122573, "loss": 2.5784, "step": 527 }, { "epoch": 0.44912280701754387, "grad_norm": 0.33527323603630066, "learning_rate": 0.00011742377910271639, "loss": 2.7016, "step": 528 }, { "epoch": 0.449973418394471, "grad_norm": 0.3452233672142029, "learning_rate": 0.00011715840420651152, "loss": 2.5495, "step": 529 }, { "epoch": 0.4508240297713982, "grad_norm": 0.36391714215278625, "learning_rate": 0.00011689290475011259, "loss": 2.662, "step": 530 }, { "epoch": 0.45167464114832534, "grad_norm": 0.41914063692092896, "learning_rate": 0.00011662728266089364, "loss": 2.8876, "step": 531 }, { "epoch": 0.45252525252525255, "grad_norm": 0.4124680459499359, "learning_rate": 0.00011636153986711906, "loss": 2.7271, "step": 532 }, { "epoch": 0.4533758639021797, "grad_norm": 0.3957962393760681, "learning_rate": 0.00011609567829792944, "loss": 2.5991, "step": 533 }, { "epoch": 0.45422647527910687, "grad_norm": 0.401443213224411, "learning_rate": 0.00011582969988332757, "loss": 2.7777, "step": 534 }, { "epoch": 0.455077086656034, "grad_norm": 0.4349088966846466, "learning_rate": 0.00011556360655416457, "loss": 2.524, "step": 535 }, { "epoch": 0.4559276980329612, "grad_norm": 0.44450250267982483, "learning_rate": 0.00011529740024212565, "loss": 2.8296, "step": 536 }, { "epoch": 0.45677830940988834, "grad_norm": 0.44510000944137573, "learning_rate": 0.00011503108287971626, "loss": 2.8425, "step": 537 }, { "epoch": 0.4576289207868155, "grad_norm": 0.4766497313976288, "learning_rate": 0.00011476465640024814, "loss": 2.7471, "step": 538 }, { "epoch": 0.4584795321637427, "grad_norm": 0.458278089761734, "learning_rate": 0.00011449812273782492, "loss": 2.7274, "step": 539 }, { "epoch": 0.45933014354066987, "grad_norm": 0.46523377299308777, "learning_rate": 0.00011423148382732853, "loss": 2.4967, "step": 540 }, { "epoch": 0.460180754917597, "grad_norm": 0.4741576611995697, "learning_rate": 0.00011396474160440478, "loss": 2.6591, "step": 541 }, { "epoch": 0.4610313662945242, "grad_norm": 0.5007473230361938, "learning_rate": 0.00011369789800544959, "loss": 2.8139, "step": 542 }, { "epoch": 0.46188197767145134, "grad_norm": 0.45440518856048584, "learning_rate": 0.00011343095496759476, "loss": 2.6233, "step": 543 }, { "epoch": 0.4627325890483785, "grad_norm": 0.5588196516036987, "learning_rate": 0.00011316391442869394, "loss": 2.6369, "step": 544 }, { "epoch": 0.4635832004253057, "grad_norm": 0.5833231806755066, "learning_rate": 0.00011289677832730862, "loss": 2.8079, "step": 545 }, { "epoch": 0.46443381180223287, "grad_norm": 0.6029292345046997, "learning_rate": 0.00011262954860269399, "loss": 2.8007, "step": 546 }, { "epoch": 0.46528442317916, "grad_norm": 0.5825842022895813, "learning_rate": 0.00011236222719478491, "loss": 2.7767, "step": 547 }, { "epoch": 0.4661350345560872, "grad_norm": 0.6685728430747986, "learning_rate": 0.00011209481604418181, "loss": 2.6621, "step": 548 }, { "epoch": 0.46698564593301434, "grad_norm": 0.6266542077064514, "learning_rate": 0.00011182731709213659, "loss": 2.5234, "step": 549 }, { "epoch": 0.4678362573099415, "grad_norm": 0.7756956815719604, "learning_rate": 0.00011155973228053853, "loss": 2.9975, "step": 550 }, { "epoch": 0.4686868686868687, "grad_norm": 0.27434810996055603, "learning_rate": 0.00011129206355190025, "loss": 2.3867, "step": 551 }, { "epoch": 0.46953748006379586, "grad_norm": 0.28944891691207886, "learning_rate": 0.00011102431284934345, "loss": 2.6283, "step": 552 }, { "epoch": 0.470388091440723, "grad_norm": 0.2992161810398102, "learning_rate": 0.00011075648211658505, "loss": 2.5818, "step": 553 }, { "epoch": 0.4712387028176502, "grad_norm": 0.2830989956855774, "learning_rate": 0.00011048857329792284, "loss": 2.5738, "step": 554 }, { "epoch": 0.47208931419457734, "grad_norm": 0.2766299545764923, "learning_rate": 0.00011022058833822158, "loss": 2.5138, "step": 555 }, { "epoch": 0.4729399255715045, "grad_norm": 0.27325817942619324, "learning_rate": 0.0001099525291828986, "loss": 2.5166, "step": 556 }, { "epoch": 0.4737905369484317, "grad_norm": 0.27071428298950195, "learning_rate": 0.00010968439777790999, "loss": 2.4935, "step": 557 }, { "epoch": 0.47464114832535886, "grad_norm": 0.28781965374946594, "learning_rate": 0.00010941619606973632, "loss": 2.5851, "step": 558 }, { "epoch": 0.475491759702286, "grad_norm": 0.2613832354545593, "learning_rate": 0.00010914792600536843, "loss": 2.5289, "step": 559 }, { "epoch": 0.4763423710792132, "grad_norm": 0.2806464433670044, "learning_rate": 0.00010887958953229349, "loss": 2.659, "step": 560 }, { "epoch": 0.47719298245614034, "grad_norm": 0.2767505347728729, "learning_rate": 0.00010861118859848067, "loss": 2.6562, "step": 561 }, { "epoch": 0.4780435938330675, "grad_norm": 0.26740705966949463, "learning_rate": 0.0001083427251523672, "loss": 2.5689, "step": 562 }, { "epoch": 0.4788942052099947, "grad_norm": 0.2635597884654999, "learning_rate": 0.000108074201142844, "loss": 2.3973, "step": 563 }, { "epoch": 0.47974481658692186, "grad_norm": 0.2559509575366974, "learning_rate": 0.00010780561851924167, "loss": 2.4662, "step": 564 }, { "epoch": 0.480595427963849, "grad_norm": 0.2819572687149048, "learning_rate": 0.0001075369792313164, "loss": 2.7678, "step": 565 }, { "epoch": 0.4814460393407762, "grad_norm": 0.2618950605392456, "learning_rate": 0.00010726828522923562, "loss": 2.6463, "step": 566 }, { "epoch": 0.48229665071770333, "grad_norm": 0.2766074240207672, "learning_rate": 0.000106999538463564, "loss": 2.7133, "step": 567 }, { "epoch": 0.4831472620946305, "grad_norm": 0.280367910861969, "learning_rate": 0.00010673074088524926, "loss": 2.61, "step": 568 }, { "epoch": 0.4839978734715577, "grad_norm": 0.3028632402420044, "learning_rate": 0.00010646189444560799, "loss": 2.5465, "step": 569 }, { "epoch": 0.48484848484848486, "grad_norm": 0.2950790822505951, "learning_rate": 0.00010619300109631145, "loss": 2.6517, "step": 570 }, { "epoch": 0.485699096225412, "grad_norm": 0.2803073227405548, "learning_rate": 0.00010592406278937144, "loss": 2.6062, "step": 571 }, { "epoch": 0.4865497076023392, "grad_norm": 0.29916471242904663, "learning_rate": 0.00010565508147712617, "loss": 2.5532, "step": 572 }, { "epoch": 0.48740031897926633, "grad_norm": 0.32185062766075134, "learning_rate": 0.00010538605911222603, "loss": 2.722, "step": 573 }, { "epoch": 0.4882509303561935, "grad_norm": 0.3155268728733063, "learning_rate": 0.00010511699764761936, "loss": 2.6655, "step": 574 }, { "epoch": 0.4891015417331207, "grad_norm": 0.3241024315357208, "learning_rate": 0.00010484789903653846, "loss": 2.7093, "step": 575 }, { "epoch": 0.48995215311004786, "grad_norm": 0.3311263620853424, "learning_rate": 0.00010457876523248518, "loss": 2.6485, "step": 576 }, { "epoch": 0.490802764486975, "grad_norm": 0.34630128741264343, "learning_rate": 0.00010430959818921694, "loss": 2.8315, "step": 577 }, { "epoch": 0.4916533758639022, "grad_norm": 0.34841713309288025, "learning_rate": 0.00010404039986073244, "loss": 2.7484, "step": 578 }, { "epoch": 0.49250398724082933, "grad_norm": 0.35180601477622986, "learning_rate": 0.00010377117220125741, "loss": 2.6745, "step": 579 }, { "epoch": 0.4933545986177565, "grad_norm": 0.38369500637054443, "learning_rate": 0.00010350191716523059, "loss": 2.6623, "step": 580 }, { "epoch": 0.4942052099946837, "grad_norm": 0.3735206425189972, "learning_rate": 0.00010323263670728946, "loss": 2.5805, "step": 581 }, { "epoch": 0.49505582137161086, "grad_norm": 0.3994956314563751, "learning_rate": 0.00010296333278225599, "loss": 2.5622, "step": 582 }, { "epoch": 0.495906432748538, "grad_norm": 0.39151209592819214, "learning_rate": 0.00010269400734512256, "loss": 2.805, "step": 583 }, { "epoch": 0.4967570441254652, "grad_norm": 0.40469613671302795, "learning_rate": 0.0001024246623510377, "loss": 2.7588, "step": 584 }, { "epoch": 0.49760765550239233, "grad_norm": 0.4307393431663513, "learning_rate": 0.0001021552997552919, "loss": 2.8039, "step": 585 }, { "epoch": 0.4984582668793195, "grad_norm": 0.41100749373435974, "learning_rate": 0.00010188592151330343, "loss": 2.6842, "step": 586 }, { "epoch": 0.4993088782562467, "grad_norm": 0.45486176013946533, "learning_rate": 0.00010161652958060417, "loss": 2.8073, "step": 587 }, { "epoch": 0.5001594896331738, "grad_norm": 0.43648669123649597, "learning_rate": 0.00010134712591282538, "loss": 2.638, "step": 588 }, { "epoch": 0.5001594896331738, "eval_loss": 2.66949725151062, "eval_runtime": 80.5199, "eval_samples_per_second": 12.295, "eval_steps_per_second": 6.148, "step": 588 }, { "epoch": 0.501010101010101, "grad_norm": 0.44425809383392334, "learning_rate": 0.00010107771246568345, "loss": 2.5832, "step": 589 }, { "epoch": 0.5018607123870282, "grad_norm": 0.4519883990287781, "learning_rate": 0.00010080829119496586, "loss": 2.8142, "step": 590 }, { "epoch": 0.5027113237639553, "grad_norm": 0.4547143280506134, "learning_rate": 0.0001005388640565168, "loss": 2.4661, "step": 591 }, { "epoch": 0.5035619351408825, "grad_norm": 0.5073342323303223, "learning_rate": 0.00010026943300622313, "loss": 2.7849, "step": 592 }, { "epoch": 0.5044125465178096, "grad_norm": 0.5203079581260681, "learning_rate": 0.0001, "loss": 2.6077, "step": 593 }, { "epoch": 0.5052631578947369, "grad_norm": 0.4956808090209961, "learning_rate": 9.97305669937769e-05, "loss": 2.7116, "step": 594 }, { "epoch": 0.506113769271664, "grad_norm": 0.5807839035987854, "learning_rate": 9.946113594348321e-05, "loss": 2.8024, "step": 595 }, { "epoch": 0.5069643806485912, "grad_norm": 0.5834396481513977, "learning_rate": 9.919170880503415e-05, "loss": 2.6428, "step": 596 }, { "epoch": 0.5078149920255184, "grad_norm": 0.5668088793754578, "learning_rate": 9.892228753431657e-05, "loss": 2.6711, "step": 597 }, { "epoch": 0.5086656034024455, "grad_norm": 0.6034618616104126, "learning_rate": 9.865287408717465e-05, "loss": 2.9129, "step": 598 }, { "epoch": 0.5095162147793727, "grad_norm": 0.5953810811042786, "learning_rate": 9.838347041939584e-05, "loss": 2.7463, "step": 599 }, { "epoch": 0.5103668261562998, "grad_norm": 0.6790388822555542, "learning_rate": 9.811407848669657e-05, "loss": 2.8535, "step": 600 }, { "epoch": 0.511217437533227, "grad_norm": 0.2997681200504303, "learning_rate": 9.784470024470812e-05, "loss": 2.4338, "step": 601 }, { "epoch": 0.5120680489101542, "grad_norm": 0.28502747416496277, "learning_rate": 9.757533764896235e-05, "loss": 2.2975, "step": 602 }, { "epoch": 0.5129186602870813, "grad_norm": 0.29402679204940796, "learning_rate": 9.730599265487745e-05, "loss": 2.6287, "step": 603 }, { "epoch": 0.5137692716640085, "grad_norm": 0.28246256709098816, "learning_rate": 9.703666721774402e-05, "loss": 2.4197, "step": 604 }, { "epoch": 0.5146198830409356, "grad_norm": 0.29511624574661255, "learning_rate": 9.676736329271059e-05, "loss": 2.6028, "step": 605 }, { "epoch": 0.5154704944178629, "grad_norm": 0.2746260166168213, "learning_rate": 9.649808283476941e-05, "loss": 2.5791, "step": 606 }, { "epoch": 0.51632110579479, "grad_norm": 0.2634756863117218, "learning_rate": 9.622882779874263e-05, "loss": 2.4199, "step": 607 }, { "epoch": 0.5171717171717172, "grad_norm": 0.2797803282737732, "learning_rate": 9.595960013926761e-05, "loss": 2.5637, "step": 608 }, { "epoch": 0.5180223285486444, "grad_norm": 0.2671017646789551, "learning_rate": 9.569040181078306e-05, "loss": 2.6811, "step": 609 }, { "epoch": 0.5188729399255715, "grad_norm": 0.26546764373779297, "learning_rate": 9.542123476751483e-05, "loss": 2.5613, "step": 610 }, { "epoch": 0.5197235513024987, "grad_norm": 0.2660638988018036, "learning_rate": 9.515210096346155e-05, "loss": 2.4644, "step": 611 }, { "epoch": 0.5205741626794258, "grad_norm": 0.2697810232639313, "learning_rate": 9.488300235238067e-05, "loss": 2.5643, "step": 612 }, { "epoch": 0.521424774056353, "grad_norm": 0.26712700724601746, "learning_rate": 9.461394088777402e-05, "loss": 2.6993, "step": 613 }, { "epoch": 0.5222753854332801, "grad_norm": 0.2802102565765381, "learning_rate": 9.434491852287385e-05, "loss": 2.5723, "step": 614 }, { "epoch": 0.5231259968102073, "grad_norm": 0.2690255641937256, "learning_rate": 9.407593721062859e-05, "loss": 2.5136, "step": 615 }, { "epoch": 0.5239766081871345, "grad_norm": 0.2789754271507263, "learning_rate": 9.38069989036886e-05, "loss": 2.5901, "step": 616 }, { "epoch": 0.5248272195640616, "grad_norm": 0.2947288751602173, "learning_rate": 9.353810555439203e-05, "loss": 2.5661, "step": 617 }, { "epoch": 0.5256778309409889, "grad_norm": 0.268081396818161, "learning_rate": 9.326925911475075e-05, "loss": 2.603, "step": 618 }, { "epoch": 0.526528442317916, "grad_norm": 0.2749037444591522, "learning_rate": 9.300046153643602e-05, "loss": 2.7176, "step": 619 }, { "epoch": 0.5273790536948432, "grad_norm": 0.267333984375, "learning_rate": 9.27317147707644e-05, "loss": 2.5775, "step": 620 }, { "epoch": 0.5282296650717704, "grad_norm": 0.2688322961330414, "learning_rate": 9.246302076868363e-05, "loss": 2.4796, "step": 621 }, { "epoch": 0.5290802764486975, "grad_norm": 0.2832041382789612, "learning_rate": 9.219438148075832e-05, "loss": 2.5764, "step": 622 }, { "epoch": 0.5299308878256247, "grad_norm": 0.3032709062099457, "learning_rate": 9.192579885715602e-05, "loss": 2.7559, "step": 623 }, { "epoch": 0.5307814992025518, "grad_norm": 0.3065055012702942, "learning_rate": 9.165727484763282e-05, "loss": 2.7058, "step": 624 }, { "epoch": 0.531632110579479, "grad_norm": 0.314698189496994, "learning_rate": 9.138881140151931e-05, "loss": 2.6227, "step": 625 }, { "epoch": 0.5324827219564061, "grad_norm": 0.31485408544540405, "learning_rate": 9.112041046770653e-05, "loss": 2.5687, "step": 626 }, { "epoch": 0.5333333333333333, "grad_norm": 0.3281068801879883, "learning_rate": 9.085207399463162e-05, "loss": 2.5957, "step": 627 }, { "epoch": 0.5341839447102605, "grad_norm": 0.3461722731590271, "learning_rate": 9.058380393026369e-05, "loss": 2.8056, "step": 628 }, { "epoch": 0.5350345560871876, "grad_norm": 0.3617733418941498, "learning_rate": 9.031560222209002e-05, "loss": 2.8171, "step": 629 }, { "epoch": 0.5358851674641149, "grad_norm": 0.34867244958877563, "learning_rate": 9.00474708171014e-05, "loss": 2.6481, "step": 630 }, { "epoch": 0.536735778841042, "grad_norm": 0.3704431653022766, "learning_rate": 8.977941166177845e-05, "loss": 2.7771, "step": 631 }, { "epoch": 0.5375863902179692, "grad_norm": 0.3661860525608063, "learning_rate": 8.951142670207717e-05, "loss": 2.5948, "step": 632 }, { "epoch": 0.5384370015948964, "grad_norm": 0.3825220763683319, "learning_rate": 8.924351788341496e-05, "loss": 2.5843, "step": 633 }, { "epoch": 0.5392876129718235, "grad_norm": 0.39650052785873413, "learning_rate": 8.897568715065657e-05, "loss": 2.7541, "step": 634 }, { "epoch": 0.5401382243487507, "grad_norm": 0.4185117483139038, "learning_rate": 8.87079364480998e-05, "loss": 2.791, "step": 635 }, { "epoch": 0.5409888357256778, "grad_norm": 0.41095203161239624, "learning_rate": 8.844026771946147e-05, "loss": 2.5707, "step": 636 }, { "epoch": 0.541839447102605, "grad_norm": 0.4548766016960144, "learning_rate": 8.817268290786343e-05, "loss": 2.8622, "step": 637 }, { "epoch": 0.5426900584795321, "grad_norm": 0.4231926202774048, "learning_rate": 8.790518395581822e-05, "loss": 2.5893, "step": 638 }, { "epoch": 0.5435406698564593, "grad_norm": 0.441785603761673, "learning_rate": 8.763777280521511e-05, "loss": 2.7675, "step": 639 }, { "epoch": 0.5443912812333865, "grad_norm": 0.4583280682563782, "learning_rate": 8.737045139730605e-05, "loss": 2.7461, "step": 640 }, { "epoch": 0.5452418926103136, "grad_norm": 0.5172987580299377, "learning_rate": 8.71032216726914e-05, "loss": 2.9216, "step": 641 }, { "epoch": 0.5460925039872409, "grad_norm": 0.49341312050819397, "learning_rate": 8.683608557130608e-05, "loss": 2.803, "step": 642 }, { "epoch": 0.546943115364168, "grad_norm": 0.5035589933395386, "learning_rate": 8.656904503240527e-05, "loss": 2.869, "step": 643 }, { "epoch": 0.5477937267410952, "grad_norm": 0.5143235921859741, "learning_rate": 8.630210199455041e-05, "loss": 2.7889, "step": 644 }, { "epoch": 0.5486443381180224, "grad_norm": 0.5110295414924622, "learning_rate": 8.603525839559523e-05, "loss": 2.8229, "step": 645 }, { "epoch": 0.5494949494949495, "grad_norm": 0.555565595626831, "learning_rate": 8.57685161726715e-05, "loss": 2.6461, "step": 646 }, { "epoch": 0.5503455608718767, "grad_norm": 0.56583172082901, "learning_rate": 8.550187726217507e-05, "loss": 2.7647, "step": 647 }, { "epoch": 0.5511961722488038, "grad_norm": 0.6618173718452454, "learning_rate": 8.523534359975189e-05, "loss": 2.6992, "step": 648 }, { "epoch": 0.552046783625731, "grad_norm": 0.6295120120048523, "learning_rate": 8.496891712028375e-05, "loss": 2.6303, "step": 649 }, { "epoch": 0.5528973950026581, "grad_norm": 0.6757770776748657, "learning_rate": 8.470259975787438e-05, "loss": 2.8003, "step": 650 }, { "epoch": 0.5537480063795853, "grad_norm": 0.26357659697532654, "learning_rate": 8.443639344583547e-05, "loss": 2.4452, "step": 651 }, { "epoch": 0.5545986177565125, "grad_norm": 0.31750619411468506, "learning_rate": 8.417030011667241e-05, "loss": 2.374, "step": 652 }, { "epoch": 0.5554492291334396, "grad_norm": 0.27070245146751404, "learning_rate": 8.390432170207057e-05, "loss": 2.3384, "step": 653 }, { "epoch": 0.5562998405103668, "grad_norm": 0.28004199266433716, "learning_rate": 8.363846013288095e-05, "loss": 2.4357, "step": 654 }, { "epoch": 0.557150451887294, "grad_norm": 0.323215126991272, "learning_rate": 8.337271733910637e-05, "loss": 2.3801, "step": 655 }, { "epoch": 0.5580010632642212, "grad_norm": 0.292837530374527, "learning_rate": 8.310709524988743e-05, "loss": 2.3924, "step": 656 }, { "epoch": 0.5588516746411484, "grad_norm": 0.2810145914554596, "learning_rate": 8.284159579348851e-05, "loss": 2.5257, "step": 657 }, { "epoch": 0.5597022860180755, "grad_norm": 0.2848433554172516, "learning_rate": 8.257622089728362e-05, "loss": 2.5743, "step": 658 }, { "epoch": 0.5605528973950027, "grad_norm": 0.28531527519226074, "learning_rate": 8.231097248774274e-05, "loss": 2.5492, "step": 659 }, { "epoch": 0.5614035087719298, "grad_norm": 0.2882923185825348, "learning_rate": 8.20458524904174e-05, "loss": 2.4982, "step": 660 }, { "epoch": 0.562254120148857, "grad_norm": 0.2807391583919525, "learning_rate": 8.178086282992705e-05, "loss": 2.5222, "step": 661 }, { "epoch": 0.5631047315257841, "grad_norm": 0.29856112599372864, "learning_rate": 8.151600542994506e-05, "loss": 2.723, "step": 662 }, { "epoch": 0.5639553429027113, "grad_norm": 0.2890356481075287, "learning_rate": 8.125128221318446e-05, "loss": 2.6931, "step": 663 }, { "epoch": 0.5648059542796385, "grad_norm": 0.28091031312942505, "learning_rate": 8.098669510138437e-05, "loss": 2.6273, "step": 664 }, { "epoch": 0.5656565656565656, "grad_norm": 0.2678775489330292, "learning_rate": 8.072224601529574e-05, "loss": 2.5571, "step": 665 }, { "epoch": 0.5665071770334928, "grad_norm": 0.2804068624973297, "learning_rate": 8.045793687466757e-05, "loss": 2.4443, "step": 666 }, { "epoch": 0.56735778841042, "grad_norm": 0.279041051864624, "learning_rate": 8.0193769598233e-05, "loss": 2.6947, "step": 667 }, { "epoch": 0.5682083997873472, "grad_norm": 0.27611520886421204, "learning_rate": 7.992974610369521e-05, "loss": 2.782, "step": 668 }, { "epoch": 0.5690590111642744, "grad_norm": 0.28445136547088623, "learning_rate": 7.966586830771367e-05, "loss": 2.5741, "step": 669 }, { "epoch": 0.5699096225412015, "grad_norm": 0.2607346773147583, "learning_rate": 7.940213812589018e-05, "loss": 2.4522, "step": 670 }, { "epoch": 0.5707602339181287, "grad_norm": 0.2881016433238983, "learning_rate": 7.913855747275489e-05, "loss": 2.6714, "step": 671 }, { "epoch": 0.5716108452950558, "grad_norm": 0.3019566237926483, "learning_rate": 7.887512826175248e-05, "loss": 2.7117, "step": 672 }, { "epoch": 0.572461456671983, "grad_norm": 0.3051791489124298, "learning_rate": 7.861185240522827e-05, "loss": 2.6867, "step": 673 }, { "epoch": 0.5733120680489101, "grad_norm": 0.32132992148399353, "learning_rate": 7.834873181441427e-05, "loss": 2.5278, "step": 674 }, { "epoch": 0.5741626794258373, "grad_norm": 0.32348886132240295, "learning_rate": 7.808576839941542e-05, "loss": 2.7507, "step": 675 }, { "epoch": 0.5750132908027645, "grad_norm": 0.32297685742378235, "learning_rate": 7.782296406919557e-05, "loss": 2.6857, "step": 676 }, { "epoch": 0.5758639021796916, "grad_norm": 0.34016191959381104, "learning_rate": 7.756032073156373e-05, "loss": 2.57, "step": 677 }, { "epoch": 0.5767145135566188, "grad_norm": 0.349483847618103, "learning_rate": 7.729784029316025e-05, "loss": 2.6255, "step": 678 }, { "epoch": 0.577565124933546, "grad_norm": 0.3523537218570709, "learning_rate": 7.703552465944287e-05, "loss": 2.8317, "step": 679 }, { "epoch": 0.5784157363104732, "grad_norm": 0.38119909167289734, "learning_rate": 7.677337573467294e-05, "loss": 2.7303, "step": 680 }, { "epoch": 0.5792663476874004, "grad_norm": 0.4053308367729187, "learning_rate": 7.651139542190164e-05, "loss": 2.6776, "step": 681 }, { "epoch": 0.5801169590643275, "grad_norm": 0.38038370013237, "learning_rate": 7.624958562295606e-05, "loss": 2.6411, "step": 682 }, { "epoch": 0.5809675704412547, "grad_norm": 0.414034903049469, "learning_rate": 7.598794823842557e-05, "loss": 2.8368, "step": 683 }, { "epoch": 0.5818181818181818, "grad_norm": 0.41362208127975464, "learning_rate": 7.572648516764777e-05, "loss": 2.7608, "step": 684 }, { "epoch": 0.582668793195109, "grad_norm": 0.41102275252342224, "learning_rate": 7.54651983086949e-05, "loss": 2.7317, "step": 685 }, { "epoch": 0.5835194045720361, "grad_norm": 0.45451274514198303, "learning_rate": 7.520408955836007e-05, "loss": 2.7164, "step": 686 }, { "epoch": 0.5843700159489633, "grad_norm": 0.42562657594680786, "learning_rate": 7.494316081214334e-05, "loss": 2.6641, "step": 687 }, { "epoch": 0.5852206273258905, "grad_norm": 0.46389469504356384, "learning_rate": 7.468241396423801e-05, "loss": 2.6883, "step": 688 }, { "epoch": 0.5860712387028176, "grad_norm": 0.42721521854400635, "learning_rate": 7.442185090751705e-05, "loss": 2.5853, "step": 689 }, { "epoch": 0.5869218500797448, "grad_norm": 0.5020928978919983, "learning_rate": 7.416147353351909e-05, "loss": 2.6468, "step": 690 }, { "epoch": 0.587772461456672, "grad_norm": 0.4806350767612457, "learning_rate": 7.390128373243479e-05, "loss": 2.5275, "step": 691 }, { "epoch": 0.5886230728335992, "grad_norm": 0.5264183282852173, "learning_rate": 7.364128339309326e-05, "loss": 2.6082, "step": 692 }, { "epoch": 0.5894736842105263, "grad_norm": 0.5251814126968384, "learning_rate": 7.338147440294809e-05, "loss": 2.6319, "step": 693 }, { "epoch": 0.5903242955874535, "grad_norm": 0.5503838658332825, "learning_rate": 7.312185864806391e-05, "loss": 2.6875, "step": 694 }, { "epoch": 0.5911749069643807, "grad_norm": 0.5717040300369263, "learning_rate": 7.286243801310248e-05, "loss": 2.6859, "step": 695 }, { "epoch": 0.5920255183413078, "grad_norm": 0.5679376125335693, "learning_rate": 7.260321438130913e-05, "loss": 2.6425, "step": 696 }, { "epoch": 0.592876129718235, "grad_norm": 0.6636802554130554, "learning_rate": 7.234418963449907e-05, "loss": 2.7617, "step": 697 }, { "epoch": 0.5937267410951621, "grad_norm": 0.6101743578910828, "learning_rate": 7.208536565304373e-05, "loss": 2.6909, "step": 698 }, { "epoch": 0.5945773524720893, "grad_norm": 0.6630806922912598, "learning_rate": 7.182674431585704e-05, "loss": 2.7461, "step": 699 }, { "epoch": 0.5954279638490165, "grad_norm": 0.738452136516571, "learning_rate": 7.156832750038192e-05, "loss": 2.8512, "step": 700 }, { "epoch": 0.5962785752259436, "grad_norm": 0.2764449715614319, "learning_rate": 7.131011708257654e-05, "loss": 2.4965, "step": 701 }, { "epoch": 0.5971291866028708, "grad_norm": 0.27585679292678833, "learning_rate": 7.105211493690073e-05, "loss": 2.5628, "step": 702 }, { "epoch": 0.597979797979798, "grad_norm": 0.2890099287033081, "learning_rate": 7.079432293630244e-05, "loss": 2.6482, "step": 703 }, { "epoch": 0.5988304093567252, "grad_norm": 0.27924832701683044, "learning_rate": 7.0536742952204e-05, "loss": 2.4101, "step": 704 }, { "epoch": 0.5996810207336523, "grad_norm": 0.30849024653434753, "learning_rate": 7.02793768544887e-05, "loss": 2.4998, "step": 705 }, { "epoch": 0.6005316321105795, "grad_norm": 0.286748468875885, "learning_rate": 7.002222651148714e-05, "loss": 2.3868, "step": 706 }, { "epoch": 0.6013822434875067, "grad_norm": 0.2908494472503662, "learning_rate": 6.976529378996357e-05, "loss": 2.4456, "step": 707 }, { "epoch": 0.6022328548644338, "grad_norm": 0.26911187171936035, "learning_rate": 6.950858055510254e-05, "loss": 2.633, "step": 708 }, { "epoch": 0.603083466241361, "grad_norm": 0.33310920000076294, "learning_rate": 6.925208867049522e-05, "loss": 2.8533, "step": 709 }, { "epoch": 0.6039340776182881, "grad_norm": 0.2783631682395935, "learning_rate": 6.89958199981258e-05, "loss": 2.4214, "step": 710 }, { "epoch": 0.6047846889952153, "grad_norm": 0.2714787721633911, "learning_rate": 6.873977639835829e-05, "loss": 2.6986, "step": 711 }, { "epoch": 0.6056353003721425, "grad_norm": 0.2905879616737366, "learning_rate": 6.848395972992261e-05, "loss": 2.5218, "step": 712 }, { "epoch": 0.6064859117490696, "grad_norm": 0.288647323846817, "learning_rate": 6.822837184990132e-05, "loss": 2.6838, "step": 713 }, { "epoch": 0.6073365231259968, "grad_norm": 0.2918678820133209, "learning_rate": 6.797301461371625e-05, "loss": 2.6513, "step": 714 }, { "epoch": 0.6081871345029239, "grad_norm": 0.26965370774269104, "learning_rate": 6.771788987511469e-05, "loss": 2.5379, "step": 715 }, { "epoch": 0.6090377458798512, "grad_norm": 0.27741289138793945, "learning_rate": 6.746299948615631e-05, "loss": 2.6699, "step": 716 }, { "epoch": 0.6098883572567783, "grad_norm": 0.26440632343292236, "learning_rate": 6.720834529719939e-05, "loss": 2.6012, "step": 717 }, { "epoch": 0.6107389686337055, "grad_norm": 0.27086710929870605, "learning_rate": 6.695392915688759e-05, "loss": 2.5854, "step": 718 }, { "epoch": 0.6115895800106327, "grad_norm": 0.2577430009841919, "learning_rate": 6.66997529121365e-05, "loss": 2.4548, "step": 719 }, { "epoch": 0.6124401913875598, "grad_norm": 0.2519771158695221, "learning_rate": 6.644581840812018e-05, "loss": 2.4582, "step": 720 }, { "epoch": 0.613290802764487, "grad_norm": 0.31657370924949646, "learning_rate": 6.619212748825776e-05, "loss": 2.5846, "step": 721 }, { "epoch": 0.6141414141414141, "grad_norm": 0.28847917914390564, "learning_rate": 6.593868199420017e-05, "loss": 2.7865, "step": 722 }, { "epoch": 0.6149920255183413, "grad_norm": 0.29375341534614563, "learning_rate": 6.568548376581662e-05, "loss": 2.3419, "step": 723 }, { "epoch": 0.6158426368952685, "grad_norm": 0.320289671421051, "learning_rate": 6.543253464118131e-05, "loss": 2.7476, "step": 724 }, { "epoch": 0.6166932482721956, "grad_norm": 0.3045991063117981, "learning_rate": 6.517983645656014e-05, "loss": 2.5995, "step": 725 }, { "epoch": 0.6175438596491228, "grad_norm": 0.3231359124183655, "learning_rate": 6.492739104639727e-05, "loss": 2.6869, "step": 726 }, { "epoch": 0.6183944710260499, "grad_norm": 0.36181551218032837, "learning_rate": 6.467520024330193e-05, "loss": 2.8256, "step": 727 }, { "epoch": 0.6192450824029772, "grad_norm": 0.34118232131004333, "learning_rate": 6.4423265878035e-05, "loss": 2.7321, "step": 728 }, { "epoch": 0.6200956937799043, "grad_norm": 0.3405606746673584, "learning_rate": 6.417158977949575e-05, "loss": 2.6993, "step": 729 }, { "epoch": 0.6209463051568315, "grad_norm": 0.355055570602417, "learning_rate": 6.392017377470866e-05, "loss": 2.7063, "step": 730 }, { "epoch": 0.6217969165337587, "grad_norm": 0.368937611579895, "learning_rate": 6.366901968881002e-05, "loss": 2.682, "step": 731 }, { "epoch": 0.6226475279106858, "grad_norm": 0.3755683898925781, "learning_rate": 6.341812934503469e-05, "loss": 2.7295, "step": 732 }, { "epoch": 0.623498139287613, "grad_norm": 0.37153956294059753, "learning_rate": 6.316750456470303e-05, "loss": 2.7157, "step": 733 }, { "epoch": 0.6243487506645401, "grad_norm": 0.4051001965999603, "learning_rate": 6.291714716720749e-05, "loss": 2.8429, "step": 734 }, { "epoch": 0.6251993620414673, "grad_norm": 0.3929100036621094, "learning_rate": 6.26670589699995e-05, "loss": 2.7148, "step": 735 }, { "epoch": 0.6260499734183945, "grad_norm": 0.4600156843662262, "learning_rate": 6.24172417885762e-05, "loss": 2.765, "step": 736 }, { "epoch": 0.6269005847953216, "grad_norm": 0.42697155475616455, "learning_rate": 6.216769743646733e-05, "loss": 2.6477, "step": 737 }, { "epoch": 0.6277511961722488, "grad_norm": 0.4428333640098572, "learning_rate": 6.191842772522214e-05, "loss": 2.9439, "step": 738 }, { "epoch": 0.6286018075491759, "grad_norm": 0.45895466208457947, "learning_rate": 6.166943446439604e-05, "loss": 2.6916, "step": 739 }, { "epoch": 0.6294524189261032, "grad_norm": 0.4783354103565216, "learning_rate": 6.142071946153751e-05, "loss": 2.6747, "step": 740 }, { "epoch": 0.6303030303030303, "grad_norm": 0.4584222733974457, "learning_rate": 6.117228452217525e-05, "loss": 2.8631, "step": 741 }, { "epoch": 0.6311536416799575, "grad_norm": 0.5074661374092102, "learning_rate": 6.092413144980464e-05, "loss": 2.7687, "step": 742 }, { "epoch": 0.6320042530568847, "grad_norm": 0.500322699546814, "learning_rate": 6.0676262045874976e-05, "loss": 2.6794, "step": 743 }, { "epoch": 0.6328548644338118, "grad_norm": 0.4991973042488098, "learning_rate": 6.04286781097763e-05, "loss": 2.739, "step": 744 }, { "epoch": 0.633705475810739, "grad_norm": 0.5203631520271301, "learning_rate": 6.018138143882621e-05, "loss": 2.5866, "step": 745 }, { "epoch": 0.6345560871876661, "grad_norm": 0.5206765532493591, "learning_rate": 5.9934373828257106e-05, "loss": 2.5869, "step": 746 }, { "epoch": 0.6354066985645933, "grad_norm": 0.5892865061759949, "learning_rate": 5.96876570712028e-05, "loss": 2.6141, "step": 747 }, { "epoch": 0.6362573099415205, "grad_norm": 0.6528813242912292, "learning_rate": 5.944123295868573e-05, "loss": 2.7467, "step": 748 }, { "epoch": 0.6371079213184476, "grad_norm": 0.6801220178604126, "learning_rate": 5.9195103279603956e-05, "loss": 2.8362, "step": 749 }, { "epoch": 0.6379585326953748, "grad_norm": 0.7539176940917969, "learning_rate": 5.894926982071804e-05, "loss": 2.8246, "step": 750 }, { "epoch": 0.6388091440723019, "grad_norm": 0.2966770827770233, "learning_rate": 5.870373436663823e-05, "loss": 2.6906, "step": 751 }, { "epoch": 0.6396597554492292, "grad_norm": 0.27538472414016724, "learning_rate": 5.845849869981137e-05, "loss": 2.4334, "step": 752 }, { "epoch": 0.6405103668261563, "grad_norm": 0.2812007665634155, "learning_rate": 5.821356460050805e-05, "loss": 2.4665, "step": 753 }, { "epoch": 0.6413609782030835, "grad_norm": 0.28760018944740295, "learning_rate": 5.796893384680964e-05, "loss": 2.6608, "step": 754 }, { "epoch": 0.6422115895800107, "grad_norm": 0.2786334455013275, "learning_rate": 5.772460821459542e-05, "loss": 2.4717, "step": 755 }, { "epoch": 0.6430622009569378, "grad_norm": 0.27650853991508484, "learning_rate": 5.7480589477529545e-05, "loss": 2.5303, "step": 756 }, { "epoch": 0.643912812333865, "grad_norm": 0.29573655128479004, "learning_rate": 5.723687940704856e-05, "loss": 2.6558, "step": 757 }, { "epoch": 0.6447634237107921, "grad_norm": 0.2814992070198059, "learning_rate": 5.699347977234799e-05, "loss": 2.4268, "step": 758 }, { "epoch": 0.6456140350877193, "grad_norm": 0.2827305495738983, "learning_rate": 5.675039234036983e-05, "loss": 2.5284, "step": 759 }, { "epoch": 0.6464646464646465, "grad_norm": 0.2877061665058136, "learning_rate": 5.650761887578977e-05, "loss": 2.5597, "step": 760 }, { "epoch": 0.6473152578415736, "grad_norm": 0.2902846038341522, "learning_rate": 5.6265161141004244e-05, "loss": 2.719, "step": 761 }, { "epoch": 0.6481658692185008, "grad_norm": 0.3014174997806549, "learning_rate": 5.602302089611755e-05, "loss": 2.6498, "step": 762 }, { "epoch": 0.6490164805954279, "grad_norm": 0.2754184603691101, "learning_rate": 5.578119989892931e-05, "loss": 2.7288, "step": 763 }, { "epoch": 0.6498670919723551, "grad_norm": 0.27788618206977844, "learning_rate": 5.5539699904921635e-05, "loss": 2.5285, "step": 764 }, { "epoch": 0.6507177033492823, "grad_norm": 0.27178868651390076, "learning_rate": 5.529852266724616e-05, "loss": 2.6693, "step": 765 }, { "epoch": 0.6515683147262095, "grad_norm": 0.2783076763153076, "learning_rate": 5.505766993671162e-05, "loss": 2.6003, "step": 766 }, { "epoch": 0.6524189261031367, "grad_norm": 0.27819153666496277, "learning_rate": 5.481714346177103e-05, "loss": 2.5775, "step": 767 }, { "epoch": 0.6532695374800638, "grad_norm": 0.27801868319511414, "learning_rate": 5.457694498850891e-05, "loss": 2.6342, "step": 768 }, { "epoch": 0.654120148856991, "grad_norm": 0.27529436349868774, "learning_rate": 5.43370762606287e-05, "loss": 2.6376, "step": 769 }, { "epoch": 0.6549707602339181, "grad_norm": 0.2912578582763672, "learning_rate": 5.409753901944006e-05, "loss": 2.8169, "step": 770 }, { "epoch": 0.6558213716108453, "grad_norm": 0.28431615233421326, "learning_rate": 5.385833500384632e-05, "loss": 2.5608, "step": 771 }, { "epoch": 0.6566719829877724, "grad_norm": 0.3062645196914673, "learning_rate": 5.3619465950331646e-05, "loss": 2.6686, "step": 772 }, { "epoch": 0.6575225943646996, "grad_norm": 0.29184243083000183, "learning_rate": 5.3380933592948704e-05, "loss": 2.4963, "step": 773 }, { "epoch": 0.6583732057416268, "grad_norm": 0.3256790041923523, "learning_rate": 5.3142739663305906e-05, "loss": 2.5206, "step": 774 }, { "epoch": 0.6592238171185539, "grad_norm": 0.3323804438114166, "learning_rate": 5.2904885890554836e-05, "loss": 2.7495, "step": 775 }, { "epoch": 0.6600744284954811, "grad_norm": 0.3358718454837799, "learning_rate": 5.266737400137765e-05, "loss": 2.646, "step": 776 }, { "epoch": 0.6609250398724082, "grad_norm": 0.34872984886169434, "learning_rate": 5.24302057199749e-05, "loss": 2.8041, "step": 777 }, { "epoch": 0.6617756512493355, "grad_norm": 0.3432627320289612, "learning_rate": 5.2193382768052436e-05, "loss": 2.6066, "step": 778 }, { "epoch": 0.6626262626262627, "grad_norm": 0.3716273009777069, "learning_rate": 5.19569068648094e-05, "loss": 2.6634, "step": 779 }, { "epoch": 0.6634768740031898, "grad_norm": 0.3645707964897156, "learning_rate": 5.172077972692553e-05, "loss": 2.5782, "step": 780 }, { "epoch": 0.664327485380117, "grad_norm": 0.43927833437919617, "learning_rate": 5.148500306854862e-05, "loss": 2.6775, "step": 781 }, { "epoch": 0.6651780967570441, "grad_norm": 0.4325307607650757, "learning_rate": 5.124957860128237e-05, "loss": 2.6963, "step": 782 }, { "epoch": 0.6660287081339713, "grad_norm": 0.39009419083595276, "learning_rate": 5.101450803417357e-05, "loss": 2.7308, "step": 783 }, { "epoch": 0.6668793195108984, "grad_norm": 0.42211011052131653, "learning_rate": 5.0779793073700044e-05, "loss": 2.8391, "step": 784 }, { "epoch": 0.6677299308878256, "grad_norm": 0.4249938130378723, "learning_rate": 5.054543542375809e-05, "loss": 2.6562, "step": 785 }, { "epoch": 0.6685805422647528, "grad_norm": 0.4374600350856781, "learning_rate": 5.031143678565005e-05, "loss": 2.6552, "step": 786 }, { "epoch": 0.6694311536416799, "grad_norm": 0.471137672662735, "learning_rate": 5.0077798858072156e-05, "loss": 2.6368, "step": 787 }, { "epoch": 0.6702817650186071, "grad_norm": 0.4666772186756134, "learning_rate": 4.984452333710207e-05, "loss": 2.6859, "step": 788 }, { "epoch": 0.6711323763955342, "grad_norm": 0.5150439739227295, "learning_rate": 4.961161191618649e-05, "loss": 2.9448, "step": 789 }, { "epoch": 0.6719829877724615, "grad_norm": 0.4853833019733429, "learning_rate": 4.937906628612905e-05, "loss": 2.7409, "step": 790 }, { "epoch": 0.6728335991493887, "grad_norm": 0.4760463535785675, "learning_rate": 4.914688813507797e-05, "loss": 2.5815, "step": 791 }, { "epoch": 0.6736842105263158, "grad_norm": 0.5204352140426636, "learning_rate": 4.89150791485137e-05, "loss": 2.7374, "step": 792 }, { "epoch": 0.674534821903243, "grad_norm": 0.5126561522483826, "learning_rate": 4.86836410092368e-05, "loss": 2.755, "step": 793 }, { "epoch": 0.6753854332801701, "grad_norm": 0.55113685131073, "learning_rate": 4.845257539735577e-05, "loss": 2.7027, "step": 794 }, { "epoch": 0.6762360446570973, "grad_norm": 0.5475419759750366, "learning_rate": 4.822188399027461e-05, "loss": 2.667, "step": 795 }, { "epoch": 0.6770866560340244, "grad_norm": 0.6212125420570374, "learning_rate": 4.799156846268095e-05, "loss": 2.6865, "step": 796 }, { "epoch": 0.6779372674109516, "grad_norm": 0.6271610260009766, "learning_rate": 4.7761630486533694e-05, "loss": 2.9713, "step": 797 }, { "epoch": 0.6787878787878788, "grad_norm": 0.6181414723396301, "learning_rate": 4.7532071731050975e-05, "loss": 2.8862, "step": 798 }, { "epoch": 0.6796384901648059, "grad_norm": 0.6996387243270874, "learning_rate": 4.730289386269792e-05, "loss": 2.9082, "step": 799 }, { "epoch": 0.6804891015417331, "grad_norm": 0.7603393793106079, "learning_rate": 4.70740985451747e-05, "loss": 3.034, "step": 800 }, { "epoch": 0.6813397129186602, "grad_norm": 0.27659621834754944, "learning_rate": 4.684568743940444e-05, "loss": 2.5099, "step": 801 }, { "epoch": 0.6821903242955875, "grad_norm": 0.29582586884498596, "learning_rate": 4.661766220352097e-05, "loss": 2.5086, "step": 802 }, { "epoch": 0.6830409356725147, "grad_norm": 0.31561413407325745, "learning_rate": 4.639002449285693e-05, "loss": 2.5214, "step": 803 }, { "epoch": 0.6838915470494418, "grad_norm": 0.27388331294059753, "learning_rate": 4.616277595993196e-05, "loss": 2.4724, "step": 804 }, { "epoch": 0.684742158426369, "grad_norm": 0.2823973000049591, "learning_rate": 4.593591825444028e-05, "loss": 2.4987, "step": 805 }, { "epoch": 0.6855927698032961, "grad_norm": 0.3274148106575012, "learning_rate": 4.57094530232389e-05, "loss": 2.6108, "step": 806 }, { "epoch": 0.6864433811802233, "grad_norm": 0.27294856309890747, "learning_rate": 4.5483381910335955e-05, "loss": 2.4774, "step": 807 }, { "epoch": 0.6872939925571504, "grad_norm": 0.2771831750869751, "learning_rate": 4.525770655687821e-05, "loss": 2.4953, "step": 808 }, { "epoch": 0.6881446039340776, "grad_norm": 0.28946414589881897, "learning_rate": 4.5032428601139644e-05, "loss": 2.7516, "step": 809 }, { "epoch": 0.6889952153110048, "grad_norm": 0.2817944884300232, "learning_rate": 4.48075496785092e-05, "loss": 2.6658, "step": 810 }, { "epoch": 0.6898458266879319, "grad_norm": 0.2726970613002777, "learning_rate": 4.4583071421479194e-05, "loss": 2.6478, "step": 811 }, { "epoch": 0.6906964380648591, "grad_norm": 0.2892582416534424, "learning_rate": 4.435899545963332e-05, "loss": 2.6592, "step": 812 }, { "epoch": 0.6915470494417862, "grad_norm": 0.2825833559036255, "learning_rate": 4.4135323419634766e-05, "loss": 2.4431, "step": 813 }, { "epoch": 0.6923976608187135, "grad_norm": 0.27093660831451416, "learning_rate": 4.391205692521453e-05, "loss": 2.4303, "step": 814 }, { "epoch": 0.6932482721956407, "grad_norm": 0.2554977834224701, "learning_rate": 4.368919759715964e-05, "loss": 2.3259, "step": 815 }, { "epoch": 0.6940988835725678, "grad_norm": 0.27814123034477234, "learning_rate": 4.346674705330117e-05, "loss": 2.5495, "step": 816 }, { "epoch": 0.694949494949495, "grad_norm": 0.303751140832901, "learning_rate": 4.32447069085028e-05, "loss": 2.5834, "step": 817 }, { "epoch": 0.6958001063264221, "grad_norm": 0.27305588126182556, "learning_rate": 4.302307877464893e-05, "loss": 2.479, "step": 818 }, { "epoch": 0.6966507177033493, "grad_norm": 0.265235036611557, "learning_rate": 4.280186426063291e-05, "loss": 2.4847, "step": 819 }, { "epoch": 0.6975013290802764, "grad_norm": 0.2711983025074005, "learning_rate": 4.258106497234551e-05, "loss": 2.3499, "step": 820 }, { "epoch": 0.6983519404572036, "grad_norm": 0.30092155933380127, "learning_rate": 4.236068251266324e-05, "loss": 2.7319, "step": 821 }, { "epoch": 0.6992025518341308, "grad_norm": 0.3007088601589203, "learning_rate": 4.214071848143655e-05, "loss": 2.5465, "step": 822 }, { "epoch": 0.7000531632110579, "grad_norm": 0.3158135712146759, "learning_rate": 4.192117447547845e-05, "loss": 2.6333, "step": 823 }, { "epoch": 0.7009037745879851, "grad_norm": 0.3173391819000244, "learning_rate": 4.170205208855281e-05, "loss": 2.8266, "step": 824 }, { "epoch": 0.7017543859649122, "grad_norm": 0.3236771523952484, "learning_rate": 4.148335291136267e-05, "loss": 2.7447, "step": 825 }, { "epoch": 0.7026049973418395, "grad_norm": 0.320290744304657, "learning_rate": 4.1265078531538916e-05, "loss": 2.8345, "step": 826 }, { "epoch": 0.7034556087187667, "grad_norm": 0.33358582854270935, "learning_rate": 4.104723053362867e-05, "loss": 2.848, "step": 827 }, { "epoch": 0.7043062200956938, "grad_norm": 0.35341066122055054, "learning_rate": 4.082981049908362e-05, "loss": 2.7216, "step": 828 }, { "epoch": 0.705156831472621, "grad_norm": 0.35734865069389343, "learning_rate": 4.061282000624885e-05, "loss": 2.702, "step": 829 }, { "epoch": 0.7060074428495481, "grad_norm": 0.3350493013858795, "learning_rate": 4.0396260630351066e-05, "loss": 2.4811, "step": 830 }, { "epoch": 0.7068580542264753, "grad_norm": 0.35712873935699463, "learning_rate": 4.018013394348752e-05, "loss": 2.7484, "step": 831 }, { "epoch": 0.7077086656034024, "grad_norm": 0.37364643812179565, "learning_rate": 3.996444151461417e-05, "loss": 2.6113, "step": 832 }, { "epoch": 0.7085592769803296, "grad_norm": 0.41270262002944946, "learning_rate": 3.9749184909534565e-05, "loss": 2.6654, "step": 833 }, { "epoch": 0.7094098883572568, "grad_norm": 0.3864672780036926, "learning_rate": 3.9534365690888566e-05, "loss": 2.6718, "step": 834 }, { "epoch": 0.7102604997341839, "grad_norm": 0.4115494191646576, "learning_rate": 3.931998541814069e-05, "loss": 2.6621, "step": 835 }, { "epoch": 0.7111111111111111, "grad_norm": 0.4606288969516754, "learning_rate": 3.9106045647569e-05, "loss": 2.512, "step": 836 }, { "epoch": 0.7119617224880382, "grad_norm": 0.42326587438583374, "learning_rate": 3.8892547932253795e-05, "loss": 2.6212, "step": 837 }, { "epoch": 0.7128123338649655, "grad_norm": 0.43627670407295227, "learning_rate": 3.8679493822066314e-05, "loss": 2.596, "step": 838 }, { "epoch": 0.7136629452418927, "grad_norm": 0.4475801885128021, "learning_rate": 3.846688486365748e-05, "loss": 2.64, "step": 839 }, { "epoch": 0.7145135566188198, "grad_norm": 0.4370073080062866, "learning_rate": 3.825472260044658e-05, "loss": 2.5291, "step": 840 }, { "epoch": 0.715364167995747, "grad_norm": 0.4832041561603546, "learning_rate": 3.804300857261025e-05, "loss": 2.659, "step": 841 }, { "epoch": 0.7162147793726741, "grad_norm": 0.4905672073364258, "learning_rate": 3.783174431707119e-05, "loss": 2.7363, "step": 842 }, { "epoch": 0.7170653907496013, "grad_norm": 0.5135957598686218, "learning_rate": 3.762093136748692e-05, "loss": 2.8243, "step": 843 }, { "epoch": 0.7179160021265284, "grad_norm": 0.5257413387298584, "learning_rate": 3.7410571254238834e-05, "loss": 2.8406, "step": 844 }, { "epoch": 0.7187666135034556, "grad_norm": 0.5150455236434937, "learning_rate": 3.7200665504420983e-05, "loss": 2.6053, "step": 845 }, { "epoch": 0.7196172248803828, "grad_norm": 0.5135449767112732, "learning_rate": 3.69912156418289e-05, "loss": 2.4932, "step": 846 }, { "epoch": 0.7204678362573099, "grad_norm": 0.5775859951972961, "learning_rate": 3.678222318694875e-05, "loss": 2.6011, "step": 847 }, { "epoch": 0.7213184476342371, "grad_norm": 0.6324962973594666, "learning_rate": 3.657368965694617e-05, "loss": 2.8145, "step": 848 }, { "epoch": 0.7221690590111642, "grad_norm": 0.6755786538124084, "learning_rate": 3.636561656565519e-05, "loss": 2.7566, "step": 849 }, { "epoch": 0.7230196703880915, "grad_norm": 0.7153156995773315, "learning_rate": 3.615800542356738e-05, "loss": 2.9433, "step": 850 }, { "epoch": 0.7238702817650186, "grad_norm": 0.29322749376296997, "learning_rate": 3.595085773782083e-05, "loss": 2.3433, "step": 851 }, { "epoch": 0.7247208931419458, "grad_norm": 0.28010284900665283, "learning_rate": 3.574417501218913e-05, "loss": 2.5661, "step": 852 }, { "epoch": 0.725571504518873, "grad_norm": 0.27005431056022644, "learning_rate": 3.55379587470706e-05, "loss": 2.4129, "step": 853 }, { "epoch": 0.7264221158958001, "grad_norm": 0.2710054814815521, "learning_rate": 3.533221043947733e-05, "loss": 2.4066, "step": 854 }, { "epoch": 0.7272727272727273, "grad_norm": 0.27026283740997314, "learning_rate": 3.512693158302421e-05, "loss": 2.4139, "step": 855 }, { "epoch": 0.7281233386496544, "grad_norm": 0.2927492558956146, "learning_rate": 3.492212366791831e-05, "loss": 2.5517, "step": 856 }, { "epoch": 0.7289739500265816, "grad_norm": 0.2770216166973114, "learning_rate": 3.471778818094785e-05, "loss": 2.6102, "step": 857 }, { "epoch": 0.7298245614035088, "grad_norm": 0.26914748549461365, "learning_rate": 3.45139266054715e-05, "loss": 2.6047, "step": 858 }, { "epoch": 0.7306751727804359, "grad_norm": 0.27307602763175964, "learning_rate": 3.4310540421407665e-05, "loss": 2.5707, "step": 859 }, { "epoch": 0.7315257841573631, "grad_norm": 0.2949882745742798, "learning_rate": 3.4107631105223525e-05, "loss": 2.502, "step": 860 }, { "epoch": 0.7323763955342902, "grad_norm": 0.26683247089385986, "learning_rate": 3.390520012992474e-05, "loss": 2.5785, "step": 861 }, { "epoch": 0.7332270069112174, "grad_norm": 0.2797079384326935, "learning_rate": 3.370324896504425e-05, "loss": 2.6119, "step": 862 }, { "epoch": 0.7340776182881446, "grad_norm": 0.27322161197662354, "learning_rate": 3.3501779076631864e-05, "loss": 2.5449, "step": 863 }, { "epoch": 0.7349282296650718, "grad_norm": 0.2743871212005615, "learning_rate": 3.330079192724379e-05, "loss": 2.6722, "step": 864 }, { "epoch": 0.735778841041999, "grad_norm": 0.27956199645996094, "learning_rate": 3.3100288975931635e-05, "loss": 2.5033, "step": 865 }, { "epoch": 0.7366294524189261, "grad_norm": 0.2711998522281647, "learning_rate": 3.290027167823204e-05, "loss": 2.5824, "step": 866 }, { "epoch": 0.7374800637958533, "grad_norm": 0.277340292930603, "learning_rate": 3.270074148615615e-05, "loss": 2.5168, "step": 867 }, { "epoch": 0.7383306751727804, "grad_norm": 0.26151034235954285, "learning_rate": 3.250169984817897e-05, "loss": 2.5749, "step": 868 }, { "epoch": 0.7391812865497076, "grad_norm": 0.2847073972225189, "learning_rate": 3.230314820922883e-05, "loss": 2.4749, "step": 869 }, { "epoch": 0.7400318979266348, "grad_norm": 0.26492998003959656, "learning_rate": 3.2105088010677e-05, "loss": 2.5874, "step": 870 }, { "epoch": 0.7408825093035619, "grad_norm": 0.2851366400718689, "learning_rate": 3.1907520690327184e-05, "loss": 2.7108, "step": 871 }, { "epoch": 0.7417331206804891, "grad_norm": 0.29604560136795044, "learning_rate": 3.1710447682405076e-05, "loss": 2.8558, "step": 872 }, { "epoch": 0.7425837320574162, "grad_norm": 0.2940882444381714, "learning_rate": 3.151387041754784e-05, "loss": 2.5961, "step": 873 }, { "epoch": 0.7434343434343434, "grad_norm": 0.3165973126888275, "learning_rate": 3.131779032279397e-05, "loss": 2.5976, "step": 874 }, { "epoch": 0.7442849548112705, "grad_norm": 0.3157351315021515, "learning_rate": 3.112220882157275e-05, "loss": 2.5901, "step": 875 }, { "epoch": 0.7451355661881978, "grad_norm": 0.3334716558456421, "learning_rate": 3.092712733369387e-05, "loss": 2.6966, "step": 876 }, { "epoch": 0.745986177565125, "grad_norm": 0.3281215727329254, "learning_rate": 3.073254727533732e-05, "loss": 2.616, "step": 877 }, { "epoch": 0.7468367889420521, "grad_norm": 0.3478872776031494, "learning_rate": 3.053847005904298e-05, "loss": 2.7568, "step": 878 }, { "epoch": 0.7476874003189793, "grad_norm": 0.35839834809303284, "learning_rate": 3.034489709370033e-05, "loss": 2.7552, "step": 879 }, { "epoch": 0.7485380116959064, "grad_norm": 0.3570314049720764, "learning_rate": 3.0151829784538254e-05, "loss": 2.6189, "step": 880 }, { "epoch": 0.7493886230728336, "grad_norm": 0.3771030008792877, "learning_rate": 2.995926953311504e-05, "loss": 3.0162, "step": 881 }, { "epoch": 0.7502392344497608, "grad_norm": 0.38607585430145264, "learning_rate": 2.9767217737307806e-05, "loss": 2.8731, "step": 882 }, { "epoch": 0.7502392344497608, "eval_loss": 2.657837152481079, "eval_runtime": 80.5036, "eval_samples_per_second": 12.298, "eval_steps_per_second": 6.149, "step": 882 }, { "epoch": 0.7510898458266879, "grad_norm": 0.3817814588546753, "learning_rate": 2.9575675791302703e-05, "loss": 2.7678, "step": 883 }, { "epoch": 0.7519404572036151, "grad_norm": 0.4072461724281311, "learning_rate": 2.9384645085584663e-05, "loss": 2.7004, "step": 884 }, { "epoch": 0.7527910685805422, "grad_norm": 0.3934648036956787, "learning_rate": 2.9194127006927208e-05, "loss": 2.4883, "step": 885 }, { "epoch": 0.7536416799574694, "grad_norm": 0.43659910559654236, "learning_rate": 2.9004122938382617e-05, "loss": 2.7447, "step": 886 }, { "epoch": 0.7544922913343965, "grad_norm": 0.4242823123931885, "learning_rate": 2.881463425927161e-05, "loss": 2.7324, "step": 887 }, { "epoch": 0.7553429027113238, "grad_norm": 0.4546271860599518, "learning_rate": 2.86256623451736e-05, "loss": 2.5985, "step": 888 }, { "epoch": 0.756193514088251, "grad_norm": 0.462896466255188, "learning_rate": 2.8437208567916517e-05, "loss": 2.5522, "step": 889 }, { "epoch": 0.7570441254651781, "grad_norm": 0.47179552912712097, "learning_rate": 2.8249274295566864e-05, "loss": 2.7653, "step": 890 }, { "epoch": 0.7578947368421053, "grad_norm": 0.5069417357444763, "learning_rate": 2.8061860892420012e-05, "loss": 2.8404, "step": 891 }, { "epoch": 0.7587453482190324, "grad_norm": 0.5162703990936279, "learning_rate": 2.787496971898994e-05, "loss": 2.5679, "step": 892 }, { "epoch": 0.7595959595959596, "grad_norm": 0.5141138434410095, "learning_rate": 2.7688602131999565e-05, "loss": 2.7156, "step": 893 }, { "epoch": 0.7604465709728868, "grad_norm": 0.5267017483711243, "learning_rate": 2.7502759484370944e-05, "loss": 2.8324, "step": 894 }, { "epoch": 0.7612971823498139, "grad_norm": 0.5354244112968445, "learning_rate": 2.7317443125215357e-05, "loss": 2.7099, "step": 895 }, { "epoch": 0.7621477937267411, "grad_norm": 0.5597330331802368, "learning_rate": 2.7132654399823444e-05, "loss": 2.7128, "step": 896 }, { "epoch": 0.7629984051036682, "grad_norm": 0.6345269680023193, "learning_rate": 2.6948394649655627e-05, "loss": 2.8904, "step": 897 }, { "epoch": 0.7638490164805954, "grad_norm": 0.5837516784667969, "learning_rate": 2.6764665212332253e-05, "loss": 2.5579, "step": 898 }, { "epoch": 0.7646996278575225, "grad_norm": 0.7032825350761414, "learning_rate": 2.658146742162384e-05, "loss": 2.7115, "step": 899 }, { "epoch": 0.7655502392344498, "grad_norm": 0.6780116558074951, "learning_rate": 2.6398802607441507e-05, "loss": 2.6573, "step": 900 }, { "epoch": 0.766400850611377, "grad_norm": 0.2630327641963959, "learning_rate": 2.6216672095827266e-05, "loss": 2.5077, "step": 901 }, { "epoch": 0.7672514619883041, "grad_norm": 0.2665589153766632, "learning_rate": 2.6035077208944415e-05, "loss": 2.4456, "step": 902 }, { "epoch": 0.7681020733652313, "grad_norm": 0.2564515471458435, "learning_rate": 2.5854019265067853e-05, "loss": 2.2768, "step": 903 }, { "epoch": 0.7689526847421584, "grad_norm": 0.290310263633728, "learning_rate": 2.5673499578574645e-05, "loss": 2.6339, "step": 904 }, { "epoch": 0.7698032961190856, "grad_norm": 0.27893102169036865, "learning_rate": 2.5493519459934423e-05, "loss": 2.6156, "step": 905 }, { "epoch": 0.7706539074960128, "grad_norm": 0.27719345688819885, "learning_rate": 2.531408021569982e-05, "loss": 2.4212, "step": 906 }, { "epoch": 0.7715045188729399, "grad_norm": 0.2813560664653778, "learning_rate": 2.5135183148496978e-05, "loss": 2.5064, "step": 907 }, { "epoch": 0.7723551302498671, "grad_norm": 0.28871771693229675, "learning_rate": 2.4956829557016338e-05, "loss": 2.505, "step": 908 }, { "epoch": 0.7732057416267942, "grad_norm": 0.30079928040504456, "learning_rate": 2.4779020736002834e-05, "loss": 2.4488, "step": 909 }, { "epoch": 0.7740563530037214, "grad_norm": 0.27679336071014404, "learning_rate": 2.4601757976246686e-05, "loss": 2.6442, "step": 910 }, { "epoch": 0.7749069643806485, "grad_norm": 0.27239468693733215, "learning_rate": 2.4425042564574184e-05, "loss": 2.4256, "step": 911 }, { "epoch": 0.7757575757575758, "grad_norm": 0.2782379686832428, "learning_rate": 2.4248875783837987e-05, "loss": 2.5383, "step": 912 }, { "epoch": 0.776608187134503, "grad_norm": 0.28101786971092224, "learning_rate": 2.407325891290817e-05, "loss": 2.5509, "step": 913 }, { "epoch": 0.7774587985114301, "grad_norm": 0.26222068071365356, "learning_rate": 2.3898193226662634e-05, "loss": 2.5718, "step": 914 }, { "epoch": 0.7783094098883573, "grad_norm": 0.28495025634765625, "learning_rate": 2.3723679995978088e-05, "loss": 2.7216, "step": 915 }, { "epoch": 0.7791600212652844, "grad_norm": 0.27799373865127563, "learning_rate": 2.3549720487720738e-05, "loss": 2.4825, "step": 916 }, { "epoch": 0.7800106326422116, "grad_norm": 0.2805662751197815, "learning_rate": 2.3376315964737004e-05, "loss": 2.5418, "step": 917 }, { "epoch": 0.7808612440191387, "grad_norm": 0.2894647717475891, "learning_rate": 2.3203467685844494e-05, "loss": 2.4872, "step": 918 }, { "epoch": 0.7817118553960659, "grad_norm": 0.2725624740123749, "learning_rate": 2.3031176905822805e-05, "loss": 2.6821, "step": 919 }, { "epoch": 0.7825624667729931, "grad_norm": 0.2858252227306366, "learning_rate": 2.2859444875404347e-05, "loss": 2.5256, "step": 920 }, { "epoch": 0.7834130781499202, "grad_norm": 0.27502021193504333, "learning_rate": 2.268827284126539e-05, "loss": 2.6097, "step": 921 }, { "epoch": 0.7842636895268474, "grad_norm": 0.30405184626579285, "learning_rate": 2.2517662046016975e-05, "loss": 2.5872, "step": 922 }, { "epoch": 0.7851143009037745, "grad_norm": 0.3032589256763458, "learning_rate": 2.234761372819577e-05, "loss": 2.6253, "step": 923 }, { "epoch": 0.7859649122807018, "grad_norm": 0.3149045705795288, "learning_rate": 2.2178129122255253e-05, "loss": 2.5631, "step": 924 }, { "epoch": 0.786815523657629, "grad_norm": 0.3490796387195587, "learning_rate": 2.200920945855669e-05, "loss": 2.7784, "step": 925 }, { "epoch": 0.7876661350345561, "grad_norm": 0.332727313041687, "learning_rate": 2.184085596336011e-05, "loss": 2.7369, "step": 926 }, { "epoch": 0.7885167464114833, "grad_norm": 0.345241516828537, "learning_rate": 2.1673069858815554e-05, "loss": 2.734, "step": 927 }, { "epoch": 0.7893673577884104, "grad_norm": 0.356146901845932, "learning_rate": 2.150585236295415e-05, "loss": 2.5664, "step": 928 }, { "epoch": 0.7902179691653376, "grad_norm": 0.34294942021369934, "learning_rate": 2.133920468967915e-05, "loss": 2.7841, "step": 929 }, { "epoch": 0.7910685805422647, "grad_norm": 0.3757612407207489, "learning_rate": 2.1173128048757306e-05, "loss": 2.8389, "step": 930 }, { "epoch": 0.7919191919191919, "grad_norm": 0.37201401591300964, "learning_rate": 2.1007623645810003e-05, "loss": 2.7424, "step": 931 }, { "epoch": 0.7927698032961191, "grad_norm": 0.37049031257629395, "learning_rate": 2.0842692682304444e-05, "loss": 2.7447, "step": 932 }, { "epoch": 0.7936204146730462, "grad_norm": 0.40525051951408386, "learning_rate": 2.0678336355545048e-05, "loss": 2.7354, "step": 933 }, { "epoch": 0.7944710260499734, "grad_norm": 0.37740442156791687, "learning_rate": 2.0514555858664663e-05, "loss": 2.5756, "step": 934 }, { "epoch": 0.7953216374269005, "grad_norm": 0.399179607629776, "learning_rate": 2.0351352380616008e-05, "loss": 2.606, "step": 935 }, { "epoch": 0.7961722488038278, "grad_norm": 0.4020417630672455, "learning_rate": 2.0188727106162874e-05, "loss": 2.5945, "step": 936 }, { "epoch": 0.797022860180755, "grad_norm": 0.4364849627017975, "learning_rate": 2.0026681215871656e-05, "loss": 2.7867, "step": 937 }, { "epoch": 0.7978734715576821, "grad_norm": 0.4335819482803345, "learning_rate": 1.986521588610285e-05, "loss": 2.6452, "step": 938 }, { "epoch": 0.7987240829346093, "grad_norm": 0.44533243775367737, "learning_rate": 1.9704332289002293e-05, "loss": 2.7037, "step": 939 }, { "epoch": 0.7995746943115364, "grad_norm": 0.46524372696876526, "learning_rate": 1.9544031592492763e-05, "loss": 2.7772, "step": 940 }, { "epoch": 0.8004253056884636, "grad_norm": 0.4814035892486572, "learning_rate": 1.9384314960265692e-05, "loss": 2.7603, "step": 941 }, { "epoch": 0.8012759170653907, "grad_norm": 0.5147087574005127, "learning_rate": 1.922518355177232e-05, "loss": 2.8118, "step": 942 }, { "epoch": 0.8021265284423179, "grad_norm": 0.5196573734283447, "learning_rate": 1.906663852221565e-05, "loss": 2.7226, "step": 943 }, { "epoch": 0.8029771398192451, "grad_norm": 0.538750171661377, "learning_rate": 1.890868102254182e-05, "loss": 2.8252, "step": 944 }, { "epoch": 0.8038277511961722, "grad_norm": 0.5554125308990479, "learning_rate": 1.875131219943187e-05, "loss": 2.7985, "step": 945 }, { "epoch": 0.8046783625730994, "grad_norm": 0.5610653162002563, "learning_rate": 1.8594533195293427e-05, "loss": 2.7006, "step": 946 }, { "epoch": 0.8055289739500265, "grad_norm": 0.5772664546966553, "learning_rate": 1.843834514825229e-05, "loss": 2.7453, "step": 947 }, { "epoch": 0.8063795853269538, "grad_norm": 0.7284188866615295, "learning_rate": 1.82827491921443e-05, "loss": 2.7826, "step": 948 }, { "epoch": 0.807230196703881, "grad_norm": 0.6359315514564514, "learning_rate": 1.8127746456507077e-05, "loss": 2.5094, "step": 949 }, { "epoch": 0.8080808080808081, "grad_norm": 0.7618624567985535, "learning_rate": 1.797333806657171e-05, "loss": 2.9762, "step": 950 }, { "epoch": 0.8089314194577353, "grad_norm": 0.2761584520339966, "learning_rate": 1.7819525143254755e-05, "loss": 2.3495, "step": 951 }, { "epoch": 0.8097820308346624, "grad_norm": 0.2787644565105438, "learning_rate": 1.7666308803150043e-05, "loss": 2.6593, "step": 952 }, { "epoch": 0.8106326422115896, "grad_norm": 0.2583043873310089, "learning_rate": 1.751369015852046e-05, "loss": 2.3744, "step": 953 }, { "epoch": 0.8114832535885167, "grad_norm": 0.27526992559432983, "learning_rate": 1.7361670317290012e-05, "loss": 2.424, "step": 954 }, { "epoch": 0.8123338649654439, "grad_norm": 0.2713683843612671, "learning_rate": 1.7210250383035807e-05, "loss": 2.5069, "step": 955 }, { "epoch": 0.8131844763423711, "grad_norm": 0.28076109290122986, "learning_rate": 1.7059431454979824e-05, "loss": 2.4204, "step": 956 }, { "epoch": 0.8140350877192982, "grad_norm": 0.2972472012042999, "learning_rate": 1.6909214627981197e-05, "loss": 2.6109, "step": 957 }, { "epoch": 0.8148856990962254, "grad_norm": 0.31532159447669983, "learning_rate": 1.6759600992528147e-05, "loss": 2.6416, "step": 958 }, { "epoch": 0.8157363104731525, "grad_norm": 0.2699948847293854, "learning_rate": 1.6610591634729965e-05, "loss": 2.4951, "step": 959 }, { "epoch": 0.8165869218500797, "grad_norm": 0.27630969882011414, "learning_rate": 1.6462187636309345e-05, "loss": 2.6336, "step": 960 }, { "epoch": 0.817437533227007, "grad_norm": 0.27848508954048157, "learning_rate": 1.631439007459441e-05, "loss": 2.6848, "step": 961 }, { "epoch": 0.8182881446039341, "grad_norm": 0.2589823007583618, "learning_rate": 1.61672000225108e-05, "loss": 2.6077, "step": 962 }, { "epoch": 0.8191387559808613, "grad_norm": 0.27338507771492004, "learning_rate": 1.6020618548574108e-05, "loss": 2.4618, "step": 963 }, { "epoch": 0.8199893673577884, "grad_norm": 0.26967307925224304, "learning_rate": 1.587464671688187e-05, "loss": 2.4733, "step": 964 }, { "epoch": 0.8208399787347156, "grad_norm": 0.27979278564453125, "learning_rate": 1.5729285587106136e-05, "loss": 2.6789, "step": 965 }, { "epoch": 0.8216905901116427, "grad_norm": 0.263261616230011, "learning_rate": 1.5584536214485457e-05, "loss": 2.5471, "step": 966 }, { "epoch": 0.8225412014885699, "grad_norm": 0.27126452326774597, "learning_rate": 1.5440399649817385e-05, "loss": 2.6185, "step": 967 }, { "epoch": 0.8233918128654971, "grad_norm": 0.2681058943271637, "learning_rate": 1.5296876939450978e-05, "loss": 2.567, "step": 968 }, { "epoch": 0.8242424242424242, "grad_norm": 0.25772204995155334, "learning_rate": 1.5153969125278934e-05, "loss": 2.4961, "step": 969 }, { "epoch": 0.8250930356193514, "grad_norm": 0.27491864562034607, "learning_rate": 1.5011677244730161e-05, "loss": 2.6572, "step": 970 }, { "epoch": 0.8259436469962785, "grad_norm": 0.3069652318954468, "learning_rate": 1.4870002330762289e-05, "loss": 2.6633, "step": 971 }, { "epoch": 0.8267942583732057, "grad_norm": 0.29441124200820923, "learning_rate": 1.4728945411854133e-05, "loss": 2.5631, "step": 972 }, { "epoch": 0.827644869750133, "grad_norm": 0.29318681359291077, "learning_rate": 1.4588507511998162e-05, "loss": 2.7143, "step": 973 }, { "epoch": 0.8284954811270601, "grad_norm": 0.3085382282733917, "learning_rate": 1.4448689650693147e-05, "loss": 2.6399, "step": 974 }, { "epoch": 0.8293460925039873, "grad_norm": 0.3353934586048126, "learning_rate": 1.4309492842936768e-05, "loss": 2.7392, "step": 975 }, { "epoch": 0.8301967038809144, "grad_norm": 0.35874679684638977, "learning_rate": 1.4170918099218166e-05, "loss": 2.6478, "step": 976 }, { "epoch": 0.8310473152578416, "grad_norm": 0.3231167793273926, "learning_rate": 1.4032966425510663e-05, "loss": 2.5808, "step": 977 }, { "epoch": 0.8318979266347687, "grad_norm": 0.3558177351951599, "learning_rate": 1.3895638823264446e-05, "loss": 2.8192, "step": 978 }, { "epoch": 0.8327485380116959, "grad_norm": 0.3447312116622925, "learning_rate": 1.3758936289399348e-05, "loss": 2.439, "step": 979 }, { "epoch": 0.8335991493886231, "grad_norm": 0.35401877760887146, "learning_rate": 1.3622859816297473e-05, "loss": 2.6874, "step": 980 }, { "epoch": 0.8344497607655502, "grad_norm": 0.3697657585144043, "learning_rate": 1.3487410391796162e-05, "loss": 2.6217, "step": 981 }, { "epoch": 0.8353003721424774, "grad_norm": 0.39387422800064087, "learning_rate": 1.3352588999180726e-05, "loss": 2.9086, "step": 982 }, { "epoch": 0.8361509835194045, "grad_norm": 0.4256942570209503, "learning_rate": 1.3218396617177287e-05, "loss": 2.6578, "step": 983 }, { "epoch": 0.8370015948963317, "grad_norm": 0.40965893864631653, "learning_rate": 1.308483421994573e-05, "loss": 2.6781, "step": 984 }, { "epoch": 0.837852206273259, "grad_norm": 0.4041212797164917, "learning_rate": 1.2951902777072655e-05, "loss": 2.8038, "step": 985 }, { "epoch": 0.8387028176501861, "grad_norm": 0.416715532541275, "learning_rate": 1.2819603253564205e-05, "loss": 2.5885, "step": 986 }, { "epoch": 0.8395534290271133, "grad_norm": 0.4417300522327423, "learning_rate": 1.2687936609839235e-05, "loss": 2.8096, "step": 987 }, { "epoch": 0.8404040404040404, "grad_norm": 0.47883424162864685, "learning_rate": 1.2556903801722219e-05, "loss": 2.8092, "step": 988 }, { "epoch": 0.8412546517809676, "grad_norm": 0.4706938862800598, "learning_rate": 1.2426505780436326e-05, "loss": 2.6285, "step": 989 }, { "epoch": 0.8421052631578947, "grad_norm": 0.4550638496875763, "learning_rate": 1.2296743492596586e-05, "loss": 2.5213, "step": 990 }, { "epoch": 0.8429558745348219, "grad_norm": 0.48749181628227234, "learning_rate": 1.2167617880202908e-05, "loss": 2.791, "step": 991 }, { "epoch": 0.8438064859117491, "grad_norm": 0.46483585238456726, "learning_rate": 1.2039129880633349e-05, "loss": 2.7672, "step": 992 }, { "epoch": 0.8446570972886762, "grad_norm": 0.510725200176239, "learning_rate": 1.1911280426637273e-05, "loss": 2.9072, "step": 993 }, { "epoch": 0.8455077086656034, "grad_norm": 0.5274655222892761, "learning_rate": 1.1784070446328476e-05, "loss": 2.8219, "step": 994 }, { "epoch": 0.8463583200425305, "grad_norm": 0.5416675806045532, "learning_rate": 1.1657500863178694e-05, "loss": 2.7237, "step": 995 }, { "epoch": 0.8472089314194577, "grad_norm": 0.5472414493560791, "learning_rate": 1.153157259601062e-05, "loss": 2.68, "step": 996 }, { "epoch": 0.8480595427963848, "grad_norm": 0.5868435502052307, "learning_rate": 1.1406286558991375e-05, "loss": 2.7562, "step": 997 }, { "epoch": 0.8489101541733121, "grad_norm": 0.6191973090171814, "learning_rate": 1.1281643661625895e-05, "loss": 2.7655, "step": 998 }, { "epoch": 0.8497607655502393, "grad_norm": 0.6745545268058777, "learning_rate": 1.1157644808750312e-05, "loss": 2.8605, "step": 999 }, { "epoch": 0.8506113769271664, "grad_norm": 0.697556734085083, "learning_rate": 1.103429090052528e-05, "loss": 2.6958, "step": 1000 }, { "epoch": 0.8514619883040936, "grad_norm": 0.26258981227874756, "learning_rate": 1.0911582832429589e-05, "loss": 2.4457, "step": 1001 }, { "epoch": 0.8523125996810207, "grad_norm": 0.26820749044418335, "learning_rate": 1.0789521495253618e-05, "loss": 2.6249, "step": 1002 }, { "epoch": 0.8531632110579479, "grad_norm": 0.26697129011154175, "learning_rate": 1.0668107775092751e-05, "loss": 2.4464, "step": 1003 }, { "epoch": 0.8540138224348751, "grad_norm": 0.2953515350818634, "learning_rate": 1.0547342553341144e-05, "loss": 2.5413, "step": 1004 }, { "epoch": 0.8548644338118022, "grad_norm": 0.264387309551239, "learning_rate": 1.0427226706685178e-05, "loss": 2.6889, "step": 1005 }, { "epoch": 0.8557150451887294, "grad_norm": 0.2656628489494324, "learning_rate": 1.030776110709718e-05, "loss": 2.5115, "step": 1006 }, { "epoch": 0.8565656565656565, "grad_norm": 0.25700512528419495, "learning_rate": 1.0188946621828976e-05, "loss": 2.5474, "step": 1007 }, { "epoch": 0.8574162679425837, "grad_norm": 0.28423216938972473, "learning_rate": 1.0070784113405763e-05, "loss": 2.5926, "step": 1008 }, { "epoch": 0.8582668793195108, "grad_norm": 0.2653867304325104, "learning_rate": 9.953274439619741e-06, "loss": 2.4022, "step": 1009 }, { "epoch": 0.8591174906964381, "grad_norm": 0.27558040618896484, "learning_rate": 9.836418453523833e-06, "loss": 2.6529, "step": 1010 }, { "epoch": 0.8599681020733653, "grad_norm": 0.25471609830856323, "learning_rate": 9.720217003425647e-06, "loss": 2.5865, "step": 1011 }, { "epoch": 0.8608187134502924, "grad_norm": 0.2609470784664154, "learning_rate": 9.60467093288121e-06, "loss": 2.5492, "step": 1012 }, { "epoch": 0.8616693248272196, "grad_norm": 0.29608237743377686, "learning_rate": 9.489781080688865e-06, "loss": 2.5104, "step": 1013 }, { "epoch": 0.8625199362041467, "grad_norm": 0.29410475492477417, "learning_rate": 9.375548280883128e-06, "loss": 2.6142, "step": 1014 }, { "epoch": 0.8633705475810739, "grad_norm": 0.269397497177124, "learning_rate": 9.261973362728827e-06, "loss": 2.5652, "step": 1015 }, { "epoch": 0.8642211589580011, "grad_norm": 0.28159505128860474, "learning_rate": 9.149057150714801e-06, "loss": 2.7499, "step": 1016 }, { "epoch": 0.8650717703349282, "grad_norm": 0.27257442474365234, "learning_rate": 9.036800464548157e-06, "loss": 2.6305, "step": 1017 }, { "epoch": 0.8659223817118554, "grad_norm": 0.26392441987991333, "learning_rate": 8.92520411914819e-06, "loss": 2.4767, "step": 1018 }, { "epoch": 0.8667729930887825, "grad_norm": 0.2779422998428345, "learning_rate": 8.814268924640468e-06, "loss": 2.6056, "step": 1019 }, { "epoch": 0.8676236044657097, "grad_norm": 0.27914363145828247, "learning_rate": 8.70399568635104e-06, "loss": 2.5612, "step": 1020 }, { "epoch": 0.8684742158426368, "grad_norm": 0.28274431824684143, "learning_rate": 8.594385204800482e-06, "loss": 2.5673, "step": 1021 }, { "epoch": 0.8693248272195641, "grad_norm": 0.30289795994758606, "learning_rate": 8.485438275698154e-06, "loss": 2.6624, "step": 1022 }, { "epoch": 0.8701754385964913, "grad_norm": 0.3084239363670349, "learning_rate": 8.377155689936434e-06, "loss": 2.5199, "step": 1023 }, { "epoch": 0.8710260499734184, "grad_norm": 0.32038623094558716, "learning_rate": 8.269538233584883e-06, "loss": 2.5174, "step": 1024 }, { "epoch": 0.8718766613503456, "grad_norm": 0.32201388478279114, "learning_rate": 8.162586687884654e-06, "loss": 2.5764, "step": 1025 }, { "epoch": 0.8727272727272727, "grad_norm": 0.31848224997520447, "learning_rate": 8.056301829242784e-06, "loss": 2.6327, "step": 1026 }, { "epoch": 0.8735778841041999, "grad_norm": 0.34866663813591003, "learning_rate": 7.950684429226463e-06, "loss": 2.8354, "step": 1027 }, { "epoch": 0.8744284954811271, "grad_norm": 0.3639064431190491, "learning_rate": 7.845735254557606e-06, "loss": 2.7546, "step": 1028 }, { "epoch": 0.8752791068580542, "grad_norm": 0.35733523964881897, "learning_rate": 7.741455067107162e-06, "loss": 2.69, "step": 1029 }, { "epoch": 0.8761297182349814, "grad_norm": 0.35862669348716736, "learning_rate": 7.637844623889556e-06, "loss": 2.5966, "step": 1030 }, { "epoch": 0.8769803296119085, "grad_norm": 0.36205005645751953, "learning_rate": 7.534904677057353e-06, "loss": 2.4966, "step": 1031 }, { "epoch": 0.8778309409888357, "grad_norm": 0.3843689560890198, "learning_rate": 7.4326359738956515e-06, "loss": 2.7942, "step": 1032 }, { "epoch": 0.8786815523657628, "grad_norm": 0.40694114565849304, "learning_rate": 7.331039256816663e-06, "loss": 2.739, "step": 1033 }, { "epoch": 0.87953216374269, "grad_norm": 0.40859633684158325, "learning_rate": 7.230115263354431e-06, "loss": 2.6879, "step": 1034 }, { "epoch": 0.8803827751196173, "grad_norm": 0.41907498240470886, "learning_rate": 7.129864726159408e-06, "loss": 2.5804, "step": 1035 }, { "epoch": 0.8812333864965444, "grad_norm": 0.4525774121284485, "learning_rate": 7.030288372993066e-06, "loss": 2.6814, "step": 1036 }, { "epoch": 0.8820839978734716, "grad_norm": 0.45679178833961487, "learning_rate": 6.931386926722772e-06, "loss": 2.7909, "step": 1037 }, { "epoch": 0.8829346092503987, "grad_norm": 0.47028636932373047, "learning_rate": 6.833161105316421e-06, "loss": 2.8758, "step": 1038 }, { "epoch": 0.8837852206273259, "grad_norm": 0.4830760359764099, "learning_rate": 6.7356116218372566e-06, "loss": 2.6979, "step": 1039 }, { "epoch": 0.8846358320042531, "grad_norm": 0.49996817111968994, "learning_rate": 6.63873918443868e-06, "loss": 2.8513, "step": 1040 }, { "epoch": 0.8854864433811802, "grad_norm": 0.4843011200428009, "learning_rate": 6.542544496359138e-06, "loss": 2.553, "step": 1041 }, { "epoch": 0.8863370547581074, "grad_norm": 0.5989816188812256, "learning_rate": 6.447028255917054e-06, "loss": 2.5985, "step": 1042 }, { "epoch": 0.8871876661350345, "grad_norm": 0.5418705940246582, "learning_rate": 6.352191156505627e-06, "loss": 2.7277, "step": 1043 }, { "epoch": 0.8880382775119617, "grad_norm": 0.5338729619979858, "learning_rate": 6.258033886587911e-06, "loss": 2.9324, "step": 1044 }, { "epoch": 0.8888888888888888, "grad_norm": 0.5353463292121887, "learning_rate": 6.164557129691828e-06, "loss": 2.6506, "step": 1045 }, { "epoch": 0.889739500265816, "grad_norm": 0.6170979142189026, "learning_rate": 6.0717615644051206e-06, "loss": 2.7699, "step": 1046 }, { "epoch": 0.8905901116427433, "grad_norm": 0.6048348546028137, "learning_rate": 5.979647864370486e-06, "loss": 2.7457, "step": 1047 }, { "epoch": 0.8914407230196704, "grad_norm": 0.6213808059692383, "learning_rate": 5.888216698280647e-06, "loss": 2.671, "step": 1048 }, { "epoch": 0.8922913343965976, "grad_norm": 0.6980977654457092, "learning_rate": 5.7974687298735454e-06, "loss": 2.9322, "step": 1049 }, { "epoch": 0.8931419457735247, "grad_norm": 0.762441873550415, "learning_rate": 5.7074046179275255e-06, "loss": 2.9242, "step": 1050 }, { "epoch": 0.8939925571504519, "grad_norm": 0.2550997734069824, "learning_rate": 5.6180250162564455e-06, "loss": 2.3309, "step": 1051 }, { "epoch": 0.8948431685273791, "grad_norm": 0.2799069285392761, "learning_rate": 5.5293305737050825e-06, "loss": 2.6428, "step": 1052 }, { "epoch": 0.8956937799043062, "grad_norm": 0.25860702991485596, "learning_rate": 5.441321934144339e-06, "loss": 2.2591, "step": 1053 }, { "epoch": 0.8965443912812334, "grad_norm": 0.262991726398468, "learning_rate": 5.35399973646653e-06, "loss": 2.3974, "step": 1054 }, { "epoch": 0.8973950026581605, "grad_norm": 0.25796017050743103, "learning_rate": 5.267364614580861e-06, "loss": 2.4487, "step": 1055 }, { "epoch": 0.8982456140350877, "grad_norm": 0.26405441761016846, "learning_rate": 5.181417197408734e-06, "loss": 2.3894, "step": 1056 }, { "epoch": 0.8990962254120148, "grad_norm": 0.27741700410842896, "learning_rate": 5.09615810887919e-06, "loss": 2.5089, "step": 1057 }, { "epoch": 0.899946836788942, "grad_norm": 0.27698126435279846, "learning_rate": 5.011587967924414e-06, "loss": 2.4868, "step": 1058 }, { "epoch": 0.9007974481658693, "grad_norm": 0.2699805796146393, "learning_rate": 4.927707388475255e-06, "loss": 2.4822, "step": 1059 }, { "epoch": 0.9016480595427964, "grad_norm": 0.2688996493816376, "learning_rate": 4.84451697945667e-06, "loss": 2.574, "step": 1060 }, { "epoch": 0.9024986709197236, "grad_norm": 0.2651573419570923, "learning_rate": 4.7620173447834425e-06, "loss": 2.5219, "step": 1061 }, { "epoch": 0.9033492822966507, "grad_norm": 0.27622607350349426, "learning_rate": 4.680209083355713e-06, "loss": 2.6289, "step": 1062 }, { "epoch": 0.9041998936735779, "grad_norm": 0.27151504158973694, "learning_rate": 4.5990927890545935e-06, "loss": 2.625, "step": 1063 }, { "epoch": 0.9050505050505051, "grad_norm": 0.2963936924934387, "learning_rate": 4.518669050737989e-06, "loss": 2.4124, "step": 1064 }, { "epoch": 0.9059011164274322, "grad_norm": 0.2703079879283905, "learning_rate": 4.438938452236219e-06, "loss": 2.482, "step": 1065 }, { "epoch": 0.9067517278043594, "grad_norm": 0.25973039865493774, "learning_rate": 4.359901572347758e-06, "loss": 2.6443, "step": 1066 }, { "epoch": 0.9076023391812865, "grad_norm": 0.27102747559547424, "learning_rate": 4.281558984835143e-06, "loss": 2.6452, "step": 1067 }, { "epoch": 0.9084529505582137, "grad_norm": 0.2813124656677246, "learning_rate": 4.203911258420712e-06, "loss": 2.6012, "step": 1068 }, { "epoch": 0.9093035619351408, "grad_norm": 0.274431437253952, "learning_rate": 4.126958956782545e-06, "loss": 2.6887, "step": 1069 }, { "epoch": 0.910154173312068, "grad_norm": 0.2668261229991913, "learning_rate": 4.050702638550275e-06, "loss": 2.5839, "step": 1070 }, { "epoch": 0.9110047846889953, "grad_norm": 0.2745119631290436, "learning_rate": 3.975142857301117e-06, "loss": 2.4371, "step": 1071 }, { "epoch": 0.9118553960659224, "grad_norm": 0.31567510962486267, "learning_rate": 3.900280161555881e-06, "loss": 2.4209, "step": 1072 }, { "epoch": 0.9127060074428496, "grad_norm": 0.28338539600372314, "learning_rate": 3.826115094774863e-06, "loss": 2.47, "step": 1073 }, { "epoch": 0.9135566188197767, "grad_norm": 0.3005933463573456, "learning_rate": 3.7526481953539915e-06, "loss": 2.5939, "step": 1074 }, { "epoch": 0.9144072301967039, "grad_norm": 0.31587204337120056, "learning_rate": 3.6798799966209497e-06, "loss": 2.6447, "step": 1075 }, { "epoch": 0.915257841573631, "grad_norm": 0.3279857635498047, "learning_rate": 3.607811026831176e-06, "loss": 2.5845, "step": 1076 }, { "epoch": 0.9161084529505582, "grad_norm": 0.3389846980571747, "learning_rate": 3.5364418091641373e-06, "loss": 2.7449, "step": 1077 }, { "epoch": 0.9169590643274854, "grad_norm": 0.3381056487560272, "learning_rate": 3.4657728617195295e-06, "loss": 2.407, "step": 1078 }, { "epoch": 0.9178096757044125, "grad_norm": 0.3725632429122925, "learning_rate": 3.3958046975134495e-06, "loss": 2.8884, "step": 1079 }, { "epoch": 0.9186602870813397, "grad_norm": 0.3629099726676941, "learning_rate": 3.32653782447474e-06, "loss": 2.6872, "step": 1080 }, { "epoch": 0.9195108984582668, "grad_norm": 0.37425747513771057, "learning_rate": 3.25797274544124e-06, "loss": 2.659, "step": 1081 }, { "epoch": 0.920361509835194, "grad_norm": 0.3709467649459839, "learning_rate": 3.1901099581561845e-06, "loss": 2.5593, "step": 1082 }, { "epoch": 0.9212121212121213, "grad_norm": 0.37324512004852295, "learning_rate": 3.122949955264587e-06, "loss": 2.5461, "step": 1083 }, { "epoch": 0.9220627325890484, "grad_norm": 0.39722326397895813, "learning_rate": 3.0564932243095866e-06, "loss": 2.8043, "step": 1084 }, { "epoch": 0.9229133439659756, "grad_norm": 0.40863049030303955, "learning_rate": 2.9907402477290514e-06, "loss": 2.5307, "step": 1085 }, { "epoch": 0.9237639553429027, "grad_norm": 0.4645911157131195, "learning_rate": 2.9256915028519573e-06, "loss": 2.3838, "step": 1086 }, { "epoch": 0.9246145667198299, "grad_norm": 0.42394137382507324, "learning_rate": 2.8613474618949366e-06, "loss": 2.7368, "step": 1087 }, { "epoch": 0.925465178096757, "grad_norm": 0.44541221857070923, "learning_rate": 2.7977085919589254e-06, "loss": 2.6973, "step": 1088 }, { "epoch": 0.9263157894736842, "grad_norm": 0.45734384655952454, "learning_rate": 2.7347753550256872e-06, "loss": 2.8454, "step": 1089 }, { "epoch": 0.9271664008506114, "grad_norm": 0.4581248164176941, "learning_rate": 2.672548207954495e-06, "loss": 2.7356, "step": 1090 }, { "epoch": 0.9280170122275385, "grad_norm": 0.45908254384994507, "learning_rate": 2.6110276024788214e-06, "loss": 2.5562, "step": 1091 }, { "epoch": 0.9288676236044657, "grad_norm": 0.4975565969944, "learning_rate": 2.550213985203076e-06, "loss": 2.8292, "step": 1092 }, { "epoch": 0.9297182349813928, "grad_norm": 0.5445611476898193, "learning_rate": 2.4901077975992838e-06, "loss": 2.9136, "step": 1093 }, { "epoch": 0.93056884635832, "grad_norm": 0.5516471862792969, "learning_rate": 2.4307094760039785e-06, "loss": 3.0927, "step": 1094 }, { "epoch": 0.9314194577352473, "grad_norm": 0.5213038921356201, "learning_rate": 2.3720194516149818e-06, "loss": 2.705, "step": 1095 }, { "epoch": 0.9322700691121744, "grad_norm": 0.5873907208442688, "learning_rate": 2.3140381504882737e-06, "loss": 2.8415, "step": 1096 }, { "epoch": 0.9331206804891016, "grad_norm": 0.6056773662567139, "learning_rate": 2.2567659935349372e-06, "loss": 2.6933, "step": 1097 }, { "epoch": 0.9339712918660287, "grad_norm": 0.6297914981842041, "learning_rate": 2.200203396517997e-06, "loss": 2.8197, "step": 1098 }, { "epoch": 0.9348219032429559, "grad_norm": 0.6759224534034729, "learning_rate": 2.144350770049597e-06, "loss": 2.7496, "step": 1099 }, { "epoch": 0.935672514619883, "grad_norm": 0.7379116415977478, "learning_rate": 2.0892085195878154e-06, "loss": 2.926, "step": 1100 }, { "epoch": 0.9365231259968102, "grad_norm": 0.25760582089424133, "learning_rate": 2.034777045433811e-06, "loss": 2.3794, "step": 1101 }, { "epoch": 0.9373737373737374, "grad_norm": 0.2775615155696869, "learning_rate": 1.9810567427289595e-06, "loss": 2.5169, "step": 1102 }, { "epoch": 0.9382243487506645, "grad_norm": 0.26485174894332886, "learning_rate": 1.92804800145191e-06, "loss": 2.4342, "step": 1103 }, { "epoch": 0.9390749601275917, "grad_norm": 0.24917316436767578, "learning_rate": 1.8757512064157656e-06, "loss": 2.4177, "step": 1104 }, { "epoch": 0.9399255715045188, "grad_norm": 0.25125351548194885, "learning_rate": 1.8241667372653316e-06, "loss": 2.2352, "step": 1105 }, { "epoch": 0.940776182881446, "grad_norm": 0.2972431182861328, "learning_rate": 1.7732949684743594e-06, "loss": 2.4803, "step": 1106 }, { "epoch": 0.9416267942583733, "grad_norm": 0.2540951073169708, "learning_rate": 1.7231362693427288e-06, "loss": 2.5117, "step": 1107 }, { "epoch": 0.9424774056353004, "grad_norm": 0.26120516657829285, "learning_rate": 1.6736910039939157e-06, "loss": 2.3744, "step": 1108 }, { "epoch": 0.9433280170122276, "grad_norm": 0.2617899477481842, "learning_rate": 1.62495953137225e-06, "loss": 2.4657, "step": 1109 }, { "epoch": 0.9441786283891547, "grad_norm": 0.29227423667907715, "learning_rate": 1.576942205240317e-06, "loss": 2.4297, "step": 1110 }, { "epoch": 0.9450292397660819, "grad_norm": 0.2886480689048767, "learning_rate": 1.5296393741764391e-06, "loss": 2.5686, "step": 1111 }, { "epoch": 0.945879851143009, "grad_norm": 0.27778005599975586, "learning_rate": 1.4830513815720759e-06, "loss": 2.5925, "step": 1112 }, { "epoch": 0.9467304625199362, "grad_norm": 0.2743171155452728, "learning_rate": 1.4371785656294046e-06, "loss": 2.6989, "step": 1113 }, { "epoch": 0.9475810738968634, "grad_norm": 0.27193981409072876, "learning_rate": 1.392021259358811e-06, "loss": 2.457, "step": 1114 }, { "epoch": 0.9484316852737905, "grad_norm": 0.2713122069835663, "learning_rate": 1.3475797905764809e-06, "loss": 2.6029, "step": 1115 }, { "epoch": 0.9492822966507177, "grad_norm": 0.25593551993370056, "learning_rate": 1.303854481902067e-06, "loss": 2.6974, "step": 1116 }, { "epoch": 0.9501329080276448, "grad_norm": 0.26953616738319397, "learning_rate": 1.2608456507562705e-06, "loss": 2.6234, "step": 1117 }, { "epoch": 0.950983519404572, "grad_norm": 0.28958868980407715, "learning_rate": 1.2185536093585747e-06, "loss": 2.6315, "step": 1118 }, { "epoch": 0.9518341307814993, "grad_norm": 0.263473242521286, "learning_rate": 1.1769786647250147e-06, "loss": 2.5595, "step": 1119 }, { "epoch": 0.9526847421584264, "grad_norm": 0.26081758737564087, "learning_rate": 1.1361211186658894e-06, "loss": 2.4885, "step": 1120 }, { "epoch": 0.9535353535353536, "grad_norm": 0.275305837392807, "learning_rate": 1.0959812677835968e-06, "loss": 2.794, "step": 1121 }, { "epoch": 0.9543859649122807, "grad_norm": 0.28377169370651245, "learning_rate": 1.0565594034704918e-06, "loss": 2.7399, "step": 1122 }, { "epoch": 0.9552365762892079, "grad_norm": 0.282060831785202, "learning_rate": 1.0178558119067315e-06, "loss": 2.4815, "step": 1123 }, { "epoch": 0.956087187666135, "grad_norm": 0.2975423038005829, "learning_rate": 9.798707740582447e-07, "loss": 2.5839, "step": 1124 }, { "epoch": 0.9569377990430622, "grad_norm": 0.3122364580631256, "learning_rate": 9.42604565674654e-07, "loss": 2.6381, "step": 1125 }, { "epoch": 0.9577884104199894, "grad_norm": 0.3378431797027588, "learning_rate": 9.060574572873237e-07, "loss": 2.5897, "step": 1126 }, { "epoch": 0.9586390217969165, "grad_norm": 0.33718162775039673, "learning_rate": 8.702297142073379e-07, "loss": 2.689, "step": 1127 }, { "epoch": 0.9594896331738437, "grad_norm": 0.34144076704978943, "learning_rate": 8.351215965235915e-07, "loss": 2.4692, "step": 1128 }, { "epoch": 0.9603402445507708, "grad_norm": 0.3324032425880432, "learning_rate": 8.007333591009358e-07, "loss": 2.653, "step": 1129 }, { "epoch": 0.961190855927698, "grad_norm": 0.3601853847503662, "learning_rate": 7.670652515782917e-07, "loss": 2.5995, "step": 1130 }, { "epoch": 0.9620414673046253, "grad_norm": 0.3701609969139099, "learning_rate": 7.341175183668503e-07, "loss": 2.6668, "step": 1131 }, { "epoch": 0.9628920786815524, "grad_norm": 0.37852516770362854, "learning_rate": 7.018903986483083e-07, "loss": 2.7223, "step": 1132 }, { "epoch": 0.9637426900584796, "grad_norm": 0.391939640045166, "learning_rate": 6.703841263730914e-07, "loss": 2.7923, "step": 1133 }, { "epoch": 0.9645933014354067, "grad_norm": 0.38707369565963745, "learning_rate": 6.395989302587113e-07, "loss": 2.6445, "step": 1134 }, { "epoch": 0.9654439128123339, "grad_norm": 0.4049709141254425, "learning_rate": 6.095350337880667e-07, "loss": 2.7038, "step": 1135 }, { "epoch": 0.966294524189261, "grad_norm": 0.41350454092025757, "learning_rate": 5.801926552078563e-07, "loss": 2.6472, "step": 1136 }, { "epoch": 0.9671451355661882, "grad_norm": 0.42945706844329834, "learning_rate": 5.515720075269348e-07, "loss": 2.7802, "step": 1137 }, { "epoch": 0.9679957469431154, "grad_norm": 0.4451653063297272, "learning_rate": 5.236732985148374e-07, "loss": 2.7633, "step": 1138 }, { "epoch": 0.9688463583200425, "grad_norm": 0.46683037281036377, "learning_rate": 4.964967307002244e-07, "loss": 2.8216, "step": 1139 }, { "epoch": 0.9696969696969697, "grad_norm": 0.4562697112560272, "learning_rate": 4.7004250136940543e-07, "loss": 2.7924, "step": 1140 }, { "epoch": 0.9705475810738968, "grad_norm": 0.48352983593940735, "learning_rate": 4.443108025649623e-07, "loss": 2.5207, "step": 1141 }, { "epoch": 0.971398192450824, "grad_norm": 0.4667017459869385, "learning_rate": 4.193018210843058e-07, "loss": 2.6552, "step": 1142 }, { "epoch": 0.9722488038277513, "grad_norm": 0.49360570311546326, "learning_rate": 3.950157384783104e-07, "loss": 2.7523, "step": 1143 }, { "epoch": 0.9730994152046784, "grad_norm": 0.526669442653656, "learning_rate": 3.714527310500371e-07, "loss": 2.7332, "step": 1144 }, { "epoch": 0.9739500265816056, "grad_norm": 0.5592300891876221, "learning_rate": 3.486129698534457e-07, "loss": 2.7264, "step": 1145 }, { "epoch": 0.9748006379585327, "grad_norm": 0.5670251250267029, "learning_rate": 3.264966206921294e-07, "loss": 2.8538, "step": 1146 }, { "epoch": 0.9756512493354599, "grad_norm": 0.5744861960411072, "learning_rate": 3.0510384411812644e-07, "loss": 2.6884, "step": 1147 }, { "epoch": 0.976501860712387, "grad_norm": 0.6117346882820129, "learning_rate": 2.844347954307325e-07, "loss": 2.8192, "step": 1148 }, { "epoch": 0.9773524720893142, "grad_norm": 0.7534440159797668, "learning_rate": 2.644896246754236e-07, "loss": 2.7148, "step": 1149 }, { "epoch": 0.9782030834662414, "grad_norm": 0.7108234167098999, "learning_rate": 2.452684766427349e-07, "loss": 2.8751, "step": 1150 }, { "epoch": 0.9790536948431685, "grad_norm": 0.264021635055542, "learning_rate": 2.2677149086718364e-07, "loss": 2.6035, "step": 1151 }, { "epoch": 0.9799043062200957, "grad_norm": 0.2746794819831848, "learning_rate": 2.0899880162630336e-07, "loss": 2.5935, "step": 1152 }, { "epoch": 0.9807549175970228, "grad_norm": 0.2742723226547241, "learning_rate": 1.9195053793964468e-07, "loss": 2.4199, "step": 1153 }, { "epoch": 0.98160552897395, "grad_norm": 0.2900630235671997, "learning_rate": 1.7562682356786487e-07, "loss": 2.5906, "step": 1154 }, { "epoch": 0.9824561403508771, "grad_norm": 0.26273003220558167, "learning_rate": 1.6002777701175086e-07, "loss": 2.4287, "step": 1155 }, { "epoch": 0.9833067517278044, "grad_norm": 0.28278976678848267, "learning_rate": 1.451535115114866e-07, "loss": 2.6875, "step": 1156 }, { "epoch": 0.9841573631047316, "grad_norm": 0.2644931674003601, "learning_rate": 1.310041350457092e-07, "loss": 2.5806, "step": 1157 }, { "epoch": 0.9850079744816587, "grad_norm": 0.26039785146713257, "learning_rate": 1.1757975033078739e-07, "loss": 2.2813, "step": 1158 }, { "epoch": 0.9858585858585859, "grad_norm": 0.27614063024520874, "learning_rate": 1.0488045482008879e-07, "loss": 2.6836, "step": 1159 }, { "epoch": 0.986709197235513, "grad_norm": 0.27048394083976746, "learning_rate": 9.29063407032249e-08, "loss": 2.5862, "step": 1160 }, { "epoch": 0.9875598086124402, "grad_norm": 0.28808289766311646, "learning_rate": 8.16574949054072e-08, "loss": 2.765, "step": 1161 }, { "epoch": 0.9884104199893674, "grad_norm": 0.323574423789978, "learning_rate": 7.113399908681429e-08, "loss": 2.6432, "step": 1162 }, { "epoch": 0.9892610313662945, "grad_norm": 0.3291761875152588, "learning_rate": 6.133592964201463e-08, "loss": 2.6037, "step": 1163 }, { "epoch": 0.9901116427432217, "grad_norm": 0.345300555229187, "learning_rate": 5.226335769936697e-08, "loss": 2.6076, "step": 1164 }, { "epoch": 0.9909622541201488, "grad_norm": 0.37476396560668945, "learning_rate": 4.391634912056519e-08, "loss": 2.8919, "step": 1165 }, { "epoch": 0.991812865497076, "grad_norm": 0.36915716528892517, "learning_rate": 3.629496450011649e-08, "loss": 2.5514, "step": 1166 }, { "epoch": 0.9926634768740031, "grad_norm": 0.3973197937011719, "learning_rate": 2.9399259164897274e-08, "loss": 2.8279, "step": 1167 }, { "epoch": 0.9935140882509303, "grad_norm": 0.42460885643959045, "learning_rate": 2.322928317378681e-08, "loss": 2.5969, "step": 1168 }, { "epoch": 0.9943646996278576, "grad_norm": 0.4290263056755066, "learning_rate": 1.778508131728973e-08, "loss": 2.7255, "step": 1169 }, { "epoch": 0.9952153110047847, "grad_norm": 0.4642498195171356, "learning_rate": 1.3066693117191886e-08, "loss": 2.7869, "step": 1170 }, { "epoch": 0.9960659223817119, "grad_norm": 0.5059999823570251, "learning_rate": 9.074152826271665e-09, "loss": 2.582, "step": 1171 }, { "epoch": 0.996916533758639, "grad_norm": 0.49888238310813904, "learning_rate": 5.807489428111268e-09, "loss": 2.8126, "step": 1172 }, { "epoch": 0.9977671451355662, "grad_norm": 0.5143745541572571, "learning_rate": 3.2667266368080484e-09, "loss": 2.7024, "step": 1173 }, { "epoch": 0.9986177565124934, "grad_norm": 0.5905907154083252, "learning_rate": 1.4518828968523857e-09, "loss": 2.7198, "step": 1174 }, { "epoch": 0.9994683678894205, "grad_norm": 0.6912564635276794, "learning_rate": 3.629713829500503e-10, "loss": 2.6901, "step": 1175 }, { "epoch": 1.0006379585326954, "grad_norm": 1.383741021156311, "learning_rate": 0.0, "loss": 4.1919, "step": 1176 }, { "epoch": 1.0006379585326954, "eval_loss": 2.6547293663024902, "eval_runtime": 80.6568, "eval_samples_per_second": 12.274, "eval_steps_per_second": 6.137, "step": 1176 } ], "logging_steps": 1, "max_steps": 1176, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 294, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.199442414922629e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }