{ "best_metric": 0.18304488062858582, "best_model_checkpoint": "/ephemeral/models/qwen-predict-describe-solve-fix_300sols/checkpoint-800", "epoch": 4.705882352941177, "eval_steps": 200, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002352941176470588, "grad_norm": 4.433023452758789, "learning_rate": 0.0, "loss": 0.6911, "step": 1 }, { "epoch": 0.004705882352941176, "grad_norm": 2.5758912563323975, "learning_rate": 2.1274605355336317e-06, "loss": 0.5919, "step": 2 }, { "epoch": 0.007058823529411765, "grad_norm": 2.9081146717071533, "learning_rate": 3.3719451705849555e-06, "loss": 0.709, "step": 3 }, { "epoch": 0.009411764705882352, "grad_norm": 2.4994797706604004, "learning_rate": 4.254921071067263e-06, "loss": 0.5489, "step": 4 }, { "epoch": 0.011764705882352941, "grad_norm": 1.9925764799118042, "learning_rate": 4.939810388219653e-06, "loss": 0.55, "step": 5 }, { "epoch": 0.01411764705882353, "grad_norm": 2.2803955078125, "learning_rate": 5.499405706118587e-06, "loss": 0.6045, "step": 6 }, { "epoch": 0.01647058823529412, "grad_norm": 1.894464373588562, "learning_rate": 5.9725368059136466e-06, "loss": 0.5911, "step": 7 }, { "epoch": 0.018823529411764704, "grad_norm": 1.5946424007415771, "learning_rate": 6.382381606600895e-06, "loss": 0.3865, "step": 8 }, { "epoch": 0.021176470588235293, "grad_norm": 1.9036686420440674, "learning_rate": 6.743890341169911e-06, "loss": 0.5159, "step": 9 }, { "epoch": 0.023529411764705882, "grad_norm": 1.7717657089233398, "learning_rate": 7.067270923753286e-06, "loss": 0.4252, "step": 10 }, { "epoch": 0.02588235294117647, "grad_norm": 1.5128735303878784, "learning_rate": 7.359804244028083e-06, "loss": 0.4031, "step": 11 }, { "epoch": 0.02823529411764706, "grad_norm": 1.0792033672332764, "learning_rate": 7.626866241652219e-06, "loss": 0.3337, "step": 12 }, { "epoch": 0.03058823529411765, "grad_norm": 1.0857174396514893, "learning_rate": 7.87253946446637e-06, "loss": 0.3449, "step": 13 }, { "epoch": 0.03294117647058824, "grad_norm": 1.2908791303634644, "learning_rate": 8.099997341447279e-06, "loss": 0.2814, "step": 14 }, { "epoch": 0.03529411764705882, "grad_norm": 1.23220694065094, "learning_rate": 8.311755558804609e-06, "loss": 0.3468, "step": 15 }, { "epoch": 0.03764705882352941, "grad_norm": 1.409195899963379, "learning_rate": 8.509842142134527e-06, "loss": 0.2442, "step": 16 }, { "epoch": 0.04, "grad_norm": 1.5413895845413208, "learning_rate": 8.695915885220267e-06, "loss": 0.4168, "step": 17 }, { "epoch": 0.042352941176470586, "grad_norm": 1.3812085390090942, "learning_rate": 8.871350876703542e-06, "loss": 0.3884, "step": 18 }, { "epoch": 0.04470588235294118, "grad_norm": 1.0942145586013794, "learning_rate": 9.037298142658738e-06, "loss": 0.3026, "step": 19 }, { "epoch": 0.047058823529411764, "grad_norm": 1.1199138164520264, "learning_rate": 9.194731459286916e-06, "loss": 0.257, "step": 20 }, { "epoch": 0.04941176470588235, "grad_norm": 1.1919746398925781, "learning_rate": 9.344481976498602e-06, "loss": 0.3158, "step": 21 }, { "epoch": 0.05176470588235294, "grad_norm": 1.079263687133789, "learning_rate": 9.487264779561716e-06, "loss": 0.289, "step": 22 }, { "epoch": 0.05411764705882353, "grad_norm": 1.22309148311615, "learning_rate": 9.623699541552615e-06, "loss": 0.3196, "step": 23 }, { "epoch": 0.05647058823529412, "grad_norm": 1.4475860595703125, "learning_rate": 9.75432677718585e-06, "loss": 0.384, "step": 24 }, { "epoch": 0.058823529411764705, "grad_norm": 1.122456669807434, "learning_rate": 9.879620776439306e-06, "loss": 0.259, "step": 25 }, { "epoch": 0.0611764705882353, "grad_norm": 1.1532243490219116, "learning_rate": 1e-05, "loss": 0.3943, "step": 26 }, { "epoch": 0.06352941176470588, "grad_norm": 1.1070886850357056, "learning_rate": 9.99999612726692e-06, "loss": 0.3171, "step": 27 }, { "epoch": 0.06588235294117648, "grad_norm": 0.9318668246269226, "learning_rate": 9.99998450907368e-06, "loss": 0.244, "step": 28 }, { "epoch": 0.06823529411764706, "grad_norm": 0.9710826277732849, "learning_rate": 9.999965145438276e-06, "loss": 0.2613, "step": 29 }, { "epoch": 0.07058823529411765, "grad_norm": 1.2915141582489014, "learning_rate": 9.99993803639071e-06, "loss": 0.3058, "step": 30 }, { "epoch": 0.07294117647058823, "grad_norm": 1.0819549560546875, "learning_rate": 9.999903181972983e-06, "loss": 0.2554, "step": 31 }, { "epoch": 0.07529411764705882, "grad_norm": 0.9847341775894165, "learning_rate": 9.999860582239085e-06, "loss": 0.2405, "step": 32 }, { "epoch": 0.07764705882352942, "grad_norm": 1.0567419528961182, "learning_rate": 9.999810237255024e-06, "loss": 0.263, "step": 33 }, { "epoch": 0.08, "grad_norm": 1.2140214443206787, "learning_rate": 9.99975214709879e-06, "loss": 0.4164, "step": 34 }, { "epoch": 0.08235294117647059, "grad_norm": 1.1025761365890503, "learning_rate": 9.999686311860382e-06, "loss": 0.2463, "step": 35 }, { "epoch": 0.08470588235294117, "grad_norm": 1.265760898590088, "learning_rate": 9.99961273164179e-06, "loss": 0.27, "step": 36 }, { "epoch": 0.08705882352941176, "grad_norm": 1.0935895442962646, "learning_rate": 9.999531406557015e-06, "loss": 0.3185, "step": 37 }, { "epoch": 0.08941176470588236, "grad_norm": 0.950699508190155, "learning_rate": 9.999442336732047e-06, "loss": 0.3034, "step": 38 }, { "epoch": 0.09176470588235294, "grad_norm": 1.1541742086410522, "learning_rate": 9.999345522304877e-06, "loss": 0.3925, "step": 39 }, { "epoch": 0.09411764705882353, "grad_norm": 1.2365785837173462, "learning_rate": 9.999240963425495e-06, "loss": 0.3156, "step": 40 }, { "epoch": 0.09647058823529411, "grad_norm": 1.029240369796753, "learning_rate": 9.999128660255886e-06, "loss": 0.3714, "step": 41 }, { "epoch": 0.0988235294117647, "grad_norm": 0.9279409646987915, "learning_rate": 9.999008612970038e-06, "loss": 0.2324, "step": 42 }, { "epoch": 0.1011764705882353, "grad_norm": 1.0728477239608765, "learning_rate": 9.998880821753935e-06, "loss": 0.2796, "step": 43 }, { "epoch": 0.10352941176470588, "grad_norm": 1.1426056623458862, "learning_rate": 9.998745286805555e-06, "loss": 0.35, "step": 44 }, { "epoch": 0.10588235294117647, "grad_norm": 0.9469041228294373, "learning_rate": 9.998602008334877e-06, "loss": 0.2706, "step": 45 }, { "epoch": 0.10823529411764705, "grad_norm": 1.042968511581421, "learning_rate": 9.998450986563874e-06, "loss": 0.2395, "step": 46 }, { "epoch": 0.11058823529411765, "grad_norm": 1.0404611825942993, "learning_rate": 9.998292221726517e-06, "loss": 0.2774, "step": 47 }, { "epoch": 0.11294117647058824, "grad_norm": 1.071613073348999, "learning_rate": 9.99812571406877e-06, "loss": 0.2747, "step": 48 }, { "epoch": 0.11529411764705882, "grad_norm": 1.1366511583328247, "learning_rate": 9.997951463848597e-06, "loss": 0.3502, "step": 49 }, { "epoch": 0.11764705882352941, "grad_norm": 0.8286213278770447, "learning_rate": 9.997769471335955e-06, "loss": 0.1695, "step": 50 }, { "epoch": 0.12, "grad_norm": 1.0261492729187012, "learning_rate": 9.997579736812795e-06, "loss": 0.3437, "step": 51 }, { "epoch": 0.1223529411764706, "grad_norm": 1.0099921226501465, "learning_rate": 9.99738226057306e-06, "loss": 0.3242, "step": 52 }, { "epoch": 0.12470588235294118, "grad_norm": 1.0330893993377686, "learning_rate": 9.997177042922695e-06, "loss": 0.224, "step": 53 }, { "epoch": 0.12705882352941175, "grad_norm": 1.0466816425323486, "learning_rate": 9.996964084179628e-06, "loss": 0.2731, "step": 54 }, { "epoch": 0.12941176470588237, "grad_norm": 1.193463683128357, "learning_rate": 9.996743384673787e-06, "loss": 0.3093, "step": 55 }, { "epoch": 0.13176470588235295, "grad_norm": 0.921506404876709, "learning_rate": 9.996514944747091e-06, "loss": 0.3007, "step": 56 }, { "epoch": 0.13411764705882354, "grad_norm": 1.0090370178222656, "learning_rate": 9.996278764753452e-06, "loss": 0.3456, "step": 57 }, { "epoch": 0.13647058823529412, "grad_norm": 1.0816580057144165, "learning_rate": 9.996034845058766e-06, "loss": 0.3714, "step": 58 }, { "epoch": 0.1388235294117647, "grad_norm": 0.9117510914802551, "learning_rate": 9.99578318604093e-06, "loss": 0.3357, "step": 59 }, { "epoch": 0.1411764705882353, "grad_norm": 1.0711637735366821, "learning_rate": 9.995523788089822e-06, "loss": 0.2744, "step": 60 }, { "epoch": 0.14352941176470588, "grad_norm": 1.0177580118179321, "learning_rate": 9.995256651607319e-06, "loss": 0.3022, "step": 61 }, { "epoch": 0.14588235294117646, "grad_norm": 1.4425690174102783, "learning_rate": 9.994981777007278e-06, "loss": 0.3159, "step": 62 }, { "epoch": 0.14823529411764705, "grad_norm": 1.0196243524551392, "learning_rate": 9.99469916471555e-06, "loss": 0.2887, "step": 63 }, { "epoch": 0.15058823529411763, "grad_norm": 1.0682207345962524, "learning_rate": 9.994408815169969e-06, "loss": 0.2842, "step": 64 }, { "epoch": 0.15294117647058825, "grad_norm": 1.0968844890594482, "learning_rate": 9.99411072882036e-06, "loss": 0.2692, "step": 65 }, { "epoch": 0.15529411764705883, "grad_norm": 1.0505070686340332, "learning_rate": 9.993804906128532e-06, "loss": 0.3175, "step": 66 }, { "epoch": 0.15764705882352942, "grad_norm": 1.092547059059143, "learning_rate": 9.993491347568283e-06, "loss": 0.325, "step": 67 }, { "epoch": 0.16, "grad_norm": 1.7406336069107056, "learning_rate": 9.993170053625389e-06, "loss": 0.3203, "step": 68 }, { "epoch": 0.1623529411764706, "grad_norm": 1.044413447380066, "learning_rate": 9.992841024797617e-06, "loss": 0.2773, "step": 69 }, { "epoch": 0.16470588235294117, "grad_norm": 1.038341760635376, "learning_rate": 9.992504261594713e-06, "loss": 0.2431, "step": 70 }, { "epoch": 0.16705882352941176, "grad_norm": 1.1833492517471313, "learning_rate": 9.992159764538408e-06, "loss": 0.2444, "step": 71 }, { "epoch": 0.16941176470588235, "grad_norm": 1.2170710563659668, "learning_rate": 9.991807534162413e-06, "loss": 0.2228, "step": 72 }, { "epoch": 0.17176470588235293, "grad_norm": 0.9302052855491638, "learning_rate": 9.991447571012417e-06, "loss": 0.2867, "step": 73 }, { "epoch": 0.17411764705882352, "grad_norm": 1.064121127128601, "learning_rate": 9.991079875646099e-06, "loss": 0.2452, "step": 74 }, { "epoch": 0.17647058823529413, "grad_norm": 1.2513593435287476, "learning_rate": 9.990704448633103e-06, "loss": 0.2745, "step": 75 }, { "epoch": 0.17882352941176471, "grad_norm": 0.9871006011962891, "learning_rate": 9.990321290555063e-06, "loss": 0.2392, "step": 76 }, { "epoch": 0.1811764705882353, "grad_norm": 0.8642148375511169, "learning_rate": 9.989930402005583e-06, "loss": 0.2317, "step": 77 }, { "epoch": 0.18352941176470589, "grad_norm": 0.9978500604629517, "learning_rate": 9.989531783590252e-06, "loss": 0.2475, "step": 78 }, { "epoch": 0.18588235294117647, "grad_norm": 0.955782413482666, "learning_rate": 9.989125435926625e-06, "loss": 0.2718, "step": 79 }, { "epoch": 0.18823529411764706, "grad_norm": 1.1206820011138916, "learning_rate": 9.988711359644233e-06, "loss": 0.3259, "step": 80 }, { "epoch": 0.19058823529411764, "grad_norm": 1.0068904161453247, "learning_rate": 9.988289555384586e-06, "loss": 0.2796, "step": 81 }, { "epoch": 0.19294117647058823, "grad_norm": 0.9677363634109497, "learning_rate": 9.987860023801165e-06, "loss": 0.2502, "step": 82 }, { "epoch": 0.1952941176470588, "grad_norm": 0.9485974311828613, "learning_rate": 9.987422765559417e-06, "loss": 0.2072, "step": 83 }, { "epoch": 0.1976470588235294, "grad_norm": 1.0714693069458008, "learning_rate": 9.986977781336767e-06, "loss": 0.2555, "step": 84 }, { "epoch": 0.2, "grad_norm": 0.9058910012245178, "learning_rate": 9.986525071822602e-06, "loss": 0.2461, "step": 85 }, { "epoch": 0.2023529411764706, "grad_norm": 1.0188825130462646, "learning_rate": 9.986064637718286e-06, "loss": 0.314, "step": 86 }, { "epoch": 0.20470588235294118, "grad_norm": 1.0274887084960938, "learning_rate": 9.985596479737142e-06, "loss": 0.3022, "step": 87 }, { "epoch": 0.20705882352941177, "grad_norm": 0.9549496173858643, "learning_rate": 9.985120598604467e-06, "loss": 0.2603, "step": 88 }, { "epoch": 0.20941176470588235, "grad_norm": 0.8760855197906494, "learning_rate": 9.984636995057512e-06, "loss": 0.2504, "step": 89 }, { "epoch": 0.21176470588235294, "grad_norm": 0.8483975529670715, "learning_rate": 9.984145669845508e-06, "loss": 0.2086, "step": 90 }, { "epoch": 0.21411764705882352, "grad_norm": 0.8707842230796814, "learning_rate": 9.98364662372963e-06, "loss": 0.2502, "step": 91 }, { "epoch": 0.2164705882352941, "grad_norm": 0.8667396306991577, "learning_rate": 9.983139857483034e-06, "loss": 0.2578, "step": 92 }, { "epoch": 0.2188235294117647, "grad_norm": 0.8952845931053162, "learning_rate": 9.98262537189082e-06, "loss": 0.2645, "step": 93 }, { "epoch": 0.2211764705882353, "grad_norm": 0.9924861788749695, "learning_rate": 9.982103167750056e-06, "loss": 0.2813, "step": 94 }, { "epoch": 0.2235294117647059, "grad_norm": 0.9458975791931152, "learning_rate": 9.981573245869765e-06, "loss": 0.3098, "step": 95 }, { "epoch": 0.22588235294117648, "grad_norm": 0.921942412853241, "learning_rate": 9.981035607070928e-06, "loss": 0.3035, "step": 96 }, { "epoch": 0.22823529411764706, "grad_norm": 0.8844327330589294, "learning_rate": 9.98049025218648e-06, "loss": 0.2214, "step": 97 }, { "epoch": 0.23058823529411765, "grad_norm": 0.9266485571861267, "learning_rate": 9.979937182061316e-06, "loss": 0.3066, "step": 98 }, { "epoch": 0.23294117647058823, "grad_norm": 1.0528078079223633, "learning_rate": 9.97937639755227e-06, "loss": 0.2184, "step": 99 }, { "epoch": 0.23529411764705882, "grad_norm": 0.8608896136283875, "learning_rate": 9.978807899528144e-06, "loss": 0.2497, "step": 100 }, { "epoch": 0.2376470588235294, "grad_norm": 0.9750809073448181, "learning_rate": 9.97823168886968e-06, "loss": 0.2942, "step": 101 }, { "epoch": 0.24, "grad_norm": 0.7832154631614685, "learning_rate": 9.977647766469571e-06, "loss": 0.1735, "step": 102 }, { "epoch": 0.24235294117647058, "grad_norm": 0.9020270705223083, "learning_rate": 9.977056133232458e-06, "loss": 0.3238, "step": 103 }, { "epoch": 0.2447058823529412, "grad_norm": 0.9636802673339844, "learning_rate": 9.976456790074926e-06, "loss": 0.335, "step": 104 }, { "epoch": 0.24705882352941178, "grad_norm": 0.7856862545013428, "learning_rate": 9.97584973792551e-06, "loss": 0.2109, "step": 105 }, { "epoch": 0.24941176470588236, "grad_norm": 0.9304346442222595, "learning_rate": 9.97523497772468e-06, "loss": 0.2492, "step": 106 }, { "epoch": 0.25176470588235295, "grad_norm": 0.9319616556167603, "learning_rate": 9.974612510424853e-06, "loss": 0.3317, "step": 107 }, { "epoch": 0.2541176470588235, "grad_norm": 1.1289972066879272, "learning_rate": 9.973982336990388e-06, "loss": 0.1714, "step": 108 }, { "epoch": 0.2564705882352941, "grad_norm": 0.9971926808357239, "learning_rate": 9.97334445839758e-06, "loss": 0.2962, "step": 109 }, { "epoch": 0.25882352941176473, "grad_norm": 1.1345818042755127, "learning_rate": 9.972698875634661e-06, "loss": 0.3363, "step": 110 }, { "epoch": 0.2611764705882353, "grad_norm": 0.9479900002479553, "learning_rate": 9.972045589701795e-06, "loss": 0.2474, "step": 111 }, { "epoch": 0.2635294117647059, "grad_norm": 0.8707336187362671, "learning_rate": 9.971384601611089e-06, "loss": 0.206, "step": 112 }, { "epoch": 0.26588235294117646, "grad_norm": 0.8235902786254883, "learning_rate": 9.970715912386574e-06, "loss": 0.2592, "step": 113 }, { "epoch": 0.26823529411764707, "grad_norm": 0.9424839615821838, "learning_rate": 9.970039523064217e-06, "loss": 0.2693, "step": 114 }, { "epoch": 0.27058823529411763, "grad_norm": 0.9064309000968933, "learning_rate": 9.969355434691913e-06, "loss": 0.2844, "step": 115 }, { "epoch": 0.27294117647058824, "grad_norm": 1.0230767726898193, "learning_rate": 9.968663648329486e-06, "loss": 0.2869, "step": 116 }, { "epoch": 0.2752941176470588, "grad_norm": 0.7037626504898071, "learning_rate": 9.96796416504868e-06, "loss": 0.1829, "step": 117 }, { "epoch": 0.2776470588235294, "grad_norm": 0.92657071352005, "learning_rate": 9.967256985933174e-06, "loss": 0.2468, "step": 118 }, { "epoch": 0.28, "grad_norm": 0.8318756818771362, "learning_rate": 9.966542112078561e-06, "loss": 0.2177, "step": 119 }, { "epoch": 0.2823529411764706, "grad_norm": 0.8794146180152893, "learning_rate": 9.965819544592358e-06, "loss": 0.2356, "step": 120 }, { "epoch": 0.2847058823529412, "grad_norm": 0.9447417259216309, "learning_rate": 9.965089284594002e-06, "loss": 0.2601, "step": 121 }, { "epoch": 0.28705882352941176, "grad_norm": 0.8184000849723816, "learning_rate": 9.964351333214845e-06, "loss": 0.2585, "step": 122 }, { "epoch": 0.28941176470588237, "grad_norm": 0.8595922589302063, "learning_rate": 9.963605691598162e-06, "loss": 0.2256, "step": 123 }, { "epoch": 0.2917647058823529, "grad_norm": 0.8226269483566284, "learning_rate": 9.96285236089913e-06, "loss": 0.2207, "step": 124 }, { "epoch": 0.29411764705882354, "grad_norm": 0.8834123015403748, "learning_rate": 9.96209134228485e-06, "loss": 0.2874, "step": 125 }, { "epoch": 0.2964705882352941, "grad_norm": 1.0046043395996094, "learning_rate": 9.961322636934327e-06, "loss": 0.2466, "step": 126 }, { "epoch": 0.2988235294117647, "grad_norm": 0.9730203151702881, "learning_rate": 9.960546246038478e-06, "loss": 0.2938, "step": 127 }, { "epoch": 0.30117647058823527, "grad_norm": 0.71089768409729, "learning_rate": 9.959762170800123e-06, "loss": 0.2252, "step": 128 }, { "epoch": 0.3035294117647059, "grad_norm": 0.9660555720329285, "learning_rate": 9.95897041243399e-06, "loss": 0.3166, "step": 129 }, { "epoch": 0.3058823529411765, "grad_norm": 0.8193936347961426, "learning_rate": 9.958170972166707e-06, "loss": 0.1798, "step": 130 }, { "epoch": 0.30823529411764705, "grad_norm": 0.9156762361526489, "learning_rate": 9.95736385123681e-06, "loss": 0.2675, "step": 131 }, { "epoch": 0.31058823529411766, "grad_norm": 1.1767754554748535, "learning_rate": 9.956549050894726e-06, "loss": 0.3224, "step": 132 }, { "epoch": 0.3129411764705882, "grad_norm": 0.9399208426475525, "learning_rate": 9.955726572402785e-06, "loss": 0.2618, "step": 133 }, { "epoch": 0.31529411764705884, "grad_norm": 0.8305631875991821, "learning_rate": 9.954896417035209e-06, "loss": 0.2003, "step": 134 }, { "epoch": 0.3176470588235294, "grad_norm": 0.8947729468345642, "learning_rate": 9.954058586078114e-06, "loss": 0.2786, "step": 135 }, { "epoch": 0.32, "grad_norm": 1.0286778211593628, "learning_rate": 9.95321308082951e-06, "loss": 0.3107, "step": 136 }, { "epoch": 0.32235294117647056, "grad_norm": 0.9012686014175415, "learning_rate": 9.952359902599294e-06, "loss": 0.3323, "step": 137 }, { "epoch": 0.3247058823529412, "grad_norm": 0.7781707048416138, "learning_rate": 9.951499052709248e-06, "loss": 0.2195, "step": 138 }, { "epoch": 0.3270588235294118, "grad_norm": 1.0555319786071777, "learning_rate": 9.950630532493047e-06, "loss": 0.3243, "step": 139 }, { "epoch": 0.32941176470588235, "grad_norm": 1.149080514907837, "learning_rate": 9.949754343296241e-06, "loss": 0.2251, "step": 140 }, { "epoch": 0.33176470588235296, "grad_norm": 0.7240123748779297, "learning_rate": 9.948870486476263e-06, "loss": 0.2624, "step": 141 }, { "epoch": 0.3341176470588235, "grad_norm": 0.8397669792175293, "learning_rate": 9.94797896340243e-06, "loss": 0.2478, "step": 142 }, { "epoch": 0.33647058823529413, "grad_norm": 0.8223580718040466, "learning_rate": 9.947079775455933e-06, "loss": 0.2478, "step": 143 }, { "epoch": 0.3388235294117647, "grad_norm": 0.8276808261871338, "learning_rate": 9.946172924029834e-06, "loss": 0.2191, "step": 144 }, { "epoch": 0.3411764705882353, "grad_norm": 0.8807602524757385, "learning_rate": 9.945258410529075e-06, "loss": 0.2173, "step": 145 }, { "epoch": 0.34352941176470586, "grad_norm": 0.9126125574111938, "learning_rate": 9.944336236370459e-06, "loss": 0.2485, "step": 146 }, { "epoch": 0.3458823529411765, "grad_norm": 1.0363433361053467, "learning_rate": 9.943406402982665e-06, "loss": 0.2908, "step": 147 }, { "epoch": 0.34823529411764703, "grad_norm": 0.8589110970497131, "learning_rate": 9.942468911806239e-06, "loss": 0.2765, "step": 148 }, { "epoch": 0.35058823529411764, "grad_norm": 0.9365296363830566, "learning_rate": 9.941523764293582e-06, "loss": 0.206, "step": 149 }, { "epoch": 0.35294117647058826, "grad_norm": 1.0245442390441895, "learning_rate": 9.940570961908966e-06, "loss": 0.3027, "step": 150 }, { "epoch": 0.3552941176470588, "grad_norm": 0.7963467836380005, "learning_rate": 9.939610506128515e-06, "loss": 0.2442, "step": 151 }, { "epoch": 0.35764705882352943, "grad_norm": 0.9175562262535095, "learning_rate": 9.938642398440217e-06, "loss": 0.3187, "step": 152 }, { "epoch": 0.36, "grad_norm": 0.9043279886245728, "learning_rate": 9.937666640343908e-06, "loss": 0.2848, "step": 153 }, { "epoch": 0.3623529411764706, "grad_norm": 0.898982584476471, "learning_rate": 9.936683233351281e-06, "loss": 0.2662, "step": 154 }, { "epoch": 0.36470588235294116, "grad_norm": 0.9480587244033813, "learning_rate": 9.935692178985878e-06, "loss": 0.2881, "step": 155 }, { "epoch": 0.36705882352941177, "grad_norm": 0.916759192943573, "learning_rate": 9.934693478783087e-06, "loss": 0.2227, "step": 156 }, { "epoch": 0.36941176470588233, "grad_norm": 0.7724462151527405, "learning_rate": 9.933687134290144e-06, "loss": 0.2822, "step": 157 }, { "epoch": 0.37176470588235294, "grad_norm": 0.9498195052146912, "learning_rate": 9.932673147066124e-06, "loss": 0.2713, "step": 158 }, { "epoch": 0.37411764705882355, "grad_norm": 1.0160996913909912, "learning_rate": 9.931651518681946e-06, "loss": 0.2755, "step": 159 }, { "epoch": 0.3764705882352941, "grad_norm": 0.860191285610199, "learning_rate": 9.930622250720365e-06, "loss": 0.2214, "step": 160 }, { "epoch": 0.3788235294117647, "grad_norm": 0.8782268166542053, "learning_rate": 9.929585344775975e-06, "loss": 0.3036, "step": 161 }, { "epoch": 0.3811764705882353, "grad_norm": 0.80301433801651, "learning_rate": 9.928540802455198e-06, "loss": 0.2329, "step": 162 }, { "epoch": 0.3835294117647059, "grad_norm": 0.8290606737136841, "learning_rate": 9.927488625376292e-06, "loss": 0.2654, "step": 163 }, { "epoch": 0.38588235294117645, "grad_norm": 0.9100065231323242, "learning_rate": 9.926428815169337e-06, "loss": 0.2506, "step": 164 }, { "epoch": 0.38823529411764707, "grad_norm": 0.7641480565071106, "learning_rate": 9.925361373476244e-06, "loss": 0.2407, "step": 165 }, { "epoch": 0.3905882352941176, "grad_norm": 0.6696881651878357, "learning_rate": 9.924286301950745e-06, "loss": 0.2017, "step": 166 }, { "epoch": 0.39294117647058824, "grad_norm": 0.8213082551956177, "learning_rate": 9.923203602258393e-06, "loss": 0.2318, "step": 167 }, { "epoch": 0.3952941176470588, "grad_norm": 0.7914837002754211, "learning_rate": 9.922113276076555e-06, "loss": 0.2587, "step": 168 }, { "epoch": 0.3976470588235294, "grad_norm": 1.0366308689117432, "learning_rate": 9.921015325094423e-06, "loss": 0.26, "step": 169 }, { "epoch": 0.4, "grad_norm": 0.7918279767036438, "learning_rate": 9.919909751012992e-06, "loss": 0.2684, "step": 170 }, { "epoch": 0.4023529411764706, "grad_norm": 0.8626274466514587, "learning_rate": 9.918796555545071e-06, "loss": 0.2717, "step": 171 }, { "epoch": 0.4047058823529412, "grad_norm": 0.7255402207374573, "learning_rate": 9.917675740415272e-06, "loss": 0.2195, "step": 172 }, { "epoch": 0.40705882352941175, "grad_norm": 0.8299323320388794, "learning_rate": 9.916547307360022e-06, "loss": 0.2432, "step": 173 }, { "epoch": 0.40941176470588236, "grad_norm": 0.8069900870323181, "learning_rate": 9.915411258127541e-06, "loss": 0.2633, "step": 174 }, { "epoch": 0.4117647058823529, "grad_norm": 0.9024124145507812, "learning_rate": 9.914267594477854e-06, "loss": 0.2189, "step": 175 }, { "epoch": 0.41411764705882353, "grad_norm": 0.9107270240783691, "learning_rate": 9.913116318182775e-06, "loss": 0.2364, "step": 176 }, { "epoch": 0.4164705882352941, "grad_norm": 0.7446509003639221, "learning_rate": 9.911957431025922e-06, "loss": 0.2172, "step": 177 }, { "epoch": 0.4188235294117647, "grad_norm": 0.7665855884552002, "learning_rate": 9.910790934802694e-06, "loss": 0.3068, "step": 178 }, { "epoch": 0.4211764705882353, "grad_norm": 0.9638316035270691, "learning_rate": 9.909616831320284e-06, "loss": 0.2972, "step": 179 }, { "epoch": 0.4235294117647059, "grad_norm": 0.760521411895752, "learning_rate": 9.908435122397671e-06, "loss": 0.2863, "step": 180 }, { "epoch": 0.4258823529411765, "grad_norm": 0.8095794320106506, "learning_rate": 9.907245809865618e-06, "loss": 0.2363, "step": 181 }, { "epoch": 0.42823529411764705, "grad_norm": 0.8258980512619019, "learning_rate": 9.906048895566659e-06, "loss": 0.2607, "step": 182 }, { "epoch": 0.43058823529411766, "grad_norm": 0.888350784778595, "learning_rate": 9.904844381355115e-06, "loss": 0.2982, "step": 183 }, { "epoch": 0.4329411764705882, "grad_norm": 0.7543793320655823, "learning_rate": 9.903632269097077e-06, "loss": 0.2589, "step": 184 }, { "epoch": 0.43529411764705883, "grad_norm": 0.7566789984703064, "learning_rate": 9.902412560670408e-06, "loss": 0.2076, "step": 185 }, { "epoch": 0.4376470588235294, "grad_norm": 0.7188458442687988, "learning_rate": 9.90118525796474e-06, "loss": 0.2334, "step": 186 }, { "epoch": 0.44, "grad_norm": 0.8086090683937073, "learning_rate": 9.899950362881468e-06, "loss": 0.2025, "step": 187 }, { "epoch": 0.4423529411764706, "grad_norm": 0.7750043869018555, "learning_rate": 9.89870787733375e-06, "loss": 0.203, "step": 188 }, { "epoch": 0.4447058823529412, "grad_norm": 0.8307673931121826, "learning_rate": 9.897457803246504e-06, "loss": 0.2297, "step": 189 }, { "epoch": 0.4470588235294118, "grad_norm": 0.765088677406311, "learning_rate": 9.896200142556411e-06, "loss": 0.239, "step": 190 }, { "epoch": 0.44941176470588234, "grad_norm": 0.7677813172340393, "learning_rate": 9.894934897211891e-06, "loss": 0.2298, "step": 191 }, { "epoch": 0.45176470588235296, "grad_norm": 0.9383118748664856, "learning_rate": 9.89366206917313e-06, "loss": 0.2623, "step": 192 }, { "epoch": 0.4541176470588235, "grad_norm": 0.7743039131164551, "learning_rate": 9.892381660412048e-06, "loss": 0.2474, "step": 193 }, { "epoch": 0.45647058823529413, "grad_norm": 5.441230773925781, "learning_rate": 9.891093672912321e-06, "loss": 0.2867, "step": 194 }, { "epoch": 0.4588235294117647, "grad_norm": 0.9450461864471436, "learning_rate": 9.889798108669361e-06, "loss": 0.2901, "step": 195 }, { "epoch": 0.4611764705882353, "grad_norm": 0.951714277267456, "learning_rate": 9.888494969690317e-06, "loss": 0.2363, "step": 196 }, { "epoch": 0.46352941176470586, "grad_norm": 0.7889963388442993, "learning_rate": 9.887184257994074e-06, "loss": 0.2236, "step": 197 }, { "epoch": 0.46588235294117647, "grad_norm": 0.8649162650108337, "learning_rate": 9.88586597561125e-06, "loss": 0.2528, "step": 198 }, { "epoch": 0.4682352941176471, "grad_norm": 0.8491036891937256, "learning_rate": 9.884540124584194e-06, "loss": 0.2089, "step": 199 }, { "epoch": 0.47058823529411764, "grad_norm": 0.9122304320335388, "learning_rate": 9.883206706966975e-06, "loss": 0.3053, "step": 200 }, { "epoch": 0.47058823529411764, "eval_loss": 0.22298569977283478, "eval_runtime": 3.3794, "eval_samples_per_second": 32.846, "eval_steps_per_second": 1.184, "step": 200 }, { "epoch": 0.47294117647058825, "grad_norm": 0.818816602230072, "learning_rate": 9.881865724825389e-06, "loss": 0.2389, "step": 201 }, { "epoch": 0.4752941176470588, "grad_norm": 0.9249524474143982, "learning_rate": 9.88051718023695e-06, "loss": 0.2278, "step": 202 }, { "epoch": 0.4776470588235294, "grad_norm": 0.7545968890190125, "learning_rate": 9.87916107529089e-06, "loss": 0.245, "step": 203 }, { "epoch": 0.48, "grad_norm": 0.8268839120864868, "learning_rate": 9.87779741208815e-06, "loss": 0.3123, "step": 204 }, { "epoch": 0.4823529411764706, "grad_norm": 0.6470619440078735, "learning_rate": 9.876426192741384e-06, "loss": 0.1806, "step": 205 }, { "epoch": 0.48470588235294115, "grad_norm": 0.8144296407699585, "learning_rate": 9.87504741937495e-06, "loss": 0.295, "step": 206 }, { "epoch": 0.48705882352941177, "grad_norm": 0.7856613993644714, "learning_rate": 9.873661094124908e-06, "loss": 0.2171, "step": 207 }, { "epoch": 0.4894117647058824, "grad_norm": 0.7514359951019287, "learning_rate": 9.872267219139026e-06, "loss": 0.2823, "step": 208 }, { "epoch": 0.49176470588235294, "grad_norm": 0.9623776078224182, "learning_rate": 9.870865796576756e-06, "loss": 0.3025, "step": 209 }, { "epoch": 0.49411764705882355, "grad_norm": 0.7472739219665527, "learning_rate": 9.869456828609253e-06, "loss": 0.2529, "step": 210 }, { "epoch": 0.4964705882352941, "grad_norm": 0.8459761738777161, "learning_rate": 9.868040317419357e-06, "loss": 0.2459, "step": 211 }, { "epoch": 0.4988235294117647, "grad_norm": 0.7439875602722168, "learning_rate": 9.866616265201594e-06, "loss": 0.2803, "step": 212 }, { "epoch": 0.5011764705882353, "grad_norm": 0.79972904920578, "learning_rate": 9.865184674162176e-06, "loss": 0.2462, "step": 213 }, { "epoch": 0.5035294117647059, "grad_norm": 0.8312981128692627, "learning_rate": 9.863745546518993e-06, "loss": 0.2517, "step": 214 }, { "epoch": 0.5058823529411764, "grad_norm": 0.806371808052063, "learning_rate": 9.862298884501609e-06, "loss": 0.2537, "step": 215 }, { "epoch": 0.508235294117647, "grad_norm": 0.7011241316795349, "learning_rate": 9.860844690351265e-06, "loss": 0.1908, "step": 216 }, { "epoch": 0.5105882352941177, "grad_norm": 0.7921483516693115, "learning_rate": 9.859382966320865e-06, "loss": 0.2809, "step": 217 }, { "epoch": 0.5129411764705882, "grad_norm": 0.6877984404563904, "learning_rate": 9.857913714674988e-06, "loss": 0.1888, "step": 218 }, { "epoch": 0.5152941176470588, "grad_norm": 0.814027726650238, "learning_rate": 9.856436937689863e-06, "loss": 0.2354, "step": 219 }, { "epoch": 0.5176470588235295, "grad_norm": 0.7442806363105774, "learning_rate": 9.854952637653387e-06, "loss": 0.2689, "step": 220 }, { "epoch": 0.52, "grad_norm": 1.0849971771240234, "learning_rate": 9.853460816865109e-06, "loss": 0.2159, "step": 221 }, { "epoch": 0.5223529411764706, "grad_norm": 0.6641291975975037, "learning_rate": 9.85196147763623e-06, "loss": 0.1522, "step": 222 }, { "epoch": 0.5247058823529411, "grad_norm": 0.7884066104888916, "learning_rate": 9.850454622289598e-06, "loss": 0.2621, "step": 223 }, { "epoch": 0.5270588235294118, "grad_norm": 0.7045320272445679, "learning_rate": 9.848940253159707e-06, "loss": 0.2056, "step": 224 }, { "epoch": 0.5294117647058824, "grad_norm": 0.9107831120491028, "learning_rate": 9.84741837259269e-06, "loss": 0.2767, "step": 225 }, { "epoch": 0.5317647058823529, "grad_norm": 0.7665836811065674, "learning_rate": 9.845888982946316e-06, "loss": 0.252, "step": 226 }, { "epoch": 0.5341176470588235, "grad_norm": 0.7923669815063477, "learning_rate": 9.84435208658999e-06, "loss": 0.2902, "step": 227 }, { "epoch": 0.5364705882352941, "grad_norm": 0.7973436713218689, "learning_rate": 9.842807685904747e-06, "loss": 0.192, "step": 228 }, { "epoch": 0.5388235294117647, "grad_norm": 0.781619668006897, "learning_rate": 9.841255783283246e-06, "loss": 0.249, "step": 229 }, { "epoch": 0.5411764705882353, "grad_norm": 0.7761977910995483, "learning_rate": 9.839696381129767e-06, "loss": 0.2572, "step": 230 }, { "epoch": 0.5435294117647059, "grad_norm": 0.7399327754974365, "learning_rate": 9.838129481860216e-06, "loss": 0.2574, "step": 231 }, { "epoch": 0.5458823529411765, "grad_norm": 0.7586814165115356, "learning_rate": 9.836555087902106e-06, "loss": 0.2657, "step": 232 }, { "epoch": 0.548235294117647, "grad_norm": 0.9079625606536865, "learning_rate": 9.834973201694563e-06, "loss": 0.2037, "step": 233 }, { "epoch": 0.5505882352941176, "grad_norm": 0.6540083885192871, "learning_rate": 9.833383825688323e-06, "loss": 0.1712, "step": 234 }, { "epoch": 0.5529411764705883, "grad_norm": 0.6759035587310791, "learning_rate": 9.831786962345724e-06, "loss": 0.2246, "step": 235 }, { "epoch": 0.5552941176470588, "grad_norm": 0.7904967069625854, "learning_rate": 9.830182614140702e-06, "loss": 0.2277, "step": 236 }, { "epoch": 0.5576470588235294, "grad_norm": 0.6802937388420105, "learning_rate": 9.828570783558789e-06, "loss": 0.1911, "step": 237 }, { "epoch": 0.56, "grad_norm": 0.8547179698944092, "learning_rate": 9.826951473097115e-06, "loss": 0.29, "step": 238 }, { "epoch": 0.5623529411764706, "grad_norm": 0.8529773950576782, "learning_rate": 9.825324685264392e-06, "loss": 0.2813, "step": 239 }, { "epoch": 0.5647058823529412, "grad_norm": 0.7918345928192139, "learning_rate": 9.823690422580919e-06, "loss": 0.2525, "step": 240 }, { "epoch": 0.5670588235294117, "grad_norm": 0.7326545715332031, "learning_rate": 9.82204868757857e-06, "loss": 0.2311, "step": 241 }, { "epoch": 0.5694117647058824, "grad_norm": 0.8501949906349182, "learning_rate": 9.820399482800805e-06, "loss": 0.3425, "step": 242 }, { "epoch": 0.571764705882353, "grad_norm": 1.027207374572754, "learning_rate": 9.818742810802647e-06, "loss": 0.2463, "step": 243 }, { "epoch": 0.5741176470588235, "grad_norm": 0.9128986597061157, "learning_rate": 9.817078674150695e-06, "loss": 0.2508, "step": 244 }, { "epoch": 0.5764705882352941, "grad_norm": 0.7841103672981262, "learning_rate": 9.815407075423112e-06, "loss": 0.1942, "step": 245 }, { "epoch": 0.5788235294117647, "grad_norm": 0.882373034954071, "learning_rate": 9.813728017209614e-06, "loss": 0.2621, "step": 246 }, { "epoch": 0.5811764705882353, "grad_norm": 0.8034451007843018, "learning_rate": 9.81204150211148e-06, "loss": 0.2406, "step": 247 }, { "epoch": 0.5835294117647059, "grad_norm": 0.7462646961212158, "learning_rate": 9.810347532741545e-06, "loss": 0.1892, "step": 248 }, { "epoch": 0.5858823529411765, "grad_norm": 0.8153924345970154, "learning_rate": 9.808646111724183e-06, "loss": 0.2438, "step": 249 }, { "epoch": 0.5882352941176471, "grad_norm": 0.8731136918067932, "learning_rate": 9.806937241695319e-06, "loss": 0.2548, "step": 250 }, { "epoch": 0.5905882352941176, "grad_norm": 0.7715024352073669, "learning_rate": 9.805220925302417e-06, "loss": 0.2664, "step": 251 }, { "epoch": 0.5929411764705882, "grad_norm": 0.5962529182434082, "learning_rate": 9.803497165204475e-06, "loss": 0.1454, "step": 252 }, { "epoch": 0.5952941176470589, "grad_norm": 0.9136282801628113, "learning_rate": 9.801765964072029e-06, "loss": 0.2846, "step": 253 }, { "epoch": 0.5976470588235294, "grad_norm": 0.8051571846008301, "learning_rate": 9.800027324587135e-06, "loss": 0.2379, "step": 254 }, { "epoch": 0.6, "grad_norm": 0.7990124225616455, "learning_rate": 9.79828124944338e-06, "loss": 0.2392, "step": 255 }, { "epoch": 0.6023529411764705, "grad_norm": 0.7473837733268738, "learning_rate": 9.796527741345869e-06, "loss": 0.2032, "step": 256 }, { "epoch": 0.6047058823529412, "grad_norm": 0.7735739946365356, "learning_rate": 9.794766803011215e-06, "loss": 0.2614, "step": 257 }, { "epoch": 0.6070588235294118, "grad_norm": 0.8545805811882019, "learning_rate": 9.792998437167555e-06, "loss": 0.2328, "step": 258 }, { "epoch": 0.6094117647058823, "grad_norm": 0.6997544169425964, "learning_rate": 9.791222646554522e-06, "loss": 0.2165, "step": 259 }, { "epoch": 0.611764705882353, "grad_norm": 0.7659356594085693, "learning_rate": 9.78943943392326e-06, "loss": 0.1782, "step": 260 }, { "epoch": 0.6141176470588235, "grad_norm": 0.9017452001571655, "learning_rate": 9.787648802036405e-06, "loss": 0.2255, "step": 261 }, { "epoch": 0.6164705882352941, "grad_norm": 0.6030929684638977, "learning_rate": 9.78585075366809e-06, "loss": 0.2201, "step": 262 }, { "epoch": 0.6188235294117647, "grad_norm": 0.7649222612380981, "learning_rate": 9.784045291603943e-06, "loss": 0.2289, "step": 263 }, { "epoch": 0.6211764705882353, "grad_norm": 0.7563538551330566, "learning_rate": 9.782232418641067e-06, "loss": 0.2673, "step": 264 }, { "epoch": 0.6235294117647059, "grad_norm": 0.8194268941879272, "learning_rate": 9.780412137588054e-06, "loss": 0.229, "step": 265 }, { "epoch": 0.6258823529411764, "grad_norm": 0.8760014176368713, "learning_rate": 9.778584451264973e-06, "loss": 0.2767, "step": 266 }, { "epoch": 0.6282352941176471, "grad_norm": 0.6847783923149109, "learning_rate": 9.77674936250336e-06, "loss": 0.1855, "step": 267 }, { "epoch": 0.6305882352941177, "grad_norm": 0.8323301076889038, "learning_rate": 9.774906874146226e-06, "loss": 0.2754, "step": 268 }, { "epoch": 0.6329411764705882, "grad_norm": 0.8181362152099609, "learning_rate": 9.77305698904804e-06, "loss": 0.2666, "step": 269 }, { "epoch": 0.6352941176470588, "grad_norm": 0.937937319278717, "learning_rate": 9.771199710074738e-06, "loss": 0.2792, "step": 270 }, { "epoch": 0.6376470588235295, "grad_norm": 0.6474738717079163, "learning_rate": 9.7693350401037e-06, "loss": 0.2537, "step": 271 }, { "epoch": 0.64, "grad_norm": 0.6516919732093811, "learning_rate": 9.767462982023765e-06, "loss": 0.2083, "step": 272 }, { "epoch": 0.6423529411764706, "grad_norm": 0.7918919920921326, "learning_rate": 9.765583538735216e-06, "loss": 0.2124, "step": 273 }, { "epoch": 0.6447058823529411, "grad_norm": 3.9153850078582764, "learning_rate": 9.763696713149778e-06, "loss": 0.2526, "step": 274 }, { "epoch": 0.6470588235294118, "grad_norm": 0.8294705152511597, "learning_rate": 9.76180250819061e-06, "loss": 0.2627, "step": 275 }, { "epoch": 0.6494117647058824, "grad_norm": 0.652409017086029, "learning_rate": 9.759900926792307e-06, "loss": 0.1823, "step": 276 }, { "epoch": 0.6517647058823529, "grad_norm": 0.7426266074180603, "learning_rate": 9.757991971900888e-06, "loss": 0.2855, "step": 277 }, { "epoch": 0.6541176470588236, "grad_norm": 0.8848517537117004, "learning_rate": 9.756075646473802e-06, "loss": 0.2784, "step": 278 }, { "epoch": 0.6564705882352941, "grad_norm": 0.8466393351554871, "learning_rate": 9.75415195347991e-06, "loss": 0.2336, "step": 279 }, { "epoch": 0.6588235294117647, "grad_norm": 0.7509840726852417, "learning_rate": 9.752220895899489e-06, "loss": 0.2431, "step": 280 }, { "epoch": 0.6611764705882353, "grad_norm": 0.879849374294281, "learning_rate": 9.750282476724227e-06, "loss": 0.1913, "step": 281 }, { "epoch": 0.6635294117647059, "grad_norm": 0.6734027862548828, "learning_rate": 9.748336698957219e-06, "loss": 0.204, "step": 282 }, { "epoch": 0.6658823529411765, "grad_norm": 0.8402525782585144, "learning_rate": 9.746383565612953e-06, "loss": 0.1832, "step": 283 }, { "epoch": 0.668235294117647, "grad_norm": 0.8339890837669373, "learning_rate": 9.74442307971732e-06, "loss": 0.2683, "step": 284 }, { "epoch": 0.6705882352941176, "grad_norm": 0.7547810077667236, "learning_rate": 9.742455244307601e-06, "loss": 0.2905, "step": 285 }, { "epoch": 0.6729411764705883, "grad_norm": 0.8148105144500732, "learning_rate": 9.740480062432459e-06, "loss": 0.3188, "step": 286 }, { "epoch": 0.6752941176470588, "grad_norm": 0.7696908116340637, "learning_rate": 9.73849753715194e-06, "loss": 0.2482, "step": 287 }, { "epoch": 0.6776470588235294, "grad_norm": 0.7691321969032288, "learning_rate": 9.736507671537471e-06, "loss": 0.2521, "step": 288 }, { "epoch": 0.68, "grad_norm": 0.7690189480781555, "learning_rate": 9.734510468671841e-06, "loss": 0.2093, "step": 289 }, { "epoch": 0.6823529411764706, "grad_norm": 0.727536141872406, "learning_rate": 9.73250593164922e-06, "loss": 0.2509, "step": 290 }, { "epoch": 0.6847058823529412, "grad_norm": 1.3121906518936157, "learning_rate": 9.730494063575133e-06, "loss": 0.2904, "step": 291 }, { "epoch": 0.6870588235294117, "grad_norm": 0.7786142230033875, "learning_rate": 9.728474867566458e-06, "loss": 0.2284, "step": 292 }, { "epoch": 0.6894117647058824, "grad_norm": 0.6716507077217102, "learning_rate": 9.726448346751433e-06, "loss": 0.2147, "step": 293 }, { "epoch": 0.691764705882353, "grad_norm": 0.7568901777267456, "learning_rate": 9.724414504269643e-06, "loss": 0.2137, "step": 294 }, { "epoch": 0.6941176470588235, "grad_norm": 0.8764899969100952, "learning_rate": 9.722373343272011e-06, "loss": 0.3572, "step": 295 }, { "epoch": 0.6964705882352941, "grad_norm": 0.7698131799697876, "learning_rate": 9.720324866920808e-06, "loss": 0.2361, "step": 296 }, { "epoch": 0.6988235294117647, "grad_norm": 0.7609451413154602, "learning_rate": 9.718269078389626e-06, "loss": 0.247, "step": 297 }, { "epoch": 0.7011764705882353, "grad_norm": 0.7543407082557678, "learning_rate": 9.716205980863396e-06, "loss": 0.2266, "step": 298 }, { "epoch": 0.7035294117647058, "grad_norm": 1.0032401084899902, "learning_rate": 9.714135577538364e-06, "loss": 0.2443, "step": 299 }, { "epoch": 0.7058823529411765, "grad_norm": 0.6936239004135132, "learning_rate": 9.712057871622104e-06, "loss": 0.2491, "step": 300 }, { "epoch": 0.7082352941176471, "grad_norm": 0.836872398853302, "learning_rate": 9.709972866333493e-06, "loss": 0.2358, "step": 301 }, { "epoch": 0.7105882352941176, "grad_norm": 0.787061870098114, "learning_rate": 9.707880564902723e-06, "loss": 0.1977, "step": 302 }, { "epoch": 0.7129411764705882, "grad_norm": 0.6852730512619019, "learning_rate": 9.705780970571288e-06, "loss": 0.2455, "step": 303 }, { "epoch": 0.7152941176470589, "grad_norm": 0.7924845218658447, "learning_rate": 9.703674086591984e-06, "loss": 0.2497, "step": 304 }, { "epoch": 0.7176470588235294, "grad_norm": 0.7378849387168884, "learning_rate": 9.701559916228891e-06, "loss": 0.2446, "step": 305 }, { "epoch": 0.72, "grad_norm": 0.7610181570053101, "learning_rate": 9.69943846275739e-06, "loss": 0.2598, "step": 306 }, { "epoch": 0.7223529411764706, "grad_norm": 0.6673954725265503, "learning_rate": 9.697309729464135e-06, "loss": 0.1592, "step": 307 }, { "epoch": 0.7247058823529412, "grad_norm": 0.67850661277771, "learning_rate": 9.695173719647064e-06, "loss": 0.184, "step": 308 }, { "epoch": 0.7270588235294118, "grad_norm": 0.8092487454414368, "learning_rate": 9.693030436615383e-06, "loss": 0.2535, "step": 309 }, { "epoch": 0.7294117647058823, "grad_norm": 0.851713240146637, "learning_rate": 9.690879883689572e-06, "loss": 0.2187, "step": 310 }, { "epoch": 0.731764705882353, "grad_norm": 0.728959321975708, "learning_rate": 9.688722064201372e-06, "loss": 0.2897, "step": 311 }, { "epoch": 0.7341176470588235, "grad_norm": 0.728104293346405, "learning_rate": 9.686556981493779e-06, "loss": 0.1961, "step": 312 }, { "epoch": 0.7364705882352941, "grad_norm": 0.7766842842102051, "learning_rate": 9.684384638921043e-06, "loss": 0.2459, "step": 313 }, { "epoch": 0.7388235294117647, "grad_norm": 0.8288973569869995, "learning_rate": 9.682205039848664e-06, "loss": 0.3098, "step": 314 }, { "epoch": 0.7411764705882353, "grad_norm": 0.7754189372062683, "learning_rate": 9.680018187653378e-06, "loss": 0.2391, "step": 315 }, { "epoch": 0.7435294117647059, "grad_norm": 0.6731651425361633, "learning_rate": 9.677824085723168e-06, "loss": 0.214, "step": 316 }, { "epoch": 0.7458823529411764, "grad_norm": 0.9066212773323059, "learning_rate": 9.675622737457234e-06, "loss": 0.2631, "step": 317 }, { "epoch": 0.7482352941176471, "grad_norm": 0.6729580163955688, "learning_rate": 9.67341414626602e-06, "loss": 0.243, "step": 318 }, { "epoch": 0.7505882352941177, "grad_norm": 0.7352074980735779, "learning_rate": 9.671198315571174e-06, "loss": 0.2464, "step": 319 }, { "epoch": 0.7529411764705882, "grad_norm": 0.8593945503234863, "learning_rate": 9.668975248805572e-06, "loss": 0.2161, "step": 320 }, { "epoch": 0.7552941176470588, "grad_norm": 0.7230058908462524, "learning_rate": 9.666744949413295e-06, "loss": 0.2236, "step": 321 }, { "epoch": 0.7576470588235295, "grad_norm": 0.7768293023109436, "learning_rate": 9.664507420849629e-06, "loss": 0.3199, "step": 322 }, { "epoch": 0.76, "grad_norm": 0.667719841003418, "learning_rate": 9.662262666581063e-06, "loss": 0.21, "step": 323 }, { "epoch": 0.7623529411764706, "grad_norm": 0.8577963709831238, "learning_rate": 9.660010690085279e-06, "loss": 0.1749, "step": 324 }, { "epoch": 0.7647058823529411, "grad_norm": 0.7182241678237915, "learning_rate": 9.657751494851144e-06, "loss": 0.2272, "step": 325 }, { "epoch": 0.7670588235294118, "grad_norm": 0.7592055797576904, "learning_rate": 9.655485084378713e-06, "loss": 0.2545, "step": 326 }, { "epoch": 0.7694117647058824, "grad_norm": 0.7897837162017822, "learning_rate": 9.653211462179222e-06, "loss": 0.3142, "step": 327 }, { "epoch": 0.7717647058823529, "grad_norm": 0.784938633441925, "learning_rate": 9.650930631775072e-06, "loss": 0.2458, "step": 328 }, { "epoch": 0.7741176470588236, "grad_norm": 0.671668291091919, "learning_rate": 9.648642596699837e-06, "loss": 0.2059, "step": 329 }, { "epoch": 0.7764705882352941, "grad_norm": 0.6523120999336243, "learning_rate": 9.64634736049825e-06, "loss": 0.2161, "step": 330 }, { "epoch": 0.7788235294117647, "grad_norm": 0.8454598188400269, "learning_rate": 9.644044926726203e-06, "loss": 0.2529, "step": 331 }, { "epoch": 0.7811764705882352, "grad_norm": 0.7416345477104187, "learning_rate": 9.641735298950735e-06, "loss": 0.2346, "step": 332 }, { "epoch": 0.7835294117647059, "grad_norm": 0.6987890601158142, "learning_rate": 9.639418480750036e-06, "loss": 0.1847, "step": 333 }, { "epoch": 0.7858823529411765, "grad_norm": 0.7716751098632812, "learning_rate": 9.637094475713429e-06, "loss": 0.2571, "step": 334 }, { "epoch": 0.788235294117647, "grad_norm": 0.798072338104248, "learning_rate": 9.634763287441376e-06, "loss": 0.249, "step": 335 }, { "epoch": 0.7905882352941176, "grad_norm": 0.6939980387687683, "learning_rate": 9.632424919545466e-06, "loss": 0.2154, "step": 336 }, { "epoch": 0.7929411764705883, "grad_norm": 0.7483590245246887, "learning_rate": 9.630079375648411e-06, "loss": 0.2132, "step": 337 }, { "epoch": 0.7952941176470588, "grad_norm": 0.6839220523834229, "learning_rate": 9.627726659384042e-06, "loss": 0.2416, "step": 338 }, { "epoch": 0.7976470588235294, "grad_norm": 0.7525051236152649, "learning_rate": 9.625366774397298e-06, "loss": 0.2471, "step": 339 }, { "epoch": 0.8, "grad_norm": 0.7243475317955017, "learning_rate": 9.622999724344226e-06, "loss": 0.2454, "step": 340 }, { "epoch": 0.8023529411764706, "grad_norm": 0.7817341089248657, "learning_rate": 9.620625512891977e-06, "loss": 0.2002, "step": 341 }, { "epoch": 0.8047058823529412, "grad_norm": 0.9650923013687134, "learning_rate": 9.618244143718792e-06, "loss": 0.1861, "step": 342 }, { "epoch": 0.8070588235294117, "grad_norm": 0.8425676226615906, "learning_rate": 9.615855620514003e-06, "loss": 0.2427, "step": 343 }, { "epoch": 0.8094117647058824, "grad_norm": 0.7390637397766113, "learning_rate": 9.613459946978027e-06, "loss": 0.261, "step": 344 }, { "epoch": 0.8117647058823529, "grad_norm": 0.6607260704040527, "learning_rate": 9.611057126822353e-06, "loss": 0.1775, "step": 345 }, { "epoch": 0.8141176470588235, "grad_norm": 0.6955421566963196, "learning_rate": 9.608647163769546e-06, "loss": 0.2075, "step": 346 }, { "epoch": 0.8164705882352942, "grad_norm": 0.9047543406486511, "learning_rate": 9.60623006155324e-06, "loss": 0.2572, "step": 347 }, { "epoch": 0.8188235294117647, "grad_norm": 0.747662365436554, "learning_rate": 9.603805823918126e-06, "loss": 0.2461, "step": 348 }, { "epoch": 0.8211764705882353, "grad_norm": 0.739825963973999, "learning_rate": 9.601374454619947e-06, "loss": 0.2604, "step": 349 }, { "epoch": 0.8235294117647058, "grad_norm": 1.0497877597808838, "learning_rate": 9.598935957425497e-06, "loss": 0.1423, "step": 350 }, { "epoch": 0.8258823529411765, "grad_norm": 0.7954054474830627, "learning_rate": 9.596490336112619e-06, "loss": 0.2224, "step": 351 }, { "epoch": 0.8282352941176471, "grad_norm": 0.6153281927108765, "learning_rate": 9.594037594470178e-06, "loss": 0.1853, "step": 352 }, { "epoch": 0.8305882352941176, "grad_norm": 0.6814773082733154, "learning_rate": 9.59157773629809e-06, "loss": 0.2122, "step": 353 }, { "epoch": 0.8329411764705882, "grad_norm": 0.8119487166404724, "learning_rate": 9.589110765407277e-06, "loss": 0.1803, "step": 354 }, { "epoch": 0.8352941176470589, "grad_norm": 0.8650240898132324, "learning_rate": 9.586636685619695e-06, "loss": 0.2922, "step": 355 }, { "epoch": 0.8376470588235294, "grad_norm": 0.6987769603729248, "learning_rate": 9.584155500768305e-06, "loss": 0.2112, "step": 356 }, { "epoch": 0.84, "grad_norm": 0.784200131893158, "learning_rate": 9.581667214697077e-06, "loss": 0.2597, "step": 357 }, { "epoch": 0.8423529411764706, "grad_norm": 0.6876600980758667, "learning_rate": 9.579171831260984e-06, "loss": 0.1876, "step": 358 }, { "epoch": 0.8447058823529412, "grad_norm": 0.7068877220153809, "learning_rate": 9.576669354325997e-06, "loss": 0.2135, "step": 359 }, { "epoch": 0.8470588235294118, "grad_norm": 0.6503164768218994, "learning_rate": 9.574159787769071e-06, "loss": 0.1698, "step": 360 }, { "epoch": 0.8494117647058823, "grad_norm": 0.8055402636528015, "learning_rate": 9.571643135478149e-06, "loss": 0.2457, "step": 361 }, { "epoch": 0.851764705882353, "grad_norm": 0.7570701241493225, "learning_rate": 9.569119401352147e-06, "loss": 0.187, "step": 362 }, { "epoch": 0.8541176470588235, "grad_norm": 0.6111272573471069, "learning_rate": 9.566588589300958e-06, "loss": 0.192, "step": 363 }, { "epoch": 0.8564705882352941, "grad_norm": 0.6852231025695801, "learning_rate": 9.564050703245439e-06, "loss": 0.2224, "step": 364 }, { "epoch": 0.8588235294117647, "grad_norm": 0.6830403208732605, "learning_rate": 9.561505747117403e-06, "loss": 0.2307, "step": 365 }, { "epoch": 0.8611764705882353, "grad_norm": 0.7601058483123779, "learning_rate": 9.558953724859618e-06, "loss": 0.166, "step": 366 }, { "epoch": 0.8635294117647059, "grad_norm": 0.7139108777046204, "learning_rate": 9.556394640425803e-06, "loss": 0.2427, "step": 367 }, { "epoch": 0.8658823529411764, "grad_norm": 0.6391799449920654, "learning_rate": 9.553828497780612e-06, "loss": 0.2005, "step": 368 }, { "epoch": 0.8682352941176471, "grad_norm": 0.7252770662307739, "learning_rate": 9.551255300899638e-06, "loss": 0.1684, "step": 369 }, { "epoch": 0.8705882352941177, "grad_norm": 0.7033180594444275, "learning_rate": 9.548675053769401e-06, "loss": 0.2793, "step": 370 }, { "epoch": 0.8729411764705882, "grad_norm": 0.7709606289863586, "learning_rate": 9.546087760387344e-06, "loss": 0.1516, "step": 371 }, { "epoch": 0.8752941176470588, "grad_norm": 0.8126618266105652, "learning_rate": 9.543493424761829e-06, "loss": 0.2442, "step": 372 }, { "epoch": 0.8776470588235294, "grad_norm": 0.7467337250709534, "learning_rate": 9.54089205091212e-06, "loss": 0.2533, "step": 373 }, { "epoch": 0.88, "grad_norm": 0.7001603245735168, "learning_rate": 9.538283642868396e-06, "loss": 0.1816, "step": 374 }, { "epoch": 0.8823529411764706, "grad_norm": 0.7251049876213074, "learning_rate": 9.535668204671727e-06, "loss": 0.1669, "step": 375 }, { "epoch": 0.8847058823529412, "grad_norm": 0.787778377532959, "learning_rate": 9.533045740374076e-06, "loss": 0.2803, "step": 376 }, { "epoch": 0.8870588235294118, "grad_norm": 0.8853365778923035, "learning_rate": 9.530416254038288e-06, "loss": 0.1656, "step": 377 }, { "epoch": 0.8894117647058823, "grad_norm": 1.031820297241211, "learning_rate": 9.527779749738092e-06, "loss": 0.2159, "step": 378 }, { "epoch": 0.8917647058823529, "grad_norm": 0.8649064302444458, "learning_rate": 9.52513623155809e-06, "loss": 0.2191, "step": 379 }, { "epoch": 0.8941176470588236, "grad_norm": 0.8236756920814514, "learning_rate": 9.522485703593745e-06, "loss": 0.2252, "step": 380 }, { "epoch": 0.8964705882352941, "grad_norm": 0.6848606467247009, "learning_rate": 9.519828169951382e-06, "loss": 0.1968, "step": 381 }, { "epoch": 0.8988235294117647, "grad_norm": 0.6837734580039978, "learning_rate": 9.51716363474818e-06, "loss": 0.2303, "step": 382 }, { "epoch": 0.9011764705882352, "grad_norm": 0.6842684745788574, "learning_rate": 9.514492102112168e-06, "loss": 0.2564, "step": 383 }, { "epoch": 0.9035294117647059, "grad_norm": 0.7511987686157227, "learning_rate": 9.511813576182209e-06, "loss": 0.2642, "step": 384 }, { "epoch": 0.9058823529411765, "grad_norm": 0.7278965711593628, "learning_rate": 9.509128061108008e-06, "loss": 0.1155, "step": 385 }, { "epoch": 0.908235294117647, "grad_norm": 0.5951911211013794, "learning_rate": 9.506435561050094e-06, "loss": 0.1906, "step": 386 }, { "epoch": 0.9105882352941177, "grad_norm": 0.6874554753303528, "learning_rate": 9.503736080179814e-06, "loss": 0.1581, "step": 387 }, { "epoch": 0.9129411764705883, "grad_norm": 0.6258144974708557, "learning_rate": 9.501029622679338e-06, "loss": 0.1412, "step": 388 }, { "epoch": 0.9152941176470588, "grad_norm": 0.7033083438873291, "learning_rate": 9.498316192741636e-06, "loss": 0.1788, "step": 389 }, { "epoch": 0.9176470588235294, "grad_norm": 0.8203756213188171, "learning_rate": 9.495595794570492e-06, "loss": 0.3023, "step": 390 }, { "epoch": 0.92, "grad_norm": 0.7318507432937622, "learning_rate": 9.49286843238047e-06, "loss": 0.2127, "step": 391 }, { "epoch": 0.9223529411764706, "grad_norm": 0.6982404589653015, "learning_rate": 9.490134110396932e-06, "loss": 0.2962, "step": 392 }, { "epoch": 0.9247058823529412, "grad_norm": 0.7355530858039856, "learning_rate": 9.487392832856024e-06, "loss": 0.2378, "step": 393 }, { "epoch": 0.9270588235294117, "grad_norm": 0.6678468585014343, "learning_rate": 9.484644604004667e-06, "loss": 0.2133, "step": 394 }, { "epoch": 0.9294117647058824, "grad_norm": 0.7420473694801331, "learning_rate": 9.481889428100544e-06, "loss": 0.2995, "step": 395 }, { "epoch": 0.9317647058823529, "grad_norm": 0.6463662385940552, "learning_rate": 9.479127309412109e-06, "loss": 0.2276, "step": 396 }, { "epoch": 0.9341176470588235, "grad_norm": 0.6382785439491272, "learning_rate": 9.476358252218571e-06, "loss": 0.1779, "step": 397 }, { "epoch": 0.9364705882352942, "grad_norm": 0.6710162162780762, "learning_rate": 9.473582260809882e-06, "loss": 0.2241, "step": 398 }, { "epoch": 0.9388235294117647, "grad_norm": 0.8188492059707642, "learning_rate": 9.470799339486744e-06, "loss": 0.2687, "step": 399 }, { "epoch": 0.9411764705882353, "grad_norm": 0.7185152173042297, "learning_rate": 9.468009492560595e-06, "loss": 0.2532, "step": 400 }, { "epoch": 0.9411764705882353, "eval_loss": 0.19940543174743652, "eval_runtime": 3.339, "eval_samples_per_second": 33.243, "eval_steps_per_second": 1.198, "step": 400 }, { "epoch": 0.9435294117647058, "grad_norm": 0.7695783972740173, "learning_rate": 9.465212724353601e-06, "loss": 0.2462, "step": 401 }, { "epoch": 0.9458823529411765, "grad_norm": 0.7926376461982727, "learning_rate": 9.462409039198644e-06, "loss": 0.2041, "step": 402 }, { "epoch": 0.9482352941176471, "grad_norm": 0.6705500483512878, "learning_rate": 9.459598441439331e-06, "loss": 0.1797, "step": 403 }, { "epoch": 0.9505882352941176, "grad_norm": 0.7240809202194214, "learning_rate": 9.456780935429977e-06, "loss": 0.2104, "step": 404 }, { "epoch": 0.9529411764705882, "grad_norm": 0.5774074196815491, "learning_rate": 9.453956525535598e-06, "loss": 0.209, "step": 405 }, { "epoch": 0.9552941176470588, "grad_norm": 0.5997632145881653, "learning_rate": 9.451125216131902e-06, "loss": 0.1912, "step": 406 }, { "epoch": 0.9576470588235294, "grad_norm": 0.6875910758972168, "learning_rate": 9.448287011605293e-06, "loss": 0.2119, "step": 407 }, { "epoch": 0.96, "grad_norm": 0.7592631578445435, "learning_rate": 9.445441916352855e-06, "loss": 0.2358, "step": 408 }, { "epoch": 0.9623529411764706, "grad_norm": 0.735801100730896, "learning_rate": 9.442589934782345e-06, "loss": 0.2229, "step": 409 }, { "epoch": 0.9647058823529412, "grad_norm": 0.6529631018638611, "learning_rate": 9.43973107131219e-06, "loss": 0.2171, "step": 410 }, { "epoch": 0.9670588235294117, "grad_norm": 0.7723212838172913, "learning_rate": 9.436865330371478e-06, "loss": 0.2775, "step": 411 }, { "epoch": 0.9694117647058823, "grad_norm": 0.7457647919654846, "learning_rate": 9.433992716399956e-06, "loss": 0.2064, "step": 412 }, { "epoch": 0.971764705882353, "grad_norm": 0.6216557621955872, "learning_rate": 9.431113233848015e-06, "loss": 0.2026, "step": 413 }, { "epoch": 0.9741176470588235, "grad_norm": 0.7274438738822937, "learning_rate": 9.428226887176686e-06, "loss": 0.2252, "step": 414 }, { "epoch": 0.9764705882352941, "grad_norm": 0.7637820243835449, "learning_rate": 9.425333680857637e-06, "loss": 0.2561, "step": 415 }, { "epoch": 0.9788235294117648, "grad_norm": 0.8315963745117188, "learning_rate": 9.422433619373163e-06, "loss": 0.1837, "step": 416 }, { "epoch": 0.9811764705882353, "grad_norm": 0.6814245581626892, "learning_rate": 9.41952670721618e-06, "loss": 0.2756, "step": 417 }, { "epoch": 0.9835294117647059, "grad_norm": 0.6335944533348083, "learning_rate": 9.416612948890214e-06, "loss": 0.2039, "step": 418 }, { "epoch": 0.9858823529411764, "grad_norm": 0.6975286602973938, "learning_rate": 9.413692348909402e-06, "loss": 0.2293, "step": 419 }, { "epoch": 0.9882352941176471, "grad_norm": 0.8188583850860596, "learning_rate": 9.410764911798477e-06, "loss": 0.2587, "step": 420 }, { "epoch": 0.9905882352941177, "grad_norm": 0.746294379234314, "learning_rate": 9.407830642092765e-06, "loss": 0.2708, "step": 421 }, { "epoch": 0.9929411764705882, "grad_norm": 0.7856789231300354, "learning_rate": 9.404889544338178e-06, "loss": 0.262, "step": 422 }, { "epoch": 0.9952941176470588, "grad_norm": 0.5907968878746033, "learning_rate": 9.401941623091208e-06, "loss": 0.2462, "step": 423 }, { "epoch": 0.9976470588235294, "grad_norm": 0.6972366571426392, "learning_rate": 9.398986882918915e-06, "loss": 0.1502, "step": 424 }, { "epoch": 1.0, "grad_norm": 0.631081223487854, "learning_rate": 9.396025328398925e-06, "loss": 0.2018, "step": 425 }, { "epoch": 1.0023529411764707, "grad_norm": 0.5795333385467529, "learning_rate": 9.39305696411942e-06, "loss": 0.1343, "step": 426 }, { "epoch": 1.0047058823529411, "grad_norm": 0.6401455998420715, "learning_rate": 9.390081794679135e-06, "loss": 0.1521, "step": 427 }, { "epoch": 1.0070588235294118, "grad_norm": 0.5998563170433044, "learning_rate": 9.387099824687345e-06, "loss": 0.1766, "step": 428 }, { "epoch": 1.0094117647058825, "grad_norm": 0.6439195275306702, "learning_rate": 9.384111058763862e-06, "loss": 0.1646, "step": 429 }, { "epoch": 1.011764705882353, "grad_norm": 0.5657816529273987, "learning_rate": 9.381115501539025e-06, "loss": 0.1431, "step": 430 }, { "epoch": 1.0141176470588236, "grad_norm": 0.9303725957870483, "learning_rate": 9.378113157653694e-06, "loss": 0.2036, "step": 431 }, { "epoch": 1.016470588235294, "grad_norm": 0.742510974407196, "learning_rate": 9.375104031759248e-06, "loss": 0.201, "step": 432 }, { "epoch": 1.0188235294117647, "grad_norm": 0.6414480209350586, "learning_rate": 9.372088128517568e-06, "loss": 0.1671, "step": 433 }, { "epoch": 1.0211764705882354, "grad_norm": 0.6017137765884399, "learning_rate": 9.369065452601038e-06, "loss": 0.1148, "step": 434 }, { "epoch": 1.0235294117647058, "grad_norm": 0.6361629366874695, "learning_rate": 9.366036008692528e-06, "loss": 0.1334, "step": 435 }, { "epoch": 1.0258823529411765, "grad_norm": 0.7263821363449097, "learning_rate": 9.362999801485405e-06, "loss": 0.1921, "step": 436 }, { "epoch": 1.0282352941176471, "grad_norm": 0.6439031362533569, "learning_rate": 9.359956835683505e-06, "loss": 0.1406, "step": 437 }, { "epoch": 1.0305882352941176, "grad_norm": 0.6672723889350891, "learning_rate": 9.356907116001138e-06, "loss": 0.1582, "step": 438 }, { "epoch": 1.0329411764705883, "grad_norm": 0.6641929149627686, "learning_rate": 9.353850647163072e-06, "loss": 0.1557, "step": 439 }, { "epoch": 1.035294117647059, "grad_norm": 0.6324585676193237, "learning_rate": 9.350787433904543e-06, "loss": 0.1272, "step": 440 }, { "epoch": 1.0376470588235294, "grad_norm": 0.8224608302116394, "learning_rate": 9.347717480971222e-06, "loss": 0.2371, "step": 441 }, { "epoch": 1.04, "grad_norm": 0.6076167821884155, "learning_rate": 9.344640793119232e-06, "loss": 0.1592, "step": 442 }, { "epoch": 1.0423529411764705, "grad_norm": 0.5944980978965759, "learning_rate": 9.341557375115125e-06, "loss": 0.1242, "step": 443 }, { "epoch": 1.0447058823529412, "grad_norm": 0.6085498332977295, "learning_rate": 9.338467231735878e-06, "loss": 0.179, "step": 444 }, { "epoch": 1.0470588235294118, "grad_norm": 0.728657066822052, "learning_rate": 9.335370367768894e-06, "loss": 0.1938, "step": 445 }, { "epoch": 1.0494117647058823, "grad_norm": 0.7110954523086548, "learning_rate": 9.33226678801198e-06, "loss": 0.1753, "step": 446 }, { "epoch": 1.051764705882353, "grad_norm": 0.7372868061065674, "learning_rate": 9.329156497273357e-06, "loss": 0.1973, "step": 447 }, { "epoch": 1.0541176470588236, "grad_norm": 0.772882342338562, "learning_rate": 9.32603950037163e-06, "loss": 0.1922, "step": 448 }, { "epoch": 1.056470588235294, "grad_norm": 0.8365598320960999, "learning_rate": 9.322915802135806e-06, "loss": 0.2142, "step": 449 }, { "epoch": 1.0588235294117647, "grad_norm": 0.7421349883079529, "learning_rate": 9.319785407405266e-06, "loss": 0.175, "step": 450 }, { "epoch": 1.0611764705882354, "grad_norm": 0.6767480969429016, "learning_rate": 9.31664832102977e-06, "loss": 0.1631, "step": 451 }, { "epoch": 1.0635294117647058, "grad_norm": 0.8293920755386353, "learning_rate": 9.313504547869442e-06, "loss": 0.1532, "step": 452 }, { "epoch": 1.0658823529411765, "grad_norm": 0.5916829109191895, "learning_rate": 9.310354092794766e-06, "loss": 0.1543, "step": 453 }, { "epoch": 1.0682352941176472, "grad_norm": 0.7404692769050598, "learning_rate": 9.307196960686582e-06, "loss": 0.1662, "step": 454 }, { "epoch": 1.0705882352941176, "grad_norm": 0.6779770851135254, "learning_rate": 9.304033156436067e-06, "loss": 0.1842, "step": 455 }, { "epoch": 1.0729411764705883, "grad_norm": 1.527043104171753, "learning_rate": 9.300862684944743e-06, "loss": 0.1134, "step": 456 }, { "epoch": 1.0752941176470587, "grad_norm": 0.826732337474823, "learning_rate": 9.297685551124453e-06, "loss": 0.163, "step": 457 }, { "epoch": 1.0776470588235294, "grad_norm": 0.7266581654548645, "learning_rate": 9.294501759897369e-06, "loss": 0.1711, "step": 458 }, { "epoch": 1.08, "grad_norm": 0.7844350934028625, "learning_rate": 9.291311316195972e-06, "loss": 0.1834, "step": 459 }, { "epoch": 1.0823529411764705, "grad_norm": 0.7173324227333069, "learning_rate": 9.288114224963052e-06, "loss": 0.1889, "step": 460 }, { "epoch": 1.0847058823529412, "grad_norm": 0.7329220175743103, "learning_rate": 9.284910491151696e-06, "loss": 0.2073, "step": 461 }, { "epoch": 1.0870588235294119, "grad_norm": 0.6462969779968262, "learning_rate": 9.281700119725281e-06, "loss": 0.161, "step": 462 }, { "epoch": 1.0894117647058823, "grad_norm": 0.4800094664096832, "learning_rate": 9.278483115657473e-06, "loss": 0.1349, "step": 463 }, { "epoch": 1.091764705882353, "grad_norm": 0.6670486330986023, "learning_rate": 9.275259483932205e-06, "loss": 0.1669, "step": 464 }, { "epoch": 1.0941176470588236, "grad_norm": 0.6776825189590454, "learning_rate": 9.272029229543687e-06, "loss": 0.1935, "step": 465 }, { "epoch": 1.096470588235294, "grad_norm": 0.6969578266143799, "learning_rate": 9.268792357496383e-06, "loss": 0.1743, "step": 466 }, { "epoch": 1.0988235294117648, "grad_norm": 0.6803928017616272, "learning_rate": 9.26554887280501e-06, "loss": 0.1497, "step": 467 }, { "epoch": 1.1011764705882352, "grad_norm": 0.6609306335449219, "learning_rate": 9.262298780494532e-06, "loss": 0.1543, "step": 468 }, { "epoch": 1.1035294117647059, "grad_norm": 0.7183146476745605, "learning_rate": 9.259042085600147e-06, "loss": 0.1458, "step": 469 }, { "epoch": 1.1058823529411765, "grad_norm": 0.7717549800872803, "learning_rate": 9.255778793167286e-06, "loss": 0.1801, "step": 470 }, { "epoch": 1.108235294117647, "grad_norm": 0.9376674294471741, "learning_rate": 9.252508908251596e-06, "loss": 0.1861, "step": 471 }, { "epoch": 1.1105882352941177, "grad_norm": 0.7132975459098816, "learning_rate": 9.24923243591894e-06, "loss": 0.2065, "step": 472 }, { "epoch": 1.1129411764705883, "grad_norm": 0.6921381950378418, "learning_rate": 9.24594938124539e-06, "loss": 0.189, "step": 473 }, { "epoch": 1.1152941176470588, "grad_norm": 0.6876912713050842, "learning_rate": 9.24265974931721e-06, "loss": 0.1741, "step": 474 }, { "epoch": 1.1176470588235294, "grad_norm": 0.6130735278129578, "learning_rate": 9.239363545230858e-06, "loss": 0.1397, "step": 475 }, { "epoch": 1.12, "grad_norm": 0.6158244609832764, "learning_rate": 9.23606077409297e-06, "loss": 0.1479, "step": 476 }, { "epoch": 1.1223529411764706, "grad_norm": 0.6705183982849121, "learning_rate": 9.232751441020357e-06, "loss": 0.1876, "step": 477 }, { "epoch": 1.1247058823529412, "grad_norm": 0.5788912773132324, "learning_rate": 9.229435551140004e-06, "loss": 0.1475, "step": 478 }, { "epoch": 1.1270588235294117, "grad_norm": 0.8458239436149597, "learning_rate": 9.226113109589038e-06, "loss": 0.1689, "step": 479 }, { "epoch": 1.1294117647058823, "grad_norm": 0.6838564276695251, "learning_rate": 9.22278412151475e-06, "loss": 0.1673, "step": 480 }, { "epoch": 1.131764705882353, "grad_norm": 0.6072037816047668, "learning_rate": 9.219448592074573e-06, "loss": 0.1766, "step": 481 }, { "epoch": 1.1341176470588235, "grad_norm": 0.6333675980567932, "learning_rate": 9.216106526436062e-06, "loss": 0.1674, "step": 482 }, { "epoch": 1.1364705882352941, "grad_norm": 0.7038118839263916, "learning_rate": 9.212757929776907e-06, "loss": 0.1837, "step": 483 }, { "epoch": 1.1388235294117648, "grad_norm": 0.7606667280197144, "learning_rate": 9.209402807284922e-06, "loss": 0.17, "step": 484 }, { "epoch": 1.1411764705882352, "grad_norm": 0.7557606101036072, "learning_rate": 9.206041164158018e-06, "loss": 0.19, "step": 485 }, { "epoch": 1.143529411764706, "grad_norm": 0.5894985198974609, "learning_rate": 9.202673005604214e-06, "loss": 0.1241, "step": 486 }, { "epoch": 1.1458823529411766, "grad_norm": 0.7073805928230286, "learning_rate": 9.19929833684163e-06, "loss": 0.1713, "step": 487 }, { "epoch": 1.148235294117647, "grad_norm": 0.6408051252365112, "learning_rate": 9.19591716309846e-06, "loss": 0.1564, "step": 488 }, { "epoch": 1.1505882352941177, "grad_norm": 0.6677308678627014, "learning_rate": 9.192529489612982e-06, "loss": 0.1617, "step": 489 }, { "epoch": 1.1529411764705881, "grad_norm": 0.6214562654495239, "learning_rate": 9.18913532163354e-06, "loss": 0.1152, "step": 490 }, { "epoch": 1.1552941176470588, "grad_norm": 0.5902462005615234, "learning_rate": 9.185734664418552e-06, "loss": 0.1935, "step": 491 }, { "epoch": 1.1576470588235295, "grad_norm": 0.7024534344673157, "learning_rate": 9.182327523236469e-06, "loss": 0.2215, "step": 492 }, { "epoch": 1.16, "grad_norm": 0.6627213358879089, "learning_rate": 9.178913903365804e-06, "loss": 0.1837, "step": 493 }, { "epoch": 1.1623529411764706, "grad_norm": 0.6739841103553772, "learning_rate": 9.1754938100951e-06, "loss": 0.1772, "step": 494 }, { "epoch": 1.1647058823529413, "grad_norm": 0.6929228901863098, "learning_rate": 9.17206724872293e-06, "loss": 0.1708, "step": 495 }, { "epoch": 1.1670588235294117, "grad_norm": 0.6083697080612183, "learning_rate": 9.16863422455789e-06, "loss": 0.1332, "step": 496 }, { "epoch": 1.1694117647058824, "grad_norm": 0.6493418216705322, "learning_rate": 9.165194742918582e-06, "loss": 0.1866, "step": 497 }, { "epoch": 1.171764705882353, "grad_norm": 0.7884907722473145, "learning_rate": 9.16174880913362e-06, "loss": 0.1908, "step": 498 }, { "epoch": 1.1741176470588235, "grad_norm": 0.6528869271278381, "learning_rate": 9.158296428541611e-06, "loss": 0.1643, "step": 499 }, { "epoch": 1.1764705882352942, "grad_norm": 0.8330202102661133, "learning_rate": 9.154837606491146e-06, "loss": 0.2369, "step": 500 }, { "epoch": 1.1788235294117646, "grad_norm": 0.6811143159866333, "learning_rate": 9.151372348340802e-06, "loss": 0.1524, "step": 501 }, { "epoch": 1.1811764705882353, "grad_norm": 0.661277174949646, "learning_rate": 9.147900659459123e-06, "loss": 0.1815, "step": 502 }, { "epoch": 1.183529411764706, "grad_norm": 0.7074480652809143, "learning_rate": 9.144422545224615e-06, "loss": 0.1748, "step": 503 }, { "epoch": 1.1858823529411764, "grad_norm": 0.6803154349327087, "learning_rate": 9.140938011025741e-06, "loss": 0.2051, "step": 504 }, { "epoch": 1.188235294117647, "grad_norm": 0.5830198526382446, "learning_rate": 9.13744706226091e-06, "loss": 0.1633, "step": 505 }, { "epoch": 1.1905882352941177, "grad_norm": 0.7339157462120056, "learning_rate": 9.133949704338467e-06, "loss": 0.1803, "step": 506 }, { "epoch": 1.1929411764705882, "grad_norm": 0.6069468259811401, "learning_rate": 9.130445942676689e-06, "loss": 0.1495, "step": 507 }, { "epoch": 1.1952941176470588, "grad_norm": 0.6298292279243469, "learning_rate": 9.12693578270377e-06, "loss": 0.166, "step": 508 }, { "epoch": 1.1976470588235295, "grad_norm": 0.6792426705360413, "learning_rate": 9.123419229857818e-06, "loss": 0.1819, "step": 509 }, { "epoch": 1.2, "grad_norm": 0.6448541283607483, "learning_rate": 9.119896289586849e-06, "loss": 0.1329, "step": 510 }, { "epoch": 1.2023529411764706, "grad_norm": 0.720327615737915, "learning_rate": 9.116366967348772e-06, "loss": 0.1731, "step": 511 }, { "epoch": 1.204705882352941, "grad_norm": 0.6252376437187195, "learning_rate": 9.11283126861138e-06, "loss": 0.1417, "step": 512 }, { "epoch": 1.2070588235294117, "grad_norm": 0.7790701985359192, "learning_rate": 9.109289198852349e-06, "loss": 0.0859, "step": 513 }, { "epoch": 1.2094117647058824, "grad_norm": 0.6499266028404236, "learning_rate": 9.105740763559224e-06, "loss": 0.1947, "step": 514 }, { "epoch": 1.2117647058823529, "grad_norm": 0.8410869240760803, "learning_rate": 9.102185968229413e-06, "loss": 0.1542, "step": 515 }, { "epoch": 1.2141176470588235, "grad_norm": 0.8121353983879089, "learning_rate": 9.098624818370174e-06, "loss": 0.2145, "step": 516 }, { "epoch": 1.2164705882352942, "grad_norm": 0.7620666027069092, "learning_rate": 9.095057319498615e-06, "loss": 0.2139, "step": 517 }, { "epoch": 1.2188235294117646, "grad_norm": 0.6351275444030762, "learning_rate": 9.091483477141673e-06, "loss": 0.153, "step": 518 }, { "epoch": 1.2211764705882353, "grad_norm": 0.810640275478363, "learning_rate": 9.08790329683612e-06, "loss": 0.2252, "step": 519 }, { "epoch": 1.223529411764706, "grad_norm": 0.7784134745597839, "learning_rate": 9.084316784128544e-06, "loss": 0.1327, "step": 520 }, { "epoch": 1.2258823529411764, "grad_norm": 0.651909351348877, "learning_rate": 9.080723944575341e-06, "loss": 0.1954, "step": 521 }, { "epoch": 1.228235294117647, "grad_norm": 0.7753939032554626, "learning_rate": 9.077124783742709e-06, "loss": 0.139, "step": 522 }, { "epoch": 1.2305882352941175, "grad_norm": 0.7542133331298828, "learning_rate": 9.073519307206647e-06, "loss": 0.1595, "step": 523 }, { "epoch": 1.2329411764705882, "grad_norm": 0.7104750871658325, "learning_rate": 9.069907520552929e-06, "loss": 0.1659, "step": 524 }, { "epoch": 1.2352941176470589, "grad_norm": 0.754581093788147, "learning_rate": 9.06628942937711e-06, "loss": 0.202, "step": 525 }, { "epoch": 1.2376470588235293, "grad_norm": 0.508418083190918, "learning_rate": 9.062665039284512e-06, "loss": 0.1196, "step": 526 }, { "epoch": 1.24, "grad_norm": 0.7869056463241577, "learning_rate": 9.05903435589021e-06, "loss": 0.204, "step": 527 }, { "epoch": 1.2423529411764707, "grad_norm": 0.7365128993988037, "learning_rate": 9.055397384819041e-06, "loss": 0.1564, "step": 528 }, { "epoch": 1.244705882352941, "grad_norm": 0.6554048657417297, "learning_rate": 9.051754131705571e-06, "loss": 0.1771, "step": 529 }, { "epoch": 1.2470588235294118, "grad_norm": 0.6004898548126221, "learning_rate": 9.048104602194103e-06, "loss": 0.1589, "step": 530 }, { "epoch": 1.2494117647058824, "grad_norm": 0.7848236560821533, "learning_rate": 9.044448801938664e-06, "loss": 0.1789, "step": 531 }, { "epoch": 1.251764705882353, "grad_norm": 0.7042925357818604, "learning_rate": 9.040786736602999e-06, "loss": 0.1357, "step": 532 }, { "epoch": 1.2541176470588236, "grad_norm": 0.6259021162986755, "learning_rate": 9.037118411860554e-06, "loss": 0.1666, "step": 533 }, { "epoch": 1.256470588235294, "grad_norm": 0.5714264512062073, "learning_rate": 9.033443833394472e-06, "loss": 0.1468, "step": 534 }, { "epoch": 1.2588235294117647, "grad_norm": 0.624652087688446, "learning_rate": 9.029763006897589e-06, "loss": 0.0996, "step": 535 }, { "epoch": 1.2611764705882353, "grad_norm": 0.6405372619628906, "learning_rate": 9.026075938072418e-06, "loss": 0.1182, "step": 536 }, { "epoch": 1.263529411764706, "grad_norm": 0.6104665398597717, "learning_rate": 9.022382632631145e-06, "loss": 0.1331, "step": 537 }, { "epoch": 1.2658823529411765, "grad_norm": 0.8554365038871765, "learning_rate": 9.018683096295617e-06, "loss": 0.1809, "step": 538 }, { "epoch": 1.2682352941176471, "grad_norm": 0.6348249912261963, "learning_rate": 9.014977334797333e-06, "loss": 0.1389, "step": 539 }, { "epoch": 1.2705882352941176, "grad_norm": 0.7563521265983582, "learning_rate": 9.011265353877433e-06, "loss": 0.1569, "step": 540 }, { "epoch": 1.2729411764705882, "grad_norm": 0.6340474486351013, "learning_rate": 9.007547159286704e-06, "loss": 0.1538, "step": 541 }, { "epoch": 1.275294117647059, "grad_norm": 0.5868159532546997, "learning_rate": 9.003822756785548e-06, "loss": 0.1544, "step": 542 }, { "epoch": 1.2776470588235294, "grad_norm": 0.7343094944953918, "learning_rate": 9.000092152143989e-06, "loss": 0.2068, "step": 543 }, { "epoch": 1.28, "grad_norm": 0.6871341466903687, "learning_rate": 8.99635535114166e-06, "loss": 0.1699, "step": 544 }, { "epoch": 1.2823529411764705, "grad_norm": 0.661542534828186, "learning_rate": 8.992612359567792e-06, "loss": 0.154, "step": 545 }, { "epoch": 1.2847058823529411, "grad_norm": 0.6785011291503906, "learning_rate": 8.988863183221207e-06, "loss": 0.1588, "step": 546 }, { "epoch": 1.2870588235294118, "grad_norm": 0.6763486266136169, "learning_rate": 8.985107827910315e-06, "loss": 0.1864, "step": 547 }, { "epoch": 1.2894117647058825, "grad_norm": 0.8712665438652039, "learning_rate": 8.981346299453087e-06, "loss": 0.1948, "step": 548 }, { "epoch": 1.291764705882353, "grad_norm": 0.6498245000839233, "learning_rate": 8.977578603677065e-06, "loss": 0.1655, "step": 549 }, { "epoch": 1.2941176470588236, "grad_norm": 0.6178410649299622, "learning_rate": 8.973804746419348e-06, "loss": 0.1408, "step": 550 }, { "epoch": 1.296470588235294, "grad_norm": 0.5823180079460144, "learning_rate": 8.970024733526574e-06, "loss": 0.1549, "step": 551 }, { "epoch": 1.2988235294117647, "grad_norm": 0.6575651168823242, "learning_rate": 8.966238570854924e-06, "loss": 0.1654, "step": 552 }, { "epoch": 1.3011764705882354, "grad_norm": 0.6671184301376343, "learning_rate": 8.962446264270102e-06, "loss": 0.1623, "step": 553 }, { "epoch": 1.3035294117647058, "grad_norm": 0.5750121474266052, "learning_rate": 8.958647819647331e-06, "loss": 0.1638, "step": 554 }, { "epoch": 1.3058823529411765, "grad_norm": 0.8694437146186829, "learning_rate": 8.954843242871348e-06, "loss": 0.1644, "step": 555 }, { "epoch": 1.308235294117647, "grad_norm": 0.69869065284729, "learning_rate": 8.95103253983638e-06, "loss": 0.1845, "step": 556 }, { "epoch": 1.3105882352941176, "grad_norm": 0.6231600046157837, "learning_rate": 8.947215716446156e-06, "loss": 0.1686, "step": 557 }, { "epoch": 1.3129411764705883, "grad_norm": 0.7214934825897217, "learning_rate": 8.943392778613886e-06, "loss": 0.1614, "step": 558 }, { "epoch": 1.315294117647059, "grad_norm": 0.6902795433998108, "learning_rate": 8.939563732262243e-06, "loss": 0.1712, "step": 559 }, { "epoch": 1.3176470588235294, "grad_norm": 0.6946473121643066, "learning_rate": 8.935728583323372e-06, "loss": 0.1974, "step": 560 }, { "epoch": 1.32, "grad_norm": 0.6899502873420715, "learning_rate": 8.931887337738873e-06, "loss": 0.1338, "step": 561 }, { "epoch": 1.3223529411764705, "grad_norm": 0.6269996166229248, "learning_rate": 8.928040001459784e-06, "loss": 0.1607, "step": 562 }, { "epoch": 1.3247058823529412, "grad_norm": 0.7340877056121826, "learning_rate": 8.924186580446588e-06, "loss": 0.1993, "step": 563 }, { "epoch": 1.3270588235294118, "grad_norm": 0.6818439960479736, "learning_rate": 8.920327080669187e-06, "loss": 0.1887, "step": 564 }, { "epoch": 1.3294117647058823, "grad_norm": 0.6619871258735657, "learning_rate": 8.916461508106908e-06, "loss": 0.1559, "step": 565 }, { "epoch": 1.331764705882353, "grad_norm": 0.6391106247901917, "learning_rate": 8.91258986874848e-06, "loss": 0.1536, "step": 566 }, { "epoch": 1.3341176470588234, "grad_norm": 0.7599229216575623, "learning_rate": 8.908712168592031e-06, "loss": 0.2132, "step": 567 }, { "epoch": 1.336470588235294, "grad_norm": 0.5862919092178345, "learning_rate": 8.904828413645084e-06, "loss": 0.1822, "step": 568 }, { "epoch": 1.3388235294117647, "grad_norm": 0.6597055196762085, "learning_rate": 8.900938609924536e-06, "loss": 0.1872, "step": 569 }, { "epoch": 1.3411764705882354, "grad_norm": 0.6758376359939575, "learning_rate": 8.897042763456662e-06, "loss": 0.2066, "step": 570 }, { "epoch": 1.3435294117647059, "grad_norm": 0.7920562028884888, "learning_rate": 8.893140880277092e-06, "loss": 0.1563, "step": 571 }, { "epoch": 1.3458823529411765, "grad_norm": 0.6722671985626221, "learning_rate": 8.889232966430814e-06, "loss": 0.1447, "step": 572 }, { "epoch": 1.348235294117647, "grad_norm": 0.6042508482933044, "learning_rate": 8.885319027972153e-06, "loss": 0.1614, "step": 573 }, { "epoch": 1.3505882352941176, "grad_norm": 0.6743940114974976, "learning_rate": 8.881399070964774e-06, "loss": 0.2262, "step": 574 }, { "epoch": 1.3529411764705883, "grad_norm": 0.6248114705085754, "learning_rate": 8.877473101481661e-06, "loss": 0.1617, "step": 575 }, { "epoch": 1.3552941176470588, "grad_norm": 0.6989963054656982, "learning_rate": 8.87354112560512e-06, "loss": 0.2101, "step": 576 }, { "epoch": 1.3576470588235294, "grad_norm": 0.6596255898475647, "learning_rate": 8.869603149426746e-06, "loss": 0.1992, "step": 577 }, { "epoch": 1.3599999999999999, "grad_norm": 0.7460675835609436, "learning_rate": 8.865659179047454e-06, "loss": 0.1801, "step": 578 }, { "epoch": 1.3623529411764705, "grad_norm": 0.6744711399078369, "learning_rate": 8.861709220577425e-06, "loss": 0.2033, "step": 579 }, { "epoch": 1.3647058823529412, "grad_norm": 0.6921920776367188, "learning_rate": 8.857753280136128e-06, "loss": 0.2219, "step": 580 }, { "epoch": 1.3670588235294119, "grad_norm": 0.7912100553512573, "learning_rate": 8.853791363852296e-06, "loss": 0.2024, "step": 581 }, { "epoch": 1.3694117647058823, "grad_norm": 0.5829184055328369, "learning_rate": 8.849823477863921e-06, "loss": 0.1301, "step": 582 }, { "epoch": 1.371764705882353, "grad_norm": 0.6439005732536316, "learning_rate": 8.84584962831824e-06, "loss": 0.2196, "step": 583 }, { "epoch": 1.3741176470588234, "grad_norm": 0.8109025955200195, "learning_rate": 8.841869821371736e-06, "loss": 0.2227, "step": 584 }, { "epoch": 1.3764705882352941, "grad_norm": 0.6888384819030762, "learning_rate": 8.837884063190115e-06, "loss": 0.1849, "step": 585 }, { "epoch": 1.3788235294117648, "grad_norm": 0.8079880475997925, "learning_rate": 8.833892359948308e-06, "loss": 0.1842, "step": 586 }, { "epoch": 1.3811764705882352, "grad_norm": 0.6843206286430359, "learning_rate": 8.829894717830452e-06, "loss": 0.121, "step": 587 }, { "epoch": 1.383529411764706, "grad_norm": 0.8064672350883484, "learning_rate": 8.825891143029887e-06, "loss": 0.1738, "step": 588 }, { "epoch": 1.3858823529411763, "grad_norm": 0.8015809059143066, "learning_rate": 8.821881641749141e-06, "loss": 0.214, "step": 589 }, { "epoch": 1.388235294117647, "grad_norm": 0.6182360053062439, "learning_rate": 8.817866220199933e-06, "loss": 0.1369, "step": 590 }, { "epoch": 1.3905882352941177, "grad_norm": 0.6076607704162598, "learning_rate": 8.813844884603143e-06, "loss": 0.1365, "step": 591 }, { "epoch": 1.3929411764705883, "grad_norm": 0.617878258228302, "learning_rate": 8.80981764118882e-06, "loss": 0.1164, "step": 592 }, { "epoch": 1.3952941176470588, "grad_norm": 0.7592839002609253, "learning_rate": 8.805784496196159e-06, "loss": 0.1511, "step": 593 }, { "epoch": 1.3976470588235295, "grad_norm": 0.5386309027671814, "learning_rate": 8.801745455873505e-06, "loss": 0.1293, "step": 594 }, { "epoch": 1.4, "grad_norm": 0.6760174036026001, "learning_rate": 8.797700526478338e-06, "loss": 0.1662, "step": 595 }, { "epoch": 1.4023529411764706, "grad_norm": 0.6481565833091736, "learning_rate": 8.793649714277252e-06, "loss": 0.1544, "step": 596 }, { "epoch": 1.4047058823529412, "grad_norm": 0.5505480766296387, "learning_rate": 8.789593025545962e-06, "loss": 0.1016, "step": 597 }, { "epoch": 1.4070588235294117, "grad_norm": 0.6323831081390381, "learning_rate": 8.785530466569286e-06, "loss": 0.1863, "step": 598 }, { "epoch": 1.4094117647058824, "grad_norm": 0.6490122675895691, "learning_rate": 8.781462043641136e-06, "loss": 0.1466, "step": 599 }, { "epoch": 1.4117647058823528, "grad_norm": 0.6417875289916992, "learning_rate": 8.777387763064506e-06, "loss": 0.1736, "step": 600 }, { "epoch": 1.4117647058823528, "eval_loss": 0.1951720267534256, "eval_runtime": 3.9243, "eval_samples_per_second": 28.285, "eval_steps_per_second": 1.019, "step": 600 }, { "epoch": 1.4141176470588235, "grad_norm": 0.6428032517433167, "learning_rate": 8.773307631151471e-06, "loss": 0.1568, "step": 601 }, { "epoch": 1.4164705882352941, "grad_norm": 0.6442651748657227, "learning_rate": 8.769221654223168e-06, "loss": 0.1788, "step": 602 }, { "epoch": 1.4188235294117648, "grad_norm": 0.6396656632423401, "learning_rate": 8.765129838609787e-06, "loss": 0.1754, "step": 603 }, { "epoch": 1.4211764705882353, "grad_norm": 0.7644724249839783, "learning_rate": 8.761032190650568e-06, "loss": 0.0905, "step": 604 }, { "epoch": 1.423529411764706, "grad_norm": 0.6846699118614197, "learning_rate": 8.75692871669378e-06, "loss": 0.1814, "step": 605 }, { "epoch": 1.4258823529411764, "grad_norm": 0.7070773243904114, "learning_rate": 8.75281942309673e-06, "loss": 0.148, "step": 606 }, { "epoch": 1.428235294117647, "grad_norm": 0.6888110637664795, "learning_rate": 8.748704316225726e-06, "loss": 0.1806, "step": 607 }, { "epoch": 1.4305882352941177, "grad_norm": 0.6245725750923157, "learning_rate": 8.744583402456095e-06, "loss": 0.1361, "step": 608 }, { "epoch": 1.4329411764705882, "grad_norm": 0.7001262903213501, "learning_rate": 8.740456688172154e-06, "loss": 0.1879, "step": 609 }, { "epoch": 1.4352941176470588, "grad_norm": 0.5712476372718811, "learning_rate": 8.736324179767205e-06, "loss": 0.1605, "step": 610 }, { "epoch": 1.4376470588235293, "grad_norm": 0.7866467833518982, "learning_rate": 8.732185883643532e-06, "loss": 0.1051, "step": 611 }, { "epoch": 1.44, "grad_norm": 0.6601927280426025, "learning_rate": 8.72804180621238e-06, "loss": 0.1718, "step": 612 }, { "epoch": 1.4423529411764706, "grad_norm": 0.6708106398582458, "learning_rate": 8.723891953893956e-06, "loss": 0.1668, "step": 613 }, { "epoch": 1.4447058823529413, "grad_norm": 0.6245149374008179, "learning_rate": 8.719736333117407e-06, "loss": 0.172, "step": 614 }, { "epoch": 1.4470588235294117, "grad_norm": 0.6206184029579163, "learning_rate": 8.715574950320826e-06, "loss": 0.1787, "step": 615 }, { "epoch": 1.4494117647058824, "grad_norm": 0.6721671223640442, "learning_rate": 8.711407811951225e-06, "loss": 0.178, "step": 616 }, { "epoch": 1.4517647058823528, "grad_norm": 0.6655483841896057, "learning_rate": 8.707234924464535e-06, "loss": 0.1463, "step": 617 }, { "epoch": 1.4541176470588235, "grad_norm": 0.57068932056427, "learning_rate": 8.703056294325597e-06, "loss": 0.174, "step": 618 }, { "epoch": 1.4564705882352942, "grad_norm": 0.6014278531074524, "learning_rate": 8.698871928008141e-06, "loss": 0.1605, "step": 619 }, { "epoch": 1.4588235294117646, "grad_norm": 0.7309080362319946, "learning_rate": 8.694681831994792e-06, "loss": 0.1867, "step": 620 }, { "epoch": 1.4611764705882353, "grad_norm": 0.660818338394165, "learning_rate": 8.690486012777049e-06, "loss": 0.2043, "step": 621 }, { "epoch": 1.4635294117647057, "grad_norm": 0.7248780727386475, "learning_rate": 8.686284476855276e-06, "loss": 0.1667, "step": 622 }, { "epoch": 1.4658823529411764, "grad_norm": 0.7171116471290588, "learning_rate": 8.682077230738698e-06, "loss": 0.1774, "step": 623 }, { "epoch": 1.468235294117647, "grad_norm": 0.5609745979309082, "learning_rate": 8.677864280945377e-06, "loss": 0.1357, "step": 624 }, { "epoch": 1.4705882352941178, "grad_norm": 0.6351557374000549, "learning_rate": 8.673645634002222e-06, "loss": 0.1874, "step": 625 }, { "epoch": 1.4729411764705882, "grad_norm": 0.7300127744674683, "learning_rate": 8.669421296444965e-06, "loss": 0.2083, "step": 626 }, { "epoch": 1.4752941176470589, "grad_norm": 0.6315834522247314, "learning_rate": 8.665191274818149e-06, "loss": 0.1234, "step": 627 }, { "epoch": 1.4776470588235293, "grad_norm": 0.7388418912887573, "learning_rate": 8.660955575675134e-06, "loss": 0.1891, "step": 628 }, { "epoch": 1.48, "grad_norm": 0.6592962741851807, "learning_rate": 8.656714205578059e-06, "loss": 0.183, "step": 629 }, { "epoch": 1.4823529411764707, "grad_norm": 0.7533652186393738, "learning_rate": 8.652467171097867e-06, "loss": 0.2354, "step": 630 }, { "epoch": 1.484705882352941, "grad_norm": 0.7389617562294006, "learning_rate": 8.648214478814265e-06, "loss": 0.2089, "step": 631 }, { "epoch": 1.4870588235294118, "grad_norm": 0.6428579688072205, "learning_rate": 8.643956135315731e-06, "loss": 0.1316, "step": 632 }, { "epoch": 1.4894117647058824, "grad_norm": 0.6167586445808411, "learning_rate": 8.639692147199492e-06, "loss": 0.1305, "step": 633 }, { "epoch": 1.4917647058823529, "grad_norm": 0.7557345628738403, "learning_rate": 8.635422521071529e-06, "loss": 0.1621, "step": 634 }, { "epoch": 1.4941176470588236, "grad_norm": 0.6332883834838867, "learning_rate": 8.631147263546547e-06, "loss": 0.1642, "step": 635 }, { "epoch": 1.4964705882352942, "grad_norm": 0.8696218729019165, "learning_rate": 8.626866381247985e-06, "loss": 0.1356, "step": 636 }, { "epoch": 1.4988235294117647, "grad_norm": 0.6551492810249329, "learning_rate": 8.62257988080799e-06, "loss": 0.1286, "step": 637 }, { "epoch": 1.5011764705882353, "grad_norm": 0.6086258292198181, "learning_rate": 8.618287768867413e-06, "loss": 0.1606, "step": 638 }, { "epoch": 1.5035294117647058, "grad_norm": 0.6094643473625183, "learning_rate": 8.613990052075808e-06, "loss": 0.1606, "step": 639 }, { "epoch": 1.5058823529411764, "grad_norm": 0.6372036337852478, "learning_rate": 8.609686737091394e-06, "loss": 0.1841, "step": 640 }, { "epoch": 1.5082352941176471, "grad_norm": 0.5628898739814758, "learning_rate": 8.605377830581082e-06, "loss": 0.1532, "step": 641 }, { "epoch": 1.5105882352941178, "grad_norm": 0.6394100785255432, "learning_rate": 8.601063339220434e-06, "loss": 0.1393, "step": 642 }, { "epoch": 1.5129411764705882, "grad_norm": 0.6562402248382568, "learning_rate": 8.596743269693669e-06, "loss": 0.1799, "step": 643 }, { "epoch": 1.5152941176470587, "grad_norm": 0.7296671271324158, "learning_rate": 8.592417628693648e-06, "loss": 0.2033, "step": 644 }, { "epoch": 1.5176470588235293, "grad_norm": 0.6036046147346497, "learning_rate": 8.588086422921858e-06, "loss": 0.1378, "step": 645 }, { "epoch": 1.52, "grad_norm": 0.5552055239677429, "learning_rate": 8.583749659088418e-06, "loss": 0.1056, "step": 646 }, { "epoch": 1.5223529411764707, "grad_norm": 0.6560239195823669, "learning_rate": 8.579407343912043e-06, "loss": 0.1274, "step": 647 }, { "epoch": 1.5247058823529411, "grad_norm": 0.6639567017555237, "learning_rate": 8.575059484120065e-06, "loss": 0.2026, "step": 648 }, { "epoch": 1.5270588235294118, "grad_norm": 0.7654052972793579, "learning_rate": 8.570706086448393e-06, "loss": 0.1524, "step": 649 }, { "epoch": 1.5294117647058822, "grad_norm": 0.7124572396278381, "learning_rate": 8.56634715764152e-06, "loss": 0.1417, "step": 650 }, { "epoch": 1.531764705882353, "grad_norm": 0.7110402584075928, "learning_rate": 8.56198270445251e-06, "loss": 0.2002, "step": 651 }, { "epoch": 1.5341176470588236, "grad_norm": 0.7214094996452332, "learning_rate": 8.557612733642986e-06, "loss": 0.2014, "step": 652 }, { "epoch": 1.5364705882352943, "grad_norm": 0.6378437280654907, "learning_rate": 8.553237251983116e-06, "loss": 0.1773, "step": 653 }, { "epoch": 1.5388235294117647, "grad_norm": 0.6170173287391663, "learning_rate": 8.548856266251603e-06, "loss": 0.1721, "step": 654 }, { "epoch": 1.5411764705882351, "grad_norm": 0.6917825937271118, "learning_rate": 8.544469783235689e-06, "loss": 0.1996, "step": 655 }, { "epoch": 1.5435294117647058, "grad_norm": 0.682044506072998, "learning_rate": 8.540077809731116e-06, "loss": 0.1672, "step": 656 }, { "epoch": 1.5458823529411765, "grad_norm": 0.6709150075912476, "learning_rate": 8.535680352542145e-06, "loss": 0.1522, "step": 657 }, { "epoch": 1.5482352941176472, "grad_norm": 0.7041885256767273, "learning_rate": 8.531277418481528e-06, "loss": 0.1359, "step": 658 }, { "epoch": 1.5505882352941176, "grad_norm": 0.71168053150177, "learning_rate": 8.526869014370505e-06, "loss": 0.229, "step": 659 }, { "epoch": 1.5529411764705883, "grad_norm": 0.8352916240692139, "learning_rate": 8.522455147038784e-06, "loss": 0.1498, "step": 660 }, { "epoch": 1.5552941176470587, "grad_norm": 0.7262111902236938, "learning_rate": 8.518035823324544e-06, "loss": 0.1872, "step": 661 }, { "epoch": 1.5576470588235294, "grad_norm": 0.6159939765930176, "learning_rate": 8.51361105007441e-06, "loss": 0.1562, "step": 662 }, { "epoch": 1.56, "grad_norm": 0.6531649231910706, "learning_rate": 8.509180834143459e-06, "loss": 0.1381, "step": 663 }, { "epoch": 1.5623529411764707, "grad_norm": 0.6642963290214539, "learning_rate": 8.50474518239519e-06, "loss": 0.1587, "step": 664 }, { "epoch": 1.5647058823529412, "grad_norm": 0.7403439283370972, "learning_rate": 8.500304101701533e-06, "loss": 0.1337, "step": 665 }, { "epoch": 1.5670588235294116, "grad_norm": 0.6963414549827576, "learning_rate": 8.49585759894282e-06, "loss": 0.1594, "step": 666 }, { "epoch": 1.5694117647058823, "grad_norm": 0.650708019733429, "learning_rate": 8.49140568100779e-06, "loss": 0.1954, "step": 667 }, { "epoch": 1.571764705882353, "grad_norm": 0.6918757557868958, "learning_rate": 8.486948354793565e-06, "loss": 0.1659, "step": 668 }, { "epoch": 1.5741176470588236, "grad_norm": 0.8411041498184204, "learning_rate": 8.482485627205654e-06, "loss": 0.1441, "step": 669 }, { "epoch": 1.576470588235294, "grad_norm": 0.697413980960846, "learning_rate": 8.478017505157928e-06, "loss": 0.1636, "step": 670 }, { "epoch": 1.5788235294117647, "grad_norm": 0.657934308052063, "learning_rate": 8.473543995572614e-06, "loss": 0.2007, "step": 671 }, { "epoch": 1.5811764705882352, "grad_norm": 0.5506691932678223, "learning_rate": 8.469065105380293e-06, "loss": 0.0918, "step": 672 }, { "epoch": 1.5835294117647059, "grad_norm": 0.6142928600311279, "learning_rate": 8.464580841519875e-06, "loss": 0.1288, "step": 673 }, { "epoch": 1.5858823529411765, "grad_norm": 0.7783555388450623, "learning_rate": 8.460091210938596e-06, "loss": 0.2344, "step": 674 }, { "epoch": 1.5882352941176472, "grad_norm": 0.7248787879943848, "learning_rate": 8.455596220592011e-06, "loss": 0.2108, "step": 675 }, { "epoch": 1.5905882352941176, "grad_norm": 0.5860445499420166, "learning_rate": 8.451095877443974e-06, "loss": 0.12, "step": 676 }, { "epoch": 1.592941176470588, "grad_norm": 0.6906816363334656, "learning_rate": 8.446590188466634e-06, "loss": 0.1913, "step": 677 }, { "epoch": 1.5952941176470588, "grad_norm": 0.574151337146759, "learning_rate": 8.442079160640418e-06, "loss": 0.0993, "step": 678 }, { "epoch": 1.5976470588235294, "grad_norm": 0.6443704962730408, "learning_rate": 8.437562800954033e-06, "loss": 0.1994, "step": 679 }, { "epoch": 1.6, "grad_norm": 0.6046055555343628, "learning_rate": 8.433041116404437e-06, "loss": 0.1859, "step": 680 }, { "epoch": 1.6023529411764705, "grad_norm": 0.700688362121582, "learning_rate": 8.428514113996842e-06, "loss": 0.1519, "step": 681 }, { "epoch": 1.6047058823529412, "grad_norm": 0.7318552732467651, "learning_rate": 8.4239818007447e-06, "loss": 0.2109, "step": 682 }, { "epoch": 1.6070588235294117, "grad_norm": 0.6271805763244629, "learning_rate": 8.419444183669687e-06, "loss": 0.1261, "step": 683 }, { "epoch": 1.6094117647058823, "grad_norm": 0.6707528233528137, "learning_rate": 8.414901269801699e-06, "loss": 0.1458, "step": 684 }, { "epoch": 1.611764705882353, "grad_norm": 0.7167073488235474, "learning_rate": 8.410353066178836e-06, "loss": 0.1383, "step": 685 }, { "epoch": 1.6141176470588237, "grad_norm": 0.7247610688209534, "learning_rate": 8.405799579847396e-06, "loss": 0.1544, "step": 686 }, { "epoch": 1.616470588235294, "grad_norm": 0.6828088760375977, "learning_rate": 8.401240817861857e-06, "loss": 0.1758, "step": 687 }, { "epoch": 1.6188235294117646, "grad_norm": 0.9505071043968201, "learning_rate": 8.396676787284876e-06, "loss": 0.1542, "step": 688 }, { "epoch": 1.6211764705882352, "grad_norm": 0.57636958360672, "learning_rate": 8.392107495187263e-06, "loss": 0.1365, "step": 689 }, { "epoch": 1.6235294117647059, "grad_norm": 0.6831150054931641, "learning_rate": 8.38753294864799e-06, "loss": 0.1897, "step": 690 }, { "epoch": 1.6258823529411766, "grad_norm": 0.7267597317695618, "learning_rate": 8.382953154754164e-06, "loss": 0.1576, "step": 691 }, { "epoch": 1.6282352941176472, "grad_norm": 0.7174981236457825, "learning_rate": 8.378368120601022e-06, "loss": 0.1852, "step": 692 }, { "epoch": 1.6305882352941177, "grad_norm": 0.7351318001747131, "learning_rate": 8.37377785329192e-06, "loss": 0.1661, "step": 693 }, { "epoch": 1.6329411764705881, "grad_norm": 0.6421177387237549, "learning_rate": 8.36918235993832e-06, "loss": 0.147, "step": 694 }, { "epoch": 1.6352941176470588, "grad_norm": 0.6125917434692383, "learning_rate": 8.364581647659781e-06, "loss": 0.1516, "step": 695 }, { "epoch": 1.6376470588235295, "grad_norm": 0.6743831634521484, "learning_rate": 8.359975723583953e-06, "loss": 0.1715, "step": 696 }, { "epoch": 1.6400000000000001, "grad_norm": 0.9951468110084534, "learning_rate": 8.355364594846549e-06, "loss": 0.1397, "step": 697 }, { "epoch": 1.6423529411764706, "grad_norm": 0.6808474659919739, "learning_rate": 8.350748268591357e-06, "loss": 0.1962, "step": 698 }, { "epoch": 1.644705882352941, "grad_norm": 0.5846621990203857, "learning_rate": 8.346126751970208e-06, "loss": 0.1125, "step": 699 }, { "epoch": 1.6470588235294117, "grad_norm": 0.614788830280304, "learning_rate": 8.34150005214298e-06, "loss": 0.1349, "step": 700 }, { "epoch": 1.6494117647058824, "grad_norm": 0.6202725768089294, "learning_rate": 8.336868176277577e-06, "loss": 0.1492, "step": 701 }, { "epoch": 1.651764705882353, "grad_norm": 0.6908979415893555, "learning_rate": 8.332231131549927e-06, "loss": 0.1908, "step": 702 }, { "epoch": 1.6541176470588237, "grad_norm": 0.6942629218101501, "learning_rate": 8.327588925143963e-06, "loss": 0.1699, "step": 703 }, { "epoch": 1.6564705882352941, "grad_norm": 0.739683985710144, "learning_rate": 8.322941564251613e-06, "loss": 0.1893, "step": 704 }, { "epoch": 1.6588235294117646, "grad_norm": 0.6604762673377991, "learning_rate": 8.318289056072791e-06, "loss": 0.174, "step": 705 }, { "epoch": 1.6611764705882353, "grad_norm": 0.6996854543685913, "learning_rate": 8.31363140781539e-06, "loss": 0.1742, "step": 706 }, { "epoch": 1.663529411764706, "grad_norm": 0.6255167722702026, "learning_rate": 8.308968626695265e-06, "loss": 0.1806, "step": 707 }, { "epoch": 1.6658823529411766, "grad_norm": 0.6817100644111633, "learning_rate": 8.304300719936212e-06, "loss": 0.1645, "step": 708 }, { "epoch": 1.668235294117647, "grad_norm": 0.7138106822967529, "learning_rate": 8.299627694769984e-06, "loss": 0.1736, "step": 709 }, { "epoch": 1.6705882352941175, "grad_norm": 0.6868867874145508, "learning_rate": 8.294949558436259e-06, "loss": 0.153, "step": 710 }, { "epoch": 1.6729411764705882, "grad_norm": 0.7232550382614136, "learning_rate": 8.290266318182625e-06, "loss": 0.106, "step": 711 }, { "epoch": 1.6752941176470588, "grad_norm": 0.7219283580780029, "learning_rate": 8.285577981264586e-06, "loss": 0.1767, "step": 712 }, { "epoch": 1.6776470588235295, "grad_norm": 0.6322694420814514, "learning_rate": 8.280884554945538e-06, "loss": 0.1676, "step": 713 }, { "epoch": 1.6800000000000002, "grad_norm": 0.6291202306747437, "learning_rate": 8.276186046496765e-06, "loss": 0.1898, "step": 714 }, { "epoch": 1.6823529411764706, "grad_norm": 0.6266096830368042, "learning_rate": 8.27148246319742e-06, "loss": 0.1229, "step": 715 }, { "epoch": 1.684705882352941, "grad_norm": 0.67790687084198, "learning_rate": 8.266773812334523e-06, "loss": 0.1669, "step": 716 }, { "epoch": 1.6870588235294117, "grad_norm": 1.0965697765350342, "learning_rate": 8.262060101202945e-06, "loss": 0.16, "step": 717 }, { "epoch": 1.6894117647058824, "grad_norm": 0.8378317356109619, "learning_rate": 8.257341337105388e-06, "loss": 0.16, "step": 718 }, { "epoch": 1.691764705882353, "grad_norm": 0.7140002250671387, "learning_rate": 8.252617527352394e-06, "loss": 0.2371, "step": 719 }, { "epoch": 1.6941176470588235, "grad_norm": 0.7533055543899536, "learning_rate": 8.247888679262312e-06, "loss": 0.1682, "step": 720 }, { "epoch": 1.696470588235294, "grad_norm": 0.6073035597801208, "learning_rate": 8.243154800161305e-06, "loss": 0.1651, "step": 721 }, { "epoch": 1.6988235294117646, "grad_norm": 0.6602287888526917, "learning_rate": 8.238415897383325e-06, "loss": 0.2139, "step": 722 }, { "epoch": 1.7011764705882353, "grad_norm": 0.6412165760993958, "learning_rate": 8.233671978270109e-06, "loss": 0.1658, "step": 723 }, { "epoch": 1.703529411764706, "grad_norm": 0.696045458316803, "learning_rate": 8.228923050171166e-06, "loss": 0.1436, "step": 724 }, { "epoch": 1.7058823529411766, "grad_norm": 0.6483584642410278, "learning_rate": 8.224169120443759e-06, "loss": 0.1863, "step": 725 }, { "epoch": 1.708235294117647, "grad_norm": 0.756818950176239, "learning_rate": 8.21941019645291e-06, "loss": 0.1958, "step": 726 }, { "epoch": 1.7105882352941175, "grad_norm": 0.7878131866455078, "learning_rate": 8.214646285571371e-06, "loss": 0.2279, "step": 727 }, { "epoch": 1.7129411764705882, "grad_norm": 0.5739184617996216, "learning_rate": 8.209877395179625e-06, "loss": 0.1045, "step": 728 }, { "epoch": 1.7152941176470589, "grad_norm": 0.6942077875137329, "learning_rate": 8.205103532665863e-06, "loss": 0.1513, "step": 729 }, { "epoch": 1.7176470588235295, "grad_norm": 0.7251983284950256, "learning_rate": 8.200324705425987e-06, "loss": 0.1403, "step": 730 }, { "epoch": 1.72, "grad_norm": 0.6048579216003418, "learning_rate": 8.195540920863579e-06, "loss": 0.1464, "step": 731 }, { "epoch": 1.7223529411764706, "grad_norm": 0.5932012796401978, "learning_rate": 8.190752186389913e-06, "loss": 0.1496, "step": 732 }, { "epoch": 1.724705882352941, "grad_norm": 0.6466701030731201, "learning_rate": 8.185958509423928e-06, "loss": 0.1605, "step": 733 }, { "epoch": 1.7270588235294118, "grad_norm": 0.8789752721786499, "learning_rate": 8.18115989739222e-06, "loss": 0.1659, "step": 734 }, { "epoch": 1.7294117647058824, "grad_norm": 0.6762556433677673, "learning_rate": 8.176356357729027e-06, "loss": 0.1587, "step": 735 }, { "epoch": 1.731764705882353, "grad_norm": 0.6186724305152893, "learning_rate": 8.171547897876225e-06, "loss": 0.1801, "step": 736 }, { "epoch": 1.7341176470588235, "grad_norm": 0.5944317579269409, "learning_rate": 8.166734525283312e-06, "loss": 0.1422, "step": 737 }, { "epoch": 1.736470588235294, "grad_norm": 0.6941853761672974, "learning_rate": 8.161916247407395e-06, "loss": 0.2174, "step": 738 }, { "epoch": 1.7388235294117647, "grad_norm": 0.7367253303527832, "learning_rate": 8.157093071713183e-06, "loss": 0.2138, "step": 739 }, { "epoch": 1.7411764705882353, "grad_norm": 0.7039114236831665, "learning_rate": 8.152265005672973e-06, "loss": 0.169, "step": 740 }, { "epoch": 1.743529411764706, "grad_norm": 0.6248987317085266, "learning_rate": 8.147432056766637e-06, "loss": 0.1685, "step": 741 }, { "epoch": 1.7458823529411764, "grad_norm": 0.7347227334976196, "learning_rate": 8.14259423248161e-06, "loss": 0.1536, "step": 742 }, { "epoch": 1.748235294117647, "grad_norm": 0.6044418811798096, "learning_rate": 8.137751540312887e-06, "loss": 0.1859, "step": 743 }, { "epoch": 1.7505882352941176, "grad_norm": 0.7060794234275818, "learning_rate": 8.132903987762999e-06, "loss": 0.2022, "step": 744 }, { "epoch": 1.7529411764705882, "grad_norm": 0.6494474411010742, "learning_rate": 8.128051582342e-06, "loss": 0.1441, "step": 745 }, { "epoch": 1.755294117647059, "grad_norm": 0.6881875991821289, "learning_rate": 8.12319433156748e-06, "loss": 0.1809, "step": 746 }, { "epoch": 1.7576470588235296, "grad_norm": 0.8490986227989197, "learning_rate": 8.118332242964522e-06, "loss": 0.1746, "step": 747 }, { "epoch": 1.76, "grad_norm": 0.6018800139427185, "learning_rate": 8.113465324065707e-06, "loss": 0.0853, "step": 748 }, { "epoch": 1.7623529411764705, "grad_norm": 0.6241636872291565, "learning_rate": 8.108593582411101e-06, "loss": 0.1503, "step": 749 }, { "epoch": 1.7647058823529411, "grad_norm": 0.6205480098724365, "learning_rate": 8.103717025548241e-06, "loss": 0.1563, "step": 750 }, { "epoch": 1.7670588235294118, "grad_norm": 0.6177165508270264, "learning_rate": 8.098835661032125e-06, "loss": 0.1728, "step": 751 }, { "epoch": 1.7694117647058825, "grad_norm": 0.6711541414260864, "learning_rate": 8.093949496425193e-06, "loss": 0.1318, "step": 752 }, { "epoch": 1.771764705882353, "grad_norm": 0.7834896445274353, "learning_rate": 8.089058539297333e-06, "loss": 0.1553, "step": 753 }, { "epoch": 1.7741176470588236, "grad_norm": 0.5765713453292847, "learning_rate": 8.084162797225847e-06, "loss": 0.1308, "step": 754 }, { "epoch": 1.776470588235294, "grad_norm": 0.670950174331665, "learning_rate": 8.079262277795456e-06, "loss": 0.1756, "step": 755 }, { "epoch": 1.7788235294117647, "grad_norm": 0.6155443787574768, "learning_rate": 8.07435698859828e-06, "loss": 0.185, "step": 756 }, { "epoch": 1.7811764705882354, "grad_norm": 0.7138822078704834, "learning_rate": 8.06944693723383e-06, "loss": 0.1353, "step": 757 }, { "epoch": 1.783529411764706, "grad_norm": 0.6463339328765869, "learning_rate": 8.064532131308994e-06, "loss": 0.2453, "step": 758 }, { "epoch": 1.7858823529411765, "grad_norm": 0.6775984168052673, "learning_rate": 8.059612578438025e-06, "loss": 0.1562, "step": 759 }, { "epoch": 1.788235294117647, "grad_norm": 0.663791298866272, "learning_rate": 8.054688286242531e-06, "loss": 0.1794, "step": 760 }, { "epoch": 1.7905882352941176, "grad_norm": 0.6618211269378662, "learning_rate": 8.049759262351466e-06, "loss": 0.1537, "step": 761 }, { "epoch": 1.7929411764705883, "grad_norm": 0.6180475950241089, "learning_rate": 8.044825514401106e-06, "loss": 0.1299, "step": 762 }, { "epoch": 1.795294117647059, "grad_norm": 0.7657858729362488, "learning_rate": 8.039887050035053e-06, "loss": 0.1831, "step": 763 }, { "epoch": 1.7976470588235294, "grad_norm": 0.6659495234489441, "learning_rate": 8.034943876904217e-06, "loss": 0.1857, "step": 764 }, { "epoch": 1.8, "grad_norm": 0.7488622665405273, "learning_rate": 8.029996002666797e-06, "loss": 0.19, "step": 765 }, { "epoch": 1.8023529411764705, "grad_norm": 0.7977464199066162, "learning_rate": 8.025043434988277e-06, "loss": 0.1649, "step": 766 }, { "epoch": 1.8047058823529412, "grad_norm": 0.6834933161735535, "learning_rate": 8.020086181541414e-06, "loss": 0.1137, "step": 767 }, { "epoch": 1.8070588235294118, "grad_norm": 0.5890786647796631, "learning_rate": 8.015124250006226e-06, "loss": 0.142, "step": 768 }, { "epoch": 1.8094117647058825, "grad_norm": 0.762107253074646, "learning_rate": 8.010157648069976e-06, "loss": 0.1555, "step": 769 }, { "epoch": 1.811764705882353, "grad_norm": 0.6065816879272461, "learning_rate": 8.005186383427159e-06, "loss": 0.1456, "step": 770 }, { "epoch": 1.8141176470588234, "grad_norm": 0.7715053558349609, "learning_rate": 8.0002104637795e-06, "loss": 0.2143, "step": 771 }, { "epoch": 1.816470588235294, "grad_norm": 0.6369625926017761, "learning_rate": 7.995229896835935e-06, "loss": 0.1657, "step": 772 }, { "epoch": 1.8188235294117647, "grad_norm": 0.8399288654327393, "learning_rate": 7.990244690312596e-06, "loss": 0.1446, "step": 773 }, { "epoch": 1.8211764705882354, "grad_norm": 0.663124144077301, "learning_rate": 7.985254851932806e-06, "loss": 0.1903, "step": 774 }, { "epoch": 1.8235294117647058, "grad_norm": 0.7091472148895264, "learning_rate": 7.980260389427063e-06, "loss": 0.1301, "step": 775 }, { "epoch": 1.8258823529411765, "grad_norm": 0.7239973545074463, "learning_rate": 7.975261310533029e-06, "loss": 0.1663, "step": 776 }, { "epoch": 1.828235294117647, "grad_norm": 0.6638039946556091, "learning_rate": 7.970257622995516e-06, "loss": 0.1782, "step": 777 }, { "epoch": 1.8305882352941176, "grad_norm": 0.6879507303237915, "learning_rate": 7.965249334566478e-06, "loss": 0.1932, "step": 778 }, { "epoch": 1.8329411764705883, "grad_norm": 0.8674580454826355, "learning_rate": 7.960236453005e-06, "loss": 0.1511, "step": 779 }, { "epoch": 1.835294117647059, "grad_norm": 0.7680577635765076, "learning_rate": 7.955218986077274e-06, "loss": 0.1853, "step": 780 }, { "epoch": 1.8376470588235294, "grad_norm": 0.6767309308052063, "learning_rate": 7.950196941556608e-06, "loss": 0.1668, "step": 781 }, { "epoch": 1.8399999999999999, "grad_norm": 0.6047348976135254, "learning_rate": 7.945170327223387e-06, "loss": 0.1911, "step": 782 }, { "epoch": 1.8423529411764705, "grad_norm": 0.7597337961196899, "learning_rate": 7.940139150865095e-06, "loss": 0.2077, "step": 783 }, { "epoch": 1.8447058823529412, "grad_norm": 0.7149922847747803, "learning_rate": 7.93510342027626e-06, "loss": 0.1689, "step": 784 }, { "epoch": 1.8470588235294119, "grad_norm": 0.6422591209411621, "learning_rate": 7.930063143258489e-06, "loss": 0.1421, "step": 785 }, { "epoch": 1.8494117647058823, "grad_norm": 0.5299738645553589, "learning_rate": 7.925018327620417e-06, "loss": 0.0903, "step": 786 }, { "epoch": 1.851764705882353, "grad_norm": 0.7970730066299438, "learning_rate": 7.919968981177714e-06, "loss": 0.1904, "step": 787 }, { "epoch": 1.8541176470588234, "grad_norm": 0.5970218181610107, "learning_rate": 7.914915111753074e-06, "loss": 0.1882, "step": 788 }, { "epoch": 1.856470588235294, "grad_norm": 0.7526482343673706, "learning_rate": 7.909856727176191e-06, "loss": 0.1673, "step": 789 }, { "epoch": 1.8588235294117648, "grad_norm": 0.5920747518539429, "learning_rate": 7.904793835283764e-06, "loss": 0.1589, "step": 790 }, { "epoch": 1.8611764705882354, "grad_norm": 0.5940408110618591, "learning_rate": 7.899726443919464e-06, "loss": 0.1654, "step": 791 }, { "epoch": 1.8635294117647059, "grad_norm": 0.6998118162155151, "learning_rate": 7.894654560933939e-06, "loss": 0.1695, "step": 792 }, { "epoch": 1.8658823529411763, "grad_norm": 0.6340261697769165, "learning_rate": 7.889578194184793e-06, "loss": 0.1841, "step": 793 }, { "epoch": 1.868235294117647, "grad_norm": 0.6209082007408142, "learning_rate": 7.88449735153658e-06, "loss": 0.1113, "step": 794 }, { "epoch": 1.8705882352941177, "grad_norm": 0.5461750626564026, "learning_rate": 7.879412040860787e-06, "loss": 0.1233, "step": 795 }, { "epoch": 1.8729411764705883, "grad_norm": 0.6491843461990356, "learning_rate": 7.874322270035818e-06, "loss": 0.1672, "step": 796 }, { "epoch": 1.8752941176470588, "grad_norm": 0.6836069822311401, "learning_rate": 7.869228046946997e-06, "loss": 0.1469, "step": 797 }, { "epoch": 1.8776470588235294, "grad_norm": 0.5914386510848999, "learning_rate": 7.864129379486533e-06, "loss": 0.1507, "step": 798 }, { "epoch": 1.88, "grad_norm": 0.6260696649551392, "learning_rate": 7.859026275553535e-06, "loss": 0.1159, "step": 799 }, { "epoch": 1.8823529411764706, "grad_norm": 1.7066631317138672, "learning_rate": 7.853918743053968e-06, "loss": 0.1506, "step": 800 }, { "epoch": 1.8823529411764706, "eval_loss": 0.18304488062858582, "eval_runtime": 3.8615, "eval_samples_per_second": 28.745, "eval_steps_per_second": 1.036, "step": 800 }, { "epoch": 1.8847058823529412, "grad_norm": 0.5768073797225952, "learning_rate": 7.848806789900676e-06, "loss": 0.1139, "step": 801 }, { "epoch": 1.887058823529412, "grad_norm": 0.6950066089630127, "learning_rate": 7.843690424013336e-06, "loss": 0.1903, "step": 802 }, { "epoch": 1.8894117647058823, "grad_norm": 0.6746254563331604, "learning_rate": 7.838569653318474e-06, "loss": 0.1057, "step": 803 }, { "epoch": 1.8917647058823528, "grad_norm": 0.6903824210166931, "learning_rate": 7.83344448574943e-06, "loss": 0.1482, "step": 804 }, { "epoch": 1.8941176470588235, "grad_norm": 0.693085253238678, "learning_rate": 7.828314929246366e-06, "loss": 0.1545, "step": 805 }, { "epoch": 1.8964705882352941, "grad_norm": 0.6785297989845276, "learning_rate": 7.823180991756231e-06, "loss": 0.1697, "step": 806 }, { "epoch": 1.8988235294117648, "grad_norm": 0.7329038381576538, "learning_rate": 7.818042681232775e-06, "loss": 0.2125, "step": 807 }, { "epoch": 1.9011764705882352, "grad_norm": 0.5725163817405701, "learning_rate": 7.81290000563651e-06, "loss": 0.1277, "step": 808 }, { "epoch": 1.903529411764706, "grad_norm": 0.6030727624893188, "learning_rate": 7.80775297293472e-06, "loss": 0.1381, "step": 809 }, { "epoch": 1.9058823529411764, "grad_norm": 1.1458784341812134, "learning_rate": 7.802601591101439e-06, "loss": 0.1913, "step": 810 }, { "epoch": 1.908235294117647, "grad_norm": 0.5845016837120056, "learning_rate": 7.79744586811743e-06, "loss": 0.1168, "step": 811 }, { "epoch": 1.9105882352941177, "grad_norm": 0.6708900332450867, "learning_rate": 7.792285811970188e-06, "loss": 0.1859, "step": 812 }, { "epoch": 1.9129411764705884, "grad_norm": 0.6742315292358398, "learning_rate": 7.787121430653925e-06, "loss": 0.1423, "step": 813 }, { "epoch": 1.9152941176470588, "grad_norm": 0.6809149980545044, "learning_rate": 7.781952732169545e-06, "loss": 0.1563, "step": 814 }, { "epoch": 1.9176470588235293, "grad_norm": 0.7855863571166992, "learning_rate": 7.776779724524645e-06, "loss": 0.1642, "step": 815 }, { "epoch": 1.92, "grad_norm": 0.5781853199005127, "learning_rate": 7.7716024157335e-06, "loss": 0.1369, "step": 816 }, { "epoch": 1.9223529411764706, "grad_norm": 0.6882116794586182, "learning_rate": 7.766420813817045e-06, "loss": 0.1717, "step": 817 }, { "epoch": 1.9247058823529413, "grad_norm": 0.7523902058601379, "learning_rate": 7.761234926802864e-06, "loss": 0.1131, "step": 818 }, { "epoch": 1.9270588235294117, "grad_norm": 0.7014837265014648, "learning_rate": 7.756044762725188e-06, "loss": 0.154, "step": 819 }, { "epoch": 1.9294117647058824, "grad_norm": 0.7352690100669861, "learning_rate": 7.750850329624868e-06, "loss": 0.1859, "step": 820 }, { "epoch": 1.9317647058823528, "grad_norm": 0.6227951049804688, "learning_rate": 7.745651635549368e-06, "loss": 0.179, "step": 821 }, { "epoch": 1.9341176470588235, "grad_norm": 0.7432917952537537, "learning_rate": 7.740448688552753e-06, "loss": 0.1926, "step": 822 }, { "epoch": 1.9364705882352942, "grad_norm": 0.7734039425849915, "learning_rate": 7.735241496695686e-06, "loss": 0.1409, "step": 823 }, { "epoch": 1.9388235294117648, "grad_norm": 0.7537693977355957, "learning_rate": 7.730030068045393e-06, "loss": 0.1498, "step": 824 }, { "epoch": 1.9411764705882353, "grad_norm": 0.6660434007644653, "learning_rate": 7.724814410675674e-06, "loss": 0.1592, "step": 825 }, { "epoch": 1.9435294117647057, "grad_norm": 0.5958448052406311, "learning_rate": 7.719594532666872e-06, "loss": 0.1194, "step": 826 }, { "epoch": 1.9458823529411764, "grad_norm": 0.5992336273193359, "learning_rate": 7.714370442105882e-06, "loss": 0.1434, "step": 827 }, { "epoch": 1.948235294117647, "grad_norm": 0.5783964395523071, "learning_rate": 7.70914214708611e-06, "loss": 0.1333, "step": 828 }, { "epoch": 1.9505882352941177, "grad_norm": 0.7397099733352661, "learning_rate": 7.703909655707482e-06, "loss": 0.1142, "step": 829 }, { "epoch": 1.9529411764705882, "grad_norm": 0.692244291305542, "learning_rate": 7.698672976076426e-06, "loss": 0.2046, "step": 830 }, { "epoch": 1.9552941176470588, "grad_norm": 0.6519684195518494, "learning_rate": 7.693432116305861e-06, "loss": 0.1438, "step": 831 }, { "epoch": 1.9576470588235293, "grad_norm": 1.1128325462341309, "learning_rate": 7.68818708451518e-06, "loss": 0.1646, "step": 832 }, { "epoch": 1.96, "grad_norm": 0.6476311087608337, "learning_rate": 7.682937888830234e-06, "loss": 0.1598, "step": 833 }, { "epoch": 1.9623529411764706, "grad_norm": 0.7262199521064758, "learning_rate": 7.677684537383334e-06, "loss": 0.126, "step": 834 }, { "epoch": 1.9647058823529413, "grad_norm": 0.6778689026832581, "learning_rate": 7.672427038313223e-06, "loss": 0.1632, "step": 835 }, { "epoch": 1.9670588235294117, "grad_norm": 0.6498765349388123, "learning_rate": 7.667165399765073e-06, "loss": 0.1379, "step": 836 }, { "epoch": 1.9694117647058822, "grad_norm": 0.6398577094078064, "learning_rate": 7.661899629890465e-06, "loss": 0.1975, "step": 837 }, { "epoch": 1.9717647058823529, "grad_norm": 0.5912958383560181, "learning_rate": 7.656629736847388e-06, "loss": 0.1781, "step": 838 }, { "epoch": 1.9741176470588235, "grad_norm": 0.6798421740531921, "learning_rate": 7.65135572880021e-06, "loss": 0.15, "step": 839 }, { "epoch": 1.9764705882352942, "grad_norm": 0.6837742328643799, "learning_rate": 7.646077613919682e-06, "loss": 0.1179, "step": 840 }, { "epoch": 1.9788235294117649, "grad_norm": 0.887593150138855, "learning_rate": 7.640795400382909e-06, "loss": 0.1521, "step": 841 }, { "epoch": 1.9811764705882353, "grad_norm": 0.6628987789154053, "learning_rate": 7.635509096373353e-06, "loss": 0.1693, "step": 842 }, { "epoch": 1.9835294117647058, "grad_norm": 0.7492585778236389, "learning_rate": 7.630218710080813e-06, "loss": 0.1619, "step": 843 }, { "epoch": 1.9858823529411764, "grad_norm": 0.6969640254974365, "learning_rate": 7.624924249701404e-06, "loss": 0.1595, "step": 844 }, { "epoch": 1.988235294117647, "grad_norm": 0.6538494825363159, "learning_rate": 7.619625723437567e-06, "loss": 0.198, "step": 845 }, { "epoch": 1.9905882352941178, "grad_norm": 0.7576302886009216, "learning_rate": 7.61432313949803e-06, "loss": 0.182, "step": 846 }, { "epoch": 1.9929411764705882, "grad_norm": 1.8366644382476807, "learning_rate": 7.60901650609781e-06, "loss": 0.1297, "step": 847 }, { "epoch": 1.9952941176470587, "grad_norm": 0.5837238430976868, "learning_rate": 7.603705831458199e-06, "loss": 0.1776, "step": 848 }, { "epoch": 1.9976470588235293, "grad_norm": 0.7496540546417236, "learning_rate": 7.598391123806754e-06, "loss": 0.1957, "step": 849 }, { "epoch": 2.0, "grad_norm": 0.5854604840278625, "learning_rate": 7.593072391377273e-06, "loss": 0.1194, "step": 850 }, { "epoch": 2.0023529411764707, "grad_norm": 0.5445462465286255, "learning_rate": 7.587749642409792e-06, "loss": 0.1312, "step": 851 }, { "epoch": 2.0047058823529413, "grad_norm": 0.5629523396492004, "learning_rate": 7.582422885150571e-06, "loss": 0.1155, "step": 852 }, { "epoch": 2.0070588235294116, "grad_norm": 0.5732834935188293, "learning_rate": 7.577092127852079e-06, "loss": 0.1014, "step": 853 }, { "epoch": 2.0094117647058822, "grad_norm": 0.7670931220054626, "learning_rate": 7.571757378772982e-06, "loss": 0.1018, "step": 854 }, { "epoch": 2.011764705882353, "grad_norm": 0.5530030131340027, "learning_rate": 7.566418646178128e-06, "loss": 0.097, "step": 855 }, { "epoch": 2.0141176470588236, "grad_norm": 0.5344820618629456, "learning_rate": 7.561075938338543e-06, "loss": 0.0739, "step": 856 }, { "epoch": 2.0164705882352942, "grad_norm": 0.8667047023773193, "learning_rate": 7.555729263531404e-06, "loss": 0.1126, "step": 857 }, { "epoch": 2.018823529411765, "grad_norm": 1.2236225605010986, "learning_rate": 7.550378630040036e-06, "loss": 0.0955, "step": 858 }, { "epoch": 2.021176470588235, "grad_norm": 0.894258439540863, "learning_rate": 7.5450240461539025e-06, "loss": 0.1288, "step": 859 }, { "epoch": 2.023529411764706, "grad_norm": 0.8086972832679749, "learning_rate": 7.5396655201685785e-06, "loss": 0.1208, "step": 860 }, { "epoch": 2.0258823529411765, "grad_norm": 0.7578272223472595, "learning_rate": 7.534303060385754e-06, "loss": 0.0876, "step": 861 }, { "epoch": 2.028235294117647, "grad_norm": 0.7803409695625305, "learning_rate": 7.528936675113205e-06, "loss": 0.0963, "step": 862 }, { "epoch": 2.030588235294118, "grad_norm": 0.7029657363891602, "learning_rate": 7.523566372664798e-06, "loss": 0.0931, "step": 863 }, { "epoch": 2.032941176470588, "grad_norm": 0.6954129338264465, "learning_rate": 7.518192161360464e-06, "loss": 0.1169, "step": 864 }, { "epoch": 2.0352941176470587, "grad_norm": 0.7383202314376831, "learning_rate": 7.51281404952619e-06, "loss": 0.1212, "step": 865 }, { "epoch": 2.0376470588235294, "grad_norm": 0.6631200909614563, "learning_rate": 7.5074320454940044e-06, "loss": 0.0948, "step": 866 }, { "epoch": 2.04, "grad_norm": 0.6731775403022766, "learning_rate": 7.502046157601968e-06, "loss": 0.1006, "step": 867 }, { "epoch": 2.0423529411764707, "grad_norm": 0.7255163788795471, "learning_rate": 7.496656394194155e-06, "loss": 0.1215, "step": 868 }, { "epoch": 2.0447058823529414, "grad_norm": 0.632131040096283, "learning_rate": 7.491262763620652e-06, "loss": 0.1041, "step": 869 }, { "epoch": 2.0470588235294116, "grad_norm": 0.6843262910842896, "learning_rate": 7.485865274237525e-06, "loss": 0.1156, "step": 870 }, { "epoch": 2.0494117647058823, "grad_norm": 0.7596901655197144, "learning_rate": 7.4804639344068295e-06, "loss": 0.0964, "step": 871 }, { "epoch": 2.051764705882353, "grad_norm": 0.6035149097442627, "learning_rate": 7.475058752496578e-06, "loss": 0.1161, "step": 872 }, { "epoch": 2.0541176470588236, "grad_norm": 0.9486169219017029, "learning_rate": 7.469649736880739e-06, "loss": 0.0713, "step": 873 }, { "epoch": 2.0564705882352943, "grad_norm": 0.6306257247924805, "learning_rate": 7.464236895939224e-06, "loss": 0.1333, "step": 874 }, { "epoch": 2.0588235294117645, "grad_norm": 0.7045282125473022, "learning_rate": 7.458820238057862e-06, "loss": 0.1079, "step": 875 }, { "epoch": 2.061176470588235, "grad_norm": 0.8000248670578003, "learning_rate": 7.453399771628403e-06, "loss": 0.143, "step": 876 }, { "epoch": 2.063529411764706, "grad_norm": 0.6590808033943176, "learning_rate": 7.447975505048493e-06, "loss": 0.1008, "step": 877 }, { "epoch": 2.0658823529411765, "grad_norm": 0.6255709528923035, "learning_rate": 7.442547446721666e-06, "loss": 0.0845, "step": 878 }, { "epoch": 2.068235294117647, "grad_norm": 0.7033380270004272, "learning_rate": 7.437115605057335e-06, "loss": 0.1354, "step": 879 }, { "epoch": 2.070588235294118, "grad_norm": 0.633857786655426, "learning_rate": 7.431679988470765e-06, "loss": 0.0932, "step": 880 }, { "epoch": 2.072941176470588, "grad_norm": 0.7986199259757996, "learning_rate": 7.4262406053830825e-06, "loss": 0.1192, "step": 881 }, { "epoch": 2.0752941176470587, "grad_norm": 0.685253381729126, "learning_rate": 7.420797464221235e-06, "loss": 0.0944, "step": 882 }, { "epoch": 2.0776470588235294, "grad_norm": 0.6974480748176575, "learning_rate": 7.415350573418004e-06, "loss": 0.1071, "step": 883 }, { "epoch": 2.08, "grad_norm": 0.7460752725601196, "learning_rate": 7.409899941411974e-06, "loss": 0.0757, "step": 884 }, { "epoch": 2.0823529411764707, "grad_norm": 0.595422089099884, "learning_rate": 7.404445576647522e-06, "loss": 0.0831, "step": 885 }, { "epoch": 2.084705882352941, "grad_norm": 0.6460819244384766, "learning_rate": 7.398987487574818e-06, "loss": 0.1041, "step": 886 }, { "epoch": 2.0870588235294116, "grad_norm": 0.8694797158241272, "learning_rate": 7.393525682649792e-06, "loss": 0.1299, "step": 887 }, { "epoch": 2.0894117647058823, "grad_norm": 0.7238962054252625, "learning_rate": 7.388060170334139e-06, "loss": 0.1097, "step": 888 }, { "epoch": 2.091764705882353, "grad_norm": 0.6399598717689514, "learning_rate": 7.382590959095293e-06, "loss": 0.095, "step": 889 }, { "epoch": 2.0941176470588236, "grad_norm": 0.6618461608886719, "learning_rate": 7.37711805740642e-06, "loss": 0.1021, "step": 890 }, { "epoch": 2.0964705882352943, "grad_norm": 0.6150338053703308, "learning_rate": 7.3716414737463985e-06, "loss": 0.1152, "step": 891 }, { "epoch": 2.0988235294117645, "grad_norm": 0.6016033887863159, "learning_rate": 7.366161216599821e-06, "loss": 0.0823, "step": 892 }, { "epoch": 2.101176470588235, "grad_norm": 0.8280043005943298, "learning_rate": 7.360677294456963e-06, "loss": 0.0886, "step": 893 }, { "epoch": 2.103529411764706, "grad_norm": 0.7198240160942078, "learning_rate": 7.355189715813782e-06, "loss": 0.096, "step": 894 }, { "epoch": 2.1058823529411765, "grad_norm": 0.5953574776649475, "learning_rate": 7.349698489171897e-06, "loss": 0.0915, "step": 895 }, { "epoch": 2.108235294117647, "grad_norm": 0.8057647347450256, "learning_rate": 7.344203623038585e-06, "loss": 0.1474, "step": 896 }, { "epoch": 2.1105882352941174, "grad_norm": 0.687349796295166, "learning_rate": 7.33870512592675e-06, "loss": 0.0959, "step": 897 }, { "epoch": 2.112941176470588, "grad_norm": 0.7956629395484924, "learning_rate": 7.333203006354938e-06, "loss": 0.1047, "step": 898 }, { "epoch": 2.1152941176470588, "grad_norm": 0.6993011832237244, "learning_rate": 7.327697272847291e-06, "loss": 0.1091, "step": 899 }, { "epoch": 2.1176470588235294, "grad_norm": 0.6338966488838196, "learning_rate": 7.322187933933557e-06, "loss": 0.1099, "step": 900 }, { "epoch": 2.12, "grad_norm": 0.5996807217597961, "learning_rate": 7.316674998149072e-06, "loss": 0.0962, "step": 901 }, { "epoch": 2.1223529411764708, "grad_norm": 0.6636237502098083, "learning_rate": 7.311158474034736e-06, "loss": 0.1077, "step": 902 }, { "epoch": 2.124705882352941, "grad_norm": 0.6790735125541687, "learning_rate": 7.305638370137021e-06, "loss": 0.0961, "step": 903 }, { "epoch": 2.1270588235294117, "grad_norm": 0.5587031841278076, "learning_rate": 7.300114695007935e-06, "loss": 0.0703, "step": 904 }, { "epoch": 2.1294117647058823, "grad_norm": 0.6436545252799988, "learning_rate": 7.29458745720502e-06, "loss": 0.1022, "step": 905 }, { "epoch": 2.131764705882353, "grad_norm": 0.7564565539360046, "learning_rate": 7.2890566652913396e-06, "loss": 0.1065, "step": 906 }, { "epoch": 2.1341176470588237, "grad_norm": 0.743167519569397, "learning_rate": 7.283522327835464e-06, "loss": 0.1123, "step": 907 }, { "epoch": 2.1364705882352943, "grad_norm": 0.7537655234336853, "learning_rate": 7.2779844534114554e-06, "loss": 0.1266, "step": 908 }, { "epoch": 2.1388235294117646, "grad_norm": 0.7517241835594177, "learning_rate": 7.272443050598853e-06, "loss": 0.092, "step": 909 }, { "epoch": 2.1411764705882352, "grad_norm": 0.6453754901885986, "learning_rate": 7.266898127982668e-06, "loss": 0.1215, "step": 910 }, { "epoch": 2.143529411764706, "grad_norm": 0.5562135577201843, "learning_rate": 7.261349694153359e-06, "loss": 0.0803, "step": 911 }, { "epoch": 2.1458823529411766, "grad_norm": 0.6285406947135925, "learning_rate": 7.25579775770683e-06, "loss": 0.1171, "step": 912 }, { "epoch": 2.1482352941176472, "grad_norm": 0.7562421560287476, "learning_rate": 7.250242327244404e-06, "loss": 0.1013, "step": 913 }, { "epoch": 2.1505882352941175, "grad_norm": 0.6755399107933044, "learning_rate": 7.244683411372826e-06, "loss": 0.095, "step": 914 }, { "epoch": 2.152941176470588, "grad_norm": 0.6340415477752686, "learning_rate": 7.239121018704233e-06, "loss": 0.0912, "step": 915 }, { "epoch": 2.155294117647059, "grad_norm": 0.6292347311973572, "learning_rate": 7.2335551578561525e-06, "loss": 0.1226, "step": 916 }, { "epoch": 2.1576470588235295, "grad_norm": 0.7732092142105103, "learning_rate": 7.227985837451483e-06, "loss": 0.1128, "step": 917 }, { "epoch": 2.16, "grad_norm": 0.7970018982887268, "learning_rate": 7.2224130661184875e-06, "loss": 0.1062, "step": 918 }, { "epoch": 2.1623529411764704, "grad_norm": 0.6562044024467468, "learning_rate": 7.216836852490769e-06, "loss": 0.0896, "step": 919 }, { "epoch": 2.164705882352941, "grad_norm": 0.6592133045196533, "learning_rate": 7.211257205207266e-06, "loss": 0.1091, "step": 920 }, { "epoch": 2.1670588235294117, "grad_norm": 0.5953753590583801, "learning_rate": 7.205674132912237e-06, "loss": 0.0745, "step": 921 }, { "epoch": 2.1694117647058824, "grad_norm": 0.707253634929657, "learning_rate": 7.200087644255248e-06, "loss": 0.0999, "step": 922 }, { "epoch": 2.171764705882353, "grad_norm": 0.6675316095352173, "learning_rate": 7.194497747891155e-06, "loss": 0.0624, "step": 923 }, { "epoch": 2.1741176470588237, "grad_norm": 0.8266602158546448, "learning_rate": 7.188904452480092e-06, "loss": 0.0898, "step": 924 }, { "epoch": 2.176470588235294, "grad_norm": 1.1246784925460815, "learning_rate": 7.183307766687467e-06, "loss": 0.116, "step": 925 }, { "epoch": 2.1788235294117646, "grad_norm": 0.669424295425415, "learning_rate": 7.1777076991839315e-06, "loss": 0.0914, "step": 926 }, { "epoch": 2.1811764705882353, "grad_norm": 0.6950623989105225, "learning_rate": 7.17210425864538e-06, "loss": 0.0753, "step": 927 }, { "epoch": 2.183529411764706, "grad_norm": 0.7236522436141968, "learning_rate": 7.166497453752934e-06, "loss": 0.0853, "step": 928 }, { "epoch": 2.1858823529411766, "grad_norm": 0.6838486790657043, "learning_rate": 7.160887293192924e-06, "loss": 0.0953, "step": 929 }, { "epoch": 2.1882352941176473, "grad_norm": 0.7558231353759766, "learning_rate": 7.155273785656883e-06, "loss": 0.0875, "step": 930 }, { "epoch": 2.1905882352941175, "grad_norm": 0.5589550137519836, "learning_rate": 7.1496569398415265e-06, "loss": 0.0715, "step": 931 }, { "epoch": 2.192941176470588, "grad_norm": 0.7954613566398621, "learning_rate": 7.144036764448738e-06, "loss": 0.1405, "step": 932 }, { "epoch": 2.195294117647059, "grad_norm": 0.7566816806793213, "learning_rate": 7.138413268185571e-06, "loss": 0.1147, "step": 933 }, { "epoch": 2.1976470588235295, "grad_norm": 0.5963475108146667, "learning_rate": 7.1327864597642124e-06, "loss": 0.0963, "step": 934 }, { "epoch": 2.2, "grad_norm": 0.6255870461463928, "learning_rate": 7.127156347901987e-06, "loss": 0.1031, "step": 935 }, { "epoch": 2.2023529411764704, "grad_norm": 0.7973399758338928, "learning_rate": 7.121522941321333e-06, "loss": 0.1044, "step": 936 }, { "epoch": 2.204705882352941, "grad_norm": 0.6446323990821838, "learning_rate": 7.115886248749797e-06, "loss": 0.1178, "step": 937 }, { "epoch": 2.2070588235294117, "grad_norm": 0.6620628833770752, "learning_rate": 7.110246278920015e-06, "loss": 0.0853, "step": 938 }, { "epoch": 2.2094117647058824, "grad_norm": 0.9005703926086426, "learning_rate": 7.104603040569695e-06, "loss": 0.1025, "step": 939 }, { "epoch": 2.211764705882353, "grad_norm": 0.848835289478302, "learning_rate": 7.09895654244162e-06, "loss": 0.1296, "step": 940 }, { "epoch": 2.2141176470588233, "grad_norm": 0.7494860887527466, "learning_rate": 7.0933067932836145e-06, "loss": 0.1137, "step": 941 }, { "epoch": 2.216470588235294, "grad_norm": 0.6608048677444458, "learning_rate": 7.087653801848539e-06, "loss": 0.0838, "step": 942 }, { "epoch": 2.2188235294117646, "grad_norm": 0.6985036134719849, "learning_rate": 7.081997576894285e-06, "loss": 0.1303, "step": 943 }, { "epoch": 2.2211764705882353, "grad_norm": 0.7739919424057007, "learning_rate": 7.076338127183745e-06, "loss": 0.1284, "step": 944 }, { "epoch": 2.223529411764706, "grad_norm": 0.6924099326133728, "learning_rate": 7.070675461484814e-06, "loss": 0.1202, "step": 945 }, { "epoch": 2.2258823529411766, "grad_norm": 0.613657534122467, "learning_rate": 7.065009588570362e-06, "loss": 0.0905, "step": 946 }, { "epoch": 2.228235294117647, "grad_norm": 0.7288751006126404, "learning_rate": 7.059340517218236e-06, "loss": 0.1067, "step": 947 }, { "epoch": 2.2305882352941175, "grad_norm": 0.6184800863265991, "learning_rate": 7.053668256211235e-06, "loss": 0.0872, "step": 948 }, { "epoch": 2.232941176470588, "grad_norm": 0.666199803352356, "learning_rate": 7.047992814337094e-06, "loss": 0.0995, "step": 949 }, { "epoch": 2.235294117647059, "grad_norm": 0.7280573844909668, "learning_rate": 7.042314200388486e-06, "loss": 0.0909, "step": 950 }, { "epoch": 2.2376470588235295, "grad_norm": 0.6218176484107971, "learning_rate": 7.036632423162992e-06, "loss": 0.1035, "step": 951 }, { "epoch": 2.24, "grad_norm": 0.6345543265342712, "learning_rate": 7.030947491463092e-06, "loss": 0.0958, "step": 952 }, { "epoch": 2.2423529411764704, "grad_norm": 0.6349145770072937, "learning_rate": 7.025259414096159e-06, "loss": 0.1007, "step": 953 }, { "epoch": 2.244705882352941, "grad_norm": 0.7322121262550354, "learning_rate": 7.0195681998744345e-06, "loss": 0.1326, "step": 954 }, { "epoch": 2.2470588235294118, "grad_norm": 0.6535943150520325, "learning_rate": 7.013873857615023e-06, "loss": 0.0725, "step": 955 }, { "epoch": 2.2494117647058824, "grad_norm": 0.6348268389701843, "learning_rate": 7.008176396139873e-06, "loss": 0.1232, "step": 956 }, { "epoch": 2.251764705882353, "grad_norm": 0.6840246319770813, "learning_rate": 7.0024758242757675e-06, "loss": 0.11, "step": 957 }, { "epoch": 2.2541176470588233, "grad_norm": 1.3789271116256714, "learning_rate": 6.996772150854305e-06, "loss": 0.0824, "step": 958 }, { "epoch": 2.256470588235294, "grad_norm": 0.8073804378509521, "learning_rate": 6.991065384711893e-06, "loss": 0.1117, "step": 959 }, { "epoch": 2.2588235294117647, "grad_norm": 0.6958234310150146, "learning_rate": 6.985355534689727e-06, "loss": 0.0994, "step": 960 }, { "epoch": 2.2611764705882353, "grad_norm": 0.7071972489356995, "learning_rate": 6.979642609633782e-06, "loss": 0.1084, "step": 961 }, { "epoch": 2.263529411764706, "grad_norm": 0.7164071798324585, "learning_rate": 6.973926618394796e-06, "loss": 0.0999, "step": 962 }, { "epoch": 2.2658823529411762, "grad_norm": 0.839823842048645, "learning_rate": 6.96820756982826e-06, "loss": 0.081, "step": 963 }, { "epoch": 2.268235294117647, "grad_norm": 0.6861598491668701, "learning_rate": 6.9624854727943954e-06, "loss": 0.106, "step": 964 }, { "epoch": 2.2705882352941176, "grad_norm": 0.7795845866203308, "learning_rate": 6.956760336158155e-06, "loss": 0.1007, "step": 965 }, { "epoch": 2.2729411764705882, "grad_norm": 0.6420612335205078, "learning_rate": 6.951032168789192e-06, "loss": 0.1037, "step": 966 }, { "epoch": 2.275294117647059, "grad_norm": 0.6562736630439758, "learning_rate": 6.945300979561865e-06, "loss": 0.1055, "step": 967 }, { "epoch": 2.2776470588235296, "grad_norm": 0.7601194977760315, "learning_rate": 6.939566777355203e-06, "loss": 0.1268, "step": 968 }, { "epoch": 2.2800000000000002, "grad_norm": 0.6228320598602295, "learning_rate": 6.933829571052909e-06, "loss": 0.0543, "step": 969 }, { "epoch": 2.2823529411764705, "grad_norm": 0.6879302859306335, "learning_rate": 6.928089369543342e-06, "loss": 0.0798, "step": 970 }, { "epoch": 2.284705882352941, "grad_norm": 0.7069498896598816, "learning_rate": 6.9223461817194955e-06, "loss": 0.0759, "step": 971 }, { "epoch": 2.287058823529412, "grad_norm": 0.7364022135734558, "learning_rate": 6.916600016478994e-06, "loss": 0.074, "step": 972 }, { "epoch": 2.2894117647058825, "grad_norm": 0.7046099305152893, "learning_rate": 6.910850882724074e-06, "loss": 0.1089, "step": 973 }, { "epoch": 2.291764705882353, "grad_norm": 0.6596800088882446, "learning_rate": 6.9050987893615695e-06, "loss": 0.1077, "step": 974 }, { "epoch": 2.2941176470588234, "grad_norm": 0.8446831703186035, "learning_rate": 6.8993437453029014e-06, "loss": 0.118, "step": 975 }, { "epoch": 2.296470588235294, "grad_norm": 0.7889611124992371, "learning_rate": 6.893585759464059e-06, "loss": 0.0914, "step": 976 }, { "epoch": 2.2988235294117647, "grad_norm": 0.6980134844779968, "learning_rate": 6.887824840765593e-06, "loss": 0.0949, "step": 977 }, { "epoch": 2.3011764705882354, "grad_norm": 0.7179991602897644, "learning_rate": 6.882060998132598e-06, "loss": 0.1352, "step": 978 }, { "epoch": 2.303529411764706, "grad_norm": 0.7880101799964905, "learning_rate": 6.876294240494691e-06, "loss": 0.0933, "step": 979 }, { "epoch": 2.3058823529411763, "grad_norm": 0.8488783240318298, "learning_rate": 6.870524576786018e-06, "loss": 0.0891, "step": 980 }, { "epoch": 2.308235294117647, "grad_norm": 0.6107221245765686, "learning_rate": 6.864752015945213e-06, "loss": 0.1015, "step": 981 }, { "epoch": 2.3105882352941176, "grad_norm": 0.6565219163894653, "learning_rate": 6.858976566915409e-06, "loss": 0.1248, "step": 982 }, { "epoch": 2.3129411764705883, "grad_norm": 0.7723371386528015, "learning_rate": 6.853198238644209e-06, "loss": 0.1219, "step": 983 }, { "epoch": 2.315294117647059, "grad_norm": 0.758979082107544, "learning_rate": 6.847417040083676e-06, "loss": 0.1119, "step": 984 }, { "epoch": 2.317647058823529, "grad_norm": 0.7551476955413818, "learning_rate": 6.8416329801903245e-06, "loss": 0.0837, "step": 985 }, { "epoch": 2.32, "grad_norm": 0.681731641292572, "learning_rate": 6.835846067925096e-06, "loss": 0.1003, "step": 986 }, { "epoch": 2.3223529411764705, "grad_norm": 1.1742265224456787, "learning_rate": 6.830056312253355e-06, "loss": 0.1369, "step": 987 }, { "epoch": 2.324705882352941, "grad_norm": 0.7118970155715942, "learning_rate": 6.824263722144869e-06, "loss": 0.1082, "step": 988 }, { "epoch": 2.327058823529412, "grad_norm": 0.7051188349723816, "learning_rate": 6.818468306573796e-06, "loss": 0.1022, "step": 989 }, { "epoch": 2.3294117647058825, "grad_norm": 0.7028709053993225, "learning_rate": 6.812670074518676e-06, "loss": 0.0958, "step": 990 }, { "epoch": 2.331764705882353, "grad_norm": 0.6872815489768982, "learning_rate": 6.806869034962407e-06, "loss": 0.1031, "step": 991 }, { "epoch": 2.3341176470588234, "grad_norm": 0.68505859375, "learning_rate": 6.801065196892241e-06, "loss": 0.0985, "step": 992 }, { "epoch": 2.336470588235294, "grad_norm": 0.6905984878540039, "learning_rate": 6.7952585692997615e-06, "loss": 0.105, "step": 993 }, { "epoch": 2.3388235294117647, "grad_norm": 0.7626079320907593, "learning_rate": 6.789449161180877e-06, "loss": 0.1244, "step": 994 }, { "epoch": 2.3411764705882354, "grad_norm": 0.6935927271842957, "learning_rate": 6.783636981535802e-06, "loss": 0.0995, "step": 995 }, { "epoch": 2.343529411764706, "grad_norm": 0.6894261837005615, "learning_rate": 6.777822039369043e-06, "loss": 0.107, "step": 996 }, { "epoch": 2.3458823529411763, "grad_norm": 0.7383339405059814, "learning_rate": 6.77200434368939e-06, "loss": 0.1113, "step": 997 }, { "epoch": 2.348235294117647, "grad_norm": 0.7424079775810242, "learning_rate": 6.7661839035099e-06, "loss": 0.0767, "step": 998 }, { "epoch": 2.3505882352941176, "grad_norm": 0.6830792427062988, "learning_rate": 6.760360727847874e-06, "loss": 0.1026, "step": 999 }, { "epoch": 2.3529411764705883, "grad_norm": 0.596807599067688, "learning_rate": 6.754534825724861e-06, "loss": 0.108, "step": 1000 }, { "epoch": 2.3529411764705883, "eval_loss": 0.190450981259346, "eval_runtime": 3.8594, "eval_samples_per_second": 28.761, "eval_steps_per_second": 1.036, "step": 1000 }, { "epoch": 2.355294117647059, "grad_norm": 0.6934550404548645, "learning_rate": 6.748706206166624e-06, "loss": 0.068, "step": 1001 }, { "epoch": 2.357647058823529, "grad_norm": 0.7162528038024902, "learning_rate": 6.742874878203147e-06, "loss": 0.133, "step": 1002 }, { "epoch": 2.36, "grad_norm": 0.7570451498031616, "learning_rate": 6.7370408508686016e-06, "loss": 0.1215, "step": 1003 }, { "epoch": 2.3623529411764705, "grad_norm": 0.5897073149681091, "learning_rate": 6.731204133201342e-06, "loss": 0.0918, "step": 1004 }, { "epoch": 2.364705882352941, "grad_norm": 0.682389497756958, "learning_rate": 6.725364734243893e-06, "loss": 0.0975, "step": 1005 }, { "epoch": 2.367058823529412, "grad_norm": 0.7141888737678528, "learning_rate": 6.719522663042933e-06, "loss": 0.1264, "step": 1006 }, { "epoch": 2.369411764705882, "grad_norm": 0.6601932048797607, "learning_rate": 6.713677928649284e-06, "loss": 0.125, "step": 1007 }, { "epoch": 2.3717647058823528, "grad_norm": 0.6385452747344971, "learning_rate": 6.707830540117882e-06, "loss": 0.0949, "step": 1008 }, { "epoch": 2.3741176470588234, "grad_norm": 0.7025374174118042, "learning_rate": 6.701980506507792e-06, "loss": 0.0995, "step": 1009 }, { "epoch": 2.376470588235294, "grad_norm": 0.6467106938362122, "learning_rate": 6.696127836882159e-06, "loss": 0.1117, "step": 1010 }, { "epoch": 2.378823529411765, "grad_norm": 0.6720057129859924, "learning_rate": 6.690272540308227e-06, "loss": 0.1139, "step": 1011 }, { "epoch": 2.3811764705882354, "grad_norm": 0.5336529612541199, "learning_rate": 6.684414625857301e-06, "loss": 0.0921, "step": 1012 }, { "epoch": 2.383529411764706, "grad_norm": 0.6144494414329529, "learning_rate": 6.678554102604744e-06, "loss": 0.0898, "step": 1013 }, { "epoch": 2.3858823529411763, "grad_norm": 0.5812954902648926, "learning_rate": 6.672690979629961e-06, "loss": 0.1033, "step": 1014 }, { "epoch": 2.388235294117647, "grad_norm": 0.6536157131195068, "learning_rate": 6.666825266016386e-06, "loss": 0.0917, "step": 1015 }, { "epoch": 2.3905882352941177, "grad_norm": 0.6393827795982361, "learning_rate": 6.660956970851462e-06, "loss": 0.07, "step": 1016 }, { "epoch": 2.3929411764705883, "grad_norm": 0.6544288396835327, "learning_rate": 6.6550861032266365e-06, "loss": 0.0923, "step": 1017 }, { "epoch": 2.395294117647059, "grad_norm": 0.7226643562316895, "learning_rate": 6.64921267223734e-06, "loss": 0.0826, "step": 1018 }, { "epoch": 2.3976470588235292, "grad_norm": 0.7735269069671631, "learning_rate": 6.643336686982975e-06, "loss": 0.1076, "step": 1019 }, { "epoch": 2.4, "grad_norm": 0.6615123152732849, "learning_rate": 6.637458156566902e-06, "loss": 0.0998, "step": 1020 }, { "epoch": 2.4023529411764706, "grad_norm": 0.6815060377120972, "learning_rate": 6.631577090096419e-06, "loss": 0.0983, "step": 1021 }, { "epoch": 2.4047058823529412, "grad_norm": 1.133968710899353, "learning_rate": 6.625693496682764e-06, "loss": 0.1077, "step": 1022 }, { "epoch": 2.407058823529412, "grad_norm": 0.7098557949066162, "learning_rate": 6.619807385441076e-06, "loss": 0.1178, "step": 1023 }, { "epoch": 2.409411764705882, "grad_norm": 0.7487027645111084, "learning_rate": 6.6139187654904055e-06, "loss": 0.0944, "step": 1024 }, { "epoch": 2.411764705882353, "grad_norm": 0.6209661960601807, "learning_rate": 6.60802764595369e-06, "loss": 0.0815, "step": 1025 }, { "epoch": 2.4141176470588235, "grad_norm": 0.7824269533157349, "learning_rate": 6.602134035957728e-06, "loss": 0.1256, "step": 1026 }, { "epoch": 2.416470588235294, "grad_norm": 0.7254578471183777, "learning_rate": 6.5962379446331885e-06, "loss": 0.0808, "step": 1027 }, { "epoch": 2.418823529411765, "grad_norm": 0.7608013153076172, "learning_rate": 6.590339381114579e-06, "loss": 0.0851, "step": 1028 }, { "epoch": 2.4211764705882355, "grad_norm": 0.7685977816581726, "learning_rate": 6.584438354540239e-06, "loss": 0.1707, "step": 1029 }, { "epoch": 2.4235294117647057, "grad_norm": 0.6505150198936462, "learning_rate": 6.578534874052324e-06, "loss": 0.1021, "step": 1030 }, { "epoch": 2.4258823529411764, "grad_norm": 0.6475066542625427, "learning_rate": 6.572628948796787e-06, "loss": 0.0943, "step": 1031 }, { "epoch": 2.428235294117647, "grad_norm": 0.7289161086082458, "learning_rate": 6.566720587923373e-06, "loss": 0.1467, "step": 1032 }, { "epoch": 2.4305882352941177, "grad_norm": 0.47064879536628723, "learning_rate": 6.5608098005856e-06, "loss": 0.0603, "step": 1033 }, { "epoch": 2.4329411764705884, "grad_norm": 0.7041738629341125, "learning_rate": 6.5548965959407454e-06, "loss": 0.1279, "step": 1034 }, { "epoch": 2.435294117647059, "grad_norm": 0.724490761756897, "learning_rate": 6.548980983149827e-06, "loss": 0.1191, "step": 1035 }, { "epoch": 2.4376470588235293, "grad_norm": 0.5963623523712158, "learning_rate": 6.543062971377603e-06, "loss": 0.0767, "step": 1036 }, { "epoch": 2.44, "grad_norm": 0.6918096542358398, "learning_rate": 6.537142569792536e-06, "loss": 0.0936, "step": 1037 }, { "epoch": 2.4423529411764706, "grad_norm": 0.6163070201873779, "learning_rate": 6.531219787566798e-06, "loss": 0.0922, "step": 1038 }, { "epoch": 2.4447058823529413, "grad_norm": 0.7517495155334473, "learning_rate": 6.525294633876254e-06, "loss": 0.1189, "step": 1039 }, { "epoch": 2.447058823529412, "grad_norm": 0.5913711786270142, "learning_rate": 6.5193671179004314e-06, "loss": 0.0886, "step": 1040 }, { "epoch": 2.449411764705882, "grad_norm": 0.7871490716934204, "learning_rate": 6.513437248822526e-06, "loss": 0.0952, "step": 1041 }, { "epoch": 2.451764705882353, "grad_norm": 0.6814084053039551, "learning_rate": 6.507505035829378e-06, "loss": 0.1149, "step": 1042 }, { "epoch": 2.4541176470588235, "grad_norm": 0.7997105717658997, "learning_rate": 6.501570488111455e-06, "loss": 0.099, "step": 1043 }, { "epoch": 2.456470588235294, "grad_norm": 0.7341837286949158, "learning_rate": 6.495633614862844e-06, "loss": 0.1258, "step": 1044 }, { "epoch": 2.458823529411765, "grad_norm": 0.651711642742157, "learning_rate": 6.4896944252812356e-06, "loss": 0.1218, "step": 1045 }, { "epoch": 2.461176470588235, "grad_norm": 0.6028974652290344, "learning_rate": 6.483752928567908e-06, "loss": 0.1036, "step": 1046 }, { "epoch": 2.4635294117647057, "grad_norm": 0.6410581469535828, "learning_rate": 6.477809133927716e-06, "loss": 0.1046, "step": 1047 }, { "epoch": 2.4658823529411764, "grad_norm": 0.6583343744277954, "learning_rate": 6.471863050569071e-06, "loss": 0.114, "step": 1048 }, { "epoch": 2.468235294117647, "grad_norm": 0.6923911571502686, "learning_rate": 6.46591468770393e-06, "loss": 0.0993, "step": 1049 }, { "epoch": 2.4705882352941178, "grad_norm": 0.7015352845191956, "learning_rate": 6.459964054547785e-06, "loss": 0.0983, "step": 1050 }, { "epoch": 2.4729411764705884, "grad_norm": 0.6866098642349243, "learning_rate": 6.454011160319645e-06, "loss": 0.1239, "step": 1051 }, { "epoch": 2.4752941176470586, "grad_norm": 0.9945891499519348, "learning_rate": 6.448056014242018e-06, "loss": 0.1007, "step": 1052 }, { "epoch": 2.4776470588235293, "grad_norm": 0.6444399952888489, "learning_rate": 6.442098625540903e-06, "loss": 0.1038, "step": 1053 }, { "epoch": 2.48, "grad_norm": 0.6762149930000305, "learning_rate": 6.436139003445776e-06, "loss": 0.1304, "step": 1054 }, { "epoch": 2.4823529411764707, "grad_norm": 0.7980263233184814, "learning_rate": 6.4301771571895676e-06, "loss": 0.0991, "step": 1055 }, { "epoch": 2.4847058823529413, "grad_norm": 0.6602880954742432, "learning_rate": 6.424213096008658e-06, "loss": 0.0916, "step": 1056 }, { "epoch": 2.487058823529412, "grad_norm": 0.7106368541717529, "learning_rate": 6.418246829142859e-06, "loss": 0.103, "step": 1057 }, { "epoch": 2.489411764705882, "grad_norm": 0.7124637961387634, "learning_rate": 6.412278365835397e-06, "loss": 0.1336, "step": 1058 }, { "epoch": 2.491764705882353, "grad_norm": 0.7071592211723328, "learning_rate": 6.406307715332905e-06, "loss": 0.1171, "step": 1059 }, { "epoch": 2.4941176470588236, "grad_norm": 0.7698739767074585, "learning_rate": 6.4003348868854e-06, "loss": 0.0854, "step": 1060 }, { "epoch": 2.496470588235294, "grad_norm": 0.7598797678947449, "learning_rate": 6.3943598897462765e-06, "loss": 0.1038, "step": 1061 }, { "epoch": 2.498823529411765, "grad_norm": 0.7425053715705872, "learning_rate": 6.38838273317229e-06, "loss": 0.105, "step": 1062 }, { "epoch": 2.501176470588235, "grad_norm": 0.7727761268615723, "learning_rate": 6.382403426423534e-06, "loss": 0.0967, "step": 1063 }, { "epoch": 2.503529411764706, "grad_norm": 0.6403548717498779, "learning_rate": 6.376421978763442e-06, "loss": 0.0784, "step": 1064 }, { "epoch": 2.5058823529411764, "grad_norm": 4.244771480560303, "learning_rate": 6.370438399458761e-06, "loss": 0.0867, "step": 1065 }, { "epoch": 2.508235294117647, "grad_norm": 0.8002198934555054, "learning_rate": 6.364452697779539e-06, "loss": 0.1065, "step": 1066 }, { "epoch": 2.510588235294118, "grad_norm": 0.7619144320487976, "learning_rate": 6.358464882999114e-06, "loss": 0.1284, "step": 1067 }, { "epoch": 2.512941176470588, "grad_norm": 0.6177496314048767, "learning_rate": 6.352474964394097e-06, "loss": 0.0948, "step": 1068 }, { "epoch": 2.5152941176470587, "grad_norm": 0.7970564961433411, "learning_rate": 6.346482951244358e-06, "loss": 0.1336, "step": 1069 }, { "epoch": 2.5176470588235293, "grad_norm": 0.6393101215362549, "learning_rate": 6.340488852833012e-06, "loss": 0.1072, "step": 1070 }, { "epoch": 2.52, "grad_norm": 0.6577503681182861, "learning_rate": 6.334492678446406e-06, "loss": 0.0824, "step": 1071 }, { "epoch": 2.5223529411764707, "grad_norm": 0.677142322063446, "learning_rate": 6.328494437374102e-06, "loss": 0.1033, "step": 1072 }, { "epoch": 2.524705882352941, "grad_norm": 0.708069920539856, "learning_rate": 6.322494138908863e-06, "loss": 0.1038, "step": 1073 }, { "epoch": 2.527058823529412, "grad_norm": 0.7070394158363342, "learning_rate": 6.316491792346639e-06, "loss": 0.105, "step": 1074 }, { "epoch": 2.5294117647058822, "grad_norm": 0.7055957913398743, "learning_rate": 6.310487406986557e-06, "loss": 0.1015, "step": 1075 }, { "epoch": 2.531764705882353, "grad_norm": 0.6629803776741028, "learning_rate": 6.304480992130901e-06, "loss": 0.0837, "step": 1076 }, { "epoch": 2.5341176470588236, "grad_norm": 0.5241256952285767, "learning_rate": 6.298472557085095e-06, "loss": 0.0672, "step": 1077 }, { "epoch": 2.5364705882352943, "grad_norm": 0.6158060431480408, "learning_rate": 6.292462111157695e-06, "loss": 0.0755, "step": 1078 }, { "epoch": 2.538823529411765, "grad_norm": 0.6893629431724548, "learning_rate": 6.286449663660379e-06, "loss": 0.0911, "step": 1079 }, { "epoch": 2.541176470588235, "grad_norm": 0.6985366940498352, "learning_rate": 6.280435223907915e-06, "loss": 0.0904, "step": 1080 }, { "epoch": 2.543529411764706, "grad_norm": 0.7024869918823242, "learning_rate": 6.274418801218165e-06, "loss": 0.1118, "step": 1081 }, { "epoch": 2.5458823529411765, "grad_norm": 0.7790821194648743, "learning_rate": 6.268400404912058e-06, "loss": 0.0996, "step": 1082 }, { "epoch": 2.548235294117647, "grad_norm": 0.7435693144798279, "learning_rate": 6.262380044313587e-06, "loss": 0.1189, "step": 1083 }, { "epoch": 2.550588235294118, "grad_norm": 0.6556516885757446, "learning_rate": 6.256357728749783e-06, "loss": 0.0884, "step": 1084 }, { "epoch": 2.552941176470588, "grad_norm": 0.692373514175415, "learning_rate": 6.250333467550707e-06, "loss": 0.1105, "step": 1085 }, { "epoch": 2.5552941176470587, "grad_norm": 0.665573000907898, "learning_rate": 6.2443072700494345e-06, "loss": 0.0881, "step": 1086 }, { "epoch": 2.5576470588235294, "grad_norm": 0.7181094884872437, "learning_rate": 6.238279145582042e-06, "loss": 0.1024, "step": 1087 }, { "epoch": 2.56, "grad_norm": 0.6073662638664246, "learning_rate": 6.23224910348759e-06, "loss": 0.0937, "step": 1088 }, { "epoch": 2.5623529411764707, "grad_norm": 0.7319949269294739, "learning_rate": 6.226217153108108e-06, "loss": 0.0823, "step": 1089 }, { "epoch": 2.564705882352941, "grad_norm": 0.8849136829376221, "learning_rate": 6.220183303788587e-06, "loss": 0.1012, "step": 1090 }, { "epoch": 2.5670588235294116, "grad_norm": 0.714245080947876, "learning_rate": 6.214147564876956e-06, "loss": 0.1117, "step": 1091 }, { "epoch": 2.5694117647058823, "grad_norm": 0.9130730032920837, "learning_rate": 6.208109945724069e-06, "loss": 0.1413, "step": 1092 }, { "epoch": 2.571764705882353, "grad_norm": 0.7421652674674988, "learning_rate": 6.202070455683702e-06, "loss": 0.1289, "step": 1093 }, { "epoch": 2.5741176470588236, "grad_norm": 0.6999459266662598, "learning_rate": 6.19602910411252e-06, "loss": 0.112, "step": 1094 }, { "epoch": 2.576470588235294, "grad_norm": 0.6535015106201172, "learning_rate": 6.1899859003700764e-06, "loss": 0.0896, "step": 1095 }, { "epoch": 2.578823529411765, "grad_norm": 0.7015939950942993, "learning_rate": 6.183940853818794e-06, "loss": 0.0783, "step": 1096 }, { "epoch": 2.581176470588235, "grad_norm": 0.7158987522125244, "learning_rate": 6.177893973823951e-06, "loss": 0.1121, "step": 1097 }, { "epoch": 2.583529411764706, "grad_norm": 0.7082965970039368, "learning_rate": 6.171845269753663e-06, "loss": 0.0819, "step": 1098 }, { "epoch": 2.5858823529411765, "grad_norm": 0.6341986656188965, "learning_rate": 6.165794750978876e-06, "loss": 0.0925, "step": 1099 }, { "epoch": 2.588235294117647, "grad_norm": 0.6780461668968201, "learning_rate": 6.159742426873341e-06, "loss": 0.0933, "step": 1100 }, { "epoch": 2.590588235294118, "grad_norm": 0.5701039433479309, "learning_rate": 6.1536883068136156e-06, "loss": 0.0733, "step": 1101 }, { "epoch": 2.592941176470588, "grad_norm": 0.8025659322738647, "learning_rate": 6.14763240017903e-06, "loss": 0.1203, "step": 1102 }, { "epoch": 2.5952941176470588, "grad_norm": 1.3901091814041138, "learning_rate": 6.141574716351688e-06, "loss": 0.1092, "step": 1103 }, { "epoch": 2.5976470588235294, "grad_norm": 0.7321634888648987, "learning_rate": 6.135515264716444e-06, "loss": 0.1271, "step": 1104 }, { "epoch": 2.6, "grad_norm": 0.6548156142234802, "learning_rate": 6.129454054660893e-06, "loss": 0.0917, "step": 1105 }, { "epoch": 2.6023529411764708, "grad_norm": 0.6152363419532776, "learning_rate": 6.123391095575354e-06, "loss": 0.0931, "step": 1106 }, { "epoch": 2.604705882352941, "grad_norm": 0.6835276484489441, "learning_rate": 6.117326396852853e-06, "loss": 0.097, "step": 1107 }, { "epoch": 2.6070588235294117, "grad_norm": 0.8003057837486267, "learning_rate": 6.111259967889117e-06, "loss": 0.1074, "step": 1108 }, { "epoch": 2.6094117647058823, "grad_norm": 0.7461892366409302, "learning_rate": 6.105191818082549e-06, "loss": 0.1141, "step": 1109 }, { "epoch": 2.611764705882353, "grad_norm": 0.5863826274871826, "learning_rate": 6.099121956834215e-06, "loss": 0.0979, "step": 1110 }, { "epoch": 2.6141176470588237, "grad_norm": 0.6728936433792114, "learning_rate": 6.09305039354784e-06, "loss": 0.113, "step": 1111 }, { "epoch": 2.616470588235294, "grad_norm": 0.6706534028053284, "learning_rate": 6.0869771376297824e-06, "loss": 0.11, "step": 1112 }, { "epoch": 2.6188235294117646, "grad_norm": 0.703102707862854, "learning_rate": 6.080902198489021e-06, "loss": 0.0942, "step": 1113 }, { "epoch": 2.621176470588235, "grad_norm": 0.6457929015159607, "learning_rate": 6.074825585537145e-06, "loss": 0.0964, "step": 1114 }, { "epoch": 2.623529411764706, "grad_norm": 0.6167572736740112, "learning_rate": 6.068747308188335e-06, "loss": 0.0989, "step": 1115 }, { "epoch": 2.6258823529411766, "grad_norm": 0.6445097327232361, "learning_rate": 6.0626673758593525e-06, "loss": 0.1126, "step": 1116 }, { "epoch": 2.6282352941176472, "grad_norm": 0.6464284658432007, "learning_rate": 6.05658579796952e-06, "loss": 0.1006, "step": 1117 }, { "epoch": 2.630588235294118, "grad_norm": 0.8194031715393066, "learning_rate": 6.050502583940712e-06, "loss": 0.0916, "step": 1118 }, { "epoch": 2.632941176470588, "grad_norm": 0.7373201847076416, "learning_rate": 6.0444177431973364e-06, "loss": 0.1369, "step": 1119 }, { "epoch": 2.635294117647059, "grad_norm": 0.6702144742012024, "learning_rate": 6.03833128516632e-06, "loss": 0.119, "step": 1120 }, { "epoch": 2.6376470588235295, "grad_norm": 0.735341489315033, "learning_rate": 6.032243219277098e-06, "loss": 0.11, "step": 1121 }, { "epoch": 2.64, "grad_norm": 0.603901207447052, "learning_rate": 6.026153554961593e-06, "loss": 0.0778, "step": 1122 }, { "epoch": 2.642352941176471, "grad_norm": 0.6713098287582397, "learning_rate": 6.020062301654212e-06, "loss": 0.1176, "step": 1123 }, { "epoch": 2.644705882352941, "grad_norm": 0.6505681872367859, "learning_rate": 6.0139694687918125e-06, "loss": 0.1236, "step": 1124 }, { "epoch": 2.6470588235294117, "grad_norm": 0.6279091835021973, "learning_rate": 6.007875065813707e-06, "loss": 0.1193, "step": 1125 }, { "epoch": 2.6494117647058824, "grad_norm": 0.7600767612457275, "learning_rate": 6.001779102161636e-06, "loss": 0.0717, "step": 1126 }, { "epoch": 2.651764705882353, "grad_norm": 0.6093897223472595, "learning_rate": 5.9956815872797625e-06, "loss": 0.0855, "step": 1127 }, { "epoch": 2.6541176470588237, "grad_norm": 0.6462109088897705, "learning_rate": 5.98958253061465e-06, "loss": 0.1047, "step": 1128 }, { "epoch": 2.656470588235294, "grad_norm": 0.5565080642700195, "learning_rate": 5.9834819416152475e-06, "loss": 0.083, "step": 1129 }, { "epoch": 2.6588235294117646, "grad_norm": 0.6923402547836304, "learning_rate": 5.977379829732884e-06, "loss": 0.0788, "step": 1130 }, { "epoch": 2.6611764705882353, "grad_norm": 0.6557579040527344, "learning_rate": 5.971276204421247e-06, "loss": 0.1131, "step": 1131 }, { "epoch": 2.663529411764706, "grad_norm": 0.9365622401237488, "learning_rate": 5.9651710751363624e-06, "loss": 0.1225, "step": 1132 }, { "epoch": 2.6658823529411766, "grad_norm": 0.664203405380249, "learning_rate": 5.959064451336593e-06, "loss": 0.0777, "step": 1133 }, { "epoch": 2.668235294117647, "grad_norm": 0.7735985517501831, "learning_rate": 5.952956342482613e-06, "loss": 0.0775, "step": 1134 }, { "epoch": 2.6705882352941175, "grad_norm": 0.6978706121444702, "learning_rate": 5.946846758037403e-06, "loss": 0.1113, "step": 1135 }, { "epoch": 2.672941176470588, "grad_norm": 0.7602506279945374, "learning_rate": 5.940735707466219e-06, "loss": 0.097, "step": 1136 }, { "epoch": 2.675294117647059, "grad_norm": 0.7475665211677551, "learning_rate": 5.9346232002366e-06, "loss": 0.0933, "step": 1137 }, { "epoch": 2.6776470588235295, "grad_norm": 0.7003555297851562, "learning_rate": 5.928509245818334e-06, "loss": 0.1126, "step": 1138 }, { "epoch": 2.68, "grad_norm": 0.6248278021812439, "learning_rate": 5.922393853683454e-06, "loss": 0.0646, "step": 1139 }, { "epoch": 2.682352941176471, "grad_norm": 0.6409180760383606, "learning_rate": 5.916277033306221e-06, "loss": 0.0982, "step": 1140 }, { "epoch": 2.684705882352941, "grad_norm": 0.6980142593383789, "learning_rate": 5.9101587941631055e-06, "loss": 0.0962, "step": 1141 }, { "epoch": 2.6870588235294117, "grad_norm": 0.6799383759498596, "learning_rate": 5.904039145732781e-06, "loss": 0.1203, "step": 1142 }, { "epoch": 2.6894117647058824, "grad_norm": 0.7584722638130188, "learning_rate": 5.897918097496099e-06, "loss": 0.0889, "step": 1143 }, { "epoch": 2.691764705882353, "grad_norm": 0.6984223127365112, "learning_rate": 5.891795658936083e-06, "loss": 0.1106, "step": 1144 }, { "epoch": 2.6941176470588237, "grad_norm": 0.5967733263969421, "learning_rate": 5.885671839537912e-06, "loss": 0.0678, "step": 1145 }, { "epoch": 2.696470588235294, "grad_norm": 0.7474173903465271, "learning_rate": 5.879546648788899e-06, "loss": 0.1194, "step": 1146 }, { "epoch": 2.6988235294117646, "grad_norm": 0.7953404188156128, "learning_rate": 5.873420096178486e-06, "loss": 0.1255, "step": 1147 }, { "epoch": 2.7011764705882353, "grad_norm": 0.6482937335968018, "learning_rate": 5.8672921911982225e-06, "loss": 0.0761, "step": 1148 }, { "epoch": 2.703529411764706, "grad_norm": 0.6395251750946045, "learning_rate": 5.861162943341755e-06, "loss": 0.0981, "step": 1149 }, { "epoch": 2.7058823529411766, "grad_norm": 1.460011601448059, "learning_rate": 5.855032362104808e-06, "loss": 0.1051, "step": 1150 }, { "epoch": 2.708235294117647, "grad_norm": 0.6398628354072571, "learning_rate": 5.848900456985174e-06, "loss": 0.0681, "step": 1151 }, { "epoch": 2.7105882352941175, "grad_norm": 0.658672034740448, "learning_rate": 5.842767237482695e-06, "loss": 0.1028, "step": 1152 }, { "epoch": 2.712941176470588, "grad_norm": 0.8743035793304443, "learning_rate": 5.836632713099252e-06, "loss": 0.0841, "step": 1153 }, { "epoch": 2.715294117647059, "grad_norm": 0.7478955984115601, "learning_rate": 5.830496893338743e-06, "loss": 0.1142, "step": 1154 }, { "epoch": 2.7176470588235295, "grad_norm": 0.697268545627594, "learning_rate": 5.8243597877070764e-06, "loss": 0.0891, "step": 1155 }, { "epoch": 2.7199999999999998, "grad_norm": 0.6838622689247131, "learning_rate": 5.818221405712154e-06, "loss": 0.1004, "step": 1156 }, { "epoch": 2.722352941176471, "grad_norm": 0.7403920888900757, "learning_rate": 5.812081756863849e-06, "loss": 0.1116, "step": 1157 }, { "epoch": 2.724705882352941, "grad_norm": 0.7088750600814819, "learning_rate": 5.805940850674003e-06, "loss": 0.1177, "step": 1158 }, { "epoch": 2.7270588235294118, "grad_norm": 0.7831651568412781, "learning_rate": 5.799798696656404e-06, "loss": 0.0901, "step": 1159 }, { "epoch": 2.7294117647058824, "grad_norm": 0.688901960849762, "learning_rate": 5.793655304326773e-06, "loss": 0.1041, "step": 1160 }, { "epoch": 2.731764705882353, "grad_norm": 0.7188778519630432, "learning_rate": 5.787510683202748e-06, "loss": 0.1422, "step": 1161 }, { "epoch": 2.7341176470588238, "grad_norm": 0.6448324918746948, "learning_rate": 5.781364842803873e-06, "loss": 0.0984, "step": 1162 }, { "epoch": 2.736470588235294, "grad_norm": 0.7281361222267151, "learning_rate": 5.7752177926515805e-06, "loss": 0.0708, "step": 1163 }, { "epoch": 2.7388235294117647, "grad_norm": 0.6679324507713318, "learning_rate": 5.769069542269172e-06, "loss": 0.0977, "step": 1164 }, { "epoch": 2.7411764705882353, "grad_norm": 0.766491711139679, "learning_rate": 5.76292010118182e-06, "loss": 0.1477, "step": 1165 }, { "epoch": 2.743529411764706, "grad_norm": 0.6500692367553711, "learning_rate": 5.7567694789165304e-06, "loss": 0.1067, "step": 1166 }, { "epoch": 2.7458823529411767, "grad_norm": 0.6257439851760864, "learning_rate": 5.750617685002145e-06, "loss": 0.0773, "step": 1167 }, { "epoch": 2.748235294117647, "grad_norm": 0.7164331078529358, "learning_rate": 5.744464728969318e-06, "loss": 0.1285, "step": 1168 }, { "epoch": 2.7505882352941176, "grad_norm": 0.6272771954536438, "learning_rate": 5.738310620350504e-06, "loss": 0.0754, "step": 1169 }, { "epoch": 2.7529411764705882, "grad_norm": 0.6259792447090149, "learning_rate": 5.732155368679948e-06, "loss": 0.0968, "step": 1170 }, { "epoch": 2.755294117647059, "grad_norm": 0.7445051670074463, "learning_rate": 5.72599898349366e-06, "loss": 0.133, "step": 1171 }, { "epoch": 2.7576470588235296, "grad_norm": 0.8034539818763733, "learning_rate": 5.71984147432941e-06, "loss": 0.1031, "step": 1172 }, { "epoch": 2.76, "grad_norm": 0.6895303130149841, "learning_rate": 5.713682850726706e-06, "loss": 0.1402, "step": 1173 }, { "epoch": 2.7623529411764705, "grad_norm": 0.5808074474334717, "learning_rate": 5.707523122226784e-06, "loss": 0.0939, "step": 1174 }, { "epoch": 2.764705882352941, "grad_norm": 0.6198788285255432, "learning_rate": 5.701362298372595e-06, "loss": 0.0871, "step": 1175 }, { "epoch": 2.767058823529412, "grad_norm": 0.7243955731391907, "learning_rate": 5.6952003887087805e-06, "loss": 0.1055, "step": 1176 }, { "epoch": 2.7694117647058825, "grad_norm": 0.630534291267395, "learning_rate": 5.689037402781668e-06, "loss": 0.0857, "step": 1177 }, { "epoch": 2.7717647058823527, "grad_norm": 0.708491861820221, "learning_rate": 5.682873350139255e-06, "loss": 0.0826, "step": 1178 }, { "epoch": 2.774117647058824, "grad_norm": 0.6357554197311401, "learning_rate": 5.676708240331187e-06, "loss": 0.0997, "step": 1179 }, { "epoch": 2.776470588235294, "grad_norm": 0.698607861995697, "learning_rate": 5.670542082908749e-06, "loss": 0.0838, "step": 1180 }, { "epoch": 2.7788235294117647, "grad_norm": 0.6765342950820923, "learning_rate": 5.664374887424848e-06, "loss": 0.1277, "step": 1181 }, { "epoch": 2.7811764705882354, "grad_norm": 0.7227910757064819, "learning_rate": 5.6582066634340015e-06, "loss": 0.0942, "step": 1182 }, { "epoch": 2.783529411764706, "grad_norm": 0.632839560508728, "learning_rate": 5.65203742049232e-06, "loss": 0.0995, "step": 1183 }, { "epoch": 2.7858823529411767, "grad_norm": 1.29427170753479, "learning_rate": 5.645867168157487e-06, "loss": 0.0926, "step": 1184 }, { "epoch": 2.788235294117647, "grad_norm": 0.9853066205978394, "learning_rate": 5.639695915988762e-06, "loss": 0.1165, "step": 1185 }, { "epoch": 2.7905882352941176, "grad_norm": 0.701269268989563, "learning_rate": 5.633523673546941e-06, "loss": 0.1021, "step": 1186 }, { "epoch": 2.7929411764705883, "grad_norm": 0.5970171093940735, "learning_rate": 5.627350450394361e-06, "loss": 0.0801, "step": 1187 }, { "epoch": 2.795294117647059, "grad_norm": 0.673363447189331, "learning_rate": 5.621176256094876e-06, "loss": 0.0597, "step": 1188 }, { "epoch": 2.7976470588235296, "grad_norm": 0.6723054051399231, "learning_rate": 5.615001100213843e-06, "loss": 0.1256, "step": 1189 }, { "epoch": 2.8, "grad_norm": 1.0035456418991089, "learning_rate": 5.608824992318116e-06, "loss": 0.0958, "step": 1190 }, { "epoch": 2.8023529411764705, "grad_norm": 0.6532413959503174, "learning_rate": 5.602647941976013e-06, "loss": 0.0831, "step": 1191 }, { "epoch": 2.804705882352941, "grad_norm": 0.6606675982475281, "learning_rate": 5.596469958757322e-06, "loss": 0.1155, "step": 1192 }, { "epoch": 2.807058823529412, "grad_norm": 0.61696857213974, "learning_rate": 5.590291052233272e-06, "loss": 0.0746, "step": 1193 }, { "epoch": 2.8094117647058825, "grad_norm": 0.6355022192001343, "learning_rate": 5.58411123197652e-06, "loss": 0.0833, "step": 1194 }, { "epoch": 2.8117647058823527, "grad_norm": 0.9709405899047852, "learning_rate": 5.5779305075611435e-06, "loss": 0.0707, "step": 1195 }, { "epoch": 2.8141176470588234, "grad_norm": 0.6515994071960449, "learning_rate": 5.571748888562616e-06, "loss": 0.0972, "step": 1196 }, { "epoch": 2.816470588235294, "grad_norm": 0.8227649331092834, "learning_rate": 5.565566384557801e-06, "loss": 0.1172, "step": 1197 }, { "epoch": 2.8188235294117647, "grad_norm": 0.6990308165550232, "learning_rate": 5.559383005124931e-06, "loss": 0.104, "step": 1198 }, { "epoch": 2.8211764705882354, "grad_norm": 0.5682240724563599, "learning_rate": 5.5531987598435956e-06, "loss": 0.111, "step": 1199 }, { "epoch": 2.8235294117647056, "grad_norm": 0.6683300137519836, "learning_rate": 5.547013658294726e-06, "loss": 0.1034, "step": 1200 }, { "epoch": 2.8235294117647056, "eval_loss": 0.18654599785804749, "eval_runtime": 3.3365, "eval_samples_per_second": 33.268, "eval_steps_per_second": 1.199, "step": 1200 }, { "epoch": 2.8258823529411767, "grad_norm": 0.6948291063308716, "learning_rate": 5.540827710060577e-06, "loss": 0.1306, "step": 1201 }, { "epoch": 2.828235294117647, "grad_norm": 0.6815454363822937, "learning_rate": 5.53464092472472e-06, "loss": 0.0834, "step": 1202 }, { "epoch": 2.8305882352941176, "grad_norm": 0.8648965954780579, "learning_rate": 5.52845331187202e-06, "loss": 0.0858, "step": 1203 }, { "epoch": 2.8329411764705883, "grad_norm": 0.6334685683250427, "learning_rate": 5.522264881088625e-06, "loss": 0.0974, "step": 1204 }, { "epoch": 2.835294117647059, "grad_norm": 0.6164634823799133, "learning_rate": 5.516075641961949e-06, "loss": 0.0754, "step": 1205 }, { "epoch": 2.8376470588235296, "grad_norm": 0.6705353856086731, "learning_rate": 5.509885604080658e-06, "loss": 0.0989, "step": 1206 }, { "epoch": 2.84, "grad_norm": 0.7529779076576233, "learning_rate": 5.503694777034662e-06, "loss": 0.0892, "step": 1207 }, { "epoch": 2.8423529411764705, "grad_norm": 0.6136559844017029, "learning_rate": 5.497503170415083e-06, "loss": 0.0928, "step": 1208 }, { "epoch": 2.844705882352941, "grad_norm": 0.736171305179596, "learning_rate": 5.4913107938142595e-06, "loss": 0.1041, "step": 1209 }, { "epoch": 2.847058823529412, "grad_norm": 0.5586112141609192, "learning_rate": 5.485117656825717e-06, "loss": 0.0658, "step": 1210 }, { "epoch": 2.8494117647058825, "grad_norm": 0.540181040763855, "learning_rate": 5.478923769044162e-06, "loss": 0.0748, "step": 1211 }, { "epoch": 2.8517647058823528, "grad_norm": 0.753854513168335, "learning_rate": 5.472729140065467e-06, "loss": 0.1234, "step": 1212 }, { "epoch": 2.8541176470588234, "grad_norm": 0.5908348560333252, "learning_rate": 5.466533779486646e-06, "loss": 0.079, "step": 1213 }, { "epoch": 2.856470588235294, "grad_norm": 0.6474031209945679, "learning_rate": 5.46033769690585e-06, "loss": 0.0802, "step": 1214 }, { "epoch": 2.8588235294117648, "grad_norm": 0.6894181966781616, "learning_rate": 5.454140901922351e-06, "loss": 0.0644, "step": 1215 }, { "epoch": 2.8611764705882354, "grad_norm": 0.682490348815918, "learning_rate": 5.447943404136521e-06, "loss": 0.1158, "step": 1216 }, { "epoch": 2.8635294117647057, "grad_norm": 0.6854071617126465, "learning_rate": 5.44174521314982e-06, "loss": 0.0893, "step": 1217 }, { "epoch": 2.8658823529411763, "grad_norm": 0.6678381562232971, "learning_rate": 5.435546338564786e-06, "loss": 0.0771, "step": 1218 }, { "epoch": 2.868235294117647, "grad_norm": 0.7433589696884155, "learning_rate": 5.429346789985017e-06, "loss": 0.1022, "step": 1219 }, { "epoch": 2.8705882352941177, "grad_norm": 0.7393654584884644, "learning_rate": 5.423146577015147e-06, "loss": 0.1075, "step": 1220 }, { "epoch": 2.8729411764705883, "grad_norm": 0.6127851009368896, "learning_rate": 5.416945709260845e-06, "loss": 0.0895, "step": 1221 }, { "epoch": 2.8752941176470586, "grad_norm": 0.6956869959831238, "learning_rate": 5.4107441963287965e-06, "loss": 0.0697, "step": 1222 }, { "epoch": 2.8776470588235297, "grad_norm": 0.9950853586196899, "learning_rate": 5.404542047826683e-06, "loss": 0.0501, "step": 1223 }, { "epoch": 2.88, "grad_norm": 0.5224266648292542, "learning_rate": 5.398339273363172e-06, "loss": 0.0707, "step": 1224 }, { "epoch": 2.8823529411764706, "grad_norm": 1.210813045501709, "learning_rate": 5.3921358825479e-06, "loss": 0.096, "step": 1225 }, { "epoch": 2.8847058823529412, "grad_norm": 0.5669080018997192, "learning_rate": 5.3859318849914576e-06, "loss": 0.0681, "step": 1226 }, { "epoch": 2.887058823529412, "grad_norm": 0.7073667049407959, "learning_rate": 5.379727290305379e-06, "loss": 0.0902, "step": 1227 }, { "epoch": 2.8894117647058826, "grad_norm": 0.6826819181442261, "learning_rate": 5.373522108102118e-06, "loss": 0.1169, "step": 1228 }, { "epoch": 2.891764705882353, "grad_norm": 0.6785362362861633, "learning_rate": 5.367316347995046e-06, "loss": 0.0948, "step": 1229 }, { "epoch": 2.8941176470588235, "grad_norm": 0.6344099044799805, "learning_rate": 5.361110019598422e-06, "loss": 0.089, "step": 1230 }, { "epoch": 2.896470588235294, "grad_norm": 1.11925208568573, "learning_rate": 5.354903132527389e-06, "loss": 0.1408, "step": 1231 }, { "epoch": 2.898823529411765, "grad_norm": 0.6570376753807068, "learning_rate": 5.348695696397956e-06, "loss": 0.0927, "step": 1232 }, { "epoch": 2.9011764705882355, "grad_norm": 0.739099383354187, "learning_rate": 5.342487720826983e-06, "loss": 0.1071, "step": 1233 }, { "epoch": 2.9035294117647057, "grad_norm": 0.6732447147369385, "learning_rate": 5.336279215432164e-06, "loss": 0.0894, "step": 1234 }, { "epoch": 2.9058823529411764, "grad_norm": 0.5628352761268616, "learning_rate": 5.330070189832013e-06, "loss": 0.0766, "step": 1235 }, { "epoch": 2.908235294117647, "grad_norm": 0.9120317101478577, "learning_rate": 5.323860653645854e-06, "loss": 0.0716, "step": 1236 }, { "epoch": 2.9105882352941177, "grad_norm": 0.6545470952987671, "learning_rate": 5.317650616493798e-06, "loss": 0.1152, "step": 1237 }, { "epoch": 2.9129411764705884, "grad_norm": 0.6928826570510864, "learning_rate": 5.311440087996734e-06, "loss": 0.0768, "step": 1238 }, { "epoch": 2.9152941176470586, "grad_norm": 0.7446548342704773, "learning_rate": 5.305229077776312e-06, "loss": 0.1415, "step": 1239 }, { "epoch": 2.9176470588235293, "grad_norm": 0.7031207084655762, "learning_rate": 5.299017595454927e-06, "loss": 0.1305, "step": 1240 }, { "epoch": 2.92, "grad_norm": 0.6536082029342651, "learning_rate": 5.292805650655708e-06, "loss": 0.0847, "step": 1241 }, { "epoch": 2.9223529411764706, "grad_norm": 0.6840224266052246, "learning_rate": 5.286593253002499e-06, "loss": 0.0967, "step": 1242 }, { "epoch": 2.9247058823529413, "grad_norm": 0.5693729519844055, "learning_rate": 5.280380412119843e-06, "loss": 0.0739, "step": 1243 }, { "epoch": 2.9270588235294115, "grad_norm": 0.6424421072006226, "learning_rate": 5.274167137632976e-06, "loss": 0.1106, "step": 1244 }, { "epoch": 2.9294117647058826, "grad_norm": 0.7196018099784851, "learning_rate": 5.2679534391678e-06, "loss": 0.1045, "step": 1245 }, { "epoch": 2.931764705882353, "grad_norm": 0.7127313613891602, "learning_rate": 5.2617393263508754e-06, "loss": 0.11, "step": 1246 }, { "epoch": 2.9341176470588235, "grad_norm": 0.7531421184539795, "learning_rate": 5.255524808809409e-06, "loss": 0.1001, "step": 1247 }, { "epoch": 2.936470588235294, "grad_norm": 0.7842127680778503, "learning_rate": 5.2493098961712245e-06, "loss": 0.1302, "step": 1248 }, { "epoch": 2.938823529411765, "grad_norm": 0.6913720965385437, "learning_rate": 5.243094598064768e-06, "loss": 0.0933, "step": 1249 }, { "epoch": 2.9411764705882355, "grad_norm": 0.6548537611961365, "learning_rate": 5.236878924119077e-06, "loss": 0.078, "step": 1250 }, { "epoch": 2.9435294117647057, "grad_norm": 0.6892358064651489, "learning_rate": 5.230662883963774e-06, "loss": 0.1164, "step": 1251 }, { "epoch": 2.9458823529411764, "grad_norm": 0.6148607134819031, "learning_rate": 5.224446487229048e-06, "loss": 0.0651, "step": 1252 }, { "epoch": 2.948235294117647, "grad_norm": 0.6649748682975769, "learning_rate": 5.218229743545638e-06, "loss": 0.0803, "step": 1253 }, { "epoch": 2.9505882352941177, "grad_norm": 0.6865822076797485, "learning_rate": 5.212012662544824e-06, "loss": 0.0968, "step": 1254 }, { "epoch": 2.9529411764705884, "grad_norm": 0.6624200344085693, "learning_rate": 5.2057952538584076e-06, "loss": 0.074, "step": 1255 }, { "epoch": 2.9552941176470586, "grad_norm": 0.6558935046195984, "learning_rate": 5.199577527118699e-06, "loss": 0.1222, "step": 1256 }, { "epoch": 2.9576470588235293, "grad_norm": 0.7811700105667114, "learning_rate": 5.193359491958497e-06, "loss": 0.0709, "step": 1257 }, { "epoch": 2.96, "grad_norm": 3.986356258392334, "learning_rate": 5.187141158011082e-06, "loss": 0.0888, "step": 1258 }, { "epoch": 2.9623529411764706, "grad_norm": 0.6463606357574463, "learning_rate": 5.180922534910199e-06, "loss": 0.102, "step": 1259 }, { "epoch": 2.9647058823529413, "grad_norm": 0.6067935824394226, "learning_rate": 5.1747036322900345e-06, "loss": 0.104, "step": 1260 }, { "epoch": 2.9670588235294115, "grad_norm": 0.7012919187545776, "learning_rate": 5.168484459785213e-06, "loss": 0.1163, "step": 1261 }, { "epoch": 2.969411764705882, "grad_norm": 0.6424875259399414, "learning_rate": 5.1622650270307795e-06, "loss": 0.1067, "step": 1262 }, { "epoch": 2.971764705882353, "grad_norm": 0.5577113032341003, "learning_rate": 5.156045343662175e-06, "loss": 0.0755, "step": 1263 }, { "epoch": 2.9741176470588235, "grad_norm": 0.7864019870758057, "learning_rate": 5.149825419315233e-06, "loss": 0.1323, "step": 1264 }, { "epoch": 2.976470588235294, "grad_norm": 0.7168054580688477, "learning_rate": 5.14360526362616e-06, "loss": 0.0899, "step": 1265 }, { "epoch": 2.978823529411765, "grad_norm": 0.6741166114807129, "learning_rate": 5.137384886231523e-06, "loss": 0.086, "step": 1266 }, { "epoch": 2.9811764705882355, "grad_norm": 0.5327978730201721, "learning_rate": 5.131164296768226e-06, "loss": 0.0646, "step": 1267 }, { "epoch": 2.9835294117647058, "grad_norm": 0.779046356678009, "learning_rate": 5.124943504873509e-06, "loss": 0.0984, "step": 1268 }, { "epoch": 2.9858823529411764, "grad_norm": 0.581616222858429, "learning_rate": 5.118722520184921e-06, "loss": 0.0757, "step": 1269 }, { "epoch": 2.988235294117647, "grad_norm": 0.6295648217201233, "learning_rate": 5.112501352340311e-06, "loss": 0.0693, "step": 1270 }, { "epoch": 2.9905882352941178, "grad_norm": 0.7035243511199951, "learning_rate": 5.1062800109778155e-06, "loss": 0.0874, "step": 1271 }, { "epoch": 2.9929411764705884, "grad_norm": 0.8655304908752441, "learning_rate": 5.100058505735829e-06, "loss": 0.0881, "step": 1272 }, { "epoch": 2.9952941176470587, "grad_norm": 0.6985899209976196, "learning_rate": 5.093836846253012e-06, "loss": 0.0826, "step": 1273 }, { "epoch": 2.9976470588235293, "grad_norm": 0.7297172546386719, "learning_rate": 5.08761504216826e-06, "loss": 0.1167, "step": 1274 }, { "epoch": 3.0, "grad_norm": 0.7141335606575012, "learning_rate": 5.0813931031206855e-06, "loss": 0.0867, "step": 1275 }, { "epoch": 3.0023529411764707, "grad_norm": 0.5637583136558533, "learning_rate": 5.0751710387496245e-06, "loss": 0.0538, "step": 1276 }, { "epoch": 3.0047058823529413, "grad_norm": 0.38446199893951416, "learning_rate": 5.068948858694593e-06, "loss": 0.0354, "step": 1277 }, { "epoch": 3.0070588235294116, "grad_norm": 0.5045323371887207, "learning_rate": 5.062726572595292e-06, "loss": 0.0593, "step": 1278 }, { "epoch": 3.0094117647058822, "grad_norm": 0.5347929000854492, "learning_rate": 5.056504190091588e-06, "loss": 0.0498, "step": 1279 }, { "epoch": 3.011764705882353, "grad_norm": 0.5768089294433594, "learning_rate": 5.050281720823497e-06, "loss": 0.0779, "step": 1280 }, { "epoch": 3.0141176470588236, "grad_norm": 0.6087992191314697, "learning_rate": 5.044059174431167e-06, "loss": 0.0624, "step": 1281 }, { "epoch": 3.0164705882352942, "grad_norm": 0.6482971906661987, "learning_rate": 5.037836560554864e-06, "loss": 0.0493, "step": 1282 }, { "epoch": 3.018823529411765, "grad_norm": 0.5817829370498657, "learning_rate": 5.031613888834967e-06, "loss": 0.0606, "step": 1283 }, { "epoch": 3.021176470588235, "grad_norm": 0.6545772552490234, "learning_rate": 5.025391168911931e-06, "loss": 0.0553, "step": 1284 }, { "epoch": 3.023529411764706, "grad_norm": 2.922285795211792, "learning_rate": 5.0191684104263e-06, "loss": 0.0444, "step": 1285 }, { "epoch": 3.0258823529411765, "grad_norm": 0.6806549429893494, "learning_rate": 5.012945623018668e-06, "loss": 0.0544, "step": 1286 }, { "epoch": 3.028235294117647, "grad_norm": 0.576906681060791, "learning_rate": 5.0067228163296775e-06, "loss": 0.0461, "step": 1287 }, { "epoch": 3.030588235294118, "grad_norm": 0.7763160467147827, "learning_rate": 5.000500000000001e-06, "loss": 0.0512, "step": 1288 }, { "epoch": 3.032941176470588, "grad_norm": 0.8381593823432922, "learning_rate": 4.994277183670324e-06, "loss": 0.059, "step": 1289 }, { "epoch": 3.0352941176470587, "grad_norm": 0.7899082899093628, "learning_rate": 4.988054376981332e-06, "loss": 0.0616, "step": 1290 }, { "epoch": 3.0376470588235294, "grad_norm": 0.7089722752571106, "learning_rate": 4.981831589573701e-06, "loss": 0.0556, "step": 1291 }, { "epoch": 3.04, "grad_norm": 0.9609414935112, "learning_rate": 4.975608831088069e-06, "loss": 0.0595, "step": 1292 }, { "epoch": 3.0423529411764707, "grad_norm": 0.8558748364448547, "learning_rate": 4.969386111165036e-06, "loss": 0.0578, "step": 1293 }, { "epoch": 3.0447058823529414, "grad_norm": 0.689437210559845, "learning_rate": 4.963163439445136e-06, "loss": 0.0466, "step": 1294 }, { "epoch": 3.0470588235294116, "grad_norm": 0.7311373949050903, "learning_rate": 4.956940825568834e-06, "loss": 0.0555, "step": 1295 }, { "epoch": 3.0494117647058823, "grad_norm": 0.731696605682373, "learning_rate": 4.9507182791765045e-06, "loss": 0.0498, "step": 1296 }, { "epoch": 3.051764705882353, "grad_norm": 0.6579477190971375, "learning_rate": 4.9444958099084125e-06, "loss": 0.039, "step": 1297 }, { "epoch": 3.0541176470588236, "grad_norm": 0.5317938923835754, "learning_rate": 4.938273427404709e-06, "loss": 0.0491, "step": 1298 }, { "epoch": 3.0564705882352943, "grad_norm": 1.3023573160171509, "learning_rate": 4.93205114130541e-06, "loss": 0.0458, "step": 1299 }, { "epoch": 3.0588235294117645, "grad_norm": 1.6159367561340332, "learning_rate": 4.925828961250378e-06, "loss": 0.0531, "step": 1300 }, { "epoch": 3.061176470588235, "grad_norm": 0.7355278134346008, "learning_rate": 4.919606896879313e-06, "loss": 0.0526, "step": 1301 }, { "epoch": 3.063529411764706, "grad_norm": 0.5272924900054932, "learning_rate": 4.913384957831741e-06, "loss": 0.0395, "step": 1302 }, { "epoch": 3.0658823529411765, "grad_norm": 0.6297348737716675, "learning_rate": 4.907163153746989e-06, "loss": 0.0778, "step": 1303 }, { "epoch": 3.068235294117647, "grad_norm": 0.6507878303527832, "learning_rate": 4.900941494264173e-06, "loss": 0.0609, "step": 1304 }, { "epoch": 3.070588235294118, "grad_norm": 0.674199104309082, "learning_rate": 4.894719989022187e-06, "loss": 0.0497, "step": 1305 }, { "epoch": 3.072941176470588, "grad_norm": 0.6062266230583191, "learning_rate": 4.888498647659689e-06, "loss": 0.062, "step": 1306 }, { "epoch": 3.0752941176470587, "grad_norm": 0.5870828032493591, "learning_rate": 4.88227747981508e-06, "loss": 0.0499, "step": 1307 }, { "epoch": 3.0776470588235294, "grad_norm": 0.8497254848480225, "learning_rate": 4.8760564951264926e-06, "loss": 0.0351, "step": 1308 }, { "epoch": 3.08, "grad_norm": 0.611456573009491, "learning_rate": 4.869835703231776e-06, "loss": 0.0476, "step": 1309 }, { "epoch": 3.0823529411764707, "grad_norm": 0.6473087072372437, "learning_rate": 4.863615113768479e-06, "loss": 0.0951, "step": 1310 }, { "epoch": 3.084705882352941, "grad_norm": 0.5935549139976501, "learning_rate": 4.857394736373841e-06, "loss": 0.0534, "step": 1311 }, { "epoch": 3.0870588235294116, "grad_norm": 0.7124994397163391, "learning_rate": 4.851174580684768e-06, "loss": 0.0588, "step": 1312 }, { "epoch": 3.0894117647058823, "grad_norm": 0.6211840510368347, "learning_rate": 4.8449546563378264e-06, "loss": 0.0576, "step": 1313 }, { "epoch": 3.091764705882353, "grad_norm": 0.6576946377754211, "learning_rate": 4.838734972969222e-06, "loss": 0.0639, "step": 1314 }, { "epoch": 3.0941176470588236, "grad_norm": 0.6857396364212036, "learning_rate": 4.832515540214786e-06, "loss": 0.0605, "step": 1315 }, { "epoch": 3.0964705882352943, "grad_norm": 0.7492038011550903, "learning_rate": 4.826296367709966e-06, "loss": 0.041, "step": 1316 }, { "epoch": 3.0988235294117645, "grad_norm": 0.7306371927261353, "learning_rate": 4.820077465089802e-06, "loss": 0.0615, "step": 1317 }, { "epoch": 3.101176470588235, "grad_norm": 0.6502029895782471, "learning_rate": 4.8138588419889184e-06, "loss": 0.0468, "step": 1318 }, { "epoch": 3.103529411764706, "grad_norm": 0.6624422073364258, "learning_rate": 4.807640508041504e-06, "loss": 0.057, "step": 1319 }, { "epoch": 3.1058823529411765, "grad_norm": 0.6203996539115906, "learning_rate": 4.801422472881303e-06, "loss": 0.0575, "step": 1320 }, { "epoch": 3.108235294117647, "grad_norm": 0.6724445819854736, "learning_rate": 4.795204746141593e-06, "loss": 0.0569, "step": 1321 }, { "epoch": 3.1105882352941174, "grad_norm": 0.6261844038963318, "learning_rate": 4.788987337455177e-06, "loss": 0.0559, "step": 1322 }, { "epoch": 3.112941176470588, "grad_norm": 0.856099545955658, "learning_rate": 4.782770256454364e-06, "loss": 0.0611, "step": 1323 }, { "epoch": 3.1152941176470588, "grad_norm": 0.6143358945846558, "learning_rate": 4.776553512770954e-06, "loss": 0.0429, "step": 1324 }, { "epoch": 3.1176470588235294, "grad_norm": 0.798851490020752, "learning_rate": 4.770337116036228e-06, "loss": 0.066, "step": 1325 }, { "epoch": 3.12, "grad_norm": 0.6449416279792786, "learning_rate": 4.764121075880925e-06, "loss": 0.0597, "step": 1326 }, { "epoch": 3.1223529411764708, "grad_norm": 0.7368743419647217, "learning_rate": 4.757905401935233e-06, "loss": 0.0499, "step": 1327 }, { "epoch": 3.124705882352941, "grad_norm": 0.6534743905067444, "learning_rate": 4.751690103828777e-06, "loss": 0.0441, "step": 1328 }, { "epoch": 3.1270588235294117, "grad_norm": 0.6359279155731201, "learning_rate": 4.7454751911905935e-06, "loss": 0.0498, "step": 1329 }, { "epoch": 3.1294117647058823, "grad_norm": 0.6797459125518799, "learning_rate": 4.739260673649125e-06, "loss": 0.0529, "step": 1330 }, { "epoch": 3.131764705882353, "grad_norm": 0.6565910577774048, "learning_rate": 4.733046560832202e-06, "loss": 0.0476, "step": 1331 }, { "epoch": 3.1341176470588237, "grad_norm": 0.7123110294342041, "learning_rate": 4.726832862367025e-06, "loss": 0.0518, "step": 1332 }, { "epoch": 3.1364705882352943, "grad_norm": 0.6119337677955627, "learning_rate": 4.720619587880159e-06, "loss": 0.0566, "step": 1333 }, { "epoch": 3.1388235294117646, "grad_norm": 0.525443434715271, "learning_rate": 4.714406746997504e-06, "loss": 0.0449, "step": 1334 }, { "epoch": 3.1411764705882352, "grad_norm": 0.6112347841262817, "learning_rate": 4.708194349344295e-06, "loss": 0.0449, "step": 1335 }, { "epoch": 3.143529411764706, "grad_norm": 0.6034753918647766, "learning_rate": 4.701982404545075e-06, "loss": 0.0503, "step": 1336 }, { "epoch": 3.1458823529411766, "grad_norm": 0.8081318140029907, "learning_rate": 4.695770922223689e-06, "loss": 0.0521, "step": 1337 }, { "epoch": 3.1482352941176472, "grad_norm": 0.6306876540184021, "learning_rate": 4.689559912003269e-06, "loss": 0.046, "step": 1338 }, { "epoch": 3.1505882352941175, "grad_norm": 0.6746077537536621, "learning_rate": 4.683349383506203e-06, "loss": 0.0642, "step": 1339 }, { "epoch": 3.152941176470588, "grad_norm": 0.6464118957519531, "learning_rate": 4.677139346354146e-06, "loss": 0.0517, "step": 1340 }, { "epoch": 3.155294117647059, "grad_norm": 0.613947331905365, "learning_rate": 4.670929810167989e-06, "loss": 0.0527, "step": 1341 }, { "epoch": 3.1576470588235295, "grad_norm": 0.7289944887161255, "learning_rate": 4.664720784567837e-06, "loss": 0.0527, "step": 1342 }, { "epoch": 3.16, "grad_norm": 0.6219155788421631, "learning_rate": 4.658512279173017e-06, "loss": 0.0469, "step": 1343 }, { "epoch": 3.1623529411764704, "grad_norm": 0.7180870771408081, "learning_rate": 4.652304303602045e-06, "loss": 0.0651, "step": 1344 }, { "epoch": 3.164705882352941, "grad_norm": 0.7903926968574524, "learning_rate": 4.646096867472612e-06, "loss": 0.0701, "step": 1345 }, { "epoch": 3.1670588235294117, "grad_norm": 0.5225538015365601, "learning_rate": 4.639889980401581e-06, "loss": 0.0353, "step": 1346 }, { "epoch": 3.1694117647058824, "grad_norm": 0.7014821171760559, "learning_rate": 4.633683652004956e-06, "loss": 0.0632, "step": 1347 }, { "epoch": 3.171764705882353, "grad_norm": 0.5845239162445068, "learning_rate": 4.627477891897882e-06, "loss": 0.0497, "step": 1348 }, { "epoch": 3.1741176470588237, "grad_norm": 0.688871443271637, "learning_rate": 4.621272709694623e-06, "loss": 0.0408, "step": 1349 }, { "epoch": 3.176470588235294, "grad_norm": 0.7417445182800293, "learning_rate": 4.615068115008544e-06, "loss": 0.0418, "step": 1350 }, { "epoch": 3.1788235294117646, "grad_norm": 0.5674678683280945, "learning_rate": 4.6088641174521015e-06, "loss": 0.0517, "step": 1351 }, { "epoch": 3.1811764705882353, "grad_norm": 0.6825995445251465, "learning_rate": 4.60266072663683e-06, "loss": 0.0586, "step": 1352 }, { "epoch": 3.183529411764706, "grad_norm": 0.7254673838615417, "learning_rate": 4.596457952173318e-06, "loss": 0.0727, "step": 1353 }, { "epoch": 3.1858823529411766, "grad_norm": 0.6877577900886536, "learning_rate": 4.590255803671204e-06, "loss": 0.0442, "step": 1354 }, { "epoch": 3.1882352941176473, "grad_norm": 0.6820171475410461, "learning_rate": 4.584054290739156e-06, "loss": 0.069, "step": 1355 }, { "epoch": 3.1905882352941175, "grad_norm": 0.7768347263336182, "learning_rate": 4.577853422984856e-06, "loss": 0.0721, "step": 1356 }, { "epoch": 3.192941176470588, "grad_norm": 0.8253313302993774, "learning_rate": 4.571653210014987e-06, "loss": 0.0549, "step": 1357 }, { "epoch": 3.195294117647059, "grad_norm": 0.5451977252960205, "learning_rate": 4.565453661435214e-06, "loss": 0.0395, "step": 1358 }, { "epoch": 3.1976470588235295, "grad_norm": 0.738277792930603, "learning_rate": 4.5592547868501795e-06, "loss": 0.054, "step": 1359 }, { "epoch": 3.2, "grad_norm": 0.7344027757644653, "learning_rate": 4.553056595863481e-06, "loss": 0.0662, "step": 1360 }, { "epoch": 3.2023529411764704, "grad_norm": 0.7200692296028137, "learning_rate": 4.54685909807765e-06, "loss": 0.0481, "step": 1361 }, { "epoch": 3.204705882352941, "grad_norm": 0.6517437696456909, "learning_rate": 4.540662303094149e-06, "loss": 0.055, "step": 1362 }, { "epoch": 3.2070588235294117, "grad_norm": 0.6691633462905884, "learning_rate": 4.534466220513356e-06, "loss": 0.0556, "step": 1363 }, { "epoch": 3.2094117647058824, "grad_norm": 0.7753266096115112, "learning_rate": 4.528270859934535e-06, "loss": 0.0755, "step": 1364 }, { "epoch": 3.211764705882353, "grad_norm": 0.6496615409851074, "learning_rate": 4.5220762309558375e-06, "loss": 0.0609, "step": 1365 }, { "epoch": 3.2141176470588233, "grad_norm": 0.7249099016189575, "learning_rate": 4.515882343174285e-06, "loss": 0.0736, "step": 1366 }, { "epoch": 3.216470588235294, "grad_norm": 0.6120924353599548, "learning_rate": 4.509689206185743e-06, "loss": 0.0586, "step": 1367 }, { "epoch": 3.2188235294117646, "grad_norm": 0.6575921177864075, "learning_rate": 4.50349682958492e-06, "loss": 0.0546, "step": 1368 }, { "epoch": 3.2211764705882353, "grad_norm": 0.49071040749549866, "learning_rate": 4.49730522296534e-06, "loss": 0.0341, "step": 1369 }, { "epoch": 3.223529411764706, "grad_norm": 0.6412584185600281, "learning_rate": 4.491114395919342e-06, "loss": 0.0391, "step": 1370 }, { "epoch": 3.2258823529411766, "grad_norm": 0.7760428786277771, "learning_rate": 4.4849243580380535e-06, "loss": 0.0448, "step": 1371 }, { "epoch": 3.228235294117647, "grad_norm": 0.6417972445487976, "learning_rate": 4.478735118911377e-06, "loss": 0.0483, "step": 1372 }, { "epoch": 3.2305882352941175, "grad_norm": 0.5705522298812866, "learning_rate": 4.4725466881279815e-06, "loss": 0.0438, "step": 1373 }, { "epoch": 3.232941176470588, "grad_norm": 0.6917790770530701, "learning_rate": 4.4663590752752815e-06, "loss": 0.0759, "step": 1374 }, { "epoch": 3.235294117647059, "grad_norm": 0.7724515199661255, "learning_rate": 4.460172289939424e-06, "loss": 0.0425, "step": 1375 }, { "epoch": 3.2376470588235295, "grad_norm": 0.8895376920700073, "learning_rate": 4.4539863417052756e-06, "loss": 0.047, "step": 1376 }, { "epoch": 3.24, "grad_norm": 0.7286106944084167, "learning_rate": 4.447801240156406e-06, "loss": 0.0588, "step": 1377 }, { "epoch": 3.2423529411764704, "grad_norm": 0.9696557521820068, "learning_rate": 4.44161699487507e-06, "loss": 0.0478, "step": 1378 }, { "epoch": 3.244705882352941, "grad_norm": 0.6850109100341797, "learning_rate": 4.435433615442202e-06, "loss": 0.0445, "step": 1379 }, { "epoch": 3.2470588235294118, "grad_norm": 0.6105085015296936, "learning_rate": 4.429251111437386e-06, "loss": 0.0482, "step": 1380 }, { "epoch": 3.2494117647058824, "grad_norm": 0.7285106182098389, "learning_rate": 4.423069492438859e-06, "loss": 0.0617, "step": 1381 }, { "epoch": 3.251764705882353, "grad_norm": 0.9300916194915771, "learning_rate": 4.416888768023482e-06, "loss": 0.0668, "step": 1382 }, { "epoch": 3.2541176470588233, "grad_norm": 0.6072533130645752, "learning_rate": 4.4107089477667295e-06, "loss": 0.0558, "step": 1383 }, { "epoch": 3.256470588235294, "grad_norm": 0.8089953064918518, "learning_rate": 4.4045300412426774e-06, "loss": 0.0618, "step": 1384 }, { "epoch": 3.2588235294117647, "grad_norm": 0.6976902484893799, "learning_rate": 4.3983520580239884e-06, "loss": 0.063, "step": 1385 }, { "epoch": 3.2611764705882353, "grad_norm": 0.5679690837860107, "learning_rate": 4.392175007681885e-06, "loss": 0.0348, "step": 1386 }, { "epoch": 3.263529411764706, "grad_norm": 0.6394214034080505, "learning_rate": 4.385998899786156e-06, "loss": 0.0486, "step": 1387 }, { "epoch": 3.2658823529411762, "grad_norm": 1.2692238092422485, "learning_rate": 4.379823743905127e-06, "loss": 0.0666, "step": 1388 }, { "epoch": 3.268235294117647, "grad_norm": 0.7926836013793945, "learning_rate": 4.37364954960564e-06, "loss": 0.0705, "step": 1389 }, { "epoch": 3.2705882352941176, "grad_norm": 0.9845795631408691, "learning_rate": 4.367476326453061e-06, "loss": 0.0473, "step": 1390 }, { "epoch": 3.2729411764705882, "grad_norm": 0.6222259402275085, "learning_rate": 4.3613040840112385e-06, "loss": 0.0528, "step": 1391 }, { "epoch": 3.275294117647059, "grad_norm": 0.7852926254272461, "learning_rate": 4.355132831842512e-06, "loss": 0.0456, "step": 1392 }, { "epoch": 3.2776470588235296, "grad_norm": 0.648204505443573, "learning_rate": 4.348962579507682e-06, "loss": 0.0555, "step": 1393 }, { "epoch": 3.2800000000000002, "grad_norm": 0.6236404776573181, "learning_rate": 4.342793336566e-06, "loss": 0.0434, "step": 1394 }, { "epoch": 3.2823529411764705, "grad_norm": 0.6712266206741333, "learning_rate": 4.336625112575153e-06, "loss": 0.0508, "step": 1395 }, { "epoch": 3.284705882352941, "grad_norm": 0.5677075386047363, "learning_rate": 4.330457917091253e-06, "loss": 0.0551, "step": 1396 }, { "epoch": 3.287058823529412, "grad_norm": 0.7619417905807495, "learning_rate": 4.3242917596688135e-06, "loss": 0.0536, "step": 1397 }, { "epoch": 3.2894117647058825, "grad_norm": 0.6548598408699036, "learning_rate": 4.3181266498607445e-06, "loss": 0.0391, "step": 1398 }, { "epoch": 3.291764705882353, "grad_norm": 0.6144687533378601, "learning_rate": 4.311962597218333e-06, "loss": 0.0603, "step": 1399 }, { "epoch": 3.2941176470588234, "grad_norm": 0.7327597141265869, "learning_rate": 4.305799611291222e-06, "loss": 0.0593, "step": 1400 }, { "epoch": 3.2941176470588234, "eval_loss": 0.20811158418655396, "eval_runtime": 3.3533, "eval_samples_per_second": 33.102, "eval_steps_per_second": 1.193, "step": 1400 }, { "epoch": 3.296470588235294, "grad_norm": 0.6819949150085449, "learning_rate": 4.299637701627408e-06, "loss": 0.0617, "step": 1401 }, { "epoch": 3.2988235294117647, "grad_norm": 0.7088370323181152, "learning_rate": 4.293476877773217e-06, "loss": 0.0707, "step": 1402 }, { "epoch": 3.3011764705882354, "grad_norm": 0.7433046698570251, "learning_rate": 4.287317149273296e-06, "loss": 0.0696, "step": 1403 }, { "epoch": 3.303529411764706, "grad_norm": 0.6657242774963379, "learning_rate": 4.281158525670593e-06, "loss": 0.049, "step": 1404 }, { "epoch": 3.3058823529411763, "grad_norm": 0.708250880241394, "learning_rate": 4.275001016506342e-06, "loss": 0.0421, "step": 1405 }, { "epoch": 3.308235294117647, "grad_norm": 0.633381187915802, "learning_rate": 4.2688446313200526e-06, "loss": 0.0434, "step": 1406 }, { "epoch": 3.3105882352941176, "grad_norm": 0.70111083984375, "learning_rate": 4.262689379649498e-06, "loss": 0.063, "step": 1407 }, { "epoch": 3.3129411764705883, "grad_norm": 0.9045040011405945, "learning_rate": 4.256535271030685e-06, "loss": 0.0725, "step": 1408 }, { "epoch": 3.315294117647059, "grad_norm": 0.7571228742599487, "learning_rate": 4.2503823149978565e-06, "loss": 0.0805, "step": 1409 }, { "epoch": 3.317647058823529, "grad_norm": 0.8218108415603638, "learning_rate": 4.244230521083473e-06, "loss": 0.0708, "step": 1410 }, { "epoch": 3.32, "grad_norm": 0.684745192527771, "learning_rate": 4.23807989881818e-06, "loss": 0.0677, "step": 1411 }, { "epoch": 3.3223529411764705, "grad_norm": 0.5586913824081421, "learning_rate": 4.231930457730829e-06, "loss": 0.0469, "step": 1412 }, { "epoch": 3.324705882352941, "grad_norm": 0.7413168549537659, "learning_rate": 4.225782207348422e-06, "loss": 0.0723, "step": 1413 }, { "epoch": 3.327058823529412, "grad_norm": 1.1209150552749634, "learning_rate": 4.219635157196127e-06, "loss": 0.0839, "step": 1414 }, { "epoch": 3.3294117647058825, "grad_norm": 0.8276891708374023, "learning_rate": 4.213489316797254e-06, "loss": 0.0665, "step": 1415 }, { "epoch": 3.331764705882353, "grad_norm": 0.8164598345756531, "learning_rate": 4.207344695673227e-06, "loss": 0.0604, "step": 1416 }, { "epoch": 3.3341176470588234, "grad_norm": 0.5691380500793457, "learning_rate": 4.201201303343595e-06, "loss": 0.0513, "step": 1417 }, { "epoch": 3.336470588235294, "grad_norm": 0.6901618838310242, "learning_rate": 4.195059149325997e-06, "loss": 0.0623, "step": 1418 }, { "epoch": 3.3388235294117647, "grad_norm": 0.6162860989570618, "learning_rate": 4.188918243136152e-06, "loss": 0.0494, "step": 1419 }, { "epoch": 3.3411764705882354, "grad_norm": 0.8268863558769226, "learning_rate": 4.182778594287847e-06, "loss": 0.0694, "step": 1420 }, { "epoch": 3.343529411764706, "grad_norm": 0.5481549501419067, "learning_rate": 4.176640212292925e-06, "loss": 0.0407, "step": 1421 }, { "epoch": 3.3458823529411763, "grad_norm": 0.7684838771820068, "learning_rate": 4.170503106661258e-06, "loss": 0.0433, "step": 1422 }, { "epoch": 3.348235294117647, "grad_norm": 0.6201825141906738, "learning_rate": 4.1643672869007505e-06, "loss": 0.0538, "step": 1423 }, { "epoch": 3.3505882352941176, "grad_norm": 0.6142993569374084, "learning_rate": 4.158232762517306e-06, "loss": 0.0374, "step": 1424 }, { "epoch": 3.3529411764705883, "grad_norm": 0.673474133014679, "learning_rate": 4.1520995430148275e-06, "loss": 0.0565, "step": 1425 }, { "epoch": 3.355294117647059, "grad_norm": 0.7072544097900391, "learning_rate": 4.145967637895194e-06, "loss": 0.0624, "step": 1426 }, { "epoch": 3.357647058823529, "grad_norm": 0.6413081288337708, "learning_rate": 4.139837056658247e-06, "loss": 0.0565, "step": 1427 }, { "epoch": 3.36, "grad_norm": 0.5527366399765015, "learning_rate": 4.133707808801779e-06, "loss": 0.0415, "step": 1428 }, { "epoch": 3.3623529411764705, "grad_norm": 0.6953883171081543, "learning_rate": 4.127579903821516e-06, "loss": 0.0567, "step": 1429 }, { "epoch": 3.364705882352941, "grad_norm": 0.9099201560020447, "learning_rate": 4.121453351211102e-06, "loss": 0.073, "step": 1430 }, { "epoch": 3.367058823529412, "grad_norm": 0.6609534025192261, "learning_rate": 4.115328160462088e-06, "loss": 0.0566, "step": 1431 }, { "epoch": 3.369411764705882, "grad_norm": 0.8391723036766052, "learning_rate": 4.109204341063918e-06, "loss": 0.0646, "step": 1432 }, { "epoch": 3.3717647058823528, "grad_norm": 0.6450514197349548, "learning_rate": 4.103081902503901e-06, "loss": 0.0642, "step": 1433 }, { "epoch": 3.3741176470588234, "grad_norm": 0.609714686870575, "learning_rate": 4.096960854267222e-06, "loss": 0.0388, "step": 1434 }, { "epoch": 3.376470588235294, "grad_norm": 0.7976754307746887, "learning_rate": 4.090841205836897e-06, "loss": 0.0666, "step": 1435 }, { "epoch": 3.378823529411765, "grad_norm": 0.6603575944900513, "learning_rate": 4.084722966693781e-06, "loss": 0.0562, "step": 1436 }, { "epoch": 3.3811764705882354, "grad_norm": 0.7049462795257568, "learning_rate": 4.078606146316549e-06, "loss": 0.0553, "step": 1437 }, { "epoch": 3.383529411764706, "grad_norm": 0.5858755111694336, "learning_rate": 4.072490754181668e-06, "loss": 0.0551, "step": 1438 }, { "epoch": 3.3858823529411763, "grad_norm": 0.5769577622413635, "learning_rate": 4.066376799763401e-06, "loss": 0.051, "step": 1439 }, { "epoch": 3.388235294117647, "grad_norm": 0.6870687007904053, "learning_rate": 4.060264292533782e-06, "loss": 0.0598, "step": 1440 }, { "epoch": 3.3905882352941177, "grad_norm": 0.7189241051673889, "learning_rate": 4.054153241962599e-06, "loss": 0.0758, "step": 1441 }, { "epoch": 3.3929411764705883, "grad_norm": 0.5692880749702454, "learning_rate": 4.048043657517387e-06, "loss": 0.0406, "step": 1442 }, { "epoch": 3.395294117647059, "grad_norm": 0.7098594903945923, "learning_rate": 4.041935548663408e-06, "loss": 0.0439, "step": 1443 }, { "epoch": 3.3976470588235292, "grad_norm": 0.6775091290473938, "learning_rate": 4.035828924863638e-06, "loss": 0.0552, "step": 1444 }, { "epoch": 3.4, "grad_norm": 0.6740248799324036, "learning_rate": 4.029723795578754e-06, "loss": 0.0439, "step": 1445 }, { "epoch": 3.4023529411764706, "grad_norm": 0.6803441643714905, "learning_rate": 4.023620170267116e-06, "loss": 0.0481, "step": 1446 }, { "epoch": 3.4047058823529412, "grad_norm": 0.6193303465843201, "learning_rate": 4.017518058384753e-06, "loss": 0.0505, "step": 1447 }, { "epoch": 3.407058823529412, "grad_norm": 0.6889247894287109, "learning_rate": 4.0114174693853525e-06, "loss": 0.0532, "step": 1448 }, { "epoch": 3.409411764705882, "grad_norm": 0.6709504723548889, "learning_rate": 4.005318412720238e-06, "loss": 0.0541, "step": 1449 }, { "epoch": 3.411764705882353, "grad_norm": 0.9423147439956665, "learning_rate": 3.999220897838364e-06, "loss": 0.0487, "step": 1450 }, { "epoch": 3.4141176470588235, "grad_norm": 0.7411242723464966, "learning_rate": 3.9931249341862955e-06, "loss": 0.0694, "step": 1451 }, { "epoch": 3.416470588235294, "grad_norm": 0.6223944425582886, "learning_rate": 3.987030531208189e-06, "loss": 0.0663, "step": 1452 }, { "epoch": 3.418823529411765, "grad_norm": 0.5782997012138367, "learning_rate": 3.980937698345788e-06, "loss": 0.0494, "step": 1453 }, { "epoch": 3.4211764705882355, "grad_norm": 0.7587346434593201, "learning_rate": 3.974846445038407e-06, "loss": 0.0549, "step": 1454 }, { "epoch": 3.4235294117647057, "grad_norm": 0.6465654373168945, "learning_rate": 3.968756780722904e-06, "loss": 0.0588, "step": 1455 }, { "epoch": 3.4258823529411764, "grad_norm": 0.5092988610267639, "learning_rate": 3.96266871483368e-06, "loss": 0.0334, "step": 1456 }, { "epoch": 3.428235294117647, "grad_norm": 0.7225087285041809, "learning_rate": 3.956582256802667e-06, "loss": 0.0439, "step": 1457 }, { "epoch": 3.4305882352941177, "grad_norm": 0.7965821623802185, "learning_rate": 3.9504974160592886e-06, "loss": 0.0649, "step": 1458 }, { "epoch": 3.4329411764705884, "grad_norm": 0.6133378148078918, "learning_rate": 3.944414202030482e-06, "loss": 0.0498, "step": 1459 }, { "epoch": 3.435294117647059, "grad_norm": 0.7459843158721924, "learning_rate": 3.938332624140648e-06, "loss": 0.0582, "step": 1460 }, { "epoch": 3.4376470588235293, "grad_norm": 0.5898981094360352, "learning_rate": 3.932252691811665e-06, "loss": 0.0488, "step": 1461 }, { "epoch": 3.44, "grad_norm": 0.8492389917373657, "learning_rate": 3.926174414462857e-06, "loss": 0.0541, "step": 1462 }, { "epoch": 3.4423529411764706, "grad_norm": 0.5698546171188354, "learning_rate": 3.920097801510979e-06, "loss": 0.043, "step": 1463 }, { "epoch": 3.4447058823529413, "grad_norm": 0.856282651424408, "learning_rate": 3.914022862370218e-06, "loss": 0.0587, "step": 1464 }, { "epoch": 3.447058823529412, "grad_norm": 0.707055926322937, "learning_rate": 3.907949606452161e-06, "loss": 0.0346, "step": 1465 }, { "epoch": 3.449411764705882, "grad_norm": 0.7313753962516785, "learning_rate": 3.9018780431657855e-06, "loss": 0.0537, "step": 1466 }, { "epoch": 3.451764705882353, "grad_norm": 0.679523766040802, "learning_rate": 3.895808181917453e-06, "loss": 0.0501, "step": 1467 }, { "epoch": 3.4541176470588235, "grad_norm": 1.7952200174331665, "learning_rate": 3.889740032110884e-06, "loss": 0.0576, "step": 1468 }, { "epoch": 3.456470588235294, "grad_norm": 1.0010805130004883, "learning_rate": 3.883673603147146e-06, "loss": 0.0573, "step": 1469 }, { "epoch": 3.458823529411765, "grad_norm": 0.6495124101638794, "learning_rate": 3.877608904424648e-06, "loss": 0.0545, "step": 1470 }, { "epoch": 3.461176470588235, "grad_norm": 0.7434952855110168, "learning_rate": 3.871545945339109e-06, "loss": 0.0633, "step": 1471 }, { "epoch": 3.4635294117647057, "grad_norm": 0.6245753169059753, "learning_rate": 3.865484735283558e-06, "loss": 0.0608, "step": 1472 }, { "epoch": 3.4658823529411764, "grad_norm": 0.656943678855896, "learning_rate": 3.859425283648315e-06, "loss": 0.0443, "step": 1473 }, { "epoch": 3.468235294117647, "grad_norm": 0.6339210867881775, "learning_rate": 3.853367599820972e-06, "loss": 0.0443, "step": 1474 }, { "epoch": 3.4705882352941178, "grad_norm": 0.6373116970062256, "learning_rate": 3.847311693186386e-06, "loss": 0.0531, "step": 1475 }, { "epoch": 3.4729411764705884, "grad_norm": 0.6633740067481995, "learning_rate": 3.84125757312666e-06, "loss": 0.0584, "step": 1476 }, { "epoch": 3.4752941176470586, "grad_norm": 0.8607824444770813, "learning_rate": 3.835205249021127e-06, "loss": 0.0378, "step": 1477 }, { "epoch": 3.4776470588235293, "grad_norm": 0.8656811118125916, "learning_rate": 3.829154730246337e-06, "loss": 0.0443, "step": 1478 }, { "epoch": 3.48, "grad_norm": 0.5924955010414124, "learning_rate": 3.823106026176051e-06, "loss": 0.0352, "step": 1479 }, { "epoch": 3.4823529411764707, "grad_norm": 0.7391366362571716, "learning_rate": 3.817059146181206e-06, "loss": 0.0552, "step": 1480 }, { "epoch": 3.4847058823529413, "grad_norm": 0.6958239674568176, "learning_rate": 3.811014099629926e-06, "loss": 0.0534, "step": 1481 }, { "epoch": 3.487058823529412, "grad_norm": 0.6780948638916016, "learning_rate": 3.804970895887482e-06, "loss": 0.052, "step": 1482 }, { "epoch": 3.489411764705882, "grad_norm": 0.564693808555603, "learning_rate": 3.7989295443162992e-06, "loss": 0.0396, "step": 1483 }, { "epoch": 3.491764705882353, "grad_norm": 0.6049733757972717, "learning_rate": 3.7928900542759323e-06, "loss": 0.0463, "step": 1484 }, { "epoch": 3.4941176470588236, "grad_norm": 0.665569543838501, "learning_rate": 3.7868524351230464e-06, "loss": 0.0455, "step": 1485 }, { "epoch": 3.496470588235294, "grad_norm": 0.5430455803871155, "learning_rate": 3.780816696211414e-06, "loss": 0.0428, "step": 1486 }, { "epoch": 3.498823529411765, "grad_norm": 0.6424660682678223, "learning_rate": 3.7747828468918923e-06, "loss": 0.0598, "step": 1487 }, { "epoch": 3.501176470588235, "grad_norm": 0.5927805304527283, "learning_rate": 3.7687508965124113e-06, "loss": 0.0533, "step": 1488 }, { "epoch": 3.503529411764706, "grad_norm": 0.6050775051116943, "learning_rate": 3.762720854417958e-06, "loss": 0.0632, "step": 1489 }, { "epoch": 3.5058823529411764, "grad_norm": 0.5748914480209351, "learning_rate": 3.7566927299505655e-06, "loss": 0.0494, "step": 1490 }, { "epoch": 3.508235294117647, "grad_norm": 0.7387210130691528, "learning_rate": 3.7506665324492935e-06, "loss": 0.0615, "step": 1491 }, { "epoch": 3.510588235294118, "grad_norm": 0.7126765251159668, "learning_rate": 3.7446422712502184e-06, "loss": 0.0502, "step": 1492 }, { "epoch": 3.512941176470588, "grad_norm": 0.7414655089378357, "learning_rate": 3.7386199556864145e-06, "loss": 0.0513, "step": 1493 }, { "epoch": 3.5152941176470587, "grad_norm": 0.6380066275596619, "learning_rate": 3.7325995950879423e-06, "loss": 0.0411, "step": 1494 }, { "epoch": 3.5176470588235293, "grad_norm": 0.7190722823143005, "learning_rate": 3.7265811987818375e-06, "loss": 0.054, "step": 1495 }, { "epoch": 3.52, "grad_norm": 0.5374762415885925, "learning_rate": 3.7205647760920866e-06, "loss": 0.0421, "step": 1496 }, { "epoch": 3.5223529411764707, "grad_norm": 0.681560218334198, "learning_rate": 3.7145503363396223e-06, "loss": 0.043, "step": 1497 }, { "epoch": 3.524705882352941, "grad_norm": 0.6806833744049072, "learning_rate": 3.7085378888423048e-06, "loss": 0.0482, "step": 1498 }, { "epoch": 3.527058823529412, "grad_norm": 0.6298288106918335, "learning_rate": 3.7025274429149065e-06, "loss": 0.0501, "step": 1499 }, { "epoch": 3.5294117647058822, "grad_norm": 0.6265859007835388, "learning_rate": 3.6965190078690995e-06, "loss": 0.0491, "step": 1500 }, { "epoch": 3.531764705882353, "grad_norm": 0.8476573824882507, "learning_rate": 3.690512593013443e-06, "loss": 0.0752, "step": 1501 }, { "epoch": 3.5341176470588236, "grad_norm": 0.8293654918670654, "learning_rate": 3.6845082076533617e-06, "loss": 0.0761, "step": 1502 }, { "epoch": 3.5364705882352943, "grad_norm": 0.6539145708084106, "learning_rate": 3.67850586109114e-06, "loss": 0.0446, "step": 1503 }, { "epoch": 3.538823529411765, "grad_norm": 0.7799049019813538, "learning_rate": 3.6725055626259002e-06, "loss": 0.0486, "step": 1504 }, { "epoch": 3.541176470588235, "grad_norm": 0.7144519686698914, "learning_rate": 3.666507321553594e-06, "loss": 0.0753, "step": 1505 }, { "epoch": 3.543529411764706, "grad_norm": 0.6979714632034302, "learning_rate": 3.66051114716699e-06, "loss": 0.0463, "step": 1506 }, { "epoch": 3.5458823529411765, "grad_norm": 0.6714532375335693, "learning_rate": 3.654517048755643e-06, "loss": 0.0454, "step": 1507 }, { "epoch": 3.548235294117647, "grad_norm": 0.5966318845748901, "learning_rate": 3.648525035605903e-06, "loss": 0.0405, "step": 1508 }, { "epoch": 3.550588235294118, "grad_norm": 0.8320118188858032, "learning_rate": 3.6425351170008878e-06, "loss": 0.062, "step": 1509 }, { "epoch": 3.552941176470588, "grad_norm": 0.7221941351890564, "learning_rate": 3.6365473022204616e-06, "loss": 0.0576, "step": 1510 }, { "epoch": 3.5552941176470587, "grad_norm": 0.5722947716712952, "learning_rate": 3.6305616005412393e-06, "loss": 0.0495, "step": 1511 }, { "epoch": 3.5576470588235294, "grad_norm": 0.7725701332092285, "learning_rate": 3.6245780212365587e-06, "loss": 0.0552, "step": 1512 }, { "epoch": 3.56, "grad_norm": 0.6416175365447998, "learning_rate": 3.618596573576467e-06, "loss": 0.0652, "step": 1513 }, { "epoch": 3.5623529411764707, "grad_norm": 0.844175398349762, "learning_rate": 3.612617266827713e-06, "loss": 0.067, "step": 1514 }, { "epoch": 3.564705882352941, "grad_norm": 0.6434468030929565, "learning_rate": 3.6066401102537236e-06, "loss": 0.0421, "step": 1515 }, { "epoch": 3.5670588235294116, "grad_norm": 0.6941683292388916, "learning_rate": 3.6006651131146007e-06, "loss": 0.0431, "step": 1516 }, { "epoch": 3.5694117647058823, "grad_norm": 0.6977798938751221, "learning_rate": 3.594692284667096e-06, "loss": 0.0505, "step": 1517 }, { "epoch": 3.571764705882353, "grad_norm": 0.6180133819580078, "learning_rate": 3.5887216341646035e-06, "loss": 0.0537, "step": 1518 }, { "epoch": 3.5741176470588236, "grad_norm": 0.6713327169418335, "learning_rate": 3.5827531708571423e-06, "loss": 0.0643, "step": 1519 }, { "epoch": 3.576470588235294, "grad_norm": 1.0031547546386719, "learning_rate": 3.576786903991344e-06, "loss": 0.0646, "step": 1520 }, { "epoch": 3.578823529411765, "grad_norm": 0.733528733253479, "learning_rate": 3.5708228428104346e-06, "loss": 0.0681, "step": 1521 }, { "epoch": 3.581176470588235, "grad_norm": 0.5416224598884583, "learning_rate": 3.564860996554226e-06, "loss": 0.0416, "step": 1522 }, { "epoch": 3.583529411764706, "grad_norm": 0.6355197429656982, "learning_rate": 3.558901374459098e-06, "loss": 0.0647, "step": 1523 }, { "epoch": 3.5858823529411765, "grad_norm": 0.6344274878501892, "learning_rate": 3.552943985757983e-06, "loss": 0.0553, "step": 1524 }, { "epoch": 3.588235294117647, "grad_norm": 0.6710776686668396, "learning_rate": 3.5469888396803576e-06, "loss": 0.0526, "step": 1525 }, { "epoch": 3.590588235294118, "grad_norm": 0.8313812613487244, "learning_rate": 3.541035945452216e-06, "loss": 0.0501, "step": 1526 }, { "epoch": 3.592941176470588, "grad_norm": 0.8184390664100647, "learning_rate": 3.5350853122960705e-06, "loss": 0.0545, "step": 1527 }, { "epoch": 3.5952941176470588, "grad_norm": 0.5932326316833496, "learning_rate": 3.5291369494309326e-06, "loss": 0.0407, "step": 1528 }, { "epoch": 3.5976470588235294, "grad_norm": 0.7754701972007751, "learning_rate": 3.523190866072286e-06, "loss": 0.0703, "step": 1529 }, { "epoch": 3.6, "grad_norm": 1.4121326208114624, "learning_rate": 3.5172470714320916e-06, "loss": 0.0675, "step": 1530 }, { "epoch": 3.6023529411764708, "grad_norm": 0.738274335861206, "learning_rate": 3.511305574718767e-06, "loss": 0.051, "step": 1531 }, { "epoch": 3.604705882352941, "grad_norm": 0.5938087105751038, "learning_rate": 3.5053663851371574e-06, "loss": 0.04, "step": 1532 }, { "epoch": 3.6070588235294117, "grad_norm": 0.5753822326660156, "learning_rate": 3.4994295118885464e-06, "loss": 0.041, "step": 1533 }, { "epoch": 3.6094117647058823, "grad_norm": 0.7501354217529297, "learning_rate": 3.4934949641706234e-06, "loss": 0.0648, "step": 1534 }, { "epoch": 3.611764705882353, "grad_norm": 0.6991965770721436, "learning_rate": 3.487562751177474e-06, "loss": 0.0596, "step": 1535 }, { "epoch": 3.6141176470588237, "grad_norm": 0.7730932831764221, "learning_rate": 3.4816328820995703e-06, "loss": 0.0544, "step": 1536 }, { "epoch": 3.616470588235294, "grad_norm": 0.9618079662322998, "learning_rate": 3.475705366123748e-06, "loss": 0.0399, "step": 1537 }, { "epoch": 3.6188235294117646, "grad_norm": 0.6133480668067932, "learning_rate": 3.4697802124332015e-06, "loss": 0.0427, "step": 1538 }, { "epoch": 3.621176470588235, "grad_norm": 0.6616939306259155, "learning_rate": 3.4638574302074663e-06, "loss": 0.0621, "step": 1539 }, { "epoch": 3.623529411764706, "grad_norm": 0.889970600605011, "learning_rate": 3.4579370286223994e-06, "loss": 0.0471, "step": 1540 }, { "epoch": 3.6258823529411766, "grad_norm": 0.829371988773346, "learning_rate": 3.4520190168501726e-06, "loss": 0.0666, "step": 1541 }, { "epoch": 3.6282352941176472, "grad_norm": 0.6455196738243103, "learning_rate": 3.446103404059257e-06, "loss": 0.0545, "step": 1542 }, { "epoch": 3.630588235294118, "grad_norm": 1.2159844636917114, "learning_rate": 3.4401901994144014e-06, "loss": 0.0388, "step": 1543 }, { "epoch": 3.632941176470588, "grad_norm": 0.6022303700447083, "learning_rate": 3.4342794120766287e-06, "loss": 0.0414, "step": 1544 }, { "epoch": 3.635294117647059, "grad_norm": 0.6545528769493103, "learning_rate": 3.4283710512032157e-06, "loss": 0.0401, "step": 1545 }, { "epoch": 3.6376470588235295, "grad_norm": 0.8615573048591614, "learning_rate": 3.422465125947678e-06, "loss": 0.0592, "step": 1546 }, { "epoch": 3.64, "grad_norm": 0.6839662194252014, "learning_rate": 3.416561645459763e-06, "loss": 0.0523, "step": 1547 }, { "epoch": 3.642352941176471, "grad_norm": 0.6358193159103394, "learning_rate": 3.4106606188854225e-06, "loss": 0.0567, "step": 1548 }, { "epoch": 3.644705882352941, "grad_norm": 0.7091001868247986, "learning_rate": 3.404762055366813e-06, "loss": 0.074, "step": 1549 }, { "epoch": 3.6470588235294117, "grad_norm": 0.7803909182548523, "learning_rate": 3.3988659640422737e-06, "loss": 0.0486, "step": 1550 }, { "epoch": 3.6494117647058824, "grad_norm": 0.6990489363670349, "learning_rate": 3.3929723540463135e-06, "loss": 0.0484, "step": 1551 }, { "epoch": 3.651764705882353, "grad_norm": 0.6860995292663574, "learning_rate": 3.3870812345095937e-06, "loss": 0.0516, "step": 1552 }, { "epoch": 3.6541176470588237, "grad_norm": 0.5654842853546143, "learning_rate": 3.381192614558926e-06, "loss": 0.0495, "step": 1553 }, { "epoch": 3.656470588235294, "grad_norm": 0.6316255927085876, "learning_rate": 3.3753065033172385e-06, "loss": 0.0482, "step": 1554 }, { "epoch": 3.6588235294117646, "grad_norm": 0.7249330282211304, "learning_rate": 3.3694229099035803e-06, "loss": 0.0476, "step": 1555 }, { "epoch": 3.6611764705882353, "grad_norm": 0.6643503308296204, "learning_rate": 3.3635418434331006e-06, "loss": 0.0373, "step": 1556 }, { "epoch": 3.663529411764706, "grad_norm": 0.7446545362472534, "learning_rate": 3.357663313017025e-06, "loss": 0.0384, "step": 1557 }, { "epoch": 3.6658823529411766, "grad_norm": 0.7732558846473694, "learning_rate": 3.351787327762662e-06, "loss": 0.0552, "step": 1558 }, { "epoch": 3.668235294117647, "grad_norm": 0.8720780611038208, "learning_rate": 3.345913896773364e-06, "loss": 0.042, "step": 1559 }, { "epoch": 3.6705882352941175, "grad_norm": 0.6133928894996643, "learning_rate": 3.340043029148539e-06, "loss": 0.0394, "step": 1560 }, { "epoch": 3.672941176470588, "grad_norm": 0.6236804723739624, "learning_rate": 3.334174733983616e-06, "loss": 0.0731, "step": 1561 }, { "epoch": 3.675294117647059, "grad_norm": 0.6509265899658203, "learning_rate": 3.3283090203700395e-06, "loss": 0.0624, "step": 1562 }, { "epoch": 3.6776470588235295, "grad_norm": 0.5995845794677734, "learning_rate": 3.3224458973952566e-06, "loss": 0.0512, "step": 1563 }, { "epoch": 3.68, "grad_norm": 0.7175416350364685, "learning_rate": 3.3165853741427007e-06, "loss": 0.0616, "step": 1564 }, { "epoch": 3.682352941176471, "grad_norm": 0.699326753616333, "learning_rate": 3.3107274596917744e-06, "loss": 0.0628, "step": 1565 }, { "epoch": 3.684705882352941, "grad_norm": 0.7520231604576111, "learning_rate": 3.3048721631178416e-06, "loss": 0.0373, "step": 1566 }, { "epoch": 3.6870588235294117, "grad_norm": 0.7419445514678955, "learning_rate": 3.299019493492211e-06, "loss": 0.0657, "step": 1567 }, { "epoch": 3.6894117647058824, "grad_norm": 0.5454927682876587, "learning_rate": 3.2931694598821183e-06, "loss": 0.0466, "step": 1568 }, { "epoch": 3.691764705882353, "grad_norm": 0.7106801271438599, "learning_rate": 3.287322071350719e-06, "loss": 0.0494, "step": 1569 }, { "epoch": 3.6941176470588237, "grad_norm": 0.80731600522995, "learning_rate": 3.2814773369570674e-06, "loss": 0.0692, "step": 1570 }, { "epoch": 3.696470588235294, "grad_norm": 0.6571000814437866, "learning_rate": 3.2756352657561086e-06, "loss": 0.0367, "step": 1571 }, { "epoch": 3.6988235294117646, "grad_norm": 0.7181046605110168, "learning_rate": 3.269795866798661e-06, "loss": 0.0568, "step": 1572 }, { "epoch": 3.7011764705882353, "grad_norm": 0.8991043567657471, "learning_rate": 3.263959149131401e-06, "loss": 0.0525, "step": 1573 }, { "epoch": 3.703529411764706, "grad_norm": 0.6854153275489807, "learning_rate": 3.2581251217968528e-06, "loss": 0.0548, "step": 1574 }, { "epoch": 3.7058823529411766, "grad_norm": 0.5120577812194824, "learning_rate": 3.2522937938333767e-06, "loss": 0.0346, "step": 1575 }, { "epoch": 3.708235294117647, "grad_norm": 0.6433867812156677, "learning_rate": 3.246465174275141e-06, "loss": 0.0611, "step": 1576 }, { "epoch": 3.7105882352941175, "grad_norm": 0.697925329208374, "learning_rate": 3.2406392721521255e-06, "loss": 0.0493, "step": 1577 }, { "epoch": 3.712941176470588, "grad_norm": 0.5316213965415955, "learning_rate": 3.2348160964901025e-06, "loss": 0.0458, "step": 1578 }, { "epoch": 3.715294117647059, "grad_norm": 0.5620549321174622, "learning_rate": 3.2289956563106094e-06, "loss": 0.0333, "step": 1579 }, { "epoch": 3.7176470588235295, "grad_norm": 0.7556501626968384, "learning_rate": 3.2231779606309595e-06, "loss": 0.0534, "step": 1580 }, { "epoch": 3.7199999999999998, "grad_norm": 0.6395542025566101, "learning_rate": 3.2173630184641997e-06, "loss": 0.0483, "step": 1581 }, { "epoch": 3.722352941176471, "grad_norm": 0.9265744090080261, "learning_rate": 3.2115508388191237e-06, "loss": 0.0533, "step": 1582 }, { "epoch": 3.724705882352941, "grad_norm": 0.746200442314148, "learning_rate": 3.20574143070024e-06, "loss": 0.064, "step": 1583 }, { "epoch": 3.7270588235294118, "grad_norm": 0.5577819347381592, "learning_rate": 3.1999348031077593e-06, "loss": 0.0304, "step": 1584 }, { "epoch": 3.7294117647058824, "grad_norm": 0.6770521998405457, "learning_rate": 3.194130965037593e-06, "loss": 0.0479, "step": 1585 }, { "epoch": 3.731764705882353, "grad_norm": 0.717755138874054, "learning_rate": 3.188329925481325e-06, "loss": 0.0667, "step": 1586 }, { "epoch": 3.7341176470588238, "grad_norm": 0.7452983260154724, "learning_rate": 3.1825316934262046e-06, "loss": 0.0658, "step": 1587 }, { "epoch": 3.736470588235294, "grad_norm": 0.7463365197181702, "learning_rate": 3.176736277855133e-06, "loss": 0.0707, "step": 1588 }, { "epoch": 3.7388235294117647, "grad_norm": 0.6792470216751099, "learning_rate": 3.1709436877466466e-06, "loss": 0.0519, "step": 1589 }, { "epoch": 3.7411764705882353, "grad_norm": 0.6617421507835388, "learning_rate": 3.1651539320749048e-06, "loss": 0.0489, "step": 1590 }, { "epoch": 3.743529411764706, "grad_norm": 0.6813374161720276, "learning_rate": 3.1593670198096763e-06, "loss": 0.0651, "step": 1591 }, { "epoch": 3.7458823529411767, "grad_norm": 0.5145179033279419, "learning_rate": 3.1535829599163244e-06, "loss": 0.0349, "step": 1592 }, { "epoch": 3.748235294117647, "grad_norm": 0.8012895584106445, "learning_rate": 3.147801761355792e-06, "loss": 0.0552, "step": 1593 }, { "epoch": 3.7505882352941176, "grad_norm": 0.6948776841163635, "learning_rate": 3.1420234330845927e-06, "loss": 0.0708, "step": 1594 }, { "epoch": 3.7529411764705882, "grad_norm": 0.6374104022979736, "learning_rate": 3.1362479840547883e-06, "loss": 0.0505, "step": 1595 }, { "epoch": 3.755294117647059, "grad_norm": 0.8743393421173096, "learning_rate": 3.130475423213983e-06, "loss": 0.0547, "step": 1596 }, { "epoch": 3.7576470588235296, "grad_norm": 0.6177039742469788, "learning_rate": 3.124705759505309e-06, "loss": 0.0398, "step": 1597 }, { "epoch": 3.76, "grad_norm": 0.572316586971283, "learning_rate": 3.118939001867404e-06, "loss": 0.0475, "step": 1598 }, { "epoch": 3.7623529411764705, "grad_norm": 0.7171497941017151, "learning_rate": 3.113175159234406e-06, "loss": 0.072, "step": 1599 }, { "epoch": 3.764705882352941, "grad_norm": 0.5975511074066162, "learning_rate": 3.1074142405359424e-06, "loss": 0.0605, "step": 1600 }, { "epoch": 3.764705882352941, "eval_loss": 0.2078520953655243, "eval_runtime": 3.332, "eval_samples_per_second": 33.314, "eval_steps_per_second": 1.2, "step": 1600 }, { "epoch": 3.767058823529412, "grad_norm": 0.6895652413368225, "learning_rate": 3.1016562546970994e-06, "loss": 0.0499, "step": 1601 }, { "epoch": 3.7694117647058825, "grad_norm": 0.8133839964866638, "learning_rate": 3.0959012106384327e-06, "loss": 0.0573, "step": 1602 }, { "epoch": 3.7717647058823527, "grad_norm": 0.6378521919250488, "learning_rate": 3.090149117275928e-06, "loss": 0.0521, "step": 1603 }, { "epoch": 3.774117647058824, "grad_norm": 0.709263801574707, "learning_rate": 3.084399983521006e-06, "loss": 0.0433, "step": 1604 }, { "epoch": 3.776470588235294, "grad_norm": 0.6668934226036072, "learning_rate": 3.078653818280507e-06, "loss": 0.0772, "step": 1605 }, { "epoch": 3.7788235294117647, "grad_norm": 0.732501208782196, "learning_rate": 3.0729106304566592e-06, "loss": 0.0391, "step": 1606 }, { "epoch": 3.7811764705882354, "grad_norm": 0.626349151134491, "learning_rate": 3.0671704289470916e-06, "loss": 0.0623, "step": 1607 }, { "epoch": 3.783529411764706, "grad_norm": 0.7870427370071411, "learning_rate": 3.0614332226447984e-06, "loss": 0.0648, "step": 1608 }, { "epoch": 3.7858823529411767, "grad_norm": 0.7330435514450073, "learning_rate": 3.055699020438136e-06, "loss": 0.0525, "step": 1609 }, { "epoch": 3.788235294117647, "grad_norm": 0.6182593107223511, "learning_rate": 3.0499678312108065e-06, "loss": 0.0465, "step": 1610 }, { "epoch": 3.7905882352941176, "grad_norm": 0.6962723135948181, "learning_rate": 3.0442396638418457e-06, "loss": 0.0662, "step": 1611 }, { "epoch": 3.7929411764705883, "grad_norm": 0.6408659219741821, "learning_rate": 3.038514527205605e-06, "loss": 0.0627, "step": 1612 }, { "epoch": 3.795294117647059, "grad_norm": 0.5173056125640869, "learning_rate": 3.0327924301717426e-06, "loss": 0.0368, "step": 1613 }, { "epoch": 3.7976470588235296, "grad_norm": 0.6127952933311462, "learning_rate": 3.0270733816052056e-06, "loss": 0.0512, "step": 1614 }, { "epoch": 3.8, "grad_norm": 0.6519802212715149, "learning_rate": 3.0213573903662196e-06, "loss": 0.0565, "step": 1615 }, { "epoch": 3.8023529411764705, "grad_norm": 0.6610091328620911, "learning_rate": 3.015644465310275e-06, "loss": 0.0478, "step": 1616 }, { "epoch": 3.804705882352941, "grad_norm": 0.6177372932434082, "learning_rate": 3.009934615288108e-06, "loss": 0.0435, "step": 1617 }, { "epoch": 3.807058823529412, "grad_norm": 0.6565794944763184, "learning_rate": 3.004227849145695e-06, "loss": 0.0396, "step": 1618 }, { "epoch": 3.8094117647058825, "grad_norm": 0.5739056468009949, "learning_rate": 2.9985241757242347e-06, "loss": 0.0465, "step": 1619 }, { "epoch": 3.8117647058823527, "grad_norm": 0.5591033101081848, "learning_rate": 2.9928236038601274e-06, "loss": 0.0496, "step": 1620 }, { "epoch": 3.8141176470588234, "grad_norm": 0.750174880027771, "learning_rate": 2.987126142384977e-06, "loss": 0.0693, "step": 1621 }, { "epoch": 3.816470588235294, "grad_norm": 0.6897544264793396, "learning_rate": 2.9814318001255672e-06, "loss": 0.0595, "step": 1622 }, { "epoch": 3.8188235294117647, "grad_norm": 0.6736388802528381, "learning_rate": 2.9757405859038425e-06, "loss": 0.0593, "step": 1623 }, { "epoch": 3.8211764705882354, "grad_norm": 0.5495582222938538, "learning_rate": 2.9700525085369103e-06, "loss": 0.0307, "step": 1624 }, { "epoch": 3.8235294117647056, "grad_norm": 0.6217741966247559, "learning_rate": 2.9643675768370113e-06, "loss": 0.0456, "step": 1625 }, { "epoch": 3.8258823529411767, "grad_norm": 0.5974146723747253, "learning_rate": 2.9586857996115137e-06, "loss": 0.0437, "step": 1626 }, { "epoch": 3.828235294117647, "grad_norm": 0.680789589881897, "learning_rate": 2.953007185662907e-06, "loss": 0.0446, "step": 1627 }, { "epoch": 3.8305882352941176, "grad_norm": 0.5888544321060181, "learning_rate": 2.9473317437887666e-06, "loss": 0.055, "step": 1628 }, { "epoch": 3.8329411764705883, "grad_norm": 0.5317657589912415, "learning_rate": 2.941659482781764e-06, "loss": 0.0474, "step": 1629 }, { "epoch": 3.835294117647059, "grad_norm": 0.7098889350891113, "learning_rate": 2.9359904114296397e-06, "loss": 0.0645, "step": 1630 }, { "epoch": 3.8376470588235296, "grad_norm": 0.6749153137207031, "learning_rate": 2.9303245385151883e-06, "loss": 0.0627, "step": 1631 }, { "epoch": 3.84, "grad_norm": 0.5657575130462646, "learning_rate": 2.924661872816256e-06, "loss": 0.0447, "step": 1632 }, { "epoch": 3.8423529411764705, "grad_norm": 0.648543119430542, "learning_rate": 2.919002423105717e-06, "loss": 0.0641, "step": 1633 }, { "epoch": 3.844705882352941, "grad_norm": 0.5736582279205322, "learning_rate": 2.913346198151463e-06, "loss": 0.0412, "step": 1634 }, { "epoch": 3.847058823529412, "grad_norm": 0.6214462518692017, "learning_rate": 2.90769320671639e-06, "loss": 0.0507, "step": 1635 }, { "epoch": 3.8494117647058825, "grad_norm": 1.1984723806381226, "learning_rate": 2.902043457558382e-06, "loss": 0.0349, "step": 1636 }, { "epoch": 3.8517647058823528, "grad_norm": 0.7699630856513977, "learning_rate": 2.896396959430305e-06, "loss": 0.0611, "step": 1637 }, { "epoch": 3.8541176470588234, "grad_norm": 0.7370188236236572, "learning_rate": 2.8907537210799896e-06, "loss": 0.0471, "step": 1638 }, { "epoch": 3.856470588235294, "grad_norm": 0.8397631049156189, "learning_rate": 2.8851137512502048e-06, "loss": 0.0623, "step": 1639 }, { "epoch": 3.8588235294117648, "grad_norm": 0.713860273361206, "learning_rate": 2.8794770586786662e-06, "loss": 0.0555, "step": 1640 }, { "epoch": 3.8611764705882354, "grad_norm": 0.7423220276832581, "learning_rate": 2.8738436520980156e-06, "loss": 0.07, "step": 1641 }, { "epoch": 3.8635294117647057, "grad_norm": 0.7277423143386841, "learning_rate": 2.868213540235788e-06, "loss": 0.0539, "step": 1642 }, { "epoch": 3.8658823529411763, "grad_norm": 0.7192392945289612, "learning_rate": 2.8625867318144283e-06, "loss": 0.0585, "step": 1643 }, { "epoch": 3.868235294117647, "grad_norm": 0.6056866645812988, "learning_rate": 2.856963235551261e-06, "loss": 0.0498, "step": 1644 }, { "epoch": 3.8705882352941177, "grad_norm": 0.6310097575187683, "learning_rate": 2.851343060158476e-06, "loss": 0.0418, "step": 1645 }, { "epoch": 3.8729411764705883, "grad_norm": 0.7208617925643921, "learning_rate": 2.8457262143431192e-06, "loss": 0.0703, "step": 1646 }, { "epoch": 3.8752941176470586, "grad_norm": 0.632614254951477, "learning_rate": 2.840112706807076e-06, "loss": 0.0529, "step": 1647 }, { "epoch": 3.8776470588235297, "grad_norm": 0.792515754699707, "learning_rate": 2.8345025462470675e-06, "loss": 0.059, "step": 1648 }, { "epoch": 3.88, "grad_norm": 0.7114555239677429, "learning_rate": 2.8288957413546216e-06, "loss": 0.0685, "step": 1649 }, { "epoch": 3.8823529411764706, "grad_norm": 0.6725842356681824, "learning_rate": 2.8232923008160694e-06, "loss": 0.0449, "step": 1650 }, { "epoch": 3.8847058823529412, "grad_norm": 0.6409413814544678, "learning_rate": 2.817692233312535e-06, "loss": 0.0488, "step": 1651 }, { "epoch": 3.887058823529412, "grad_norm": 0.6377838253974915, "learning_rate": 2.81209554751991e-06, "loss": 0.058, "step": 1652 }, { "epoch": 3.8894117647058826, "grad_norm": 0.6340404748916626, "learning_rate": 2.8065022521088474e-06, "loss": 0.0509, "step": 1653 }, { "epoch": 3.891764705882353, "grad_norm": 0.6542562246322632, "learning_rate": 2.8009123557447543e-06, "loss": 0.0521, "step": 1654 }, { "epoch": 3.8941176470588235, "grad_norm": 0.561510443687439, "learning_rate": 2.795325867087764e-06, "loss": 0.0418, "step": 1655 }, { "epoch": 3.896470588235294, "grad_norm": 0.7120569944381714, "learning_rate": 2.789742794792734e-06, "loss": 0.0632, "step": 1656 }, { "epoch": 3.898823529411765, "grad_norm": 0.7488998174667358, "learning_rate": 2.784163147509232e-06, "loss": 0.063, "step": 1657 }, { "epoch": 3.9011764705882355, "grad_norm": 0.6038960814476013, "learning_rate": 2.7785869338815143e-06, "loss": 0.0416, "step": 1658 }, { "epoch": 3.9035294117647057, "grad_norm": 0.7641870975494385, "learning_rate": 2.7730141625485164e-06, "loss": 0.0639, "step": 1659 }, { "epoch": 3.9058823529411764, "grad_norm": 0.7091242074966431, "learning_rate": 2.767444842143849e-06, "loss": 0.0586, "step": 1660 }, { "epoch": 3.908235294117647, "grad_norm": 0.9158393144607544, "learning_rate": 2.7618789812957696e-06, "loss": 0.0565, "step": 1661 }, { "epoch": 3.9105882352941177, "grad_norm": 0.6919254660606384, "learning_rate": 2.756316588627176e-06, "loss": 0.0611, "step": 1662 }, { "epoch": 3.9129411764705884, "grad_norm": 0.8638816475868225, "learning_rate": 2.750757672755597e-06, "loss": 0.065, "step": 1663 }, { "epoch": 3.9152941176470586, "grad_norm": 0.5773096084594727, "learning_rate": 2.745202242293171e-06, "loss": 0.0528, "step": 1664 }, { "epoch": 3.9176470588235293, "grad_norm": 0.5660383105278015, "learning_rate": 2.739650305846642e-06, "loss": 0.0458, "step": 1665 }, { "epoch": 3.92, "grad_norm": 0.6585059762001038, "learning_rate": 2.734101872017334e-06, "loss": 0.0362, "step": 1666 }, { "epoch": 3.9223529411764706, "grad_norm": 0.7854761481285095, "learning_rate": 2.7285569494011476e-06, "loss": 0.0707, "step": 1667 }, { "epoch": 3.9247058823529413, "grad_norm": 0.6969524621963501, "learning_rate": 2.7230155465885484e-06, "loss": 0.0511, "step": 1668 }, { "epoch": 3.9270588235294115, "grad_norm": 0.7440138459205627, "learning_rate": 2.717477672164538e-06, "loss": 0.0335, "step": 1669 }, { "epoch": 3.9294117647058826, "grad_norm": 0.5925984978675842, "learning_rate": 2.7119433347086605e-06, "loss": 0.0463, "step": 1670 }, { "epoch": 3.931764705882353, "grad_norm": 0.7008888125419617, "learning_rate": 2.706412542794981e-06, "loss": 0.0632, "step": 1671 }, { "epoch": 3.9341176470588235, "grad_norm": 0.820094645023346, "learning_rate": 2.700885304992067e-06, "loss": 0.0701, "step": 1672 }, { "epoch": 3.936470588235294, "grad_norm": 0.7981771230697632, "learning_rate": 2.695361629862979e-06, "loss": 0.0518, "step": 1673 }, { "epoch": 3.938823529411765, "grad_norm": 1.2317931652069092, "learning_rate": 2.6898415259652633e-06, "loss": 0.0332, "step": 1674 }, { "epoch": 3.9411764705882355, "grad_norm": 0.9631883502006531, "learning_rate": 2.684325001850931e-06, "loss": 0.0645, "step": 1675 }, { "epoch": 3.9435294117647057, "grad_norm": 0.6429290771484375, "learning_rate": 2.6788120660664442e-06, "loss": 0.0483, "step": 1676 }, { "epoch": 3.9458823529411764, "grad_norm": 0.701875627040863, "learning_rate": 2.673302727152711e-06, "loss": 0.0692, "step": 1677 }, { "epoch": 3.948235294117647, "grad_norm": 0.6788539886474609, "learning_rate": 2.6677969936450644e-06, "loss": 0.0725, "step": 1678 }, { "epoch": 3.9505882352941177, "grad_norm": 0.9383338689804077, "learning_rate": 2.6622948740732507e-06, "loss": 0.0508, "step": 1679 }, { "epoch": 3.9529411764705884, "grad_norm": 0.6625303030014038, "learning_rate": 2.6567963769614177e-06, "loss": 0.054, "step": 1680 }, { "epoch": 3.9552941176470586, "grad_norm": 0.8969583511352539, "learning_rate": 2.6513015108281036e-06, "loss": 0.0609, "step": 1681 }, { "epoch": 3.9576470588235293, "grad_norm": 0.7066641449928284, "learning_rate": 2.645810284186221e-06, "loss": 0.0609, "step": 1682 }, { "epoch": 3.96, "grad_norm": 0.7366204261779785, "learning_rate": 2.640322705543038e-06, "loss": 0.0706, "step": 1683 }, { "epoch": 3.9623529411764706, "grad_norm": 0.7286041378974915, "learning_rate": 2.6348387834001785e-06, "loss": 0.056, "step": 1684 }, { "epoch": 3.9647058823529413, "grad_norm": 0.6399794220924377, "learning_rate": 2.6293585262536037e-06, "loss": 0.0526, "step": 1685 }, { "epoch": 3.9670588235294115, "grad_norm": 0.7203150391578674, "learning_rate": 2.6238819425935824e-06, "loss": 0.039, "step": 1686 }, { "epoch": 3.969411764705882, "grad_norm": 0.6771878004074097, "learning_rate": 2.6184090409047065e-06, "loss": 0.0652, "step": 1687 }, { "epoch": 3.971764705882353, "grad_norm": 0.6680408716201782, "learning_rate": 2.6129398296658627e-06, "loss": 0.0458, "step": 1688 }, { "epoch": 3.9741176470588235, "grad_norm": 0.5898144841194153, "learning_rate": 2.6074743173502082e-06, "loss": 0.0346, "step": 1689 }, { "epoch": 3.976470588235294, "grad_norm": 0.7494096159934998, "learning_rate": 2.602012512425183e-06, "loss": 0.077, "step": 1690 }, { "epoch": 3.978823529411765, "grad_norm": 0.6727002859115601, "learning_rate": 2.5965544233524786e-06, "loss": 0.056, "step": 1691 }, { "epoch": 3.9811764705882355, "grad_norm": 0.6718494296073914, "learning_rate": 2.591100058588029e-06, "loss": 0.0591, "step": 1692 }, { "epoch": 3.9835294117647058, "grad_norm": 0.5671539306640625, "learning_rate": 2.585649426581997e-06, "loss": 0.0359, "step": 1693 }, { "epoch": 3.9858823529411764, "grad_norm": 0.6335062384605408, "learning_rate": 2.580202535778764e-06, "loss": 0.0468, "step": 1694 }, { "epoch": 3.988235294117647, "grad_norm": 0.5970304012298584, "learning_rate": 2.5747593946169184e-06, "loss": 0.0315, "step": 1695 }, { "epoch": 3.9905882352941178, "grad_norm": 0.7124571204185486, "learning_rate": 2.5693200115292355e-06, "loss": 0.0723, "step": 1696 }, { "epoch": 3.9929411764705884, "grad_norm": 0.5812715888023376, "learning_rate": 2.5638843949426673e-06, "loss": 0.0503, "step": 1697 }, { "epoch": 3.9952941176470587, "grad_norm": 0.6741745471954346, "learning_rate": 2.558452553278336e-06, "loss": 0.0462, "step": 1698 }, { "epoch": 3.9976470588235293, "grad_norm": 0.8010451197624207, "learning_rate": 2.5530244949515103e-06, "loss": 0.0607, "step": 1699 }, { "epoch": 4.0, "grad_norm": 0.5861409306526184, "learning_rate": 2.5476002283715984e-06, "loss": 0.041, "step": 1700 }, { "epoch": 4.00235294117647, "grad_norm": 0.42000025510787964, "learning_rate": 2.5421797619421384e-06, "loss": 0.047, "step": 1701 }, { "epoch": 4.004705882352941, "grad_norm": 0.4812847673892975, "learning_rate": 2.536763104060778e-06, "loss": 0.0229, "step": 1702 }, { "epoch": 4.007058823529412, "grad_norm": 0.4776272475719452, "learning_rate": 2.5313502631192594e-06, "loss": 0.026, "step": 1703 }, { "epoch": 4.009411764705883, "grad_norm": 0.3527643084526062, "learning_rate": 2.5259412475034226e-06, "loss": 0.0193, "step": 1704 }, { "epoch": 4.011764705882353, "grad_norm": 0.4161747992038727, "learning_rate": 2.5205360655931726e-06, "loss": 0.0245, "step": 1705 }, { "epoch": 4.014117647058823, "grad_norm": 0.45115095376968384, "learning_rate": 2.5151347257624757e-06, "loss": 0.0291, "step": 1706 }, { "epoch": 4.016470588235294, "grad_norm": 0.4003223776817322, "learning_rate": 2.5097372363793503e-06, "loss": 0.0256, "step": 1707 }, { "epoch": 4.0188235294117645, "grad_norm": 0.5670434236526489, "learning_rate": 2.5043436058058464e-06, "loss": 0.0332, "step": 1708 }, { "epoch": 4.021176470588236, "grad_norm": 0.39886030554771423, "learning_rate": 2.4989538423980337e-06, "loss": 0.0289, "step": 1709 }, { "epoch": 4.023529411764706, "grad_norm": 0.42442089319229126, "learning_rate": 2.4935679545059973e-06, "loss": 0.0234, "step": 1710 }, { "epoch": 4.025882352941177, "grad_norm": 0.48420706391334534, "learning_rate": 2.4881859504738106e-06, "loss": 0.0261, "step": 1711 }, { "epoch": 4.028235294117647, "grad_norm": 0.44125667214393616, "learning_rate": 2.4828078386395357e-06, "loss": 0.0201, "step": 1712 }, { "epoch": 4.030588235294117, "grad_norm": 0.4883314073085785, "learning_rate": 2.477433627335202e-06, "loss": 0.0193, "step": 1713 }, { "epoch": 4.0329411764705885, "grad_norm": 0.5374500155448914, "learning_rate": 2.4720633248867947e-06, "loss": 0.0268, "step": 1714 }, { "epoch": 4.035294117647059, "grad_norm": 0.7481744289398193, "learning_rate": 2.46669693961425e-06, "loss": 0.0325, "step": 1715 }, { "epoch": 4.03764705882353, "grad_norm": 0.4469453990459442, "learning_rate": 2.4613344798314224e-06, "loss": 0.0203, "step": 1716 }, { "epoch": 4.04, "grad_norm": 0.5141384601593018, "learning_rate": 2.4559759538460976e-06, "loss": 0.0218, "step": 1717 }, { "epoch": 4.04235294117647, "grad_norm": 0.45099106431007385, "learning_rate": 2.4506213699599634e-06, "loss": 0.0294, "step": 1718 }, { "epoch": 4.044705882352941, "grad_norm": 0.5943623781204224, "learning_rate": 2.445270736468598e-06, "loss": 0.0211, "step": 1719 }, { "epoch": 4.047058823529412, "grad_norm": 0.6896787285804749, "learning_rate": 2.439924061661457e-06, "loss": 0.0314, "step": 1720 }, { "epoch": 4.049411764705883, "grad_norm": 0.6494241952896118, "learning_rate": 2.434581353821872e-06, "loss": 0.0302, "step": 1721 }, { "epoch": 4.051764705882353, "grad_norm": 0.6108441948890686, "learning_rate": 2.42924262122702e-06, "loss": 0.0337, "step": 1722 }, { "epoch": 4.054117647058823, "grad_norm": 0.5080441832542419, "learning_rate": 2.423907872147921e-06, "loss": 0.0196, "step": 1723 }, { "epoch": 4.056470588235294, "grad_norm": 0.6315309405326843, "learning_rate": 2.41857711484943e-06, "loss": 0.0272, "step": 1724 }, { "epoch": 4.0588235294117645, "grad_norm": 0.5516512393951416, "learning_rate": 2.4132503575902092e-06, "loss": 0.0206, "step": 1725 }, { "epoch": 4.061176470588236, "grad_norm": 0.7889315485954285, "learning_rate": 2.4079276086227298e-06, "loss": 0.031, "step": 1726 }, { "epoch": 4.063529411764706, "grad_norm": 0.6405814290046692, "learning_rate": 2.402608876193247e-06, "loss": 0.0255, "step": 1727 }, { "epoch": 4.065882352941176, "grad_norm": 0.680702805519104, "learning_rate": 2.3972941685418016e-06, "loss": 0.0332, "step": 1728 }, { "epoch": 4.068235294117647, "grad_norm": 0.5502400398254395, "learning_rate": 2.3919834939021933e-06, "loss": 0.039, "step": 1729 }, { "epoch": 4.070588235294117, "grad_norm": 0.42983266711235046, "learning_rate": 2.386676860501972e-06, "loss": 0.0189, "step": 1730 }, { "epoch": 4.0729411764705885, "grad_norm": 0.5533312559127808, "learning_rate": 2.3813742765624325e-06, "loss": 0.0228, "step": 1731 }, { "epoch": 4.075294117647059, "grad_norm": 0.5819824934005737, "learning_rate": 2.3760757502985964e-06, "loss": 0.025, "step": 1732 }, { "epoch": 4.07764705882353, "grad_norm": 0.5771346092224121, "learning_rate": 2.3707812899191896e-06, "loss": 0.027, "step": 1733 }, { "epoch": 4.08, "grad_norm": 0.5885286927223206, "learning_rate": 2.3654909036266466e-06, "loss": 0.0236, "step": 1734 }, { "epoch": 4.08235294117647, "grad_norm": 0.9046328067779541, "learning_rate": 2.360204599617094e-06, "loss": 0.0369, "step": 1735 }, { "epoch": 4.084705882352941, "grad_norm": 0.44469839334487915, "learning_rate": 2.3549223860803205e-06, "loss": 0.0166, "step": 1736 }, { "epoch": 4.087058823529412, "grad_norm": 0.4798140823841095, "learning_rate": 2.3496442711997915e-06, "loss": 0.023, "step": 1737 }, { "epoch": 4.089411764705883, "grad_norm": 0.6299778819084167, "learning_rate": 2.3443702631526126e-06, "loss": 0.0224, "step": 1738 }, { "epoch": 4.091764705882353, "grad_norm": 0.5644456148147583, "learning_rate": 2.3391003701095354e-06, "loss": 0.0189, "step": 1739 }, { "epoch": 4.094117647058823, "grad_norm": 0.7675298452377319, "learning_rate": 2.33383460023493e-06, "loss": 0.0275, "step": 1740 }, { "epoch": 4.096470588235294, "grad_norm": 0.5961945056915283, "learning_rate": 2.328572961686778e-06, "loss": 0.0192, "step": 1741 }, { "epoch": 4.0988235294117645, "grad_norm": 0.46581706404685974, "learning_rate": 2.3233154626166678e-06, "loss": 0.0249, "step": 1742 }, { "epoch": 4.101176470588236, "grad_norm": 0.9032256007194519, "learning_rate": 2.3180621111697676e-06, "loss": 0.0302, "step": 1743 }, { "epoch": 4.103529411764706, "grad_norm": 0.5416139960289001, "learning_rate": 2.312812915484822e-06, "loss": 0.0206, "step": 1744 }, { "epoch": 4.105882352941176, "grad_norm": 0.6123484373092651, "learning_rate": 2.3075678836941395e-06, "loss": 0.0334, "step": 1745 }, { "epoch": 4.108235294117647, "grad_norm": 0.6024317741394043, "learning_rate": 2.3023270239235758e-06, "loss": 0.0282, "step": 1746 }, { "epoch": 4.110588235294117, "grad_norm": 0.48972463607788086, "learning_rate": 2.29709034429252e-06, "loss": 0.024, "step": 1747 }, { "epoch": 4.1129411764705885, "grad_norm": 0.5379170179367065, "learning_rate": 2.291857852913893e-06, "loss": 0.0218, "step": 1748 }, { "epoch": 4.115294117647059, "grad_norm": 0.7932527661323547, "learning_rate": 2.28662955789412e-06, "loss": 0.0278, "step": 1749 }, { "epoch": 4.117647058823529, "grad_norm": 0.5772182941436768, "learning_rate": 2.281405467333126e-06, "loss": 0.0243, "step": 1750 }, { "epoch": 4.12, "grad_norm": 0.5694379806518555, "learning_rate": 2.2761855893243277e-06, "loss": 0.0296, "step": 1751 }, { "epoch": 4.12235294117647, "grad_norm": 0.5875977277755737, "learning_rate": 2.270969931954609e-06, "loss": 0.0343, "step": 1752 }, { "epoch": 4.124705882352941, "grad_norm": 1.0962599515914917, "learning_rate": 2.2657585033043155e-06, "loss": 0.0282, "step": 1753 }, { "epoch": 4.127058823529412, "grad_norm": 0.5852847695350647, "learning_rate": 2.260551311447248e-06, "loss": 0.0277, "step": 1754 }, { "epoch": 4.129411764705883, "grad_norm": 0.7450524568557739, "learning_rate": 2.2553483644506354e-06, "loss": 0.0295, "step": 1755 }, { "epoch": 4.131764705882353, "grad_norm": 0.5639746189117432, "learning_rate": 2.2501496703751338e-06, "loss": 0.0275, "step": 1756 }, { "epoch": 4.134117647058823, "grad_norm": 0.5845755934715271, "learning_rate": 2.2449552372748133e-06, "loss": 0.021, "step": 1757 }, { "epoch": 4.136470588235294, "grad_norm": 1.017167568206787, "learning_rate": 2.239765073197135e-06, "loss": 0.0285, "step": 1758 }, { "epoch": 4.138823529411765, "grad_norm": 0.4522583484649658, "learning_rate": 2.234579186182959e-06, "loss": 0.0184, "step": 1759 }, { "epoch": 4.141176470588236, "grad_norm": 0.5055395364761353, "learning_rate": 2.2293975842665015e-06, "loss": 0.0213, "step": 1760 }, { "epoch": 4.143529411764706, "grad_norm": 0.5410231947898865, "learning_rate": 2.224220275475355e-06, "loss": 0.0211, "step": 1761 }, { "epoch": 4.145882352941176, "grad_norm": 0.5190861225128174, "learning_rate": 2.219047267830458e-06, "loss": 0.0247, "step": 1762 }, { "epoch": 4.148235294117647, "grad_norm": 0.6215575337409973, "learning_rate": 2.2138785693460775e-06, "loss": 0.0212, "step": 1763 }, { "epoch": 4.1505882352941175, "grad_norm": 1.5099438428878784, "learning_rate": 2.2087141880298117e-06, "loss": 0.0393, "step": 1764 }, { "epoch": 4.152941176470589, "grad_norm": 0.5458499193191528, "learning_rate": 2.2035541318825716e-06, "loss": 0.0198, "step": 1765 }, { "epoch": 4.155294117647059, "grad_norm": 0.5073500871658325, "learning_rate": 2.198398408898563e-06, "loss": 0.0208, "step": 1766 }, { "epoch": 4.157647058823529, "grad_norm": 0.5976425409317017, "learning_rate": 2.1932470270652783e-06, "loss": 0.0187, "step": 1767 }, { "epoch": 4.16, "grad_norm": 0.5079370737075806, "learning_rate": 2.18809999436349e-06, "loss": 0.0252, "step": 1768 }, { "epoch": 4.16235294117647, "grad_norm": 0.6899617910385132, "learning_rate": 2.182957318767227e-06, "loss": 0.0291, "step": 1769 }, { "epoch": 4.1647058823529415, "grad_norm": 0.5577676892280579, "learning_rate": 2.17781900824377e-06, "loss": 0.0286, "step": 1770 }, { "epoch": 4.167058823529412, "grad_norm": 0.7420270442962646, "learning_rate": 2.1726850707536355e-06, "loss": 0.0395, "step": 1771 }, { "epoch": 4.169411764705882, "grad_norm": 0.560803234577179, "learning_rate": 2.1675555142505706e-06, "loss": 0.0211, "step": 1772 }, { "epoch": 4.171764705882353, "grad_norm": 0.624365508556366, "learning_rate": 2.1624303466815286e-06, "loss": 0.0312, "step": 1773 }, { "epoch": 4.174117647058823, "grad_norm": 0.5360134840011597, "learning_rate": 2.157309575986665e-06, "loss": 0.0247, "step": 1774 }, { "epoch": 4.176470588235294, "grad_norm": 0.499871164560318, "learning_rate": 2.152193210099327e-06, "loss": 0.0243, "step": 1775 }, { "epoch": 4.178823529411765, "grad_norm": 0.5868552327156067, "learning_rate": 2.1470812569460337e-06, "loss": 0.0257, "step": 1776 }, { "epoch": 4.181176470588236, "grad_norm": 0.5563998818397522, "learning_rate": 2.1419737244464676e-06, "loss": 0.0271, "step": 1777 }, { "epoch": 4.183529411764706, "grad_norm": 0.6100590825080872, "learning_rate": 2.136870620513466e-06, "loss": 0.0309, "step": 1778 }, { "epoch": 4.185882352941176, "grad_norm": 0.5671743750572205, "learning_rate": 2.1317719530530055e-06, "loss": 0.0333, "step": 1779 }, { "epoch": 4.188235294117647, "grad_norm": 0.57925945520401, "learning_rate": 2.1266777299641817e-06, "loss": 0.0227, "step": 1780 }, { "epoch": 4.1905882352941175, "grad_norm": 0.5916570425033569, "learning_rate": 2.121587959139215e-06, "loss": 0.0311, "step": 1781 }, { "epoch": 4.192941176470589, "grad_norm": 0.5728688836097717, "learning_rate": 2.1165026484634217e-06, "loss": 0.0293, "step": 1782 }, { "epoch": 4.195294117647059, "grad_norm": 0.5919227004051208, "learning_rate": 2.1114218058152083e-06, "loss": 0.0323, "step": 1783 }, { "epoch": 4.197647058823529, "grad_norm": 0.6490453481674194, "learning_rate": 2.106345439066064e-06, "loss": 0.0291, "step": 1784 }, { "epoch": 4.2, "grad_norm": 0.5592430830001831, "learning_rate": 2.101273556080537e-06, "loss": 0.0244, "step": 1785 }, { "epoch": 4.20235294117647, "grad_norm": 0.5288647413253784, "learning_rate": 2.0962061647162378e-06, "loss": 0.0211, "step": 1786 }, { "epoch": 4.2047058823529415, "grad_norm": 0.7493144869804382, "learning_rate": 2.0911432728238095e-06, "loss": 0.0225, "step": 1787 }, { "epoch": 4.207058823529412, "grad_norm": 0.5457441806793213, "learning_rate": 2.086084888246927e-06, "loss": 0.019, "step": 1788 }, { "epoch": 4.209411764705882, "grad_norm": 0.520625114440918, "learning_rate": 2.081031018822288e-06, "loss": 0.0152, "step": 1789 }, { "epoch": 4.211764705882353, "grad_norm": 0.4774334132671356, "learning_rate": 2.075981672379586e-06, "loss": 0.0247, "step": 1790 }, { "epoch": 4.214117647058823, "grad_norm": 0.4951927959918976, "learning_rate": 2.070936856741512e-06, "loss": 0.0261, "step": 1791 }, { "epoch": 4.216470588235294, "grad_norm": 0.5156142711639404, "learning_rate": 2.06589657972374e-06, "loss": 0.0282, "step": 1792 }, { "epoch": 4.218823529411765, "grad_norm": 0.43942171335220337, "learning_rate": 2.0608608491349084e-06, "loss": 0.02, "step": 1793 }, { "epoch": 4.221176470588235, "grad_norm": 0.7986873388290405, "learning_rate": 2.055829672776612e-06, "loss": 0.0263, "step": 1794 }, { "epoch": 4.223529411764706, "grad_norm": 0.5600565075874329, "learning_rate": 2.0508030584433944e-06, "loss": 0.0254, "step": 1795 }, { "epoch": 4.225882352941176, "grad_norm": 0.5514658093452454, "learning_rate": 2.045781013922727e-06, "loss": 0.0266, "step": 1796 }, { "epoch": 4.228235294117647, "grad_norm": 0.6986518502235413, "learning_rate": 2.0407635469950017e-06, "loss": 0.0363, "step": 1797 }, { "epoch": 4.2305882352941175, "grad_norm": 0.7387487888336182, "learning_rate": 2.0357506654335226e-06, "loss": 0.0362, "step": 1798 }, { "epoch": 4.232941176470589, "grad_norm": 0.5657845139503479, "learning_rate": 2.030742377004486e-06, "loss": 0.0313, "step": 1799 }, { "epoch": 4.235294117647059, "grad_norm": 0.45734772086143494, "learning_rate": 2.0257386894669726e-06, "loss": 0.0196, "step": 1800 }, { "epoch": 4.235294117647059, "eval_loss": 0.24141359329223633, "eval_runtime": 3.3295, "eval_samples_per_second": 33.338, "eval_steps_per_second": 1.201, "step": 1800 }, { "epoch": 4.237647058823529, "grad_norm": 0.5300816297531128, "learning_rate": 2.020739610572938e-06, "loss": 0.0262, "step": 1801 }, { "epoch": 4.24, "grad_norm": 0.5023676753044128, "learning_rate": 2.015745148067195e-06, "loss": 0.0239, "step": 1802 }, { "epoch": 4.24235294117647, "grad_norm": 0.503661572933197, "learning_rate": 2.0107553096874056e-06, "loss": 0.0149, "step": 1803 }, { "epoch": 4.2447058823529416, "grad_norm": 0.5376380681991577, "learning_rate": 2.0057701031640656e-06, "loss": 0.0195, "step": 1804 }, { "epoch": 4.247058823529412, "grad_norm": 0.6690245866775513, "learning_rate": 2.0007895362204993e-06, "loss": 0.0316, "step": 1805 }, { "epoch": 4.249411764705882, "grad_norm": 0.5719420313835144, "learning_rate": 1.9958136165728436e-06, "loss": 0.0119, "step": 1806 }, { "epoch": 4.251764705882353, "grad_norm": 0.5765411853790283, "learning_rate": 1.990842351930027e-06, "loss": 0.0331, "step": 1807 }, { "epoch": 4.254117647058823, "grad_norm": 0.5285260081291199, "learning_rate": 1.9858757499937737e-06, "loss": 0.0205, "step": 1808 }, { "epoch": 4.2564705882352944, "grad_norm": 0.5836981534957886, "learning_rate": 1.9809138184585873e-06, "loss": 0.0243, "step": 1809 }, { "epoch": 4.258823529411765, "grad_norm": 0.937655508518219, "learning_rate": 1.9759565650117247e-06, "loss": 0.0314, "step": 1810 }, { "epoch": 4.261176470588235, "grad_norm": 0.4412587583065033, "learning_rate": 1.9710039973332035e-06, "loss": 0.0218, "step": 1811 }, { "epoch": 4.263529411764706, "grad_norm": 0.49271538853645325, "learning_rate": 1.9660561230957836e-06, "loss": 0.0184, "step": 1812 }, { "epoch": 4.265882352941176, "grad_norm": 0.6505703330039978, "learning_rate": 1.961112949964947e-06, "loss": 0.0314, "step": 1813 }, { "epoch": 4.268235294117647, "grad_norm": 0.6777704358100891, "learning_rate": 1.9561744855988964e-06, "loss": 0.0348, "step": 1814 }, { "epoch": 4.270588235294118, "grad_norm": 0.5567635893821716, "learning_rate": 1.951240737648536e-06, "loss": 0.0362, "step": 1815 }, { "epoch": 4.272941176470589, "grad_norm": 0.7478465437889099, "learning_rate": 1.9463117137574706e-06, "loss": 0.035, "step": 1816 }, { "epoch": 4.275294117647059, "grad_norm": 0.7008923292160034, "learning_rate": 1.9413874215619777e-06, "loss": 0.0307, "step": 1817 }, { "epoch": 4.277647058823529, "grad_norm": 0.6434463858604431, "learning_rate": 1.9364678686910073e-06, "loss": 0.0334, "step": 1818 }, { "epoch": 4.28, "grad_norm": 0.693565845489502, "learning_rate": 1.9315530627661714e-06, "loss": 0.0283, "step": 1819 }, { "epoch": 4.2823529411764705, "grad_norm": 0.6502346396446228, "learning_rate": 1.9266430114017225e-06, "loss": 0.0221, "step": 1820 }, { "epoch": 4.284705882352942, "grad_norm": 0.6489312648773193, "learning_rate": 1.9217377222045445e-06, "loss": 0.0262, "step": 1821 }, { "epoch": 4.287058823529412, "grad_norm": 0.7954316735267639, "learning_rate": 1.9168372027741543e-06, "loss": 0.0249, "step": 1822 }, { "epoch": 4.289411764705882, "grad_norm": 0.9463852047920227, "learning_rate": 1.9119414607026692e-06, "loss": 0.0369, "step": 1823 }, { "epoch": 4.291764705882353, "grad_norm": 0.613831639289856, "learning_rate": 1.9070505035748066e-06, "loss": 0.0299, "step": 1824 }, { "epoch": 4.294117647058823, "grad_norm": 0.6280023455619812, "learning_rate": 1.9021643389678774e-06, "loss": 0.0248, "step": 1825 }, { "epoch": 4.2964705882352945, "grad_norm": 0.750869631767273, "learning_rate": 1.8972829744517605e-06, "loss": 0.0346, "step": 1826 }, { "epoch": 4.298823529411765, "grad_norm": 0.4741457402706146, "learning_rate": 1.892406417588899e-06, "loss": 0.0278, "step": 1827 }, { "epoch": 4.301176470588235, "grad_norm": 0.6332969665527344, "learning_rate": 1.8875346759342935e-06, "loss": 0.0288, "step": 1828 }, { "epoch": 4.303529411764706, "grad_norm": 0.6008880138397217, "learning_rate": 1.88266775703548e-06, "loss": 0.0265, "step": 1829 }, { "epoch": 4.305882352941176, "grad_norm": 0.46322059631347656, "learning_rate": 1.8778056684325199e-06, "loss": 0.0175, "step": 1830 }, { "epoch": 4.308235294117647, "grad_norm": 0.47674229741096497, "learning_rate": 1.8729484176579998e-06, "loss": 0.0237, "step": 1831 }, { "epoch": 4.310588235294118, "grad_norm": 0.6052824854850769, "learning_rate": 1.868096012237003e-06, "loss": 0.0267, "step": 1832 }, { "epoch": 4.312941176470588, "grad_norm": 0.5301565527915955, "learning_rate": 1.8632484596871128e-06, "loss": 0.0216, "step": 1833 }, { "epoch": 4.315294117647059, "grad_norm": 0.6715896725654602, "learning_rate": 1.8584057675183892e-06, "loss": 0.0347, "step": 1834 }, { "epoch": 4.317647058823529, "grad_norm": 0.6616461277008057, "learning_rate": 1.8535679432333624e-06, "loss": 0.0394, "step": 1835 }, { "epoch": 4.32, "grad_norm": 0.6533105969429016, "learning_rate": 1.8487349943270291e-06, "loss": 0.0189, "step": 1836 }, { "epoch": 4.3223529411764705, "grad_norm": 0.5595614314079285, "learning_rate": 1.843906928286818e-06, "loss": 0.0292, "step": 1837 }, { "epoch": 4.324705882352941, "grad_norm": 0.5416598916053772, "learning_rate": 1.8390837525926052e-06, "loss": 0.0259, "step": 1838 }, { "epoch": 4.327058823529412, "grad_norm": 0.7100399732589722, "learning_rate": 1.8342654747166893e-06, "loss": 0.0343, "step": 1839 }, { "epoch": 4.329411764705882, "grad_norm": 0.6283623576164246, "learning_rate": 1.8294521021237762e-06, "loss": 0.0365, "step": 1840 }, { "epoch": 4.331764705882353, "grad_norm": 0.530127227306366, "learning_rate": 1.8246436422709726e-06, "loss": 0.0239, "step": 1841 }, { "epoch": 4.334117647058823, "grad_norm": 0.6457006931304932, "learning_rate": 1.8198401026077804e-06, "loss": 0.0289, "step": 1842 }, { "epoch": 4.3364705882352945, "grad_norm": 0.6381149291992188, "learning_rate": 1.8150414905760721e-06, "loss": 0.0334, "step": 1843 }, { "epoch": 4.338823529411765, "grad_norm": 0.7035207152366638, "learning_rate": 1.8102478136100862e-06, "loss": 0.0178, "step": 1844 }, { "epoch": 4.341176470588235, "grad_norm": 2.345923662185669, "learning_rate": 1.8054590791364223e-06, "loss": 0.0233, "step": 1845 }, { "epoch": 4.343529411764706, "grad_norm": 1.9358124732971191, "learning_rate": 1.8006752945740172e-06, "loss": 0.0235, "step": 1846 }, { "epoch": 4.345882352941176, "grad_norm": 0.512031078338623, "learning_rate": 1.7958964673341389e-06, "loss": 0.0175, "step": 1847 }, { "epoch": 4.348235294117647, "grad_norm": 0.5193244218826294, "learning_rate": 1.7911226048203763e-06, "loss": 0.0254, "step": 1848 }, { "epoch": 4.350588235294118, "grad_norm": 0.660871148109436, "learning_rate": 1.7863537144286291e-06, "loss": 0.0802, "step": 1849 }, { "epoch": 4.352941176470588, "grad_norm": 0.6197521090507507, "learning_rate": 1.7815898035470916e-06, "loss": 0.0276, "step": 1850 }, { "epoch": 4.355294117647059, "grad_norm": 0.5837772488594055, "learning_rate": 1.7768308795562423e-06, "loss": 0.0278, "step": 1851 }, { "epoch": 4.357647058823529, "grad_norm": 0.5951693058013916, "learning_rate": 1.7720769498288356e-06, "loss": 0.0295, "step": 1852 }, { "epoch": 4.36, "grad_norm": 0.6612610816955566, "learning_rate": 1.7673280217298933e-06, "loss": 0.0284, "step": 1853 }, { "epoch": 4.3623529411764705, "grad_norm": 0.6521152853965759, "learning_rate": 1.7625841026166758e-06, "loss": 0.0307, "step": 1854 }, { "epoch": 4.364705882352941, "grad_norm": 0.5666381120681763, "learning_rate": 1.7578451998386944e-06, "loss": 0.0183, "step": 1855 }, { "epoch": 4.367058823529412, "grad_norm": 0.5937379002571106, "learning_rate": 1.75311132073769e-06, "loss": 0.0297, "step": 1856 }, { "epoch": 4.369411764705882, "grad_norm": 0.6581999063491821, "learning_rate": 1.7483824726476084e-06, "loss": 0.0295, "step": 1857 }, { "epoch": 4.371764705882353, "grad_norm": 0.5768826007843018, "learning_rate": 1.7436586628946139e-06, "loss": 0.0235, "step": 1858 }, { "epoch": 4.374117647058823, "grad_norm": 1.217420220375061, "learning_rate": 1.7389398987970566e-06, "loss": 0.0308, "step": 1859 }, { "epoch": 4.376470588235295, "grad_norm": 0.6339168548583984, "learning_rate": 1.7342261876654772e-06, "loss": 0.0213, "step": 1860 }, { "epoch": 4.378823529411765, "grad_norm": 0.6864333748817444, "learning_rate": 1.7295175368025815e-06, "loss": 0.0268, "step": 1861 }, { "epoch": 4.381176470588235, "grad_norm": 0.5396808981895447, "learning_rate": 1.7248139535032363e-06, "loss": 0.0289, "step": 1862 }, { "epoch": 4.383529411764706, "grad_norm": 0.509730339050293, "learning_rate": 1.7201154450544632e-06, "loss": 0.0281, "step": 1863 }, { "epoch": 4.385882352941176, "grad_norm": 0.5762155652046204, "learning_rate": 1.7154220187354168e-06, "loss": 0.023, "step": 1864 }, { "epoch": 4.3882352941176475, "grad_norm": 0.5934739112854004, "learning_rate": 1.710733681817376e-06, "loss": 0.0327, "step": 1865 }, { "epoch": 4.390588235294118, "grad_norm": 0.6144732236862183, "learning_rate": 1.7060504415637425e-06, "loss": 0.0249, "step": 1866 }, { "epoch": 4.392941176470588, "grad_norm": 0.6597561240196228, "learning_rate": 1.7013723052300155e-06, "loss": 0.0185, "step": 1867 }, { "epoch": 4.395294117647059, "grad_norm": 0.6036880016326904, "learning_rate": 1.6966992800637874e-06, "loss": 0.0224, "step": 1868 }, { "epoch": 4.397647058823529, "grad_norm": 0.5688931345939636, "learning_rate": 1.6920313733047377e-06, "loss": 0.0315, "step": 1869 }, { "epoch": 4.4, "grad_norm": 0.5357118248939514, "learning_rate": 1.6873685921846101e-06, "loss": 0.0179, "step": 1870 }, { "epoch": 4.402352941176471, "grad_norm": 0.5910123586654663, "learning_rate": 1.6827109439272084e-06, "loss": 0.0234, "step": 1871 }, { "epoch": 4.404705882352941, "grad_norm": 0.8906912803649902, "learning_rate": 1.6780584357483887e-06, "loss": 0.0244, "step": 1872 }, { "epoch": 4.407058823529412, "grad_norm": 0.6063958406448364, "learning_rate": 1.673411074856039e-06, "loss": 0.0235, "step": 1873 }, { "epoch": 4.409411764705882, "grad_norm": 0.5685113668441772, "learning_rate": 1.6687688684500732e-06, "loss": 0.0327, "step": 1874 }, { "epoch": 4.411764705882353, "grad_norm": 0.6706697940826416, "learning_rate": 1.664131823722424e-06, "loss": 0.0282, "step": 1875 }, { "epoch": 4.4141176470588235, "grad_norm": 0.7646396160125732, "learning_rate": 1.6594999478570231e-06, "loss": 0.0402, "step": 1876 }, { "epoch": 4.416470588235294, "grad_norm": 0.5035815834999084, "learning_rate": 1.6548732480297935e-06, "loss": 0.027, "step": 1877 }, { "epoch": 4.418823529411765, "grad_norm": 0.651404619216919, "learning_rate": 1.650251731408645e-06, "loss": 0.0212, "step": 1878 }, { "epoch": 4.421176470588235, "grad_norm": 0.5370557904243469, "learning_rate": 1.6456354051534506e-06, "loss": 0.0185, "step": 1879 }, { "epoch": 4.423529411764706, "grad_norm": 0.6163151264190674, "learning_rate": 1.6410242764160497e-06, "loss": 0.0342, "step": 1880 }, { "epoch": 4.425882352941176, "grad_norm": 0.9767559766769409, "learning_rate": 1.6364183523402188e-06, "loss": 0.0307, "step": 1881 }, { "epoch": 4.428235294117647, "grad_norm": 0.5366573929786682, "learning_rate": 1.6318176400616805e-06, "loss": 0.0263, "step": 1882 }, { "epoch": 4.430588235294118, "grad_norm": 0.810806393623352, "learning_rate": 1.6272221467080835e-06, "loss": 0.021, "step": 1883 }, { "epoch": 4.432941176470588, "grad_norm": 0.40564659237861633, "learning_rate": 1.6226318793989799e-06, "loss": 0.0227, "step": 1884 }, { "epoch": 4.435294117647059, "grad_norm": 0.798107922077179, "learning_rate": 1.6180468452458365e-06, "loss": 0.0379, "step": 1885 }, { "epoch": 4.437647058823529, "grad_norm": 0.584542453289032, "learning_rate": 1.613467051352011e-06, "loss": 0.0313, "step": 1886 }, { "epoch": 4.44, "grad_norm": 0.531349241733551, "learning_rate": 1.608892504812739e-06, "loss": 0.0226, "step": 1887 }, { "epoch": 4.442352941176471, "grad_norm": 0.712742805480957, "learning_rate": 1.6043232127151262e-06, "loss": 0.033, "step": 1888 }, { "epoch": 4.444705882352941, "grad_norm": 0.5294598937034607, "learning_rate": 1.5997591821381433e-06, "loss": 0.0185, "step": 1889 }, { "epoch": 4.447058823529412, "grad_norm": 0.5600643754005432, "learning_rate": 1.595200420152605e-06, "loss": 0.0296, "step": 1890 }, { "epoch": 4.449411764705882, "grad_norm": 0.6043305993080139, "learning_rate": 1.5906469338211652e-06, "loss": 0.0303, "step": 1891 }, { "epoch": 4.451764705882353, "grad_norm": 0.5694990754127502, "learning_rate": 1.5860987301983018e-06, "loss": 0.0211, "step": 1892 }, { "epoch": 4.4541176470588235, "grad_norm": 0.658371090888977, "learning_rate": 1.5815558163303143e-06, "loss": 0.0342, "step": 1893 }, { "epoch": 4.456470588235294, "grad_norm": 0.6810142397880554, "learning_rate": 1.5770181992553025e-06, "loss": 0.0209, "step": 1894 }, { "epoch": 4.458823529411765, "grad_norm": 0.5600376725196838, "learning_rate": 1.5724858860031588e-06, "loss": 0.0313, "step": 1895 }, { "epoch": 4.461176470588235, "grad_norm": 0.7715686559677124, "learning_rate": 1.5679588835955653e-06, "loss": 0.0223, "step": 1896 }, { "epoch": 4.463529411764706, "grad_norm": 0.563930094242096, "learning_rate": 1.56343719904597e-06, "loss": 0.0244, "step": 1897 }, { "epoch": 4.465882352941176, "grad_norm": 0.4103754460811615, "learning_rate": 1.5589208393595828e-06, "loss": 0.0193, "step": 1898 }, { "epoch": 4.4682352941176475, "grad_norm": 0.7052960395812988, "learning_rate": 1.5544098115333678e-06, "loss": 0.0274, "step": 1899 }, { "epoch": 4.470588235294118, "grad_norm": 0.5612783432006836, "learning_rate": 1.5499041225560287e-06, "loss": 0.0275, "step": 1900 }, { "epoch": 4.472941176470588, "grad_norm": 0.6451455950737, "learning_rate": 1.5454037794079905e-06, "loss": 0.0262, "step": 1901 }, { "epoch": 4.475294117647059, "grad_norm": 0.7640933990478516, "learning_rate": 1.5409087890614056e-06, "loss": 0.024, "step": 1902 }, { "epoch": 4.477647058823529, "grad_norm": 0.6170369982719421, "learning_rate": 1.5364191584801278e-06, "loss": 0.0449, "step": 1903 }, { "epoch": 4.48, "grad_norm": 0.629677414894104, "learning_rate": 1.5319348946197082e-06, "loss": 0.0284, "step": 1904 }, { "epoch": 4.482352941176471, "grad_norm": 0.5779640674591064, "learning_rate": 1.5274560044273864e-06, "loss": 0.019, "step": 1905 }, { "epoch": 4.484705882352941, "grad_norm": 0.6657842993736267, "learning_rate": 1.5229824948420732e-06, "loss": 0.0314, "step": 1906 }, { "epoch": 4.487058823529412, "grad_norm": 0.5493777990341187, "learning_rate": 1.5185143727943466e-06, "loss": 0.0316, "step": 1907 }, { "epoch": 4.489411764705882, "grad_norm": 0.5363492369651794, "learning_rate": 1.5140516452064358e-06, "loss": 0.0245, "step": 1908 }, { "epoch": 4.491764705882353, "grad_norm": 0.43891069293022156, "learning_rate": 1.509594318992212e-06, "loss": 0.0167, "step": 1909 }, { "epoch": 4.4941176470588236, "grad_norm": 0.7133246064186096, "learning_rate": 1.5051424010571813e-06, "loss": 0.0255, "step": 1910 }, { "epoch": 4.496470588235294, "grad_norm": 0.7026239037513733, "learning_rate": 1.5006958982984699e-06, "loss": 0.0289, "step": 1911 }, { "epoch": 4.498823529411765, "grad_norm": 0.6471855044364929, "learning_rate": 1.49625481760481e-06, "loss": 0.0213, "step": 1912 }, { "epoch": 4.501176470588235, "grad_norm": 0.467190682888031, "learning_rate": 1.4918191658565426e-06, "loss": 0.0145, "step": 1913 }, { "epoch": 4.503529411764706, "grad_norm": 0.585096001625061, "learning_rate": 1.4873889499255908e-06, "loss": 0.0205, "step": 1914 }, { "epoch": 4.5058823529411764, "grad_norm": 0.6410333514213562, "learning_rate": 1.4829641766754576e-06, "loss": 0.0319, "step": 1915 }, { "epoch": 4.508235294117647, "grad_norm": 0.6922761797904968, "learning_rate": 1.478544852961217e-06, "loss": 0.0355, "step": 1916 }, { "epoch": 4.510588235294118, "grad_norm": 0.621031641960144, "learning_rate": 1.474130985629497e-06, "loss": 0.0349, "step": 1917 }, { "epoch": 4.512941176470588, "grad_norm": 0.7330631017684937, "learning_rate": 1.4697225815184716e-06, "loss": 0.0282, "step": 1918 }, { "epoch": 4.515294117647059, "grad_norm": 0.6273748874664307, "learning_rate": 1.465319647457856e-06, "loss": 0.0341, "step": 1919 }, { "epoch": 4.517647058823529, "grad_norm": 0.4696935713291168, "learning_rate": 1.460922190268886e-06, "loss": 0.0191, "step": 1920 }, { "epoch": 4.52, "grad_norm": 0.5957695841789246, "learning_rate": 1.4565302167643133e-06, "loss": 0.0236, "step": 1921 }, { "epoch": 4.522352941176471, "grad_norm": 0.5579785108566284, "learning_rate": 1.4521437337483966e-06, "loss": 0.0226, "step": 1922 }, { "epoch": 4.524705882352941, "grad_norm": 0.5443218350410461, "learning_rate": 1.4477627480168863e-06, "loss": 0.0232, "step": 1923 }, { "epoch": 4.527058823529412, "grad_norm": 0.6180779337882996, "learning_rate": 1.4433872663570153e-06, "loss": 0.0255, "step": 1924 }, { "epoch": 4.529411764705882, "grad_norm": 0.5214064717292786, "learning_rate": 1.4390172955474893e-06, "loss": 0.0281, "step": 1925 }, { "epoch": 4.5317647058823525, "grad_norm": 0.5840471386909485, "learning_rate": 1.4346528423584796e-06, "loss": 0.0308, "step": 1926 }, { "epoch": 4.534117647058824, "grad_norm": 0.6670432090759277, "learning_rate": 1.4302939135516102e-06, "loss": 0.0244, "step": 1927 }, { "epoch": 4.536470588235294, "grad_norm": 0.552777111530304, "learning_rate": 1.4259405158799373e-06, "loss": 0.0172, "step": 1928 }, { "epoch": 4.538823529411765, "grad_norm": 0.9015364050865173, "learning_rate": 1.421592656087957e-06, "loss": 0.0254, "step": 1929 }, { "epoch": 4.541176470588235, "grad_norm": 0.660683274269104, "learning_rate": 1.417250340911586e-06, "loss": 0.0251, "step": 1930 }, { "epoch": 4.543529411764706, "grad_norm": 0.4990255832672119, "learning_rate": 1.4129135770781427e-06, "loss": 0.0191, "step": 1931 }, { "epoch": 4.5458823529411765, "grad_norm": 0.5530819892883301, "learning_rate": 1.4085823713063528e-06, "loss": 0.0248, "step": 1932 }, { "epoch": 4.548235294117647, "grad_norm": 0.9866442084312439, "learning_rate": 1.4042567303063307e-06, "loss": 0.0426, "step": 1933 }, { "epoch": 4.550588235294118, "grad_norm": 0.7778128385543823, "learning_rate": 1.399936660779566e-06, "loss": 0.0243, "step": 1934 }, { "epoch": 4.552941176470588, "grad_norm": 0.6484590172767639, "learning_rate": 1.3956221694189193e-06, "loss": 0.0302, "step": 1935 }, { "epoch": 4.555294117647059, "grad_norm": 0.6753600835800171, "learning_rate": 1.391313262908606e-06, "loss": 0.0343, "step": 1936 }, { "epoch": 4.557647058823529, "grad_norm": 0.5519704818725586, "learning_rate": 1.3870099479241947e-06, "loss": 0.0277, "step": 1937 }, { "epoch": 4.5600000000000005, "grad_norm": 0.5672491192817688, "learning_rate": 1.3827122311325872e-06, "loss": 0.0256, "step": 1938 }, { "epoch": 4.562352941176471, "grad_norm": 0.6417268514633179, "learning_rate": 1.3784201191920108e-06, "loss": 0.0283, "step": 1939 }, { "epoch": 4.564705882352941, "grad_norm": 0.5707656145095825, "learning_rate": 1.3741336187520163e-06, "loss": 0.0277, "step": 1940 }, { "epoch": 4.567058823529412, "grad_norm": 0.687130868434906, "learning_rate": 1.3698527364534539e-06, "loss": 0.0305, "step": 1941 }, { "epoch": 4.569411764705882, "grad_norm": 0.6455793380737305, "learning_rate": 1.3655774789284723e-06, "loss": 0.0337, "step": 1942 }, { "epoch": 4.571764705882353, "grad_norm": 0.6945592164993286, "learning_rate": 1.3613078528005083e-06, "loss": 0.0366, "step": 1943 }, { "epoch": 4.574117647058824, "grad_norm": 0.603651762008667, "learning_rate": 1.3570438646842715e-06, "loss": 0.0252, "step": 1944 }, { "epoch": 4.576470588235294, "grad_norm": 0.6734772324562073, "learning_rate": 1.3527855211857351e-06, "loss": 0.019, "step": 1945 }, { "epoch": 4.578823529411765, "grad_norm": 0.6563680171966553, "learning_rate": 1.3485328289021345e-06, "loss": 0.0306, "step": 1946 }, { "epoch": 4.581176470588235, "grad_norm": 0.5644499063491821, "learning_rate": 1.3442857944219434e-06, "loss": 0.0226, "step": 1947 }, { "epoch": 4.583529411764706, "grad_norm": 0.43741509318351746, "learning_rate": 1.3400444243248694e-06, "loss": 0.0213, "step": 1948 }, { "epoch": 4.5858823529411765, "grad_norm": 0.5435033440589905, "learning_rate": 1.3358087251818516e-06, "loss": 0.0199, "step": 1949 }, { "epoch": 4.588235294117647, "grad_norm": 0.9395979642868042, "learning_rate": 1.331578703555037e-06, "loss": 0.0384, "step": 1950 }, { "epoch": 4.590588235294118, "grad_norm": 0.47106558084487915, "learning_rate": 1.327354365997778e-06, "loss": 0.0238, "step": 1951 }, { "epoch": 4.592941176470588, "grad_norm": 0.6912057995796204, "learning_rate": 1.3231357190546236e-06, "loss": 0.0347, "step": 1952 }, { "epoch": 4.595294117647059, "grad_norm": 0.5549225807189941, "learning_rate": 1.318922769261303e-06, "loss": 0.0249, "step": 1953 }, { "epoch": 4.597647058823529, "grad_norm": 0.5445212721824646, "learning_rate": 1.3147155231447234e-06, "loss": 0.0261, "step": 1954 }, { "epoch": 4.6, "grad_norm": 0.676288902759552, "learning_rate": 1.3105139872229512e-06, "loss": 0.0327, "step": 1955 }, { "epoch": 4.602352941176471, "grad_norm": 0.6602302193641663, "learning_rate": 1.3063181680052075e-06, "loss": 0.0264, "step": 1956 }, { "epoch": 4.604705882352941, "grad_norm": 0.48593994975090027, "learning_rate": 1.30212807199186e-06, "loss": 0.0291, "step": 1957 }, { "epoch": 4.607058823529412, "grad_norm": 0.44223693013191223, "learning_rate": 1.2979437056744054e-06, "loss": 0.0207, "step": 1958 }, { "epoch": 4.609411764705882, "grad_norm": 0.5054108500480652, "learning_rate": 1.293765075535465e-06, "loss": 0.0208, "step": 1959 }, { "epoch": 4.6117647058823525, "grad_norm": 0.4915538430213928, "learning_rate": 1.2895921880487755e-06, "loss": 0.0312, "step": 1960 }, { "epoch": 4.614117647058824, "grad_norm": 0.6358614563941956, "learning_rate": 1.285425049679175e-06, "loss": 0.0298, "step": 1961 }, { "epoch": 4.616470588235294, "grad_norm": 0.5898299813270569, "learning_rate": 1.2812636668825923e-06, "loss": 0.0301, "step": 1962 }, { "epoch": 4.618823529411765, "grad_norm": 0.6169513463973999, "learning_rate": 1.277108046106046e-06, "loss": 0.0281, "step": 1963 }, { "epoch": 4.621176470588235, "grad_norm": 0.5170131325721741, "learning_rate": 1.2729581937876214e-06, "loss": 0.0211, "step": 1964 }, { "epoch": 4.623529411764705, "grad_norm": 0.5551052689552307, "learning_rate": 1.2688141163564696e-06, "loss": 0.033, "step": 1965 }, { "epoch": 4.625882352941177, "grad_norm": 0.8127591609954834, "learning_rate": 1.2646758202327963e-06, "loss": 0.0164, "step": 1966 }, { "epoch": 4.628235294117647, "grad_norm": 0.5993316173553467, "learning_rate": 1.260543311827848e-06, "loss": 0.0297, "step": 1967 }, { "epoch": 4.630588235294118, "grad_norm": 0.4705202579498291, "learning_rate": 1.256416597543905e-06, "loss": 0.0203, "step": 1968 }, { "epoch": 4.632941176470588, "grad_norm": 0.508584201335907, "learning_rate": 1.2522956837742743e-06, "loss": 0.0193, "step": 1969 }, { "epoch": 4.635294117647058, "grad_norm": 0.6193340420722961, "learning_rate": 1.2481805769032724e-06, "loss": 0.0285, "step": 1970 }, { "epoch": 4.6376470588235295, "grad_norm": 0.577470600605011, "learning_rate": 1.244071283306221e-06, "loss": 0.0261, "step": 1971 }, { "epoch": 4.64, "grad_norm": 0.6645365357398987, "learning_rate": 1.2399678093494343e-06, "loss": 0.0254, "step": 1972 }, { "epoch": 4.642352941176471, "grad_norm": 0.611129105091095, "learning_rate": 1.2358701613902134e-06, "loss": 0.029, "step": 1973 }, { "epoch": 4.644705882352941, "grad_norm": 0.5139645934104919, "learning_rate": 1.231778345776835e-06, "loss": 0.0264, "step": 1974 }, { "epoch": 4.647058823529412, "grad_norm": 0.6887354254722595, "learning_rate": 1.22769236884853e-06, "loss": 0.0334, "step": 1975 }, { "epoch": 4.649411764705882, "grad_norm": 0.6671368479728699, "learning_rate": 1.2236122369354943e-06, "loss": 0.0349, "step": 1976 }, { "epoch": 4.651764705882353, "grad_norm": 0.6127507090568542, "learning_rate": 1.2195379563588675e-06, "loss": 0.0385, "step": 1977 }, { "epoch": 4.654117647058824, "grad_norm": 0.6465680003166199, "learning_rate": 1.2154695334307158e-06, "loss": 0.0353, "step": 1978 }, { "epoch": 4.656470588235294, "grad_norm": 0.7277697324752808, "learning_rate": 1.211406974454038e-06, "loss": 0.0194, "step": 1979 }, { "epoch": 4.658823529411765, "grad_norm": 0.590830385684967, "learning_rate": 1.2073502857227485e-06, "loss": 0.028, "step": 1980 }, { "epoch": 4.661176470588235, "grad_norm": 0.6606682538986206, "learning_rate": 1.203299473521663e-06, "loss": 0.0334, "step": 1981 }, { "epoch": 4.663529411764706, "grad_norm": 0.7053754329681396, "learning_rate": 1.1992545441264946e-06, "loss": 0.0317, "step": 1982 }, { "epoch": 4.665882352941177, "grad_norm": 0.5804036259651184, "learning_rate": 1.195215503803842e-06, "loss": 0.0218, "step": 1983 }, { "epoch": 4.668235294117647, "grad_norm": 0.5991600751876831, "learning_rate": 1.1911823588111826e-06, "loss": 0.0322, "step": 1984 }, { "epoch": 4.670588235294118, "grad_norm": 0.6972777843475342, "learning_rate": 1.1871551153968586e-06, "loss": 0.0221, "step": 1985 }, { "epoch": 4.672941176470588, "grad_norm": 0.6447551846504211, "learning_rate": 1.1831337798000677e-06, "loss": 0.031, "step": 1986 }, { "epoch": 4.675294117647059, "grad_norm": 0.5735760927200317, "learning_rate": 1.179118358250859e-06, "loss": 0.0239, "step": 1987 }, { "epoch": 4.6776470588235295, "grad_norm": 0.6017490029335022, "learning_rate": 1.175108856970116e-06, "loss": 0.0303, "step": 1988 }, { "epoch": 4.68, "grad_norm": 0.4663926959037781, "learning_rate": 1.1711052821695493e-06, "loss": 0.0223, "step": 1989 }, { "epoch": 4.682352941176471, "grad_norm": 0.5765492916107178, "learning_rate": 1.1671076400516936e-06, "loss": 0.0248, "step": 1990 }, { "epoch": 4.684705882352941, "grad_norm": 0.6318277716636658, "learning_rate": 1.1631159368098865e-06, "loss": 0.0237, "step": 1991 }, { "epoch": 4.687058823529412, "grad_norm": 0.6639541983604431, "learning_rate": 1.1591301786282651e-06, "loss": 0.0246, "step": 1992 }, { "epoch": 4.689411764705882, "grad_norm": 0.5595539212226868, "learning_rate": 1.1551503716817612e-06, "loss": 0.0239, "step": 1993 }, { "epoch": 4.691764705882353, "grad_norm": 0.673437237739563, "learning_rate": 1.151176522136082e-06, "loss": 0.038, "step": 1994 }, { "epoch": 4.694117647058824, "grad_norm": 0.7285794615745544, "learning_rate": 1.147208636147705e-06, "loss": 0.0255, "step": 1995 }, { "epoch": 4.696470588235294, "grad_norm": 0.576962947845459, "learning_rate": 1.1432467198638731e-06, "loss": 0.0279, "step": 1996 }, { "epoch": 4.698823529411765, "grad_norm": 0.5219523310661316, "learning_rate": 1.1392907794225766e-06, "loss": 0.0284, "step": 1997 }, { "epoch": 4.701176470588235, "grad_norm": 0.574146032333374, "learning_rate": 1.1353408209525474e-06, "loss": 0.0134, "step": 1998 }, { "epoch": 4.7035294117647055, "grad_norm": 0.6474567651748657, "learning_rate": 1.1313968505732544e-06, "loss": 0.0301, "step": 1999 }, { "epoch": 4.705882352941177, "grad_norm": 0.66566002368927, "learning_rate": 1.1274588743948826e-06, "loss": 0.031, "step": 2000 }, { "epoch": 4.705882352941177, "eval_loss": 0.24587702751159668, "eval_runtime": 3.5639, "eval_samples_per_second": 31.146, "eval_steps_per_second": 1.122, "step": 2000 } ], "logging_steps": 1.0, "max_steps": 2550, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 200, "total_flos": 5.005290770278646e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }