{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 200, "global_step": 922, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0021691973969631237, "grad_norm": 0.18814353593860253, "learning_rate": 1.0752688172043011e-06, "loss": 0.7804, "step": 1 }, { "epoch": 0.004338394793926247, "grad_norm": 0.14783898318898897, "learning_rate": 2.1505376344086023e-06, "loss": 0.6624, "step": 2 }, { "epoch": 0.006507592190889371, "grad_norm": 0.19503814642198758, "learning_rate": 3.225806451612903e-06, "loss": 0.7092, "step": 3 }, { "epoch": 0.008676789587852495, "grad_norm": 0.1618902610161204, "learning_rate": 4.3010752688172045e-06, "loss": 0.7077, "step": 4 }, { "epoch": 0.010845986984815618, "grad_norm": 0.12013865404820073, "learning_rate": 5.376344086021506e-06, "loss": 0.6043, "step": 5 }, { "epoch": 0.013015184381778741, "grad_norm": 0.1374084610184008, "learning_rate": 6.451612903225806e-06, "loss": 0.6217, "step": 6 }, { "epoch": 0.015184381778741865, "grad_norm": 0.14626420092970213, "learning_rate": 7.526881720430108e-06, "loss": 0.6813, "step": 7 }, { "epoch": 0.01735357917570499, "grad_norm": 0.13817440123812896, "learning_rate": 8.602150537634409e-06, "loss": 0.5804, "step": 8 }, { "epoch": 0.019522776572668113, "grad_norm": 0.15287513050585858, "learning_rate": 9.67741935483871e-06, "loss": 0.623, "step": 9 }, { "epoch": 0.021691973969631236, "grad_norm": 0.1598803439887024, "learning_rate": 1.0752688172043012e-05, "loss": 0.6743, "step": 10 }, { "epoch": 0.02386117136659436, "grad_norm": 0.19385413161643053, "learning_rate": 1.1827956989247313e-05, "loss": 0.7609, "step": 11 }, { "epoch": 0.026030368763557483, "grad_norm": 0.22754758020045657, "learning_rate": 1.2903225806451613e-05, "loss": 0.6324, "step": 12 }, { "epoch": 0.028199566160520606, "grad_norm": 0.17059715163003164, "learning_rate": 1.3978494623655914e-05, "loss": 0.6333, "step": 13 }, { "epoch": 0.03036876355748373, "grad_norm": 0.18873401138092946, "learning_rate": 1.5053763440860215e-05, "loss": 0.7387, "step": 14 }, { "epoch": 0.03253796095444685, "grad_norm": 0.18496670943100807, "learning_rate": 1.6129032258064517e-05, "loss": 0.6586, "step": 15 }, { "epoch": 0.03470715835140998, "grad_norm": 0.17005766008579865, "learning_rate": 1.7204301075268818e-05, "loss": 0.6018, "step": 16 }, { "epoch": 0.0368763557483731, "grad_norm": 0.19850157253361694, "learning_rate": 1.827956989247312e-05, "loss": 0.7417, "step": 17 }, { "epoch": 0.039045553145336226, "grad_norm": 0.16224315084212368, "learning_rate": 1.935483870967742e-05, "loss": 0.4922, "step": 18 }, { "epoch": 0.04121475054229935, "grad_norm": 0.17598848375010048, "learning_rate": 2.0430107526881722e-05, "loss": 0.6445, "step": 19 }, { "epoch": 0.04338394793926247, "grad_norm": 0.18904672614562715, "learning_rate": 2.1505376344086024e-05, "loss": 0.6504, "step": 20 }, { "epoch": 0.0455531453362256, "grad_norm": 0.17265224349161137, "learning_rate": 2.258064516129032e-05, "loss": 0.6245, "step": 21 }, { "epoch": 0.04772234273318872, "grad_norm": 0.2017634897229929, "learning_rate": 2.3655913978494626e-05, "loss": 0.6795, "step": 22 }, { "epoch": 0.049891540130151846, "grad_norm": 0.18337107237337277, "learning_rate": 2.4731182795698928e-05, "loss": 0.5461, "step": 23 }, { "epoch": 0.052060737527114966, "grad_norm": 0.1819879312384368, "learning_rate": 2.5806451612903226e-05, "loss": 0.6217, "step": 24 }, { "epoch": 0.05422993492407809, "grad_norm": 0.1796615077383316, "learning_rate": 2.6881720430107527e-05, "loss": 0.6185, "step": 25 }, { "epoch": 0.05639913232104121, "grad_norm": 0.154004978022556, "learning_rate": 2.7956989247311828e-05, "loss": 0.5814, "step": 26 }, { "epoch": 0.05856832971800434, "grad_norm": 0.15691846371505647, "learning_rate": 2.9032258064516133e-05, "loss": 0.5873, "step": 27 }, { "epoch": 0.06073752711496746, "grad_norm": 0.13885516449257118, "learning_rate": 3.010752688172043e-05, "loss": 0.5345, "step": 28 }, { "epoch": 0.06290672451193059, "grad_norm": 0.15573868807503372, "learning_rate": 3.118279569892473e-05, "loss": 0.5894, "step": 29 }, { "epoch": 0.0650759219088937, "grad_norm": 0.14860751998326854, "learning_rate": 3.2258064516129034e-05, "loss": 0.5983, "step": 30 }, { "epoch": 0.06724511930585683, "grad_norm": 0.12573553934841997, "learning_rate": 3.3333333333333335e-05, "loss": 0.5542, "step": 31 }, { "epoch": 0.06941431670281996, "grad_norm": 0.12949334468283782, "learning_rate": 3.4408602150537636e-05, "loss": 0.5369, "step": 32 }, { "epoch": 0.07158351409978309, "grad_norm": 0.1541384187233925, "learning_rate": 3.548387096774194e-05, "loss": 0.6545, "step": 33 }, { "epoch": 0.0737527114967462, "grad_norm": 0.1342436339209465, "learning_rate": 3.655913978494624e-05, "loss": 0.5753, "step": 34 }, { "epoch": 0.07592190889370933, "grad_norm": 0.13687890281351986, "learning_rate": 3.763440860215054e-05, "loss": 0.5251, "step": 35 }, { "epoch": 0.07809110629067245, "grad_norm": 0.14224847495035575, "learning_rate": 3.870967741935484e-05, "loss": 0.5494, "step": 36 }, { "epoch": 0.08026030368763558, "grad_norm": 0.1368222255571023, "learning_rate": 3.978494623655914e-05, "loss": 0.5793, "step": 37 }, { "epoch": 0.0824295010845987, "grad_norm": 0.14032242415063437, "learning_rate": 4.0860215053763444e-05, "loss": 0.5738, "step": 38 }, { "epoch": 0.08459869848156182, "grad_norm": 0.1077744610907224, "learning_rate": 4.1935483870967746e-05, "loss": 0.4712, "step": 39 }, { "epoch": 0.08676789587852494, "grad_norm": 0.1269827092811396, "learning_rate": 4.301075268817205e-05, "loss": 0.5071, "step": 40 }, { "epoch": 0.08893709327548807, "grad_norm": 0.15098582104711297, "learning_rate": 4.408602150537635e-05, "loss": 0.5202, "step": 41 }, { "epoch": 0.0911062906724512, "grad_norm": 0.13316194280918697, "learning_rate": 4.516129032258064e-05, "loss": 0.4469, "step": 42 }, { "epoch": 0.09327548806941431, "grad_norm": 0.13385436180609817, "learning_rate": 4.6236559139784944e-05, "loss": 0.4936, "step": 43 }, { "epoch": 0.09544468546637744, "grad_norm": 0.1456999546987616, "learning_rate": 4.731182795698925e-05, "loss": 0.4559, "step": 44 }, { "epoch": 0.09761388286334056, "grad_norm": 0.13569281275379386, "learning_rate": 4.8387096774193554e-05, "loss": 0.4314, "step": 45 }, { "epoch": 0.09978308026030369, "grad_norm": 0.15212176484932607, "learning_rate": 4.9462365591397855e-05, "loss": 0.4876, "step": 46 }, { "epoch": 0.1019522776572668, "grad_norm": 0.12839200001749063, "learning_rate": 5.053763440860215e-05, "loss": 0.4748, "step": 47 }, { "epoch": 0.10412147505422993, "grad_norm": 0.12862351761485963, "learning_rate": 5.161290322580645e-05, "loss": 0.4125, "step": 48 }, { "epoch": 0.10629067245119306, "grad_norm": 0.12676342192056295, "learning_rate": 5.268817204301075e-05, "loss": 0.425, "step": 49 }, { "epoch": 0.10845986984815618, "grad_norm": 0.12433212919621899, "learning_rate": 5.3763440860215054e-05, "loss": 0.4082, "step": 50 }, { "epoch": 0.11062906724511931, "grad_norm": 0.13762864881467574, "learning_rate": 5.4838709677419355e-05, "loss": 0.4342, "step": 51 }, { "epoch": 0.11279826464208242, "grad_norm": 0.16375581387185012, "learning_rate": 5.5913978494623656e-05, "loss": 0.5276, "step": 52 }, { "epoch": 0.11496746203904555, "grad_norm": 0.14903663671440864, "learning_rate": 5.6989247311827965e-05, "loss": 0.4966, "step": 53 }, { "epoch": 0.11713665943600868, "grad_norm": 0.13085616750795798, "learning_rate": 5.8064516129032266e-05, "loss": 0.4177, "step": 54 }, { "epoch": 0.1193058568329718, "grad_norm": 0.1399410088881321, "learning_rate": 5.913978494623657e-05, "loss": 0.4759, "step": 55 }, { "epoch": 0.12147505422993492, "grad_norm": 0.17552603285175208, "learning_rate": 6.021505376344086e-05, "loss": 0.3852, "step": 56 }, { "epoch": 0.12364425162689804, "grad_norm": 0.150563045551466, "learning_rate": 6.129032258064517e-05, "loss": 0.4456, "step": 57 }, { "epoch": 0.12581344902386118, "grad_norm": 0.15103853247096755, "learning_rate": 6.236559139784946e-05, "loss": 0.486, "step": 58 }, { "epoch": 0.1279826464208243, "grad_norm": 0.13897437214670214, "learning_rate": 6.344086021505376e-05, "loss": 0.393, "step": 59 }, { "epoch": 0.1301518438177874, "grad_norm": 0.13998124057548186, "learning_rate": 6.451612903225807e-05, "loss": 0.3703, "step": 60 }, { "epoch": 0.13232104121475055, "grad_norm": 0.14910024036690483, "learning_rate": 6.559139784946236e-05, "loss": 0.4163, "step": 61 }, { "epoch": 0.13449023861171366, "grad_norm": 0.1426571111554546, "learning_rate": 6.666666666666667e-05, "loss": 0.3021, "step": 62 }, { "epoch": 0.13665943600867678, "grad_norm": 0.2255435027702441, "learning_rate": 6.774193548387096e-05, "loss": 0.348, "step": 63 }, { "epoch": 0.13882863340563992, "grad_norm": 0.15326407709554848, "learning_rate": 6.881720430107527e-05, "loss": 0.4446, "step": 64 }, { "epoch": 0.14099783080260303, "grad_norm": 0.15096250596140368, "learning_rate": 6.989247311827958e-05, "loss": 0.4356, "step": 65 }, { "epoch": 0.14316702819956617, "grad_norm": 0.14931162214298302, "learning_rate": 7.096774193548388e-05, "loss": 0.3921, "step": 66 }, { "epoch": 0.14533622559652928, "grad_norm": 0.13357955043603964, "learning_rate": 7.204301075268818e-05, "loss": 0.285, "step": 67 }, { "epoch": 0.1475054229934924, "grad_norm": 0.14467881960033757, "learning_rate": 7.311827956989248e-05, "loss": 0.3828, "step": 68 }, { "epoch": 0.14967462039045554, "grad_norm": 0.13526505831717678, "learning_rate": 7.419354838709677e-05, "loss": 0.3523, "step": 69 }, { "epoch": 0.15184381778741865, "grad_norm": 0.17267391484274627, "learning_rate": 7.526881720430108e-05, "loss": 0.4506, "step": 70 }, { "epoch": 0.1540130151843818, "grad_norm": 0.15979585190145928, "learning_rate": 7.634408602150538e-05, "loss": 0.3862, "step": 71 }, { "epoch": 0.1561822125813449, "grad_norm": 0.15434731006679933, "learning_rate": 7.741935483870968e-05, "loss": 0.3694, "step": 72 }, { "epoch": 0.15835140997830802, "grad_norm": 0.14815314471149627, "learning_rate": 7.849462365591398e-05, "loss": 0.3644, "step": 73 }, { "epoch": 0.16052060737527116, "grad_norm": 0.15387792339587666, "learning_rate": 7.956989247311829e-05, "loss": 0.4095, "step": 74 }, { "epoch": 0.16268980477223427, "grad_norm": 0.15135351623798715, "learning_rate": 8.064516129032258e-05, "loss": 0.4128, "step": 75 }, { "epoch": 0.1648590021691974, "grad_norm": 0.13006929468252665, "learning_rate": 8.172043010752689e-05, "loss": 0.2786, "step": 76 }, { "epoch": 0.16702819956616052, "grad_norm": 0.17289441324627974, "learning_rate": 8.27956989247312e-05, "loss": 0.4148, "step": 77 }, { "epoch": 0.16919739696312364, "grad_norm": 0.15172742838256395, "learning_rate": 8.387096774193549e-05, "loss": 0.3758, "step": 78 }, { "epoch": 0.17136659436008678, "grad_norm": 0.16805860911767084, "learning_rate": 8.494623655913979e-05, "loss": 0.3755, "step": 79 }, { "epoch": 0.1735357917570499, "grad_norm": 0.13731854156086779, "learning_rate": 8.60215053763441e-05, "loss": 0.3031, "step": 80 }, { "epoch": 0.175704989154013, "grad_norm": 0.15898088745954764, "learning_rate": 8.709677419354839e-05, "loss": 0.3785, "step": 81 }, { "epoch": 0.17787418655097614, "grad_norm": 0.14869088820217116, "learning_rate": 8.81720430107527e-05, "loss": 0.3831, "step": 82 }, { "epoch": 0.18004338394793926, "grad_norm": 0.15285942249523343, "learning_rate": 8.924731182795699e-05, "loss": 0.3302, "step": 83 }, { "epoch": 0.1822125813449024, "grad_norm": 0.15925128432243588, "learning_rate": 9.032258064516129e-05, "loss": 0.389, "step": 84 }, { "epoch": 0.1843817787418655, "grad_norm": 0.1538607349870438, "learning_rate": 9.13978494623656e-05, "loss": 0.366, "step": 85 }, { "epoch": 0.18655097613882862, "grad_norm": 0.16016055121309444, "learning_rate": 9.247311827956989e-05, "loss": 0.3554, "step": 86 }, { "epoch": 0.18872017353579176, "grad_norm": 0.14283942143425307, "learning_rate": 9.35483870967742e-05, "loss": 0.3336, "step": 87 }, { "epoch": 0.19088937093275488, "grad_norm": 0.16354104148599777, "learning_rate": 9.46236559139785e-05, "loss": 0.4439, "step": 88 }, { "epoch": 0.19305856832971802, "grad_norm": 0.18296179182683667, "learning_rate": 9.56989247311828e-05, "loss": 0.4064, "step": 89 }, { "epoch": 0.19522776572668113, "grad_norm": 0.15460229153045282, "learning_rate": 9.677419354838711e-05, "loss": 0.3407, "step": 90 }, { "epoch": 0.19739696312364424, "grad_norm": 0.15215604875393682, "learning_rate": 9.78494623655914e-05, "loss": 0.3553, "step": 91 }, { "epoch": 0.19956616052060738, "grad_norm": 0.1597213335615803, "learning_rate": 9.892473118279571e-05, "loss": 0.3458, "step": 92 }, { "epoch": 0.2017353579175705, "grad_norm": 0.15328246596239517, "learning_rate": 0.0001, "loss": 0.342, "step": 93 }, { "epoch": 0.2039045553145336, "grad_norm": 0.15921412490863757, "learning_rate": 9.999964097046781e-05, "loss": 0.3843, "step": 94 }, { "epoch": 0.20607375271149675, "grad_norm": 0.15390306970937798, "learning_rate": 9.999856388702731e-05, "loss": 0.3442, "step": 95 }, { "epoch": 0.20824295010845986, "grad_norm": 0.15618794488353185, "learning_rate": 9.999676876514667e-05, "loss": 0.3396, "step": 96 }, { "epoch": 0.210412147505423, "grad_norm": 0.13701461086257688, "learning_rate": 9.999425563060602e-05, "loss": 0.2914, "step": 97 }, { "epoch": 0.21258134490238612, "grad_norm": 0.17727970752759084, "learning_rate": 9.999102451949688e-05, "loss": 0.3815, "step": 98 }, { "epoch": 0.21475054229934923, "grad_norm": 0.17606273323635682, "learning_rate": 9.998707547822186e-05, "loss": 0.377, "step": 99 }, { "epoch": 0.21691973969631237, "grad_norm": 0.16285779873289172, "learning_rate": 9.998240856349383e-05, "loss": 0.3221, "step": 100 }, { "epoch": 0.21908893709327548, "grad_norm": 0.17385851069798777, "learning_rate": 9.997702384233523e-05, "loss": 0.3539, "step": 101 }, { "epoch": 0.22125813449023862, "grad_norm": 0.15796293985116644, "learning_rate": 9.9970921392077e-05, "loss": 0.3407, "step": 102 }, { "epoch": 0.22342733188720174, "grad_norm": 0.16210590012135875, "learning_rate": 9.996410130035751e-05, "loss": 0.3433, "step": 103 }, { "epoch": 0.22559652928416485, "grad_norm": 0.1815484690495641, "learning_rate": 9.995656366512138e-05, "loss": 0.3576, "step": 104 }, { "epoch": 0.227765726681128, "grad_norm": 0.16370570821991282, "learning_rate": 9.994830859461793e-05, "loss": 0.3713, "step": 105 }, { "epoch": 0.2299349240780911, "grad_norm": 0.16719534192377292, "learning_rate": 9.993933620739974e-05, "loss": 0.3868, "step": 106 }, { "epoch": 0.23210412147505424, "grad_norm": 0.19714493676306685, "learning_rate": 9.992964663232086e-05, "loss": 0.3378, "step": 107 }, { "epoch": 0.23427331887201736, "grad_norm": 0.14799736868617666, "learning_rate": 9.991924000853505e-05, "loss": 0.3252, "step": 108 }, { "epoch": 0.23644251626898047, "grad_norm": 0.15459646771807045, "learning_rate": 9.990811648549374e-05, "loss": 0.349, "step": 109 }, { "epoch": 0.2386117136659436, "grad_norm": 0.15814773704274712, "learning_rate": 9.989627622294384e-05, "loss": 0.3459, "step": 110 }, { "epoch": 0.24078091106290672, "grad_norm": 0.1573452355191302, "learning_rate": 9.988371939092551e-05, "loss": 0.3436, "step": 111 }, { "epoch": 0.24295010845986983, "grad_norm": 0.17857552638412025, "learning_rate": 9.987044616976969e-05, "loss": 0.3515, "step": 112 }, { "epoch": 0.24511930585683298, "grad_norm": 0.17548675076361434, "learning_rate": 9.985645675009551e-05, "loss": 0.4118, "step": 113 }, { "epoch": 0.2472885032537961, "grad_norm": 0.16436582516347584, "learning_rate": 9.984175133280758e-05, "loss": 0.3623, "step": 114 }, { "epoch": 0.24945770065075923, "grad_norm": 0.14320940017640504, "learning_rate": 9.982633012909304e-05, "loss": 0.296, "step": 115 }, { "epoch": 0.25162689804772237, "grad_norm": 0.16287542910761935, "learning_rate": 9.981019336041861e-05, "loss": 0.296, "step": 116 }, { "epoch": 0.25379609544468545, "grad_norm": 0.16861388898681715, "learning_rate": 9.979334125852735e-05, "loss": 0.337, "step": 117 }, { "epoch": 0.2559652928416486, "grad_norm": 0.15898794759349474, "learning_rate": 9.977577406543535e-05, "loss": 0.3329, "step": 118 }, { "epoch": 0.25813449023861174, "grad_norm": 0.1734384500805255, "learning_rate": 9.975749203342823e-05, "loss": 0.3463, "step": 119 }, { "epoch": 0.2603036876355748, "grad_norm": 0.1687288542407359, "learning_rate": 9.97384954250576e-05, "loss": 0.3716, "step": 120 }, { "epoch": 0.26247288503253796, "grad_norm": 0.15571310481512463, "learning_rate": 9.971878451313719e-05, "loss": 0.3174, "step": 121 }, { "epoch": 0.2646420824295011, "grad_norm": 0.1519598125985657, "learning_rate": 9.969835958073897e-05, "loss": 0.2869, "step": 122 }, { "epoch": 0.2668112798264642, "grad_norm": 0.1631777318786355, "learning_rate": 9.967722092118909e-05, "loss": 0.3269, "step": 123 }, { "epoch": 0.26898047722342733, "grad_norm": 0.15802300841876601, "learning_rate": 9.965536883806368e-05, "loss": 0.3369, "step": 124 }, { "epoch": 0.27114967462039047, "grad_norm": 0.15777870559434587, "learning_rate": 9.963280364518448e-05, "loss": 0.3434, "step": 125 }, { "epoch": 0.27331887201735355, "grad_norm": 0.14460302054911775, "learning_rate": 9.96095256666143e-05, "loss": 0.2965, "step": 126 }, { "epoch": 0.2754880694143167, "grad_norm": 0.17074141051395292, "learning_rate": 9.958553523665242e-05, "loss": 0.3627, "step": 127 }, { "epoch": 0.27765726681127983, "grad_norm": 0.16289437443944546, "learning_rate": 9.956083269982973e-05, "loss": 0.3269, "step": 128 }, { "epoch": 0.279826464208243, "grad_norm": 0.19184991332619222, "learning_rate": 9.953541841090388e-05, "loss": 0.3729, "step": 129 }, { "epoch": 0.28199566160520606, "grad_norm": 0.14220941041693624, "learning_rate": 9.950929273485404e-05, "loss": 0.3038, "step": 130 }, { "epoch": 0.2841648590021692, "grad_norm": 0.15132399774722008, "learning_rate": 9.948245604687581e-05, "loss": 0.3162, "step": 131 }, { "epoch": 0.28633405639913234, "grad_norm": 0.1567696241360261, "learning_rate": 9.945490873237571e-05, "loss": 0.3041, "step": 132 }, { "epoch": 0.2885032537960954, "grad_norm": 0.14672088173624798, "learning_rate": 9.942665118696575e-05, "loss": 0.2985, "step": 133 }, { "epoch": 0.29067245119305857, "grad_norm": 0.1769806338719645, "learning_rate": 9.939768381645762e-05, "loss": 0.3648, "step": 134 }, { "epoch": 0.2928416485900217, "grad_norm": 0.1600455826423261, "learning_rate": 9.9368007036857e-05, "loss": 0.3428, "step": 135 }, { "epoch": 0.2950108459869848, "grad_norm": 0.15482109589656579, "learning_rate": 9.933762127435751e-05, "loss": 0.3211, "step": 136 }, { "epoch": 0.29718004338394793, "grad_norm": 0.1376974659275616, "learning_rate": 9.930652696533458e-05, "loss": 0.2655, "step": 137 }, { "epoch": 0.2993492407809111, "grad_norm": 0.1604514505800568, "learning_rate": 9.927472455633921e-05, "loss": 0.3424, "step": 138 }, { "epoch": 0.30151843817787416, "grad_norm": 0.14831623076631825, "learning_rate": 9.92422145040916e-05, "loss": 0.2929, "step": 139 }, { "epoch": 0.3036876355748373, "grad_norm": 0.14497199913662426, "learning_rate": 9.920899727547446e-05, "loss": 0.3063, "step": 140 }, { "epoch": 0.30585683297180044, "grad_norm": 0.1686161432311634, "learning_rate": 9.917507334752647e-05, "loss": 0.3695, "step": 141 }, { "epoch": 0.3080260303687636, "grad_norm": 0.1415843035366969, "learning_rate": 9.914044320743527e-05, "loss": 0.2989, "step": 142 }, { "epoch": 0.31019522776572667, "grad_norm": 0.1537154117560618, "learning_rate": 9.91051073525306e-05, "loss": 0.323, "step": 143 }, { "epoch": 0.3123644251626898, "grad_norm": 0.19421465105677843, "learning_rate": 9.90690662902771e-05, "loss": 0.3331, "step": 144 }, { "epoch": 0.31453362255965295, "grad_norm": 0.15822998528244542, "learning_rate": 9.903232053826696e-05, "loss": 0.2934, "step": 145 }, { "epoch": 0.31670281995661603, "grad_norm": 0.15358991548479717, "learning_rate": 9.89948706242126e-05, "loss": 0.3206, "step": 146 }, { "epoch": 0.3188720173535792, "grad_norm": 0.14959200755836038, "learning_rate": 9.895671708593903e-05, "loss": 0.3114, "step": 147 }, { "epoch": 0.3210412147505423, "grad_norm": 0.16473377943898643, "learning_rate": 9.891786047137615e-05, "loss": 0.3312, "step": 148 }, { "epoch": 0.3232104121475054, "grad_norm": 0.1648174790366256, "learning_rate": 9.887830133855079e-05, "loss": 0.3597, "step": 149 }, { "epoch": 0.32537960954446854, "grad_norm": 0.1887880278076816, "learning_rate": 9.883804025557888e-05, "loss": 0.3968, "step": 150 }, { "epoch": 0.3275488069414317, "grad_norm": 0.17273118800696421, "learning_rate": 9.879707780065712e-05, "loss": 0.3731, "step": 151 }, { "epoch": 0.3297180043383948, "grad_norm": 0.16253972479744438, "learning_rate": 9.875541456205473e-05, "loss": 0.3408, "step": 152 }, { "epoch": 0.3318872017353579, "grad_norm": 0.1583694488182022, "learning_rate": 9.871305113810505e-05, "loss": 0.3121, "step": 153 }, { "epoch": 0.33405639913232105, "grad_norm": 0.16161186391084162, "learning_rate": 9.86699881371969e-05, "loss": 0.3331, "step": 154 }, { "epoch": 0.3362255965292842, "grad_norm": 0.15558997399940616, "learning_rate": 9.862622617776582e-05, "loss": 0.3136, "step": 155 }, { "epoch": 0.3383947939262473, "grad_norm": 0.14717176320577952, "learning_rate": 9.858176588828526e-05, "loss": 0.3227, "step": 156 }, { "epoch": 0.3405639913232104, "grad_norm": 0.15604482540053785, "learning_rate": 9.85366079072575e-05, "loss": 0.3093, "step": 157 }, { "epoch": 0.34273318872017355, "grad_norm": 0.164872478623103, "learning_rate": 9.849075288320446e-05, "loss": 0.3231, "step": 158 }, { "epoch": 0.34490238611713664, "grad_norm": 0.18011689706998146, "learning_rate": 9.84442014746585e-05, "loss": 0.3571, "step": 159 }, { "epoch": 0.3470715835140998, "grad_norm": 0.2281829326423908, "learning_rate": 9.839695435015279e-05, "loss": 0.4504, "step": 160 }, { "epoch": 0.3492407809110629, "grad_norm": 0.17155618609511838, "learning_rate": 9.83490121882119e-05, "loss": 0.3318, "step": 161 }, { "epoch": 0.351409978308026, "grad_norm": 0.1451699212552566, "learning_rate": 9.830037567734187e-05, "loss": 0.27, "step": 162 }, { "epoch": 0.35357917570498915, "grad_norm": 0.16172715854603054, "learning_rate": 9.825104551602047e-05, "loss": 0.2993, "step": 163 }, { "epoch": 0.3557483731019523, "grad_norm": 0.16625581355250432, "learning_rate": 9.820102241268708e-05, "loss": 0.3447, "step": 164 }, { "epoch": 0.3579175704989154, "grad_norm": 0.17497945522123493, "learning_rate": 9.815030708573256e-05, "loss": 0.3604, "step": 165 }, { "epoch": 0.3600867678958785, "grad_norm": 0.16549978577941174, "learning_rate": 9.809890026348891e-05, "loss": 0.3325, "step": 166 }, { "epoch": 0.36225596529284165, "grad_norm": 0.15123204350737743, "learning_rate": 9.804680268421885e-05, "loss": 0.3047, "step": 167 }, { "epoch": 0.3644251626898048, "grad_norm": 0.1653117817038064, "learning_rate": 9.799401509610511e-05, "loss": 0.3336, "step": 168 }, { "epoch": 0.3665943600867679, "grad_norm": 0.15941088150160534, "learning_rate": 9.794053825723983e-05, "loss": 0.3415, "step": 169 }, { "epoch": 0.368763557483731, "grad_norm": 0.1569489292887316, "learning_rate": 9.788637293561363e-05, "loss": 0.3086, "step": 170 }, { "epoch": 0.37093275488069416, "grad_norm": 0.15521077438298633, "learning_rate": 9.783151990910446e-05, "loss": 0.3129, "step": 171 }, { "epoch": 0.37310195227765725, "grad_norm": 0.15853168180990132, "learning_rate": 9.777597996546661e-05, "loss": 0.3197, "step": 172 }, { "epoch": 0.3752711496746204, "grad_norm": 0.17021980724333097, "learning_rate": 9.771975390231927e-05, "loss": 0.3438, "step": 173 }, { "epoch": 0.3774403470715835, "grad_norm": 0.1467658403488705, "learning_rate": 9.766284252713511e-05, "loss": 0.2868, "step": 174 }, { "epoch": 0.3796095444685466, "grad_norm": 0.19325560102675848, "learning_rate": 9.760524665722874e-05, "loss": 0.4445, "step": 175 }, { "epoch": 0.38177874186550975, "grad_norm": 0.1676049985924069, "learning_rate": 9.754696711974486e-05, "loss": 0.3596, "step": 176 }, { "epoch": 0.3839479392624729, "grad_norm": 0.1517046172036335, "learning_rate": 9.748800475164648e-05, "loss": 0.2814, "step": 177 }, { "epoch": 0.38611713665943603, "grad_norm": 0.1499206779996939, "learning_rate": 9.742836039970287e-05, "loss": 0.3078, "step": 178 }, { "epoch": 0.3882863340563991, "grad_norm": 0.16835317016600657, "learning_rate": 9.736803492047736e-05, "loss": 0.346, "step": 179 }, { "epoch": 0.39045553145336226, "grad_norm": 0.1735161690202731, "learning_rate": 9.730702918031511e-05, "loss": 0.3339, "step": 180 }, { "epoch": 0.3926247288503254, "grad_norm": 0.178265268230418, "learning_rate": 9.724534405533061e-05, "loss": 0.3581, "step": 181 }, { "epoch": 0.3947939262472885, "grad_norm": 0.15928651246621914, "learning_rate": 9.718298043139513e-05, "loss": 0.3158, "step": 182 }, { "epoch": 0.3969631236442516, "grad_norm": 0.15680437561632446, "learning_rate": 9.711993920412395e-05, "loss": 0.3612, "step": 183 }, { "epoch": 0.39913232104121477, "grad_norm": 0.16147834287905172, "learning_rate": 9.70562212788636e-05, "loss": 0.3213, "step": 184 }, { "epoch": 0.40130151843817785, "grad_norm": 0.16810458922445323, "learning_rate": 9.699182757067875e-05, "loss": 0.3715, "step": 185 }, { "epoch": 0.403470715835141, "grad_norm": 0.16273652493527896, "learning_rate": 9.69267590043391e-05, "loss": 0.2901, "step": 186 }, { "epoch": 0.40563991323210413, "grad_norm": 0.17226018154021203, "learning_rate": 9.686101651430612e-05, "loss": 0.3774, "step": 187 }, { "epoch": 0.4078091106290672, "grad_norm": 0.18016919749286983, "learning_rate": 9.679460104471965e-05, "loss": 0.3602, "step": 188 }, { "epoch": 0.40997830802603036, "grad_norm": 0.15507719486138638, "learning_rate": 9.672751354938429e-05, "loss": 0.3129, "step": 189 }, { "epoch": 0.4121475054229935, "grad_norm": 0.1582451132361843, "learning_rate": 9.66597549917557e-05, "loss": 0.3183, "step": 190 }, { "epoch": 0.41431670281995664, "grad_norm": 0.14849683648502013, "learning_rate": 9.659132634492684e-05, "loss": 0.3103, "step": 191 }, { "epoch": 0.4164859002169197, "grad_norm": 0.24758668306536105, "learning_rate": 9.652222859161388e-05, "loss": 0.3157, "step": 192 }, { "epoch": 0.41865509761388287, "grad_norm": 0.17938378050442594, "learning_rate": 9.645246272414221e-05, "loss": 0.3715, "step": 193 }, { "epoch": 0.420824295010846, "grad_norm": 0.14292889306164755, "learning_rate": 9.63820297444321e-05, "loss": 0.281, "step": 194 }, { "epoch": 0.4229934924078091, "grad_norm": 0.15121757619907555, "learning_rate": 9.63109306639843e-05, "loss": 0.285, "step": 195 }, { "epoch": 0.42516268980477223, "grad_norm": 0.15479911159606385, "learning_rate": 9.623916650386564e-05, "loss": 0.333, "step": 196 }, { "epoch": 0.42733188720173537, "grad_norm": 0.19245481852848167, "learning_rate": 9.61667382946942e-05, "loss": 0.3803, "step": 197 }, { "epoch": 0.42950108459869846, "grad_norm": 0.16251333268925855, "learning_rate": 9.609364707662467e-05, "loss": 0.3326, "step": 198 }, { "epoch": 0.4316702819956616, "grad_norm": 0.15843432631003965, "learning_rate": 9.601989389933323e-05, "loss": 0.3151, "step": 199 }, { "epoch": 0.43383947939262474, "grad_norm": 0.1804329476464318, "learning_rate": 9.594547982200266e-05, "loss": 0.3436, "step": 200 }, { "epoch": 0.43383947939262474, "eval_loss": 0.33424264192581177, "eval_runtime": 39.7662, "eval_samples_per_second": 0.478, "eval_steps_per_second": 0.126, "step": 200 }, { "epoch": 0.4360086767895879, "grad_norm": 0.19645387562310979, "learning_rate": 9.5870405913307e-05, "loss": 0.3159, "step": 201 }, { "epoch": 0.43817787418655096, "grad_norm": 0.1644546677880722, "learning_rate": 9.579467325139627e-05, "loss": 0.3315, "step": 202 }, { "epoch": 0.4403470715835141, "grad_norm": 0.15812376696679412, "learning_rate": 9.571828292388096e-05, "loss": 0.3348, "step": 203 }, { "epoch": 0.44251626898047725, "grad_norm": 0.1675206237509223, "learning_rate": 9.56412360278164e-05, "loss": 0.3721, "step": 204 }, { "epoch": 0.44468546637744033, "grad_norm": 0.14308548515478076, "learning_rate": 9.556353366968705e-05, "loss": 0.2651, "step": 205 }, { "epoch": 0.44685466377440347, "grad_norm": 0.15386997755578125, "learning_rate": 9.548517696539054e-05, "loss": 0.2701, "step": 206 }, { "epoch": 0.4490238611713666, "grad_norm": 0.14640343506659434, "learning_rate": 9.540616704022173e-05, "loss": 0.2496, "step": 207 }, { "epoch": 0.4511930585683297, "grad_norm": 0.16521172350141197, "learning_rate": 9.532650502885646e-05, "loss": 0.3337, "step": 208 }, { "epoch": 0.45336225596529284, "grad_norm": 0.1540772430165373, "learning_rate": 9.524619207533532e-05, "loss": 0.3043, "step": 209 }, { "epoch": 0.455531453362256, "grad_norm": 0.16165589280753936, "learning_rate": 9.516522933304721e-05, "loss": 0.3276, "step": 210 }, { "epoch": 0.45770065075921906, "grad_norm": 0.1883035011589188, "learning_rate": 9.508361796471272e-05, "loss": 0.3584, "step": 211 }, { "epoch": 0.4598698481561822, "grad_norm": 0.14934674223988761, "learning_rate": 9.500135914236755e-05, "loss": 0.2946, "step": 212 }, { "epoch": 0.46203904555314534, "grad_norm": 0.16930578402110355, "learning_rate": 9.491845404734551e-05, "loss": 0.3423, "step": 213 }, { "epoch": 0.4642082429501085, "grad_norm": 0.16818902583571363, "learning_rate": 9.483490387026174e-05, "loss": 0.3245, "step": 214 }, { "epoch": 0.46637744034707157, "grad_norm": 0.1646282158556431, "learning_rate": 9.475070981099545e-05, "loss": 0.3377, "step": 215 }, { "epoch": 0.4685466377440347, "grad_norm": 0.15601143737978804, "learning_rate": 9.466587307867281e-05, "loss": 0.3181, "step": 216 }, { "epoch": 0.47071583514099785, "grad_norm": 0.16754840015499342, "learning_rate": 9.458039489164951e-05, "loss": 0.3682, "step": 217 }, { "epoch": 0.47288503253796094, "grad_norm": 0.1426551523167691, "learning_rate": 9.449427647749328e-05, "loss": 0.267, "step": 218 }, { "epoch": 0.4750542299349241, "grad_norm": 0.16599632915307785, "learning_rate": 9.440751907296628e-05, "loss": 0.3181, "step": 219 }, { "epoch": 0.4772234273318872, "grad_norm": 0.1620746064207198, "learning_rate": 9.432012392400733e-05, "loss": 0.3676, "step": 220 }, { "epoch": 0.4793926247288503, "grad_norm": 0.15120817805826722, "learning_rate": 9.423209228571398e-05, "loss": 0.3158, "step": 221 }, { "epoch": 0.48156182212581344, "grad_norm": 0.14803905174097284, "learning_rate": 9.414342542232462e-05, "loss": 0.2836, "step": 222 }, { "epoch": 0.4837310195227766, "grad_norm": 0.15220763579005372, "learning_rate": 9.405412460720006e-05, "loss": 0.2827, "step": 223 }, { "epoch": 0.48590021691973967, "grad_norm": 0.15404544864962286, "learning_rate": 9.396419112280555e-05, "loss": 0.3025, "step": 224 }, { "epoch": 0.4880694143167028, "grad_norm": 0.15073679537979376, "learning_rate": 9.387362626069216e-05, "loss": 0.3144, "step": 225 }, { "epoch": 0.49023861171366595, "grad_norm": 0.1610803970826281, "learning_rate": 9.378243132147825e-05, "loss": 0.2977, "step": 226 }, { "epoch": 0.4924078091106291, "grad_norm": 0.16316823817190756, "learning_rate": 9.369060761483095e-05, "loss": 0.3128, "step": 227 }, { "epoch": 0.4945770065075922, "grad_norm": 0.163300375716653, "learning_rate": 9.359815645944709e-05, "loss": 0.3438, "step": 228 }, { "epoch": 0.4967462039045553, "grad_norm": 0.14836269729887547, "learning_rate": 9.35050791830345e-05, "loss": 0.2525, "step": 229 }, { "epoch": 0.49891540130151846, "grad_norm": 0.14711791523393897, "learning_rate": 9.341137712229282e-05, "loss": 0.2713, "step": 230 }, { "epoch": 0.5010845986984815, "grad_norm": 0.1563819160365463, "learning_rate": 9.331705162289433e-05, "loss": 0.2919, "step": 231 }, { "epoch": 0.5032537960954447, "grad_norm": 0.1723872547155802, "learning_rate": 9.322210403946461e-05, "loss": 0.3121, "step": 232 }, { "epoch": 0.5054229934924078, "grad_norm": 0.17043959065030484, "learning_rate": 9.312653573556316e-05, "loss": 0.3561, "step": 233 }, { "epoch": 0.5075921908893709, "grad_norm": 0.14967506341505113, "learning_rate": 9.303034808366367e-05, "loss": 0.282, "step": 234 }, { "epoch": 0.5097613882863341, "grad_norm": 0.3206125419158534, "learning_rate": 9.293354246513448e-05, "loss": 0.3589, "step": 235 }, { "epoch": 0.5119305856832972, "grad_norm": 0.15389716021946065, "learning_rate": 9.283612027021862e-05, "loss": 0.3168, "step": 236 }, { "epoch": 0.5140997830802603, "grad_norm": 0.16629395409267134, "learning_rate": 9.273808289801388e-05, "loss": 0.2989, "step": 237 }, { "epoch": 0.5162689804772235, "grad_norm": 0.17350303956302596, "learning_rate": 9.263943175645275e-05, "loss": 0.3515, "step": 238 }, { "epoch": 0.5184381778741866, "grad_norm": 0.16804219683380595, "learning_rate": 9.254016826228215e-05, "loss": 0.3296, "step": 239 }, { "epoch": 0.5206073752711496, "grad_norm": 0.15382387009648443, "learning_rate": 9.244029384104311e-05, "loss": 0.2929, "step": 240 }, { "epoch": 0.5227765726681128, "grad_norm": 0.1817152430149277, "learning_rate": 9.233980992705031e-05, "loss": 0.409, "step": 241 }, { "epoch": 0.5249457700650759, "grad_norm": 0.15284178391247238, "learning_rate": 9.223871796337147e-05, "loss": 0.29, "step": 242 }, { "epoch": 0.527114967462039, "grad_norm": 0.17168700031201556, "learning_rate": 9.213701940180657e-05, "loss": 0.3596, "step": 243 }, { "epoch": 0.5292841648590022, "grad_norm": 0.16236461998196408, "learning_rate": 9.203471570286711e-05, "loss": 0.3531, "step": 244 }, { "epoch": 0.5314533622559653, "grad_norm": 0.17510095685795815, "learning_rate": 9.193180833575506e-05, "loss": 0.4172, "step": 245 }, { "epoch": 0.5336225596529284, "grad_norm": 0.1618295606935838, "learning_rate": 9.182829877834176e-05, "loss": 0.3086, "step": 246 }, { "epoch": 0.5357917570498916, "grad_norm": 0.16138627788900253, "learning_rate": 9.172418851714676e-05, "loss": 0.3409, "step": 247 }, { "epoch": 0.5379609544468547, "grad_norm": 0.15375456963212109, "learning_rate": 9.161947904731636e-05, "loss": 0.3306, "step": 248 }, { "epoch": 0.5401301518438177, "grad_norm": 0.14093510508892232, "learning_rate": 9.151417187260226e-05, "loss": 0.2723, "step": 249 }, { "epoch": 0.5422993492407809, "grad_norm": 0.1566994003310743, "learning_rate": 9.140826850533987e-05, "loss": 0.3047, "step": 250 }, { "epoch": 0.544468546637744, "grad_norm": 0.16263764510479803, "learning_rate": 9.130177046642667e-05, "loss": 0.3313, "step": 251 }, { "epoch": 0.5466377440347071, "grad_norm": 0.2635396490290928, "learning_rate": 9.119467928530027e-05, "loss": 0.3137, "step": 252 }, { "epoch": 0.5488069414316703, "grad_norm": 0.19432404691982347, "learning_rate": 9.108699649991659e-05, "loss": 0.3271, "step": 253 }, { "epoch": 0.5509761388286334, "grad_norm": 0.16321335163395914, "learning_rate": 9.097872365672757e-05, "loss": 0.3464, "step": 254 }, { "epoch": 0.5531453362255966, "grad_norm": 0.1674780863277491, "learning_rate": 9.086986231065917e-05, "loss": 0.3479, "step": 255 }, { "epoch": 0.5553145336225597, "grad_norm": 0.16720487351831387, "learning_rate": 9.076041402508893e-05, "loss": 0.3765, "step": 256 }, { "epoch": 0.5574837310195228, "grad_norm": 0.1768233975227357, "learning_rate": 9.06503803718235e-05, "loss": 0.3611, "step": 257 }, { "epoch": 0.559652928416486, "grad_norm": 0.1658738115128469, "learning_rate": 9.053976293107612e-05, "loss": 0.3425, "step": 258 }, { "epoch": 0.561822125813449, "grad_norm": 0.15075392628393477, "learning_rate": 9.042856329144393e-05, "loss": 0.2831, "step": 259 }, { "epoch": 0.5639913232104121, "grad_norm": 0.15462905264522137, "learning_rate": 9.031678304988509e-05, "loss": 0.3359, "step": 260 }, { "epoch": 0.5661605206073753, "grad_norm": 0.1486760035515965, "learning_rate": 9.020442381169593e-05, "loss": 0.3036, "step": 261 }, { "epoch": 0.5683297180043384, "grad_norm": 0.14672655464193096, "learning_rate": 9.009148719048785e-05, "loss": 0.2655, "step": 262 }, { "epoch": 0.5704989154013015, "grad_norm": 0.14525138583221342, "learning_rate": 8.99779748081641e-05, "loss": 0.2893, "step": 263 }, { "epoch": 0.5726681127982647, "grad_norm": 0.1649767278464621, "learning_rate": 8.986388829489663e-05, "loss": 0.3416, "step": 264 }, { "epoch": 0.5748373101952278, "grad_norm": 0.1486891920011246, "learning_rate": 8.97492292891025e-05, "loss": 0.3157, "step": 265 }, { "epoch": 0.5770065075921909, "grad_norm": 0.16844270718485851, "learning_rate": 8.96339994374205e-05, "loss": 0.3434, "step": 266 }, { "epoch": 0.579175704989154, "grad_norm": 0.15180653954202514, "learning_rate": 8.951820039468741e-05, "loss": 0.2775, "step": 267 }, { "epoch": 0.5813449023861171, "grad_norm": 0.15671071191539224, "learning_rate": 8.940183382391429e-05, "loss": 0.2713, "step": 268 }, { "epoch": 0.5835140997830802, "grad_norm": 0.16147849592493357, "learning_rate": 8.928490139626253e-05, "loss": 0.3173, "step": 269 }, { "epoch": 0.5856832971800434, "grad_norm": 0.19489581469202696, "learning_rate": 8.916740479101995e-05, "loss": 0.3289, "step": 270 }, { "epoch": 0.5878524945770065, "grad_norm": 0.14998764666371173, "learning_rate": 8.90493456955766e-05, "loss": 0.2677, "step": 271 }, { "epoch": 0.5900216919739696, "grad_norm": 0.15423637992388417, "learning_rate": 8.893072580540053e-05, "loss": 0.3233, "step": 272 }, { "epoch": 0.5921908893709328, "grad_norm": 0.1618307748258286, "learning_rate": 8.88115468240135e-05, "loss": 0.335, "step": 273 }, { "epoch": 0.5943600867678959, "grad_norm": 0.28889311411388907, "learning_rate": 8.869181046296647e-05, "loss": 0.3162, "step": 274 }, { "epoch": 0.596529284164859, "grad_norm": 0.16811888898141328, "learning_rate": 8.857151844181502e-05, "loss": 0.3321, "step": 275 }, { "epoch": 0.5986984815618221, "grad_norm": 0.15974545714844673, "learning_rate": 8.845067248809469e-05, "loss": 0.3424, "step": 276 }, { "epoch": 0.6008676789587852, "grad_norm": 0.14627135337253672, "learning_rate": 8.83292743372961e-05, "loss": 0.2914, "step": 277 }, { "epoch": 0.6030368763557483, "grad_norm": 0.15447685130795957, "learning_rate": 8.820732573284012e-05, "loss": 0.3175, "step": 278 }, { "epoch": 0.6052060737527115, "grad_norm": 0.17049697629599625, "learning_rate": 8.808482842605277e-05, "loss": 0.3328, "step": 279 }, { "epoch": 0.6073752711496746, "grad_norm": 0.15776013624499943, "learning_rate": 8.796178417614007e-05, "loss": 0.3109, "step": 280 }, { "epoch": 0.6095444685466378, "grad_norm": 0.16222342988686625, "learning_rate": 8.783819475016282e-05, "loss": 0.3748, "step": 281 }, { "epoch": 0.6117136659436009, "grad_norm": 0.2331822347475535, "learning_rate": 8.771406192301113e-05, "loss": 0.3138, "step": 282 }, { "epoch": 0.613882863340564, "grad_norm": 0.15710540429632355, "learning_rate": 8.758938747737909e-05, "loss": 0.3204, "step": 283 }, { "epoch": 0.6160520607375272, "grad_norm": 0.1696778017954279, "learning_rate": 8.746417320373896e-05, "loss": 0.3545, "step": 284 }, { "epoch": 0.6182212581344902, "grad_norm": 0.160133581870411, "learning_rate": 8.733842090031565e-05, "loss": 0.3079, "step": 285 }, { "epoch": 0.6203904555314533, "grad_norm": 0.15694259520801862, "learning_rate": 8.72121323730608e-05, "loss": 0.311, "step": 286 }, { "epoch": 0.6225596529284165, "grad_norm": 0.1585388671488109, "learning_rate": 8.708530943562683e-05, "loss": 0.3366, "step": 287 }, { "epoch": 0.6247288503253796, "grad_norm": 0.1637524110812479, "learning_rate": 8.695795390934094e-05, "loss": 0.3351, "step": 288 }, { "epoch": 0.6268980477223427, "grad_norm": 0.17338170936952552, "learning_rate": 8.683006762317891e-05, "loss": 0.3683, "step": 289 }, { "epoch": 0.6290672451193059, "grad_norm": 0.15279053222915287, "learning_rate": 8.670165241373891e-05, "loss": 0.2839, "step": 290 }, { "epoch": 0.631236442516269, "grad_norm": 0.15271124650902063, "learning_rate": 8.657271012521504e-05, "loss": 0.274, "step": 291 }, { "epoch": 0.6334056399132321, "grad_norm": 0.20119865632692643, "learning_rate": 8.644324260937085e-05, "loss": 0.2766, "step": 292 }, { "epoch": 0.6355748373101953, "grad_norm": 0.16358970886050292, "learning_rate": 8.631325172551284e-05, "loss": 0.3223, "step": 293 }, { "epoch": 0.6377440347071583, "grad_norm": 0.16024548206523562, "learning_rate": 8.618273934046364e-05, "loss": 0.3255, "step": 294 }, { "epoch": 0.6399132321041214, "grad_norm": 0.18933994539002422, "learning_rate": 8.60517073285353e-05, "loss": 0.4288, "step": 295 }, { "epoch": 0.6420824295010846, "grad_norm": 0.14871982406160117, "learning_rate": 8.592015757150225e-05, "loss": 0.3048, "step": 296 }, { "epoch": 0.6442516268980477, "grad_norm": 0.1634546407796359, "learning_rate": 8.578809195857445e-05, "loss": 0.3775, "step": 297 }, { "epoch": 0.6464208242950108, "grad_norm": 0.16293792940515708, "learning_rate": 8.565551238637006e-05, "loss": 0.3142, "step": 298 }, { "epoch": 0.648590021691974, "grad_norm": 0.14930487066180928, "learning_rate": 8.552242075888838e-05, "loss": 0.2613, "step": 299 }, { "epoch": 0.6507592190889371, "grad_norm": 0.14814023625501535, "learning_rate": 8.538881898748241e-05, "loss": 0.3157, "step": 300 }, { "epoch": 0.6529284164859002, "grad_norm": 0.15429106087854486, "learning_rate": 8.525470899083138e-05, "loss": 0.3023, "step": 301 }, { "epoch": 0.6550976138828634, "grad_norm": 0.1526889270326536, "learning_rate": 8.51200926949133e-05, "loss": 0.2622, "step": 302 }, { "epoch": 0.6572668112798264, "grad_norm": 0.16836180192848554, "learning_rate": 8.498497203297716e-05, "loss": 0.3181, "step": 303 }, { "epoch": 0.6594360086767896, "grad_norm": 0.1568597185268113, "learning_rate": 8.48493489455153e-05, "loss": 0.3164, "step": 304 }, { "epoch": 0.6616052060737527, "grad_norm": 0.1717660325284911, "learning_rate": 8.47132253802355e-05, "loss": 0.3144, "step": 305 }, { "epoch": 0.6637744034707158, "grad_norm": 0.15864917205877277, "learning_rate": 8.457660329203289e-05, "loss": 0.3057, "step": 306 }, { "epoch": 0.665943600867679, "grad_norm": 0.16569505505355422, "learning_rate": 8.443948464296211e-05, "loss": 0.3327, "step": 307 }, { "epoch": 0.6681127982646421, "grad_norm": 0.16057628499246557, "learning_rate": 8.430187140220889e-05, "loss": 0.291, "step": 308 }, { "epoch": 0.6702819956616052, "grad_norm": 0.14767031347275292, "learning_rate": 8.416376554606195e-05, "loss": 0.2795, "step": 309 }, { "epoch": 0.6724511930585684, "grad_norm": 0.15854515513637216, "learning_rate": 8.402516905788455e-05, "loss": 0.3009, "step": 310 }, { "epoch": 0.6746203904555315, "grad_norm": 0.14713035448630263, "learning_rate": 8.388608392808593e-05, "loss": 0.2793, "step": 311 }, { "epoch": 0.6767895878524945, "grad_norm": 0.16017456682784914, "learning_rate": 8.37465121540929e-05, "loss": 0.305, "step": 312 }, { "epoch": 0.6789587852494577, "grad_norm": 0.15120391692718194, "learning_rate": 8.360645574032098e-05, "loss": 0.2749, "step": 313 }, { "epoch": 0.6811279826464208, "grad_norm": 0.15560001063747786, "learning_rate": 8.346591669814572e-05, "loss": 0.3186, "step": 314 }, { "epoch": 0.6832971800433839, "grad_norm": 0.1613679084226523, "learning_rate": 8.332489704587381e-05, "loss": 0.3328, "step": 315 }, { "epoch": 0.6854663774403471, "grad_norm": 0.15605720716726337, "learning_rate": 8.318339880871402e-05, "loss": 0.3077, "step": 316 }, { "epoch": 0.6876355748373102, "grad_norm": 0.15488519210128188, "learning_rate": 8.304142401874818e-05, "loss": 0.3256, "step": 317 }, { "epoch": 0.6898047722342733, "grad_norm": 0.1403558799176201, "learning_rate": 8.2898974714902e-05, "loss": 0.2849, "step": 318 }, { "epoch": 0.6919739696312365, "grad_norm": 0.16073415458894003, "learning_rate": 8.275605294291576e-05, "loss": 0.3148, "step": 319 }, { "epoch": 0.6941431670281996, "grad_norm": 0.14531860683004322, "learning_rate": 8.261266075531493e-05, "loss": 0.2949, "step": 320 }, { "epoch": 0.6963123644251626, "grad_norm": 0.153620854119112, "learning_rate": 8.24688002113807e-05, "loss": 0.3024, "step": 321 }, { "epoch": 0.6984815618221258, "grad_norm": 0.1598435331827638, "learning_rate": 8.232447337712045e-05, "loss": 0.3116, "step": 322 }, { "epoch": 0.7006507592190889, "grad_norm": 0.16126012816422514, "learning_rate": 8.217968232523798e-05, "loss": 0.3302, "step": 323 }, { "epoch": 0.702819956616052, "grad_norm": 0.167819401197193, "learning_rate": 8.203442913510386e-05, "loss": 0.2961, "step": 324 }, { "epoch": 0.7049891540130152, "grad_norm": 0.16091517631557192, "learning_rate": 8.188871589272547e-05, "loss": 0.3058, "step": 325 }, { "epoch": 0.7071583514099783, "grad_norm": 0.15072907511506445, "learning_rate": 8.174254469071711e-05, "loss": 0.2768, "step": 326 }, { "epoch": 0.7093275488069414, "grad_norm": 0.15463419748008012, "learning_rate": 8.15959176282699e-05, "loss": 0.2698, "step": 327 }, { "epoch": 0.7114967462039046, "grad_norm": 0.15765765627941916, "learning_rate": 8.144883681112168e-05, "loss": 0.3327, "step": 328 }, { "epoch": 0.7136659436008677, "grad_norm": 0.1625944955327596, "learning_rate": 8.130130435152671e-05, "loss": 0.3115, "step": 329 }, { "epoch": 0.7158351409978309, "grad_norm": 0.14663689685956108, "learning_rate": 8.115332236822543e-05, "loss": 0.2813, "step": 330 }, { "epoch": 0.7180043383947939, "grad_norm": 0.14366366452382384, "learning_rate": 8.100489298641387e-05, "loss": 0.252, "step": 331 }, { "epoch": 0.720173535791757, "grad_norm": 0.14062872637814394, "learning_rate": 8.085601833771332e-05, "loss": 0.2426, "step": 332 }, { "epoch": 0.7223427331887202, "grad_norm": 0.151319434703271, "learning_rate": 8.070670056013963e-05, "loss": 0.2804, "step": 333 }, { "epoch": 0.7245119305856833, "grad_norm": 0.17339443782515734, "learning_rate": 8.055694179807241e-05, "loss": 0.3386, "step": 334 }, { "epoch": 0.7266811279826464, "grad_norm": 0.162057871765424, "learning_rate": 8.040674420222442e-05, "loss": 0.3059, "step": 335 }, { "epoch": 0.7288503253796096, "grad_norm": 0.17452086702897832, "learning_rate": 8.025610992961059e-05, "loss": 0.323, "step": 336 }, { "epoch": 0.7310195227765727, "grad_norm": 0.15218002802937822, "learning_rate": 8.010504114351699e-05, "loss": 0.2892, "step": 337 }, { "epoch": 0.7331887201735358, "grad_norm": 0.1786691433855208, "learning_rate": 7.995354001346984e-05, "loss": 0.2993, "step": 338 }, { "epoch": 0.735357917570499, "grad_norm": 0.15572465398388327, "learning_rate": 7.980160871520434e-05, "loss": 0.2751, "step": 339 }, { "epoch": 0.737527114967462, "grad_norm": 0.14944653365494096, "learning_rate": 7.964924943063341e-05, "loss": 0.295, "step": 340 }, { "epoch": 0.7396963123644251, "grad_norm": 0.15137015815284247, "learning_rate": 7.949646434781637e-05, "loss": 0.2928, "step": 341 }, { "epoch": 0.7418655097613883, "grad_norm": 0.13520870208050645, "learning_rate": 7.934325566092749e-05, "loss": 0.2392, "step": 342 }, { "epoch": 0.7440347071583514, "grad_norm": 0.17474668202456908, "learning_rate": 7.918962557022445e-05, "loss": 0.3343, "step": 343 }, { "epoch": 0.7462039045553145, "grad_norm": 0.15920430785387085, "learning_rate": 7.903557628201689e-05, "loss": 0.3108, "step": 344 }, { "epoch": 0.7483731019522777, "grad_norm": 0.14495723872575775, "learning_rate": 7.888111000863455e-05, "loss": 0.2723, "step": 345 }, { "epoch": 0.7505422993492408, "grad_norm": 0.16184390654404834, "learning_rate": 7.872622896839556e-05, "loss": 0.2729, "step": 346 }, { "epoch": 0.7527114967462039, "grad_norm": 0.16644660892068602, "learning_rate": 7.857093538557465e-05, "loss": 0.3267, "step": 347 }, { "epoch": 0.754880694143167, "grad_norm": 0.15541714907955761, "learning_rate": 7.841523149037109e-05, "loss": 0.3217, "step": 348 }, { "epoch": 0.7570498915401301, "grad_norm": 0.15934934206985665, "learning_rate": 7.825911951887677e-05, "loss": 0.2913, "step": 349 }, { "epoch": 0.7592190889370932, "grad_norm": 0.1578728957708445, "learning_rate": 7.810260171304399e-05, "loss": 0.3093, "step": 350 }, { "epoch": 0.7613882863340564, "grad_norm": 0.15490528067171122, "learning_rate": 7.794568032065337e-05, "loss": 0.3009, "step": 351 }, { "epoch": 0.7635574837310195, "grad_norm": 0.16699421067292403, "learning_rate": 7.778835759528145e-05, "loss": 0.3656, "step": 352 }, { "epoch": 0.7657266811279827, "grad_norm": 0.22044364766801572, "learning_rate": 7.763063579626839e-05, "loss": 0.334, "step": 353 }, { "epoch": 0.7678958785249458, "grad_norm": 0.19643699768744521, "learning_rate": 7.747251718868557e-05, "loss": 0.3231, "step": 354 }, { "epoch": 0.7700650759219089, "grad_norm": 0.20910555862615757, "learning_rate": 7.731400404330298e-05, "loss": 0.3161, "step": 355 }, { "epoch": 0.7722342733188721, "grad_norm": 0.16468231811761894, "learning_rate": 7.715509863655661e-05, "loss": 0.2979, "step": 356 }, { "epoch": 0.7744034707158352, "grad_norm": 0.1367744772298653, "learning_rate": 7.699580325051583e-05, "loss": 0.2166, "step": 357 }, { "epoch": 0.7765726681127982, "grad_norm": 0.16273122371163448, "learning_rate": 7.683612017285056e-05, "loss": 0.2847, "step": 358 }, { "epoch": 0.7787418655097614, "grad_norm": 0.15669925061953327, "learning_rate": 7.667605169679842e-05, "loss": 0.2777, "step": 359 }, { "epoch": 0.7809110629067245, "grad_norm": 0.17725284818408057, "learning_rate": 7.651560012113182e-05, "loss": 0.3337, "step": 360 }, { "epoch": 0.7830802603036876, "grad_norm": 0.17427480006189128, "learning_rate": 7.635476775012493e-05, "loss": 0.3665, "step": 361 }, { "epoch": 0.7852494577006508, "grad_norm": 0.17414268214659534, "learning_rate": 7.619355689352056e-05, "loss": 0.3309, "step": 362 }, { "epoch": 0.7874186550976139, "grad_norm": 0.17512481390885523, "learning_rate": 7.60319698664971e-05, "loss": 0.3328, "step": 363 }, { "epoch": 0.789587852494577, "grad_norm": 0.15289988099891824, "learning_rate": 7.587000898963508e-05, "loss": 0.2515, "step": 364 }, { "epoch": 0.7917570498915402, "grad_norm": 0.1624765535782596, "learning_rate": 7.570767658888405e-05, "loss": 0.2518, "step": 365 }, { "epoch": 0.7939262472885033, "grad_norm": 0.1617918820535666, "learning_rate": 7.554497499552902e-05, "loss": 0.3053, "step": 366 }, { "epoch": 0.7960954446854663, "grad_norm": 0.16383299523189962, "learning_rate": 7.538190654615711e-05, "loss": 0.3033, "step": 367 }, { "epoch": 0.7982646420824295, "grad_norm": 0.1740801344083908, "learning_rate": 7.521847358262384e-05, "loss": 0.3324, "step": 368 }, { "epoch": 0.8004338394793926, "grad_norm": 0.16040212743690085, "learning_rate": 7.505467845201965e-05, "loss": 0.3583, "step": 369 }, { "epoch": 0.8026030368763557, "grad_norm": 0.1581087352900414, "learning_rate": 7.48905235066361e-05, "loss": 0.3184, "step": 370 }, { "epoch": 0.8047722342733189, "grad_norm": 0.15619092562938275, "learning_rate": 7.472601110393212e-05, "loss": 0.3248, "step": 371 }, { "epoch": 0.806941431670282, "grad_norm": 0.1595073883740819, "learning_rate": 7.456114360650015e-05, "loss": 0.3324, "step": 372 }, { "epoch": 0.8091106290672451, "grad_norm": 0.16937228457587902, "learning_rate": 7.439592338203221e-05, "loss": 0.272, "step": 373 }, { "epoch": 0.8112798264642083, "grad_norm": 0.21955900610207899, "learning_rate": 7.423035280328589e-05, "loss": 0.2876, "step": 374 }, { "epoch": 0.8134490238611713, "grad_norm": 0.1622542075335067, "learning_rate": 7.406443424805031e-05, "loss": 0.3292, "step": 375 }, { "epoch": 0.8156182212581344, "grad_norm": 0.1539483896012786, "learning_rate": 7.389817009911188e-05, "loss": 0.3103, "step": 376 }, { "epoch": 0.8177874186550976, "grad_norm": 0.15631416958623467, "learning_rate": 7.373156274422022e-05, "loss": 0.2877, "step": 377 }, { "epoch": 0.8199566160520607, "grad_norm": 0.16629989037855605, "learning_rate": 7.356461457605373e-05, "loss": 0.2955, "step": 378 }, { "epoch": 0.8221258134490239, "grad_norm": 0.17821592166706385, "learning_rate": 7.339732799218535e-05, "loss": 0.3395, "step": 379 }, { "epoch": 0.824295010845987, "grad_norm": 0.16495627039143948, "learning_rate": 7.322970539504802e-05, "loss": 0.2975, "step": 380 }, { "epoch": 0.8264642082429501, "grad_norm": 0.1658072464206595, "learning_rate": 7.306174919190025e-05, "loss": 0.3178, "step": 381 }, { "epoch": 0.8286334056399133, "grad_norm": 0.15286362948439744, "learning_rate": 7.28934617947915e-05, "loss": 0.2405, "step": 382 }, { "epoch": 0.8308026030368764, "grad_norm": 0.25352133804652655, "learning_rate": 7.272484562052762e-05, "loss": 0.3318, "step": 383 }, { "epoch": 0.8329718004338394, "grad_norm": 0.16883132171707005, "learning_rate": 7.255590309063604e-05, "loss": 0.2697, "step": 384 }, { "epoch": 0.8351409978308026, "grad_norm": 0.17506322037825633, "learning_rate": 7.238663663133108e-05, "loss": 0.2947, "step": 385 }, { "epoch": 0.8373101952277657, "grad_norm": 0.162905554800095, "learning_rate": 7.221704867347901e-05, "loss": 0.316, "step": 386 }, { "epoch": 0.8394793926247288, "grad_norm": 0.16905944662023994, "learning_rate": 7.204714165256325e-05, "loss": 0.3287, "step": 387 }, { "epoch": 0.841648590021692, "grad_norm": 0.15092794187286074, "learning_rate": 7.187691800864936e-05, "loss": 0.2997, "step": 388 }, { "epoch": 0.8438177874186551, "grad_norm": 0.1355944525377151, "learning_rate": 7.170638018634993e-05, "loss": 0.2278, "step": 389 }, { "epoch": 0.8459869848156182, "grad_norm": 0.17675290778005076, "learning_rate": 7.153553063478953e-05, "loss": 0.3531, "step": 390 }, { "epoch": 0.8481561822125814, "grad_norm": 0.1534412441507167, "learning_rate": 7.136437180756954e-05, "loss": 0.2781, "step": 391 }, { "epoch": 0.8503253796095445, "grad_norm": 0.16137646593619714, "learning_rate": 7.119290616273294e-05, "loss": 0.2834, "step": 392 }, { "epoch": 0.8524945770065075, "grad_norm": 0.15944482254384695, "learning_rate": 7.10211361627289e-05, "loss": 0.2979, "step": 393 }, { "epoch": 0.8546637744034707, "grad_norm": 0.1656541893638849, "learning_rate": 7.084906427437757e-05, "loss": 0.2898, "step": 394 }, { "epoch": 0.8568329718004338, "grad_norm": 0.17951549268207853, "learning_rate": 7.06766929688345e-05, "loss": 0.4042, "step": 395 }, { "epoch": 0.8590021691973969, "grad_norm": 0.1566199701101273, "learning_rate": 7.050402472155526e-05, "loss": 0.3079, "step": 396 }, { "epoch": 0.8611713665943601, "grad_norm": 0.15458950588314033, "learning_rate": 7.03310620122599e-05, "loss": 0.2778, "step": 397 }, { "epoch": 0.8633405639913232, "grad_norm": 0.17104556871315288, "learning_rate": 7.015780732489717e-05, "loss": 0.3175, "step": 398 }, { "epoch": 0.8655097613882863, "grad_norm": 0.19835635840780777, "learning_rate": 6.99842631476091e-05, "loss": 0.3446, "step": 399 }, { "epoch": 0.8676789587852495, "grad_norm": 0.17218889670168014, "learning_rate": 6.981043197269505e-05, "loss": 0.3048, "step": 400 }, { "epoch": 0.8676789587852495, "eval_loss": 0.31443339586257935, "eval_runtime": 39.7245, "eval_samples_per_second": 0.478, "eval_steps_per_second": 0.126, "step": 400 }, { "epoch": 0.8698481561822126, "grad_norm": 0.16106337443031235, "learning_rate": 6.963631629657606e-05, "loss": 0.282, "step": 401 }, { "epoch": 0.8720173535791758, "grad_norm": 0.1549710087992951, "learning_rate": 6.946191861975888e-05, "loss": 0.2923, "step": 402 }, { "epoch": 0.8741865509761388, "grad_norm": 0.17143963315653743, "learning_rate": 6.928724144680022e-05, "loss": 0.304, "step": 403 }, { "epoch": 0.8763557483731019, "grad_norm": 0.16661780813221244, "learning_rate": 6.911228728627059e-05, "loss": 0.3294, "step": 404 }, { "epoch": 0.8785249457700651, "grad_norm": 0.16403696305382504, "learning_rate": 6.893705865071842e-05, "loss": 0.3014, "step": 405 }, { "epoch": 0.8806941431670282, "grad_norm": 0.15319848299652775, "learning_rate": 6.876155805663389e-05, "loss": 0.2691, "step": 406 }, { "epoch": 0.8828633405639913, "grad_norm": 0.17118141480932222, "learning_rate": 6.858578802441288e-05, "loss": 0.3132, "step": 407 }, { "epoch": 0.8850325379609545, "grad_norm": 0.16178546049978892, "learning_rate": 6.840975107832067e-05, "loss": 0.2564, "step": 408 }, { "epoch": 0.8872017353579176, "grad_norm": 0.16291335526409656, "learning_rate": 6.823344974645576e-05, "loss": 0.2873, "step": 409 }, { "epoch": 0.8893709327548807, "grad_norm": 0.1609692593321932, "learning_rate": 6.805688656071354e-05, "loss": 0.305, "step": 410 }, { "epoch": 0.8915401301518439, "grad_norm": 0.14901262397966952, "learning_rate": 6.788006405674992e-05, "loss": 0.2673, "step": 411 }, { "epoch": 0.8937093275488069, "grad_norm": 0.16659840958516206, "learning_rate": 6.770298477394495e-05, "loss": 0.3277, "step": 412 }, { "epoch": 0.89587852494577, "grad_norm": 0.1713410771401561, "learning_rate": 6.75256512553663e-05, "loss": 0.3714, "step": 413 }, { "epoch": 0.8980477223427332, "grad_norm": 0.1818967831157803, "learning_rate": 6.734806604773277e-05, "loss": 0.3811, "step": 414 }, { "epoch": 0.9002169197396963, "grad_norm": 0.21925123588651396, "learning_rate": 6.717023170137774e-05, "loss": 0.3056, "step": 415 }, { "epoch": 0.9023861171366594, "grad_norm": 0.15314422603554104, "learning_rate": 6.69921507702125e-05, "loss": 0.2837, "step": 416 }, { "epoch": 0.9045553145336226, "grad_norm": 0.16258221254945943, "learning_rate": 6.681382581168956e-05, "loss": 0.3085, "step": 417 }, { "epoch": 0.9067245119305857, "grad_norm": 0.16032298987917648, "learning_rate": 6.663525938676603e-05, "loss": 0.3223, "step": 418 }, { "epoch": 0.9088937093275488, "grad_norm": 0.16085131851237103, "learning_rate": 6.645645405986665e-05, "loss": 0.303, "step": 419 }, { "epoch": 0.911062906724512, "grad_norm": 0.1650727655299475, "learning_rate": 6.627741239884716e-05, "loss": 0.3276, "step": 420 }, { "epoch": 0.913232104121475, "grad_norm": 0.15360660494909031, "learning_rate": 6.609813697495731e-05, "loss": 0.2751, "step": 421 }, { "epoch": 0.9154013015184381, "grad_norm": 0.15157055567722505, "learning_rate": 6.591863036280398e-05, "loss": 0.3085, "step": 422 }, { "epoch": 0.9175704989154013, "grad_norm": 0.15803099826725212, "learning_rate": 6.573889514031415e-05, "loss": 0.2939, "step": 423 }, { "epoch": 0.9197396963123644, "grad_norm": 0.1585787128334511, "learning_rate": 6.555893388869793e-05, "loss": 0.2503, "step": 424 }, { "epoch": 0.9219088937093276, "grad_norm": 0.17931795418506688, "learning_rate": 6.537874919241149e-05, "loss": 0.3886, "step": 425 }, { "epoch": 0.9240780911062907, "grad_norm": 0.15343352858299092, "learning_rate": 6.519834363911992e-05, "loss": 0.2793, "step": 426 }, { "epoch": 0.9262472885032538, "grad_norm": 0.15328728201248645, "learning_rate": 6.501771981966007e-05, "loss": 0.2644, "step": 427 }, { "epoch": 0.928416485900217, "grad_norm": 0.16064411420877225, "learning_rate": 6.483688032800337e-05, "loss": 0.3188, "step": 428 }, { "epoch": 0.93058568329718, "grad_norm": 0.1679425352918729, "learning_rate": 6.465582776121852e-05, "loss": 0.3602, "step": 429 }, { "epoch": 0.9327548806941431, "grad_norm": 0.16379595982132292, "learning_rate": 6.447456471943427e-05, "loss": 0.2769, "step": 430 }, { "epoch": 0.9349240780911063, "grad_norm": 0.15943476584744723, "learning_rate": 6.429309380580202e-05, "loss": 0.2702, "step": 431 }, { "epoch": 0.9370932754880694, "grad_norm": 0.15321548417679473, "learning_rate": 6.411141762645846e-05, "loss": 0.2463, "step": 432 }, { "epoch": 0.9392624728850325, "grad_norm": 0.15564677916963437, "learning_rate": 6.392953879048813e-05, "loss": 0.2885, "step": 433 }, { "epoch": 0.9414316702819957, "grad_norm": 0.1631290118518175, "learning_rate": 6.374745990988598e-05, "loss": 0.2748, "step": 434 }, { "epoch": 0.9436008676789588, "grad_norm": 0.14619225672799796, "learning_rate": 6.356518359951982e-05, "loss": 0.249, "step": 435 }, { "epoch": 0.9457700650759219, "grad_norm": 0.1571277122359372, "learning_rate": 6.338271247709278e-05, "loss": 0.2533, "step": 436 }, { "epoch": 0.9479392624728851, "grad_norm": 0.1550916491663443, "learning_rate": 6.320004916310573e-05, "loss": 0.2677, "step": 437 }, { "epoch": 0.9501084598698482, "grad_norm": 0.16661022388778246, "learning_rate": 6.301719628081965e-05, "loss": 0.2878, "step": 438 }, { "epoch": 0.9522776572668112, "grad_norm": 0.2040573789571519, "learning_rate": 6.283415645621791e-05, "loss": 0.2703, "step": 439 }, { "epoch": 0.9544468546637744, "grad_norm": 0.16469631306397306, "learning_rate": 6.265093231796864e-05, "loss": 0.3037, "step": 440 }, { "epoch": 0.9566160520607375, "grad_norm": 0.15770760260143657, "learning_rate": 6.246752649738686e-05, "loss": 0.2657, "step": 441 }, { "epoch": 0.9587852494577006, "grad_norm": 0.15983199407271448, "learning_rate": 6.228394162839686e-05, "loss": 0.2764, "step": 442 }, { "epoch": 0.9609544468546638, "grad_norm": 0.16489136986239505, "learning_rate": 6.210018034749421e-05, "loss": 0.2955, "step": 443 }, { "epoch": 0.9631236442516269, "grad_norm": 0.22721431990817345, "learning_rate": 6.191624529370796e-05, "loss": 0.3395, "step": 444 }, { "epoch": 0.96529284164859, "grad_norm": 0.157681168570052, "learning_rate": 6.173213910856277e-05, "loss": 0.3018, "step": 445 }, { "epoch": 0.9674620390455532, "grad_norm": 0.1576772973443425, "learning_rate": 6.154786443604098e-05, "loss": 0.3013, "step": 446 }, { "epoch": 0.9696312364425163, "grad_norm": 0.1579242415527196, "learning_rate": 6.13634239225445e-05, "loss": 0.294, "step": 447 }, { "epoch": 0.9718004338394793, "grad_norm": 0.20593023409205996, "learning_rate": 6.117882021685704e-05, "loss": 0.2883, "step": 448 }, { "epoch": 0.9739696312364425, "grad_norm": 0.16203552803868196, "learning_rate": 6.099405597010585e-05, "loss": 0.2957, "step": 449 }, { "epoch": 0.9761388286334056, "grad_norm": 0.1614999286165849, "learning_rate": 6.0809133835723774e-05, "loss": 0.2885, "step": 450 }, { "epoch": 0.9783080260303688, "grad_norm": 0.16900731974324282, "learning_rate": 6.0624056469411125e-05, "loss": 0.301, "step": 451 }, { "epoch": 0.9804772234273319, "grad_norm": 0.17129193051244268, "learning_rate": 6.043882652909752e-05, "loss": 0.2701, "step": 452 }, { "epoch": 0.982646420824295, "grad_norm": 0.19758216454445776, "learning_rate": 6.025344667490369e-05, "loss": 0.3732, "step": 453 }, { "epoch": 0.9848156182212582, "grad_norm": 0.15788414206460427, "learning_rate": 6.006791956910334e-05, "loss": 0.286, "step": 454 }, { "epoch": 0.9869848156182213, "grad_norm": 0.1686049946630881, "learning_rate": 5.9882247876084865e-05, "loss": 0.3423, "step": 455 }, { "epoch": 0.9891540130151844, "grad_norm": 0.1623316957404726, "learning_rate": 5.969643426231309e-05, "loss": 0.2972, "step": 456 }, { "epoch": 0.9913232104121475, "grad_norm": 0.16633964946287502, "learning_rate": 5.951048139629105e-05, "loss": 0.3207, "step": 457 }, { "epoch": 0.9934924078091106, "grad_norm": 0.15578806657191704, "learning_rate": 5.932439194852153e-05, "loss": 0.2821, "step": 458 }, { "epoch": 0.9956616052060737, "grad_norm": 0.141680355386348, "learning_rate": 5.9138168591468845e-05, "loss": 0.211, "step": 459 }, { "epoch": 0.9978308026030369, "grad_norm": 0.1626311176710574, "learning_rate": 5.8951813999520375e-05, "loss": 0.2776, "step": 460 }, { "epoch": 1.0, "grad_norm": 0.2168432501312755, "learning_rate": 5.876533084894821e-05, "loss": 0.3189, "step": 461 }, { "epoch": 1.002169197396963, "grad_norm": 0.15195564980233445, "learning_rate": 5.8578721817870666e-05, "loss": 0.2746, "step": 462 }, { "epoch": 1.0043383947939262, "grad_norm": 0.1491438360530865, "learning_rate": 5.839198958621388e-05, "loss": 0.2053, "step": 463 }, { "epoch": 1.0065075921908895, "grad_norm": 0.15367262427759873, "learning_rate": 5.820513683567328e-05, "loss": 0.2371, "step": 464 }, { "epoch": 1.0086767895878526, "grad_norm": 0.1511433155050122, "learning_rate": 5.801816624967509e-05, "loss": 0.2213, "step": 465 }, { "epoch": 1.0108459869848156, "grad_norm": 0.1663892898536712, "learning_rate": 5.783108051333779e-05, "loss": 0.2576, "step": 466 }, { "epoch": 1.0130151843817787, "grad_norm": 0.17685832695456444, "learning_rate": 5.764388231343356e-05, "loss": 0.3009, "step": 467 }, { "epoch": 1.0151843817787418, "grad_norm": 0.21787472552900225, "learning_rate": 5.745657433834968e-05, "loss": 0.2636, "step": 468 }, { "epoch": 1.017353579175705, "grad_norm": 0.19123373552730527, "learning_rate": 5.726915927804995e-05, "loss": 0.2743, "step": 469 }, { "epoch": 1.0195227765726682, "grad_norm": 0.17768041272481136, "learning_rate": 5.7081639824036e-05, "loss": 0.2623, "step": 470 }, { "epoch": 1.0216919739696313, "grad_norm": 0.1697916576481495, "learning_rate": 5.6894018669308735e-05, "loss": 0.2272, "step": 471 }, { "epoch": 1.0238611713665944, "grad_norm": 0.17650211499609383, "learning_rate": 5.670629850832956e-05, "loss": 0.2761, "step": 472 }, { "epoch": 1.0260303687635575, "grad_norm": 0.18214913995299273, "learning_rate": 5.6518482036981725e-05, "loss": 0.2459, "step": 473 }, { "epoch": 1.0281995661605206, "grad_norm": 0.17835571006990567, "learning_rate": 5.633057195253164e-05, "loss": 0.2468, "step": 474 }, { "epoch": 1.0303687635574836, "grad_norm": 0.2069539317038426, "learning_rate": 5.614257095359009e-05, "loss": 0.3213, "step": 475 }, { "epoch": 1.032537960954447, "grad_norm": 0.19136911570719517, "learning_rate": 5.5954481740073505e-05, "loss": 0.2823, "step": 476 }, { "epoch": 1.03470715835141, "grad_norm": 0.19262401192871317, "learning_rate": 5.5766307013165156e-05, "loss": 0.2595, "step": 477 }, { "epoch": 1.0368763557483731, "grad_norm": 0.1860152304131639, "learning_rate": 5.557804947527645e-05, "loss": 0.2801, "step": 478 }, { "epoch": 1.0390455531453362, "grad_norm": 0.16924664525627553, "learning_rate": 5.5389711830007984e-05, "loss": 0.2308, "step": 479 }, { "epoch": 1.0412147505422993, "grad_norm": 0.1816528885351813, "learning_rate": 5.5201296782110845e-05, "loss": 0.265, "step": 480 }, { "epoch": 1.0433839479392624, "grad_norm": 0.16993844286408233, "learning_rate": 5.501280703744769e-05, "loss": 0.2538, "step": 481 }, { "epoch": 1.0455531453362257, "grad_norm": 0.1949073398717587, "learning_rate": 5.48242453029539e-05, "loss": 0.3056, "step": 482 }, { "epoch": 1.0477223427331888, "grad_norm": 0.16891171117764792, "learning_rate": 5.463561428659875e-05, "loss": 0.2197, "step": 483 }, { "epoch": 1.0498915401301518, "grad_norm": 0.18215169314661425, "learning_rate": 5.444691669734643e-05, "loss": 0.2447, "step": 484 }, { "epoch": 1.052060737527115, "grad_norm": 0.182026157244086, "learning_rate": 5.425815524511726e-05, "loss": 0.2978, "step": 485 }, { "epoch": 1.054229934924078, "grad_norm": 0.17881791802607705, "learning_rate": 5.406933264074866e-05, "loss": 0.2469, "step": 486 }, { "epoch": 1.056399132321041, "grad_norm": 0.18807032896417783, "learning_rate": 5.3880451595956294e-05, "loss": 0.297, "step": 487 }, { "epoch": 1.0585683297180044, "grad_norm": 0.20875267489125912, "learning_rate": 5.369151482329506e-05, "loss": 0.282, "step": 488 }, { "epoch": 1.0607375271149675, "grad_norm": 0.19549344502300633, "learning_rate": 5.350252503612024e-05, "loss": 0.3833, "step": 489 }, { "epoch": 1.0629067245119306, "grad_norm": 0.17937531323085554, "learning_rate": 5.331348494854841e-05, "loss": 0.2622, "step": 490 }, { "epoch": 1.0650759219088937, "grad_norm": 0.1870147791370534, "learning_rate": 5.3124397275418524e-05, "loss": 0.2322, "step": 491 }, { "epoch": 1.0672451193058567, "grad_norm": 0.1951898037972042, "learning_rate": 5.2935264732252965e-05, "loss": 0.2357, "step": 492 }, { "epoch": 1.06941431670282, "grad_norm": 0.18975302549273446, "learning_rate": 5.274609003521846e-05, "loss": 0.2419, "step": 493 }, { "epoch": 1.0715835140997831, "grad_norm": 0.20439478227045751, "learning_rate": 5.255687590108711e-05, "loss": 0.3275, "step": 494 }, { "epoch": 1.0737527114967462, "grad_norm": 0.18609479985163507, "learning_rate": 5.236762504719742e-05, "loss": 0.2754, "step": 495 }, { "epoch": 1.0759219088937093, "grad_norm": 0.1830303728930595, "learning_rate": 5.217834019141521e-05, "loss": 0.2595, "step": 496 }, { "epoch": 1.0780911062906724, "grad_norm": 0.20315831733475495, "learning_rate": 5.1989024052094605e-05, "loss": 0.3317, "step": 497 }, { "epoch": 1.0802603036876355, "grad_norm": 0.18316883083338042, "learning_rate": 5.1799679348039e-05, "loss": 0.2458, "step": 498 }, { "epoch": 1.0824295010845988, "grad_norm": 0.2041065318820468, "learning_rate": 5.1610308798462016e-05, "loss": 0.3169, "step": 499 }, { "epoch": 1.0845986984815619, "grad_norm": 0.1944264750547026, "learning_rate": 5.142091512294844e-05, "loss": 0.2903, "step": 500 }, { "epoch": 1.086767895878525, "grad_norm": 0.18918880781763017, "learning_rate": 5.123150104141521e-05, "loss": 0.2638, "step": 501 }, { "epoch": 1.088937093275488, "grad_norm": 0.18983712841034528, "learning_rate": 5.104206927407225e-05, "loss": 0.2871, "step": 502 }, { "epoch": 1.0911062906724511, "grad_norm": 0.20226737440215853, "learning_rate": 5.085262254138353e-05, "loss": 0.3285, "step": 503 }, { "epoch": 1.0932754880694142, "grad_norm": 0.19118754054883752, "learning_rate": 5.0663163564027935e-05, "loss": 0.2594, "step": 504 }, { "epoch": 1.0954446854663775, "grad_norm": 0.2138619141242844, "learning_rate": 5.047369506286017e-05, "loss": 0.2832, "step": 505 }, { "epoch": 1.0976138828633406, "grad_norm": 0.3269054916680425, "learning_rate": 5.028421975887173e-05, "loss": 0.2988, "step": 506 }, { "epoch": 1.0997830802603037, "grad_norm": 0.19040645241202558, "learning_rate": 5.00947403731518e-05, "loss": 0.2712, "step": 507 }, { "epoch": 1.1019522776572668, "grad_norm": 0.1967133503261764, "learning_rate": 4.99052596268482e-05, "loss": 0.2815, "step": 508 }, { "epoch": 1.1041214750542299, "grad_norm": 0.1903074378491903, "learning_rate": 4.9715780241128286e-05, "loss": 0.2737, "step": 509 }, { "epoch": 1.106290672451193, "grad_norm": 0.18119699874786585, "learning_rate": 4.952630493713984e-05, "loss": 0.269, "step": 510 }, { "epoch": 1.1084598698481563, "grad_norm": 0.221788539906692, "learning_rate": 4.9336836435972076e-05, "loss": 0.2707, "step": 511 }, { "epoch": 1.1106290672451193, "grad_norm": 0.1928601828912626, "learning_rate": 4.914737745861646e-05, "loss": 0.2226, "step": 512 }, { "epoch": 1.1127982646420824, "grad_norm": 0.21645221349069002, "learning_rate": 4.895793072592776e-05, "loss": 0.2286, "step": 513 }, { "epoch": 1.1149674620390455, "grad_norm": 0.1970144418295259, "learning_rate": 4.8768498958584795e-05, "loss": 0.2825, "step": 514 }, { "epoch": 1.1171366594360086, "grad_norm": 0.1957160462190446, "learning_rate": 4.8579084877051565e-05, "loss": 0.2391, "step": 515 }, { "epoch": 1.119305856832972, "grad_norm": 0.21402378244369188, "learning_rate": 4.838969120153798e-05, "loss": 0.2555, "step": 516 }, { "epoch": 1.121475054229935, "grad_norm": 0.21877951289590283, "learning_rate": 4.820032065196101e-05, "loss": 0.334, "step": 517 }, { "epoch": 1.123644251626898, "grad_norm": 0.21929824905146836, "learning_rate": 4.801097594790539e-05, "loss": 0.2973, "step": 518 }, { "epoch": 1.1258134490238612, "grad_norm": 0.20619197296358086, "learning_rate": 4.78216598085848e-05, "loss": 0.3018, "step": 519 }, { "epoch": 1.1279826464208242, "grad_norm": 0.19555135005105484, "learning_rate": 4.763237495280258e-05, "loss": 0.2462, "step": 520 }, { "epoch": 1.1301518438177873, "grad_norm": 0.21114282630013761, "learning_rate": 4.74431240989129e-05, "loss": 0.327, "step": 521 }, { "epoch": 1.1323210412147506, "grad_norm": 0.20234364960770707, "learning_rate": 4.725390996478155e-05, "loss": 0.266, "step": 522 }, { "epoch": 1.1344902386117137, "grad_norm": 0.19961583213636722, "learning_rate": 4.706473526774705e-05, "loss": 0.255, "step": 523 }, { "epoch": 1.1366594360086768, "grad_norm": 0.19946499474596735, "learning_rate": 4.6875602724581474e-05, "loss": 0.2567, "step": 524 }, { "epoch": 1.13882863340564, "grad_norm": 0.190031071960285, "learning_rate": 4.668651505145161e-05, "loss": 0.2536, "step": 525 }, { "epoch": 1.140997830802603, "grad_norm": 0.19912223902629478, "learning_rate": 4.649747496387976e-05, "loss": 0.2814, "step": 526 }, { "epoch": 1.1431670281995663, "grad_norm": 0.2029251712939136, "learning_rate": 4.630848517670495e-05, "loss": 0.2357, "step": 527 }, { "epoch": 1.1453362255965294, "grad_norm": 0.1906141032119321, "learning_rate": 4.611954840404371e-05, "loss": 0.2814, "step": 528 }, { "epoch": 1.1475054229934925, "grad_norm": 0.188787160150689, "learning_rate": 4.593066735925135e-05, "loss": 0.2696, "step": 529 }, { "epoch": 1.1496746203904555, "grad_norm": 0.2343991376514014, "learning_rate": 4.574184475488274e-05, "loss": 0.2429, "step": 530 }, { "epoch": 1.1518438177874186, "grad_norm": 0.19283832286439154, "learning_rate": 4.5553083302653576e-05, "loss": 0.2433, "step": 531 }, { "epoch": 1.1540130151843817, "grad_norm": 0.21911781620446846, "learning_rate": 4.5364385713401256e-05, "loss": 0.2821, "step": 532 }, { "epoch": 1.1561822125813448, "grad_norm": 0.22969903578133888, "learning_rate": 4.517575469704611e-05, "loss": 0.2776, "step": 533 }, { "epoch": 1.158351409978308, "grad_norm": 0.20410973608962502, "learning_rate": 4.498719296255231e-05, "loss": 0.2549, "step": 534 }, { "epoch": 1.1605206073752712, "grad_norm": 0.2146495328698227, "learning_rate": 4.4798703217889166e-05, "loss": 0.2809, "step": 535 }, { "epoch": 1.1626898047722343, "grad_norm": 0.20839631573723869, "learning_rate": 4.461028816999203e-05, "loss": 0.2974, "step": 536 }, { "epoch": 1.1648590021691974, "grad_norm": 0.19237241347004658, "learning_rate": 4.442195052472357e-05, "loss": 0.2636, "step": 537 }, { "epoch": 1.1670281995661604, "grad_norm": 0.18275212969570778, "learning_rate": 4.423369298683485e-05, "loss": 0.2346, "step": 538 }, { "epoch": 1.1691973969631237, "grad_norm": 0.21224147212685224, "learning_rate": 4.404551825992651e-05, "loss": 0.2942, "step": 539 }, { "epoch": 1.1713665943600868, "grad_norm": 0.1947203315252727, "learning_rate": 4.385742904640993e-05, "loss": 0.239, "step": 540 }, { "epoch": 1.17353579175705, "grad_norm": 0.2014217555241961, "learning_rate": 4.366942804746837e-05, "loss": 0.282, "step": 541 }, { "epoch": 1.175704989154013, "grad_norm": 0.20005402924569843, "learning_rate": 4.3481517963018294e-05, "loss": 0.2567, "step": 542 }, { "epoch": 1.177874186550976, "grad_norm": 0.19683374608842036, "learning_rate": 4.329370149167046e-05, "loss": 0.2422, "step": 543 }, { "epoch": 1.1800433839479392, "grad_norm": 0.18295647045924832, "learning_rate": 4.310598133069128e-05, "loss": 0.2608, "step": 544 }, { "epoch": 1.1822125813449025, "grad_norm": 0.25320255904534206, "learning_rate": 4.291836017596401e-05, "loss": 0.2522, "step": 545 }, { "epoch": 1.1843817787418656, "grad_norm": 0.25012095918625965, "learning_rate": 4.273084072195008e-05, "loss": 0.2392, "step": 546 }, { "epoch": 1.1865509761388287, "grad_norm": 0.2042418325652416, "learning_rate": 4.2543425661650325e-05, "loss": 0.2632, "step": 547 }, { "epoch": 1.1887201735357917, "grad_norm": 0.20971549718156754, "learning_rate": 4.2356117686566464e-05, "loss": 0.3056, "step": 548 }, { "epoch": 1.1908893709327548, "grad_norm": 0.22494235538454008, "learning_rate": 4.2168919486662225e-05, "loss": 0.1999, "step": 549 }, { "epoch": 1.1930585683297181, "grad_norm": 0.19594451325509984, "learning_rate": 4.1981833750324934e-05, "loss": 0.2237, "step": 550 }, { "epoch": 1.1952277657266812, "grad_norm": 0.20048311843021904, "learning_rate": 4.179486316432674e-05, "loss": 0.2457, "step": 551 }, { "epoch": 1.1973969631236443, "grad_norm": 0.2133231404323406, "learning_rate": 4.1608010413786145e-05, "loss": 0.2428, "step": 552 }, { "epoch": 1.1995661605206074, "grad_norm": 0.22391429220016193, "learning_rate": 4.1421278182129345e-05, "loss": 0.2835, "step": 553 }, { "epoch": 1.2017353579175705, "grad_norm": 0.20513974766270474, "learning_rate": 4.1234669151051814e-05, "loss": 0.2837, "step": 554 }, { "epoch": 1.2039045553145336, "grad_norm": 0.21910779105815384, "learning_rate": 4.104818600047963e-05, "loss": 0.2552, "step": 555 }, { "epoch": 1.2060737527114966, "grad_norm": 0.20347018099490702, "learning_rate": 4.0861831408531174e-05, "loss": 0.2825, "step": 556 }, { "epoch": 1.20824295010846, "grad_norm": 0.20019781967770364, "learning_rate": 4.067560805147848e-05, "loss": 0.2646, "step": 557 }, { "epoch": 1.210412147505423, "grad_norm": 0.19985925625011156, "learning_rate": 4.048951860370897e-05, "loss": 0.2339, "step": 558 }, { "epoch": 1.2125813449023861, "grad_norm": 0.20804567399279866, "learning_rate": 4.030356573768691e-05, "loss": 0.2685, "step": 559 }, { "epoch": 1.2147505422993492, "grad_norm": 0.210586914513503, "learning_rate": 4.0117752123915166e-05, "loss": 0.2949, "step": 560 }, { "epoch": 1.2169197396963123, "grad_norm": 0.19395560738783052, "learning_rate": 3.9932080430896674e-05, "loss": 0.2581, "step": 561 }, { "epoch": 1.2190889370932756, "grad_norm": 0.19335081571244175, "learning_rate": 3.974655332509632e-05, "loss": 0.2398, "step": 562 }, { "epoch": 1.2212581344902387, "grad_norm": 0.21139167236935025, "learning_rate": 3.956117347090249e-05, "loss": 0.294, "step": 563 }, { "epoch": 1.2234273318872018, "grad_norm": 0.2136565272145805, "learning_rate": 3.937594353058888e-05, "loss": 0.249, "step": 564 }, { "epoch": 1.2255965292841648, "grad_norm": 0.20983577033903353, "learning_rate": 3.9190866164276224e-05, "loss": 0.277, "step": 565 }, { "epoch": 1.227765726681128, "grad_norm": 0.22523284294717572, "learning_rate": 3.900594402989416e-05, "loss": 0.2965, "step": 566 }, { "epoch": 1.229934924078091, "grad_norm": 0.19361498352831719, "learning_rate": 3.8821179783142976e-05, "loss": 0.2468, "step": 567 }, { "epoch": 1.2321041214750543, "grad_norm": 0.20349907006204035, "learning_rate": 3.863657607745551e-05, "loss": 0.2707, "step": 568 }, { "epoch": 1.2342733188720174, "grad_norm": 0.21437394856709535, "learning_rate": 3.8452135563959035e-05, "loss": 0.2482, "step": 569 }, { "epoch": 1.2364425162689805, "grad_norm": 0.4418854079923846, "learning_rate": 3.8267860891437224e-05, "loss": 0.2526, "step": 570 }, { "epoch": 1.2386117136659436, "grad_norm": 0.20107219218854192, "learning_rate": 3.8083754706292044e-05, "loss": 0.2302, "step": 571 }, { "epoch": 1.2407809110629067, "grad_norm": 0.22317690892260889, "learning_rate": 3.7899819652505805e-05, "loss": 0.3018, "step": 572 }, { "epoch": 1.2429501084598698, "grad_norm": 0.24225532136972353, "learning_rate": 3.771605837160315e-05, "loss": 0.2945, "step": 573 }, { "epoch": 1.245119305856833, "grad_norm": 0.21311877910331786, "learning_rate": 3.753247350261314e-05, "loss": 0.268, "step": 574 }, { "epoch": 1.2472885032537961, "grad_norm": 0.21276496906364534, "learning_rate": 3.734906768203137e-05, "loss": 0.2796, "step": 575 }, { "epoch": 1.2494577006507592, "grad_norm": 0.21666315750271123, "learning_rate": 3.7165843543782094e-05, "loss": 0.3048, "step": 576 }, { "epoch": 1.2516268980477223, "grad_norm": 0.2117830975027958, "learning_rate": 3.698280371918035e-05, "loss": 0.3134, "step": 577 }, { "epoch": 1.2537960954446854, "grad_norm": 0.21204614791084325, "learning_rate": 3.679995083689427e-05, "loss": 0.2583, "step": 578 }, { "epoch": 1.2559652928416485, "grad_norm": 0.20266543030388112, "learning_rate": 3.6617287522907215e-05, "loss": 0.2187, "step": 579 }, { "epoch": 1.2581344902386118, "grad_norm": 0.19799515726039077, "learning_rate": 3.643481640048019e-05, "loss": 0.2486, "step": 580 }, { "epoch": 1.2603036876355749, "grad_norm": 0.22965267318434665, "learning_rate": 3.6252540090114014e-05, "loss": 0.3141, "step": 581 }, { "epoch": 1.262472885032538, "grad_norm": 0.20300927451116837, "learning_rate": 3.607046120951187e-05, "loss": 0.2481, "step": 582 }, { "epoch": 1.264642082429501, "grad_norm": 0.19473972219878735, "learning_rate": 3.588858237354154e-05, "loss": 0.2425, "step": 583 }, { "epoch": 1.2668112798264641, "grad_norm": 0.19460999115969896, "learning_rate": 3.5706906194197995e-05, "loss": 0.2695, "step": 584 }, { "epoch": 1.2689804772234274, "grad_norm": 0.2088031103817509, "learning_rate": 3.552543528056573e-05, "loss": 0.2894, "step": 585 }, { "epoch": 1.2711496746203905, "grad_norm": 0.20225624597683012, "learning_rate": 3.534417223878149e-05, "loss": 0.2532, "step": 586 }, { "epoch": 1.2733188720173536, "grad_norm": 0.1925296097866438, "learning_rate": 3.516311967199664e-05, "loss": 0.2544, "step": 587 }, { "epoch": 1.2754880694143167, "grad_norm": 0.19510001512503033, "learning_rate": 3.498228018033994e-05, "loss": 0.262, "step": 588 }, { "epoch": 1.2776572668112798, "grad_norm": 0.2049108757868876, "learning_rate": 3.4801656360880083e-05, "loss": 0.2562, "step": 589 }, { "epoch": 1.2798264642082429, "grad_norm": 0.2266927606829423, "learning_rate": 3.4621250807588524e-05, "loss": 0.3551, "step": 590 }, { "epoch": 1.281995661605206, "grad_norm": 0.21569872794030454, "learning_rate": 3.444106611130209e-05, "loss": 0.2774, "step": 591 }, { "epoch": 1.2841648590021693, "grad_norm": 0.21023989849968328, "learning_rate": 3.4261104859685865e-05, "loss": 0.2622, "step": 592 }, { "epoch": 1.2863340563991323, "grad_norm": 0.19106084312223887, "learning_rate": 3.408136963719605e-05, "loss": 0.2252, "step": 593 }, { "epoch": 1.2885032537960954, "grad_norm": 0.19995458008751701, "learning_rate": 3.39018630250427e-05, "loss": 0.2372, "step": 594 }, { "epoch": 1.2906724511930585, "grad_norm": 0.22200947357348208, "learning_rate": 3.3722587601152855e-05, "loss": 0.2321, "step": 595 }, { "epoch": 1.2928416485900218, "grad_norm": 0.1950829833237249, "learning_rate": 3.354354594013337e-05, "loss": 0.2241, "step": 596 }, { "epoch": 1.295010845986985, "grad_norm": 0.20036238676413232, "learning_rate": 3.336474061323399e-05, "loss": 0.2055, "step": 597 }, { "epoch": 1.297180043383948, "grad_norm": 0.2107825415915727, "learning_rate": 3.318617418831044e-05, "loss": 0.256, "step": 598 }, { "epoch": 1.299349240780911, "grad_norm": 0.21989801828531605, "learning_rate": 3.3007849229787516e-05, "loss": 0.2764, "step": 599 }, { "epoch": 1.3015184381778742, "grad_norm": 0.21680377382599092, "learning_rate": 3.282976829862227e-05, "loss": 0.2566, "step": 600 }, { "epoch": 1.3015184381778742, "eval_loss": 0.3054097592830658, "eval_runtime": 39.7394, "eval_samples_per_second": 0.478, "eval_steps_per_second": 0.126, "step": 600 }, { "epoch": 1.3036876355748372, "grad_norm": 0.20308783149132958, "learning_rate": 3.2651933952267245e-05, "loss": 0.2112, "step": 601 }, { "epoch": 1.3058568329718003, "grad_norm": 0.21690298502452043, "learning_rate": 3.247434874463372e-05, "loss": 0.2649, "step": 602 }, { "epoch": 1.3080260303687636, "grad_norm": 0.2055181727108075, "learning_rate": 3.2297015226055076e-05, "loss": 0.2824, "step": 603 }, { "epoch": 1.3101952277657267, "grad_norm": 0.2320338793221242, "learning_rate": 3.211993594325009e-05, "loss": 0.2456, "step": 604 }, { "epoch": 1.3123644251626898, "grad_norm": 0.20493923957441781, "learning_rate": 3.194311343928649e-05, "loss": 0.2366, "step": 605 }, { "epoch": 1.314533622559653, "grad_norm": 0.2030392074075536, "learning_rate": 3.176655025354425e-05, "loss": 0.2508, "step": 606 }, { "epoch": 1.316702819956616, "grad_norm": 0.2110656534399716, "learning_rate": 3.159024892167935e-05, "loss": 0.2502, "step": 607 }, { "epoch": 1.3188720173535793, "grad_norm": 0.25854440170290666, "learning_rate": 3.141421197558713e-05, "loss": 0.3055, "step": 608 }, { "epoch": 1.3210412147505424, "grad_norm": 0.19420699816129983, "learning_rate": 3.123844194336613e-05, "loss": 0.2271, "step": 609 }, { "epoch": 1.3232104121475055, "grad_norm": 0.21496315608117952, "learning_rate": 3.1062941349281594e-05, "loss": 0.2673, "step": 610 }, { "epoch": 1.3253796095444685, "grad_norm": 0.20106583560494895, "learning_rate": 3.0887712713729435e-05, "loss": 0.2365, "step": 611 }, { "epoch": 1.3275488069414316, "grad_norm": 0.21120123263668, "learning_rate": 3.071275855319979e-05, "loss": 0.2554, "step": 612 }, { "epoch": 1.3297180043383947, "grad_norm": 0.21665345900294192, "learning_rate": 3.053808138024113e-05, "loss": 0.2701, "step": 613 }, { "epoch": 1.3318872017353578, "grad_norm": 0.21025986355262039, "learning_rate": 3.036368370342396e-05, "loss": 0.243, "step": 614 }, { "epoch": 1.334056399132321, "grad_norm": 0.2060375691889601, "learning_rate": 3.018956802730497e-05, "loss": 0.275, "step": 615 }, { "epoch": 1.3362255965292842, "grad_norm": 0.22136533659676028, "learning_rate": 3.0015736852390918e-05, "loss": 0.257, "step": 616 }, { "epoch": 1.3383947939262473, "grad_norm": 0.21449871481305618, "learning_rate": 2.984219267510285e-05, "loss": 0.2704, "step": 617 }, { "epoch": 1.3405639913232104, "grad_norm": 0.24014701061684474, "learning_rate": 2.966893798774012e-05, "loss": 0.2891, "step": 618 }, { "epoch": 1.3427331887201737, "grad_norm": 0.2244269288798775, "learning_rate": 2.9495975278444743e-05, "loss": 0.2935, "step": 619 }, { "epoch": 1.3449023861171367, "grad_norm": 0.21096761572427544, "learning_rate": 2.9323307031165503e-05, "loss": 0.2739, "step": 620 }, { "epoch": 1.3470715835140998, "grad_norm": 0.20262736058046335, "learning_rate": 2.9150935725622434e-05, "loss": 0.2499, "step": 621 }, { "epoch": 1.349240780911063, "grad_norm": 0.19820653616274597, "learning_rate": 2.8978863837271096e-05, "loss": 0.2393, "step": 622 }, { "epoch": 1.351409978308026, "grad_norm": 0.23042991309507332, "learning_rate": 2.8807093837267062e-05, "loss": 0.302, "step": 623 }, { "epoch": 1.353579175704989, "grad_norm": 0.1909580869184299, "learning_rate": 2.8635628192430457e-05, "loss": 0.2492, "step": 624 }, { "epoch": 1.3557483731019522, "grad_norm": 0.1954261314272082, "learning_rate": 2.8464469365210476e-05, "loss": 0.2369, "step": 625 }, { "epoch": 1.3579175704989155, "grad_norm": 0.21435896485747777, "learning_rate": 2.829361981365008e-05, "loss": 0.2825, "step": 626 }, { "epoch": 1.3600867678958786, "grad_norm": 0.20366176793808735, "learning_rate": 2.812308199135064e-05, "loss": 0.2455, "step": 627 }, { "epoch": 1.3622559652928417, "grad_norm": 0.220096792471076, "learning_rate": 2.795285834743674e-05, "loss": 0.2281, "step": 628 }, { "epoch": 1.3644251626898047, "grad_norm": 0.23939270387667466, "learning_rate": 2.7782951326521e-05, "loss": 0.3235, "step": 629 }, { "epoch": 1.3665943600867678, "grad_norm": 0.2501946465822018, "learning_rate": 2.7613363368668933e-05, "loss": 0.3561, "step": 630 }, { "epoch": 1.3687635574837311, "grad_norm": 0.19776494315538312, "learning_rate": 2.744409690936396e-05, "loss": 0.235, "step": 631 }, { "epoch": 1.3709327548806942, "grad_norm": 0.22972123156189964, "learning_rate": 2.7275154379472383e-05, "loss": 0.2781, "step": 632 }, { "epoch": 1.3731019522776573, "grad_norm": 0.22520549969079728, "learning_rate": 2.7106538205208503e-05, "loss": 0.2952, "step": 633 }, { "epoch": 1.3752711496746204, "grad_norm": 0.2056228650880094, "learning_rate": 2.6938250808099765e-05, "loss": 0.2337, "step": 634 }, { "epoch": 1.3774403470715835, "grad_norm": 0.2286918375083098, "learning_rate": 2.677029460495199e-05, "loss": 0.2896, "step": 635 }, { "epoch": 1.3796095444685466, "grad_norm": 0.19803374847789326, "learning_rate": 2.6602672007814657e-05, "loss": 0.2445, "step": 636 }, { "epoch": 1.3817787418655096, "grad_norm": 0.21204934763741792, "learning_rate": 2.6435385423946268e-05, "loss": 0.2565, "step": 637 }, { "epoch": 1.383947939262473, "grad_norm": 0.2623356517904308, "learning_rate": 2.6268437255779795e-05, "loss": 0.2574, "step": 638 }, { "epoch": 1.386117136659436, "grad_norm": 0.22828545407934486, "learning_rate": 2.610182990088813e-05, "loss": 0.3083, "step": 639 }, { "epoch": 1.3882863340563991, "grad_norm": 0.21316635288288857, "learning_rate": 2.5935565751949708e-05, "loss": 0.2884, "step": 640 }, { "epoch": 1.3904555314533622, "grad_norm": 0.18200487727356005, "learning_rate": 2.5769647196714115e-05, "loss": 0.1971, "step": 641 }, { "epoch": 1.3926247288503255, "grad_norm": 0.20394759124623543, "learning_rate": 2.5604076617967797e-05, "loss": 0.2531, "step": 642 }, { "epoch": 1.3947939262472886, "grad_norm": 0.2211940525129031, "learning_rate": 2.5438856393499856e-05, "loss": 0.2642, "step": 643 }, { "epoch": 1.3969631236442517, "grad_norm": 0.21677701478466002, "learning_rate": 2.527398889606789e-05, "loss": 0.2471, "step": 644 }, { "epoch": 1.3991323210412148, "grad_norm": 0.19805770191187302, "learning_rate": 2.5109476493363903e-05, "loss": 0.2275, "step": 645 }, { "epoch": 1.4013015184381779, "grad_norm": 0.370645367139852, "learning_rate": 2.494532154798036e-05, "loss": 0.2652, "step": 646 }, { "epoch": 1.403470715835141, "grad_norm": 0.4096140214680738, "learning_rate": 2.4781526417376167e-05, "loss": 0.2214, "step": 647 }, { "epoch": 1.405639913232104, "grad_norm": 0.20027276095982202, "learning_rate": 2.4618093453842904e-05, "loss": 0.2283, "step": 648 }, { "epoch": 1.407809110629067, "grad_norm": 0.2042256862448447, "learning_rate": 2.4455025004470983e-05, "loss": 0.2317, "step": 649 }, { "epoch": 1.4099783080260304, "grad_norm": 0.2170785919045779, "learning_rate": 2.4292323411115963e-05, "loss": 0.2582, "step": 650 }, { "epoch": 1.4121475054229935, "grad_norm": 0.5111593541353036, "learning_rate": 2.412999101036493e-05, "loss": 0.3233, "step": 651 }, { "epoch": 1.4143167028199566, "grad_norm": 0.22766256947955918, "learning_rate": 2.3968030133502916e-05, "loss": 0.2548, "step": 652 }, { "epoch": 1.4164859002169197, "grad_norm": 0.21004376919072829, "learning_rate": 2.380644310647944e-05, "loss": 0.2463, "step": 653 }, { "epoch": 1.418655097613883, "grad_norm": 0.22454612498741602, "learning_rate": 2.364523224987508e-05, "loss": 0.2186, "step": 654 }, { "epoch": 1.420824295010846, "grad_norm": 0.21635926190963323, "learning_rate": 2.3484399878868185e-05, "loss": 0.2574, "step": 655 }, { "epoch": 1.4229934924078091, "grad_norm": 0.20472709985073356, "learning_rate": 2.3323948303201586e-05, "loss": 0.2453, "step": 656 }, { "epoch": 1.4251626898047722, "grad_norm": 0.2204401348193557, "learning_rate": 2.3163879827149448e-05, "loss": 0.2507, "step": 657 }, { "epoch": 1.4273318872017353, "grad_norm": 0.24175915571290157, "learning_rate": 2.300419674948418e-05, "loss": 0.2907, "step": 658 }, { "epoch": 1.4295010845986984, "grad_norm": 0.22445756380382137, "learning_rate": 2.2844901363443404e-05, "loss": 0.2791, "step": 659 }, { "epoch": 1.4316702819956615, "grad_norm": 0.22386370995786847, "learning_rate": 2.2685995956697038e-05, "loss": 0.2779, "step": 660 }, { "epoch": 1.4338394793926248, "grad_norm": 0.23249928031701667, "learning_rate": 2.2527482811314437e-05, "loss": 0.311, "step": 661 }, { "epoch": 1.4360086767895879, "grad_norm": 0.2342495773652486, "learning_rate": 2.2369364203731618e-05, "loss": 0.2679, "step": 662 }, { "epoch": 1.438177874186551, "grad_norm": 0.20675103233277622, "learning_rate": 2.221164240471857e-05, "loss": 0.2313, "step": 663 }, { "epoch": 1.440347071583514, "grad_norm": 0.20528823101739593, "learning_rate": 2.205431967934664e-05, "loss": 0.2492, "step": 664 }, { "epoch": 1.4425162689804774, "grad_norm": 0.2128126904380516, "learning_rate": 2.1897398286956012e-05, "loss": 0.2777, "step": 665 }, { "epoch": 1.4446854663774404, "grad_norm": 0.21518099561214468, "learning_rate": 2.1740880481123238e-05, "loss": 0.2518, "step": 666 }, { "epoch": 1.4468546637744035, "grad_norm": 0.2186083154676726, "learning_rate": 2.1584768509628922e-05, "loss": 0.2453, "step": 667 }, { "epoch": 1.4490238611713666, "grad_norm": 0.25175528122891083, "learning_rate": 2.142906461442537e-05, "loss": 0.3037, "step": 668 }, { "epoch": 1.4511930585683297, "grad_norm": 0.22763386297900118, "learning_rate": 2.1273771031604456e-05, "loss": 0.2644, "step": 669 }, { "epoch": 1.4533622559652928, "grad_norm": 0.2295090408465574, "learning_rate": 2.1118889991365476e-05, "loss": 0.2686, "step": 670 }, { "epoch": 1.4555314533622559, "grad_norm": 0.20891584135932628, "learning_rate": 2.096442371798313e-05, "loss": 0.2204, "step": 671 }, { "epoch": 1.457700650759219, "grad_norm": 0.2254247187316148, "learning_rate": 2.0810374429775565e-05, "loss": 0.2913, "step": 672 }, { "epoch": 1.4598698481561823, "grad_norm": 0.24216942402374153, "learning_rate": 2.0656744339072542e-05, "loss": 0.3349, "step": 673 }, { "epoch": 1.4620390455531453, "grad_norm": 0.22222297204476354, "learning_rate": 2.0503535652183643e-05, "loss": 0.2793, "step": 674 }, { "epoch": 1.4642082429501084, "grad_norm": 0.20870048389264126, "learning_rate": 2.035075056936659e-05, "loss": 0.2655, "step": 675 }, { "epoch": 1.4663774403470715, "grad_norm": 0.22841755003782033, "learning_rate": 2.0198391284795664e-05, "loss": 0.3155, "step": 676 }, { "epoch": 1.4685466377440348, "grad_norm": 0.22269122758511214, "learning_rate": 2.004645998653017e-05, "loss": 0.2491, "step": 677 }, { "epoch": 1.470715835140998, "grad_norm": 0.22332389169722103, "learning_rate": 1.9894958856483026e-05, "loss": 0.2648, "step": 678 }, { "epoch": 1.472885032537961, "grad_norm": 0.214833703844543, "learning_rate": 1.974389007038942e-05, "loss": 0.2609, "step": 679 }, { "epoch": 1.475054229934924, "grad_norm": 0.20682197278555356, "learning_rate": 1.9593255797775577e-05, "loss": 0.2328, "step": 680 }, { "epoch": 1.4772234273318872, "grad_norm": 0.23230767426539825, "learning_rate": 1.9443058201927588e-05, "loss": 0.2347, "step": 681 }, { "epoch": 1.4793926247288502, "grad_norm": 0.2261449038214272, "learning_rate": 1.9293299439860396e-05, "loss": 0.2758, "step": 682 }, { "epoch": 1.4815618221258133, "grad_norm": 0.3168804778663666, "learning_rate": 1.9143981662286665e-05, "loss": 0.273, "step": 683 }, { "epoch": 1.4837310195227766, "grad_norm": 0.22898408446614574, "learning_rate": 1.8995107013586137e-05, "loss": 0.2713, "step": 684 }, { "epoch": 1.4859002169197397, "grad_norm": 0.22912219172967715, "learning_rate": 1.8846677631774575e-05, "loss": 0.2937, "step": 685 }, { "epoch": 1.4880694143167028, "grad_norm": 0.20900331783691264, "learning_rate": 1.869869564847329e-05, "loss": 0.2269, "step": 686 }, { "epoch": 1.490238611713666, "grad_norm": 0.21015606821149282, "learning_rate": 1.8551163188878313e-05, "loss": 0.2299, "step": 687 }, { "epoch": 1.4924078091106292, "grad_norm": 0.2250692099687625, "learning_rate": 1.840408237173011e-05, "loss": 0.2892, "step": 688 }, { "epoch": 1.4945770065075923, "grad_norm": 0.2145016212764728, "learning_rate": 1.8257455309282882e-05, "loss": 0.2476, "step": 689 }, { "epoch": 1.4967462039045554, "grad_norm": 0.2236334148717297, "learning_rate": 1.811128410727454e-05, "loss": 0.2259, "step": 690 }, { "epoch": 1.4989154013015185, "grad_norm": 0.22580326511659854, "learning_rate": 1.7965570864896138e-05, "loss": 0.2611, "step": 691 }, { "epoch": 1.5010845986984815, "grad_norm": 0.23566517211163768, "learning_rate": 1.7820317674762034e-05, "loss": 0.3256, "step": 692 }, { "epoch": 1.5032537960954446, "grad_norm": 0.2387014338194217, "learning_rate": 1.767552662287955e-05, "loss": 0.2169, "step": 693 }, { "epoch": 1.5054229934924077, "grad_norm": 0.2131558457240097, "learning_rate": 1.7531199788619305e-05, "loss": 0.2166, "step": 694 }, { "epoch": 1.5075921908893708, "grad_norm": 0.21663448852517472, "learning_rate": 1.738733924468507e-05, "loss": 0.2473, "step": 695 }, { "epoch": 1.509761388286334, "grad_norm": 0.2302024457559936, "learning_rate": 1.7243947057084252e-05, "loss": 0.2579, "step": 696 }, { "epoch": 1.5119305856832972, "grad_norm": 0.21162459671820377, "learning_rate": 1.7101025285097988e-05, "loss": 0.2367, "step": 697 }, { "epoch": 1.5140997830802603, "grad_norm": 0.22899234818180603, "learning_rate": 1.695857598125183e-05, "loss": 0.2487, "step": 698 }, { "epoch": 1.5162689804772236, "grad_norm": 0.21180166251652038, "learning_rate": 1.681660119128598e-05, "loss": 0.2454, "step": 699 }, { "epoch": 1.5184381778741867, "grad_norm": 0.21900430622153325, "learning_rate": 1.6675102954126204e-05, "loss": 0.2505, "step": 700 }, { "epoch": 1.5206073752711498, "grad_norm": 0.22388111292395452, "learning_rate": 1.6534083301854287e-05, "loss": 0.256, "step": 701 }, { "epoch": 1.5227765726681128, "grad_norm": 0.19613947069940627, "learning_rate": 1.639354425967904e-05, "loss": 0.2029, "step": 702 }, { "epoch": 1.524945770065076, "grad_norm": 0.20905100864213258, "learning_rate": 1.6253487845907122e-05, "loss": 0.2275, "step": 703 }, { "epoch": 1.527114967462039, "grad_norm": 0.21880038965460166, "learning_rate": 1.6113916071914082e-05, "loss": 0.2617, "step": 704 }, { "epoch": 1.529284164859002, "grad_norm": 0.21208440823468386, "learning_rate": 1.5974830942115472e-05, "loss": 0.242, "step": 705 }, { "epoch": 1.5314533622559652, "grad_norm": 0.23456256487858015, "learning_rate": 1.5836234453938054e-05, "loss": 0.2549, "step": 706 }, { "epoch": 1.5336225596529283, "grad_norm": 0.2396806217154987, "learning_rate": 1.5698128597791122e-05, "loss": 0.3393, "step": 707 }, { "epoch": 1.5357917570498916, "grad_norm": 0.2031182271876605, "learning_rate": 1.5560515357037898e-05, "loss": 0.2152, "step": 708 }, { "epoch": 1.5379609544468547, "grad_norm": 0.22000222658603283, "learning_rate": 1.542339670796712e-05, "loss": 0.2648, "step": 709 }, { "epoch": 1.5401301518438177, "grad_norm": 0.265809393717544, "learning_rate": 1.528677461976451e-05, "loss": 0.192, "step": 710 }, { "epoch": 1.542299349240781, "grad_norm": 0.20877625917509104, "learning_rate": 1.5150651054484705e-05, "loss": 0.2257, "step": 711 }, { "epoch": 1.5444685466377441, "grad_norm": 0.21683968747146132, "learning_rate": 1.5015027967022838e-05, "loss": 0.2702, "step": 712 }, { "epoch": 1.5466377440347072, "grad_norm": 0.20896474897637557, "learning_rate": 1.4879907305086721e-05, "loss": 0.2459, "step": 713 }, { "epoch": 1.5488069414316703, "grad_norm": 0.2615773501556193, "learning_rate": 1.4745291009168616e-05, "loss": 0.2782, "step": 714 }, { "epoch": 1.5509761388286334, "grad_norm": 0.22282625503762615, "learning_rate": 1.461118101251761e-05, "loss": 0.2885, "step": 715 }, { "epoch": 1.5531453362255965, "grad_norm": 0.21801422944358295, "learning_rate": 1.4477579241111616e-05, "loss": 0.2406, "step": 716 }, { "epoch": 1.5553145336225596, "grad_norm": 0.22073898585184953, "learning_rate": 1.4344487613629958e-05, "loss": 0.2656, "step": 717 }, { "epoch": 1.5574837310195226, "grad_norm": 0.2187048550685909, "learning_rate": 1.4211908041425565e-05, "loss": 0.2613, "step": 718 }, { "epoch": 1.559652928416486, "grad_norm": 0.2101381320422253, "learning_rate": 1.4079842428497764e-05, "loss": 0.2333, "step": 719 }, { "epoch": 1.561822125813449, "grad_norm": 0.20391298465277313, "learning_rate": 1.3948292671464708e-05, "loss": 0.2343, "step": 720 }, { "epoch": 1.5639913232104121, "grad_norm": 0.22749006672655192, "learning_rate": 1.3817260659536368e-05, "loss": 0.2563, "step": 721 }, { "epoch": 1.5661605206073754, "grad_norm": 0.22567622662999204, "learning_rate": 1.368674827448716e-05, "loss": 0.3026, "step": 722 }, { "epoch": 1.5683297180043385, "grad_norm": 0.22202307546172664, "learning_rate": 1.355675739062916e-05, "loss": 0.2365, "step": 723 }, { "epoch": 1.5704989154013016, "grad_norm": 0.20324438958594626, "learning_rate": 1.3427289874784965e-05, "loss": 0.2113, "step": 724 }, { "epoch": 1.5726681127982647, "grad_norm": 0.21573774200754933, "learning_rate": 1.3298347586261101e-05, "loss": 0.2256, "step": 725 }, { "epoch": 1.5748373101952278, "grad_norm": 0.2107630411490319, "learning_rate": 1.3169932376821087e-05, "loss": 0.2243, "step": 726 }, { "epoch": 1.5770065075921909, "grad_norm": 0.2250209366455123, "learning_rate": 1.3042046090659082e-05, "loss": 0.2311, "step": 727 }, { "epoch": 1.579175704989154, "grad_norm": 0.21327462482312162, "learning_rate": 1.2914690564373172e-05, "loss": 0.2025, "step": 728 }, { "epoch": 1.581344902386117, "grad_norm": 0.21048416380580592, "learning_rate": 1.278786762693921e-05, "loss": 0.2505, "step": 729 }, { "epoch": 1.58351409978308, "grad_norm": 0.23805779361424165, "learning_rate": 1.2661579099684345e-05, "loss": 0.2993, "step": 730 }, { "epoch": 1.5856832971800434, "grad_norm": 0.2236199058892429, "learning_rate": 1.2535826796261058e-05, "loss": 0.2559, "step": 731 }, { "epoch": 1.5878524945770065, "grad_norm": 0.2267692199206878, "learning_rate": 1.2410612522620923e-05, "loss": 0.2631, "step": 732 }, { "epoch": 1.5900216919739696, "grad_norm": 0.22527317437513994, "learning_rate": 1.2285938076988879e-05, "loss": 0.2381, "step": 733 }, { "epoch": 1.592190889370933, "grad_norm": 0.20643186489302753, "learning_rate": 1.2161805249837189e-05, "loss": 0.2261, "step": 734 }, { "epoch": 1.594360086767896, "grad_norm": 0.2363410474128828, "learning_rate": 1.2038215823859944e-05, "loss": 0.2848, "step": 735 }, { "epoch": 1.596529284164859, "grad_norm": 0.22064343187390295, "learning_rate": 1.1915171573947231e-05, "loss": 0.2114, "step": 736 }, { "epoch": 1.5986984815618221, "grad_norm": 0.2241110750390057, "learning_rate": 1.179267426715988e-05, "loss": 0.273, "step": 737 }, { "epoch": 1.6008676789587852, "grad_norm": 0.24065816206033924, "learning_rate": 1.1670725662703907e-05, "loss": 0.2667, "step": 738 }, { "epoch": 1.6030368763557483, "grad_norm": 0.216540199511313, "learning_rate": 1.1549327511905322e-05, "loss": 0.2603, "step": 739 }, { "epoch": 1.6052060737527114, "grad_norm": 0.22867224484504703, "learning_rate": 1.1428481558184984e-05, "loss": 0.2835, "step": 740 }, { "epoch": 1.6073752711496745, "grad_norm": 0.21664360434007227, "learning_rate": 1.1308189537033532e-05, "loss": 0.2799, "step": 741 }, { "epoch": 1.6095444685466378, "grad_norm": 0.21600464380836015, "learning_rate": 1.1188453175986502e-05, "loss": 0.2555, "step": 742 }, { "epoch": 1.6117136659436009, "grad_norm": 0.22003089603782433, "learning_rate": 1.1069274194599477e-05, "loss": 0.2764, "step": 743 }, { "epoch": 1.613882863340564, "grad_norm": 0.21966503118494438, "learning_rate": 1.0950654304423408e-05, "loss": 0.243, "step": 744 }, { "epoch": 1.6160520607375273, "grad_norm": 0.2134291281940422, "learning_rate": 1.0832595208980052e-05, "loss": 0.2235, "step": 745 }, { "epoch": 1.6182212581344904, "grad_norm": 0.22637670128987095, "learning_rate": 1.0715098603737473e-05, "loss": 0.2597, "step": 746 }, { "epoch": 1.6203904555314534, "grad_norm": 0.21648611339436485, "learning_rate": 1.0598166176085722e-05, "loss": 0.2542, "step": 747 }, { "epoch": 1.6225596529284165, "grad_norm": 0.23629777995443355, "learning_rate": 1.0481799605312598e-05, "loss": 0.3228, "step": 748 }, { "epoch": 1.6247288503253796, "grad_norm": 0.21725498937866275, "learning_rate": 1.0366000562579509e-05, "loss": 0.2344, "step": 749 }, { "epoch": 1.6268980477223427, "grad_norm": 0.21952448011527256, "learning_rate": 1.0250770710897512e-05, "loss": 0.3131, "step": 750 }, { "epoch": 1.6290672451193058, "grad_norm": 0.2011634686461227, "learning_rate": 1.0136111705103384e-05, "loss": 0.2069, "step": 751 }, { "epoch": 1.6312364425162689, "grad_norm": 0.22463127569065536, "learning_rate": 1.0022025191835905e-05, "loss": 0.2281, "step": 752 }, { "epoch": 1.633405639913232, "grad_norm": 0.2062264163167854, "learning_rate": 9.90851280951216e-06, "loss": 0.2179, "step": 753 }, { "epoch": 1.6355748373101953, "grad_norm": 0.22691208712294778, "learning_rate": 9.795576188304068e-06, "loss": 0.3075, "step": 754 }, { "epoch": 1.6377440347071583, "grad_norm": 0.21132869155040268, "learning_rate": 9.68321695011491e-06, "loss": 0.1903, "step": 755 }, { "epoch": 1.6399132321041214, "grad_norm": 0.21551342495874198, "learning_rate": 9.571436708556076e-06, "loss": 0.221, "step": 756 }, { "epoch": 1.6420824295010847, "grad_norm": 0.2174975697740617, "learning_rate": 9.460237068923883e-06, "loss": 0.2587, "step": 757 }, { "epoch": 1.6442516268980478, "grad_norm": 0.2160546965912458, "learning_rate": 9.34961962817652e-06, "loss": 0.2303, "step": 758 }, { "epoch": 1.646420824295011, "grad_norm": 0.2376163843745843, "learning_rate": 9.239585974911074e-06, "loss": 0.3243, "step": 759 }, { "epoch": 1.648590021691974, "grad_norm": 0.21539719738458052, "learning_rate": 9.130137689340839e-06, "loss": 0.2571, "step": 760 }, { "epoch": 1.650759219088937, "grad_norm": 0.2410719861687306, "learning_rate": 9.021276343272434e-06, "loss": 0.2633, "step": 761 }, { "epoch": 1.6529284164859002, "grad_norm": 0.2079469970240713, "learning_rate": 8.913003500083438e-06, "loss": 0.2379, "step": 762 }, { "epoch": 1.6550976138828633, "grad_norm": 0.22772263664943265, "learning_rate": 8.805320714699728e-06, "loss": 0.2395, "step": 763 }, { "epoch": 1.6572668112798263, "grad_norm": 0.20307660213818468, "learning_rate": 8.698229533573338e-06, "loss": 0.1864, "step": 764 }, { "epoch": 1.6594360086767896, "grad_norm": 0.2148519135414743, "learning_rate": 8.591731494660132e-06, "loss": 0.2053, "step": 765 }, { "epoch": 1.6616052060737527, "grad_norm": 0.229814247011586, "learning_rate": 8.485828127397749e-06, "loss": 0.2875, "step": 766 }, { "epoch": 1.6637744034707158, "grad_norm": 0.21643960802023465, "learning_rate": 8.380520952683646e-06, "loss": 0.2599, "step": 767 }, { "epoch": 1.6659436008676791, "grad_norm": 0.22244934877456704, "learning_rate": 8.275811482853245e-06, "loss": 0.2598, "step": 768 }, { "epoch": 1.6681127982646422, "grad_norm": 0.23343908563620214, "learning_rate": 8.17170122165824e-06, "loss": 0.3098, "step": 769 }, { "epoch": 1.6702819956616053, "grad_norm": 0.19803997345115992, "learning_rate": 8.068191664244945e-06, "loss": 0.2029, "step": 770 }, { "epoch": 1.6724511930585684, "grad_norm": 0.2190516176521641, "learning_rate": 7.965284297132896e-06, "loss": 0.241, "step": 771 }, { "epoch": 1.6746203904555315, "grad_norm": 0.21290567846006564, "learning_rate": 7.862980598193442e-06, "loss": 0.2063, "step": 772 }, { "epoch": 1.6767895878524945, "grad_norm": 0.23699837862628473, "learning_rate": 7.761282036628548e-06, "loss": 0.3082, "step": 773 }, { "epoch": 1.6789587852494576, "grad_norm": 0.22947643986082064, "learning_rate": 7.660190072949692e-06, "loss": 0.2628, "step": 774 }, { "epoch": 1.6811279826464207, "grad_norm": 0.23210641035367943, "learning_rate": 7.559706158956898e-06, "loss": 0.2715, "step": 775 }, { "epoch": 1.6832971800433838, "grad_norm": 0.2294766385634235, "learning_rate": 7.459831737717859e-06, "loss": 0.2749, "step": 776 }, { "epoch": 1.685466377440347, "grad_norm": 0.20107115872856784, "learning_rate": 7.360568243547261e-06, "loss": 0.219, "step": 777 }, { "epoch": 1.6876355748373102, "grad_norm": 0.20113176199889593, "learning_rate": 7.261917101986127e-06, "loss": 0.2074, "step": 778 }, { "epoch": 1.6898047722342733, "grad_norm": 0.23609135019264865, "learning_rate": 7.163879729781392e-06, "loss": 0.2667, "step": 779 }, { "epoch": 1.6919739696312366, "grad_norm": 0.2228940425973063, "learning_rate": 7.066457534865528e-06, "loss": 0.2643, "step": 780 }, { "epoch": 1.6941431670281997, "grad_norm": 0.23796760069402623, "learning_rate": 6.969651916336334e-06, "loss": 0.2249, "step": 781 }, { "epoch": 1.6963123644251628, "grad_norm": 0.2327272256132086, "learning_rate": 6.8734642644368576e-06, "loss": 0.2543, "step": 782 }, { "epoch": 1.6984815618221258, "grad_norm": 0.2472213046502843, "learning_rate": 6.7778959605353906e-06, "loss": 0.2585, "step": 783 }, { "epoch": 1.700650759219089, "grad_norm": 0.22682131480226772, "learning_rate": 6.682948377105686e-06, "loss": 0.226, "step": 784 }, { "epoch": 1.702819956616052, "grad_norm": 0.23469843862434467, "learning_rate": 6.588622877707196e-06, "loss": 0.2933, "step": 785 }, { "epoch": 1.704989154013015, "grad_norm": 0.2244645457364394, "learning_rate": 6.49492081696551e-06, "loss": 0.2965, "step": 786 }, { "epoch": 1.7071583514099782, "grad_norm": 0.20869135290493288, "learning_rate": 6.401843540552921e-06, "loss": 0.2295, "step": 787 }, { "epoch": 1.7093275488069413, "grad_norm": 0.21494691194019463, "learning_rate": 6.309392385169066e-06, "loss": 0.2418, "step": 788 }, { "epoch": 1.7114967462039046, "grad_norm": 0.2521558215114822, "learning_rate": 6.217568678521746e-06, "loss": 0.3596, "step": 789 }, { "epoch": 1.7136659436008677, "grad_norm": 0.21627894705194553, "learning_rate": 6.126373739307856e-06, "loss": 0.2358, "step": 790 }, { "epoch": 1.715835140997831, "grad_norm": 0.2520399170899315, "learning_rate": 6.035808877194454e-06, "loss": 0.3594, "step": 791 }, { "epoch": 1.718004338394794, "grad_norm": 0.22790741465610556, "learning_rate": 5.945875392799944e-06, "loss": 0.3138, "step": 792 }, { "epoch": 1.7201735357917571, "grad_norm": 0.21149040475910053, "learning_rate": 5.8565745776754e-06, "loss": 0.2251, "step": 793 }, { "epoch": 1.7223427331887202, "grad_norm": 0.23452143004512307, "learning_rate": 5.7679077142860135e-06, "loss": 0.2652, "step": 794 }, { "epoch": 1.7245119305856833, "grad_norm": 0.22627084576919643, "learning_rate": 5.679876075992685e-06, "loss": 0.2856, "step": 795 }, { "epoch": 1.7266811279826464, "grad_norm": 0.23065836475523283, "learning_rate": 5.592480927033733e-06, "loss": 0.2407, "step": 796 }, { "epoch": 1.7288503253796095, "grad_norm": 0.21666121524543802, "learning_rate": 5.505723522506734e-06, "loss": 0.2533, "step": 797 }, { "epoch": 1.7310195227765726, "grad_norm": 0.21851910440097064, "learning_rate": 5.419605108350501e-06, "loss": 0.1987, "step": 798 }, { "epoch": 1.7331887201735356, "grad_norm": 0.22502982751463446, "learning_rate": 5.334126921327193e-06, "loss": 0.2522, "step": 799 }, { "epoch": 1.735357917570499, "grad_norm": 0.24282939171015397, "learning_rate": 5.249290189004552e-06, "loss": 0.3036, "step": 800 }, { "epoch": 1.735357917570499, "eval_loss": 0.2993714511394501, "eval_runtime": 39.7253, "eval_samples_per_second": 0.478, "eval_steps_per_second": 0.126, "step": 800 }, { "epoch": 1.737527114967462, "grad_norm": 0.22684517743489496, "learning_rate": 5.165096129738267e-06, "loss": 0.2412, "step": 801 }, { "epoch": 1.7396963123644251, "grad_norm": 0.21496799405491135, "learning_rate": 5.081545952654493e-06, "loss": 0.2373, "step": 802 }, { "epoch": 1.7418655097613884, "grad_norm": 0.22310608285043565, "learning_rate": 4.998640857632464e-06, "loss": 0.2496, "step": 803 }, { "epoch": 1.7440347071583515, "grad_norm": 0.2320457312319671, "learning_rate": 4.916382035287276e-06, "loss": 0.2582, "step": 804 }, { "epoch": 1.7462039045553146, "grad_norm": 0.22615551698433095, "learning_rate": 4.8347706669527985e-06, "loss": 0.2548, "step": 805 }, { "epoch": 1.7483731019522777, "grad_norm": 0.23006147126943094, "learning_rate": 4.7538079246646825e-06, "loss": 0.2776, "step": 806 }, { "epoch": 1.7505422993492408, "grad_norm": 0.22519218682482992, "learning_rate": 4.673494971143538e-06, "loss": 0.2457, "step": 807 }, { "epoch": 1.7527114967462039, "grad_norm": 0.21484715264028661, "learning_rate": 4.5938329597782824e-06, "loss": 0.2163, "step": 808 }, { "epoch": 1.754880694143167, "grad_norm": 0.22724812121756507, "learning_rate": 4.514823034609455e-06, "loss": 0.2568, "step": 809 }, { "epoch": 1.75704989154013, "grad_norm": 0.2192304364191693, "learning_rate": 4.436466330312966e-06, "loss": 0.235, "step": 810 }, { "epoch": 1.7592190889370931, "grad_norm": 0.1833973092649833, "learning_rate": 4.358763972183599e-06, "loss": 0.166, "step": 811 }, { "epoch": 1.7613882863340564, "grad_norm": 0.22170941113081136, "learning_rate": 4.281717076119057e-06, "loss": 0.2543, "step": 812 }, { "epoch": 1.7635574837310195, "grad_norm": 0.2181063063924005, "learning_rate": 4.205326748603744e-06, "loss": 0.2231, "step": 813 }, { "epoch": 1.7657266811279828, "grad_norm": 0.23650416189819906, "learning_rate": 4.129594086693012e-06, "loss": 0.2647, "step": 814 }, { "epoch": 1.767895878524946, "grad_norm": 0.23715186447220826, "learning_rate": 4.0545201779973564e-06, "loss": 0.2395, "step": 815 }, { "epoch": 1.770065075921909, "grad_norm": 0.25382512329158124, "learning_rate": 3.980106100666781e-06, "loss": 0.2909, "step": 816 }, { "epoch": 1.772234273318872, "grad_norm": 0.21476601477039556, "learning_rate": 3.906352923375345e-06, "loss": 0.2326, "step": 817 }, { "epoch": 1.7744034707158352, "grad_norm": 0.31142043160518457, "learning_rate": 3.833261705305796e-06, "loss": 0.2506, "step": 818 }, { "epoch": 1.7765726681127982, "grad_norm": 0.23754127992716897, "learning_rate": 3.7608334961343695e-06, "loss": 0.2753, "step": 819 }, { "epoch": 1.7787418655097613, "grad_norm": 0.2606633745028959, "learning_rate": 3.6890693360157105e-06, "loss": 0.2735, "step": 820 }, { "epoch": 1.7809110629067244, "grad_norm": 0.2308719975444426, "learning_rate": 3.6179702555679264e-06, "loss": 0.2391, "step": 821 }, { "epoch": 1.7830802603036875, "grad_norm": 0.20821914212005022, "learning_rate": 3.5475372758577984e-06, "loss": 0.2041, "step": 822 }, { "epoch": 1.7852494577006508, "grad_norm": 0.27553949669527766, "learning_rate": 3.4777714083861268e-06, "loss": 0.2641, "step": 823 }, { "epoch": 1.7874186550976139, "grad_norm": 0.2298326829683291, "learning_rate": 3.4086736550731747e-06, "loss": 0.2694, "step": 824 }, { "epoch": 1.789587852494577, "grad_norm": 0.23725507049973954, "learning_rate": 3.340245008244308e-06, "loss": 0.2672, "step": 825 }, { "epoch": 1.7917570498915403, "grad_norm": 0.23615628058747723, "learning_rate": 3.272486450615725e-06, "loss": 0.298, "step": 826 }, { "epoch": 1.7939262472885034, "grad_norm": 0.24396047621014613, "learning_rate": 3.2053989552803586e-06, "loss": 0.3171, "step": 827 }, { "epoch": 1.7960954446854664, "grad_norm": 0.22614687587834664, "learning_rate": 3.1389834856938916e-06, "loss": 0.2379, "step": 828 }, { "epoch": 1.7982646420824295, "grad_norm": 0.22756878563712518, "learning_rate": 3.0732409956609197e-06, "loss": 0.2636, "step": 829 }, { "epoch": 1.8004338394793926, "grad_norm": 0.19317698277364906, "learning_rate": 3.0081724293212653e-06, "loss": 0.1733, "step": 830 }, { "epoch": 1.8026030368763557, "grad_norm": 0.2290227505004182, "learning_rate": 2.943778721136403e-06, "loss": 0.281, "step": 831 }, { "epoch": 1.8047722342733188, "grad_norm": 0.21862342101596383, "learning_rate": 2.8800607958760497e-06, "loss": 0.2335, "step": 832 }, { "epoch": 1.8069414316702819, "grad_norm": 0.2951658359969376, "learning_rate": 2.8170195686048837e-06, "loss": 0.2809, "step": 833 }, { "epoch": 1.809110629067245, "grad_norm": 0.24062215600970813, "learning_rate": 2.7546559446693876e-06, "loss": 0.2618, "step": 834 }, { "epoch": 1.8112798264642083, "grad_norm": 0.222892598517785, "learning_rate": 2.692970819684898e-06, "loss": 0.2255, "step": 835 }, { "epoch": 1.8134490238611713, "grad_norm": 0.22574094208163784, "learning_rate": 2.6319650795226414e-06, "loss": 0.2844, "step": 836 }, { "epoch": 1.8156182212581344, "grad_norm": 0.2143823574732022, "learning_rate": 2.571639600297143e-06, "loss": 0.2179, "step": 837 }, { "epoch": 1.8177874186550977, "grad_norm": 0.20810428440270837, "learning_rate": 2.5119952483535214e-06, "loss": 0.2273, "step": 838 }, { "epoch": 1.8199566160520608, "grad_norm": 0.2219262917061002, "learning_rate": 2.4530328802551516e-06, "loss": 0.2469, "step": 839 }, { "epoch": 1.822125813449024, "grad_norm": 0.22814813313597437, "learning_rate": 2.3947533427712666e-06, "loss": 0.2717, "step": 840 }, { "epoch": 1.824295010845987, "grad_norm": 0.20474219388181666, "learning_rate": 2.3371574728648928e-06, "loss": 0.2206, "step": 841 }, { "epoch": 1.82646420824295, "grad_norm": 0.2393953186831936, "learning_rate": 2.2802460976807384e-06, "loss": 0.2773, "step": 842 }, { "epoch": 1.8286334056399132, "grad_norm": 0.20242210841493188, "learning_rate": 2.2240200345333972e-06, "loss": 0.241, "step": 843 }, { "epoch": 1.8308026030368763, "grad_norm": 0.2479672969940867, "learning_rate": 2.1684800908955362e-06, "loss": 0.304, "step": 844 }, { "epoch": 1.8329718004338393, "grad_norm": 0.2924875615305496, "learning_rate": 2.1136270643863807e-06, "loss": 0.1846, "step": 845 }, { "epoch": 1.8351409978308026, "grad_norm": 0.23730285407509868, "learning_rate": 2.0594617427601603e-06, "loss": 0.2475, "step": 846 }, { "epoch": 1.8373101952277657, "grad_norm": 0.22663266708243113, "learning_rate": 2.0059849038949084e-06, "loss": 0.2662, "step": 847 }, { "epoch": 1.8394793926247288, "grad_norm": 0.24163203609153988, "learning_rate": 1.953197315781169e-06, "loss": 0.2949, "step": 848 }, { "epoch": 1.8416485900216921, "grad_norm": 0.22227338821415707, "learning_rate": 1.9010997365110971e-06, "loss": 0.2483, "step": 849 }, { "epoch": 1.8438177874186552, "grad_norm": 0.20465862377896551, "learning_rate": 1.8496929142674424e-06, "loss": 0.2406, "step": 850 }, { "epoch": 1.8459869848156183, "grad_norm": 0.2364370583373017, "learning_rate": 1.7989775873129278e-06, "loss": 0.2902, "step": 851 }, { "epoch": 1.8481561822125814, "grad_norm": 0.26218568934111874, "learning_rate": 1.7489544839795314e-06, "loss": 0.2492, "step": 852 }, { "epoch": 1.8503253796095445, "grad_norm": 0.21511824786196265, "learning_rate": 1.6996243226581388e-06, "loss": 0.2368, "step": 853 }, { "epoch": 1.8524945770065075, "grad_norm": 0.2198436062463957, "learning_rate": 1.6509878117881073e-06, "loss": 0.2208, "step": 854 }, { "epoch": 1.8546637744034706, "grad_norm": 0.21644370095781967, "learning_rate": 1.6030456498472124e-06, "loss": 0.212, "step": 855 }, { "epoch": 1.8568329718004337, "grad_norm": 0.22133422953240703, "learning_rate": 1.5557985253415119e-06, "loss": 0.2553, "step": 856 }, { "epoch": 1.8590021691973968, "grad_norm": 0.22714457687645023, "learning_rate": 1.509247116795548e-06, "loss": 0.2462, "step": 857 }, { "epoch": 1.86117136659436, "grad_norm": 0.222946120475819, "learning_rate": 1.463392092742516e-06, "loss": 0.2306, "step": 858 }, { "epoch": 1.8633405639913232, "grad_norm": 0.23418079900923416, "learning_rate": 1.4182341117147501e-06, "loss": 0.2696, "step": 859 }, { "epoch": 1.8655097613882863, "grad_norm": 0.24002442145224728, "learning_rate": 1.373773822234181e-06, "loss": 0.3045, "step": 860 }, { "epoch": 1.8676789587852496, "grad_norm": 0.23645720857751817, "learning_rate": 1.3300118628031044e-06, "loss": 0.2677, "step": 861 }, { "epoch": 1.8698481561822127, "grad_norm": 0.20964939053220596, "learning_rate": 1.2869488618949488e-06, "loss": 0.2387, "step": 862 }, { "epoch": 1.8720173535791758, "grad_norm": 0.23651470700477456, "learning_rate": 1.2445854379452726e-06, "loss": 0.2706, "step": 863 }, { "epoch": 1.8741865509761388, "grad_norm": 0.22140918135138807, "learning_rate": 1.2029221993428873e-06, "loss": 0.2607, "step": 864 }, { "epoch": 1.876355748373102, "grad_norm": 0.2348213137855099, "learning_rate": 1.1619597444211206e-06, "loss": 0.2672, "step": 865 }, { "epoch": 1.878524945770065, "grad_norm": 0.21784895708257698, "learning_rate": 1.121698661449211e-06, "loss": 0.2285, "step": 866 }, { "epoch": 1.880694143167028, "grad_norm": 0.22922875389410033, "learning_rate": 1.0821395286238656e-06, "loss": 0.2343, "step": 867 }, { "epoch": 1.8828633405639912, "grad_norm": 0.23166059465595806, "learning_rate": 1.0432829140609723e-06, "loss": 0.2714, "step": 868 }, { "epoch": 1.8850325379609545, "grad_norm": 0.2360632284131841, "learning_rate": 1.0051293757874002e-06, "loss": 0.2716, "step": 869 }, { "epoch": 1.8872017353579176, "grad_norm": 0.2314940438713054, "learning_rate": 9.67679461733051e-07, "loss": 0.2835, "step": 870 }, { "epoch": 1.8893709327548807, "grad_norm": 0.21435494553958442, "learning_rate": 9.309337097229087e-07, "loss": 0.2131, "step": 871 }, { "epoch": 1.891540130151844, "grad_norm": 0.2210305803057693, "learning_rate": 8.94892647469403e-07, "loss": 0.2575, "step": 872 }, { "epoch": 1.893709327548807, "grad_norm": 0.24869986480220152, "learning_rate": 8.595567925647363e-07, "loss": 0.2738, "step": 873 }, { "epoch": 1.8958785249457701, "grad_norm": 0.2305734211734528, "learning_rate": 8.249266524735455e-07, "loss": 0.2733, "step": 874 }, { "epoch": 1.8980477223427332, "grad_norm": 0.2536697393369725, "learning_rate": 7.910027245255413e-07, "loss": 0.3261, "step": 875 }, { "epoch": 1.9002169197396963, "grad_norm": 0.23661550044126228, "learning_rate": 7.577854959084085e-07, "loss": 0.3014, "step": 876 }, { "epoch": 1.9023861171366594, "grad_norm": 0.22995644681345337, "learning_rate": 7.252754436607834e-07, "loss": 0.2957, "step": 877 }, { "epoch": 1.9045553145336225, "grad_norm": 0.22988735331559226, "learning_rate": 6.934730346654316e-07, "loss": 0.2418, "step": 878 }, { "epoch": 1.9067245119305856, "grad_norm": 0.22730994636228313, "learning_rate": 6.623787256424984e-07, "loss": 0.2746, "step": 879 }, { "epoch": 1.9088937093275486, "grad_norm": 0.21461778114516813, "learning_rate": 6.319929631430077e-07, "loss": 0.2225, "step": 880 }, { "epoch": 1.911062906724512, "grad_norm": 0.23234603239697021, "learning_rate": 6.023161835423896e-07, "loss": 0.2969, "step": 881 }, { "epoch": 1.913232104121475, "grad_norm": 0.21550966631710464, "learning_rate": 5.733488130342635e-07, "loss": 0.2503, "step": 882 }, { "epoch": 1.9154013015184381, "grad_norm": 0.2231153295751743, "learning_rate": 5.45091267624287e-07, "loss": 0.249, "step": 883 }, { "epoch": 1.9175704989154014, "grad_norm": 0.2293111442665084, "learning_rate": 5.175439531241999e-07, "loss": 0.29, "step": 884 }, { "epoch": 1.9197396963123645, "grad_norm": 0.2378040770485283, "learning_rate": 4.907072651459621e-07, "loss": 0.2891, "step": 885 }, { "epoch": 1.9219088937093276, "grad_norm": 0.21536664945034864, "learning_rate": 4.645815890961358e-07, "loss": 0.22, "step": 886 }, { "epoch": 1.9240780911062907, "grad_norm": 0.21799292627016253, "learning_rate": 4.391673001702734e-07, "loss": 0.2168, "step": 887 }, { "epoch": 1.9262472885032538, "grad_norm": 0.2272046818418607, "learning_rate": 4.14464763347594e-07, "loss": 0.2555, "step": 888 }, { "epoch": 1.9284164859002169, "grad_norm": 0.26748625735516185, "learning_rate": 3.904743333857097e-07, "loss": 0.2186, "step": 889 }, { "epoch": 1.93058568329718, "grad_norm": 0.23511293905776262, "learning_rate": 3.671963548155244e-07, "loss": 0.2622, "step": 890 }, { "epoch": 1.932754880694143, "grad_norm": 0.2132059321481103, "learning_rate": 3.446311619363207e-07, "loss": 0.2162, "step": 891 }, { "epoch": 1.9349240780911063, "grad_norm": 0.21597717686523027, "learning_rate": 3.227790788109197e-07, "loss": 0.2307, "step": 892 }, { "epoch": 1.9370932754880694, "grad_norm": 0.2978176072682262, "learning_rate": 3.0164041926104e-07, "loss": 0.2577, "step": 893 }, { "epoch": 1.9392624728850325, "grad_norm": 0.23250354980532184, "learning_rate": 2.8121548686281805e-07, "loss": 0.2714, "step": 894 }, { "epoch": 1.9414316702819958, "grad_norm": 0.23296800107183185, "learning_rate": 2.6150457494240057e-07, "loss": 0.2414, "step": 895 }, { "epoch": 1.943600867678959, "grad_norm": 0.2136515878728428, "learning_rate": 2.4250796657177e-07, "loss": 0.2173, "step": 896 }, { "epoch": 1.945770065075922, "grad_norm": 0.20347321610165708, "learning_rate": 2.2422593456466468e-07, "loss": 0.2152, "step": 897 }, { "epoch": 1.947939262472885, "grad_norm": 0.23997716902494964, "learning_rate": 2.0665874147265395e-07, "loss": 0.2305, "step": 898 }, { "epoch": 1.9501084598698482, "grad_norm": 0.22269717072616485, "learning_rate": 1.8980663958139133e-07, "loss": 0.2665, "step": 899 }, { "epoch": 1.9522776572668112, "grad_norm": 0.21024780049164776, "learning_rate": 1.736698709069673e-07, "loss": 0.1967, "step": 900 }, { "epoch": 1.9544468546637743, "grad_norm": 0.2267046711412435, "learning_rate": 1.5824866719243436e-07, "loss": 0.2445, "step": 901 }, { "epoch": 1.9566160520607374, "grad_norm": 0.22809702055070344, "learning_rate": 1.4354324990449863e-07, "loss": 0.2569, "step": 902 }, { "epoch": 1.9587852494577005, "grad_norm": 0.23382391297633118, "learning_rate": 1.295538302303223e-07, "loss": 0.2876, "step": 903 }, { "epoch": 1.9609544468546638, "grad_norm": 0.20472909033825798, "learning_rate": 1.1628060907449834e-07, "loss": 0.1924, "step": 904 }, { "epoch": 1.9631236442516269, "grad_norm": 0.2096943890587732, "learning_rate": 1.0372377705616942e-07, "loss": 0.233, "step": 905 }, { "epoch": 1.96529284164859, "grad_norm": 0.211371475954759, "learning_rate": 9.188351450626353e-08, "loss": 0.2166, "step": 906 }, { "epoch": 1.9674620390455533, "grad_norm": 0.21471581374101012, "learning_rate": 8.075999146494595e-08, "loss": 0.2282, "step": 907 }, { "epoch": 1.9696312364425164, "grad_norm": 0.2660219035351609, "learning_rate": 7.035336767914346e-08, "loss": 0.2735, "step": 908 }, { "epoch": 1.9718004338394794, "grad_norm": 0.22465441666056135, "learning_rate": 6.066379260026845e-08, "loss": 0.2765, "step": 909 }, { "epoch": 1.9739696312364425, "grad_norm": 0.22816673191661355, "learning_rate": 5.169140538207051e-08, "loss": 0.2685, "step": 910 }, { "epoch": 1.9761388286334056, "grad_norm": 0.24062260875061986, "learning_rate": 4.343633487861598e-08, "loss": 0.2479, "step": 911 }, { "epoch": 1.9783080260303687, "grad_norm": 0.23204079070364966, "learning_rate": 3.589869964248371e-08, "loss": 0.2544, "step": 912 }, { "epoch": 1.9804772234273318, "grad_norm": 0.2339821898793586, "learning_rate": 2.907860792301098e-08, "loss": 0.3132, "step": 913 }, { "epoch": 1.9826464208242949, "grad_norm": 0.21123283163235765, "learning_rate": 2.297615766477801e-08, "loss": 0.2454, "step": 914 }, { "epoch": 1.9848156182212582, "grad_norm": 0.23481457672784686, "learning_rate": 1.7591436506170235e-08, "loss": 0.2803, "step": 915 }, { "epoch": 1.9869848156182213, "grad_norm": 0.22666968506581125, "learning_rate": 1.2924521778151511e-08, "loss": 0.262, "step": 916 }, { "epoch": 1.9891540130151844, "grad_norm": 0.22371033015850025, "learning_rate": 8.975480503126133e-09, "loss": 0.2253, "step": 917 }, { "epoch": 1.9913232104121477, "grad_norm": 0.24195113149305236, "learning_rate": 5.74436939398959e-09, "loss": 0.2679, "step": 918 }, { "epoch": 1.9934924078091107, "grad_norm": 0.22275331811796895, "learning_rate": 3.2312348533236614e-09, "loss": 0.3069, "step": 919 }, { "epoch": 1.9956616052060738, "grad_norm": 0.2209663606112055, "learning_rate": 1.4361129727025191e-09, "loss": 0.2256, "step": 920 }, { "epoch": 1.997830802603037, "grad_norm": 0.2356916076109314, "learning_rate": 3.590295321986847e-10, "loss": 0.2807, "step": 921 }, { "epoch": 2.0, "grad_norm": 0.22113655920342676, "learning_rate": 0.0, "loss": 0.2328, "step": 922 }, { "epoch": 2.0, "step": 922, "total_flos": 4515096717950976.0, "train_loss": 0.30544997376759503, "train_runtime": 16133.364, "train_samples_per_second": 0.228, "train_steps_per_second": 0.057 } ], "logging_steps": 1, "max_steps": 922, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4515096717950976.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }